src/backend/access/transam/multixact.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * multixact.c
   4  *              PostgreSQL multi-transaction-log manager
   5  *
   6  * The pg_multixact manager is a pg_xact-like manager that stores an array of
   7  * MultiXactMember for each MultiXactId.  It is a fundamental part of the
   8  * shared-row-lock implementation.  Each MultiXactMember is comprised of a
   9  * TransactionId and a set of flag bits.  The name is a bit historical:
  10  * originally, a MultiXactId consisted of more than one TransactionId (except
  11  * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
  12  * legitimate to have MultiXactIds that only include a single Xid.
  13  *
  14  * The meaning of the flag bits is opaque to this module, but they are mostly
  15  * used in heapam.c to identify lock modes that each of the member transactions
  16  * is holding on any given tuple.  This module just contains support to store
  17  * and retrieve the arrays.
  18  *
  19  * We use two SLRU areas, one for storing the offsets at which the data
  20  * starts for each MultiXactId in the other one.  This trick allows us to
  21  * store variable length arrays of TransactionIds.  (We could alternatively
  22  * use one area containing counts and TransactionIds, with valid MultiXactId
  23  * values pointing at slots containing counts; but that way seems less robust
  24  * since it would get completely confused if someone inquired about a bogus
  25  * MultiXactId that pointed to an intermediate slot containing an XID.)
  26  *
  27  * XLOG interactions: this module generates a record whenever a new OFFSETs or
  28  * MEMBERs page is initialized to zeroes, as well as an
  29  * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
  30  * This module ignores the WAL rule "write xlog before data," because it
  31  * suffices that actions recording a MultiXactId in a heap xmax do follow that
  32  * rule.  The only way for the MXID to be referenced from any data page is for
  33  * heap_lock_tuple() or heap_update() to have put it there, and each generates
  34  * an XLOG record that must follow ours.  The normal LSN interlock between the
  35  * data page and that XLOG record will ensure that our XLOG record reaches
  36  * disk first.  If the SLRU members/offsets data reaches disk sooner than the
  37  * XLOG records, we do not care; after recovery, no xmax will refer to it.  On
  38  * the flip side, to ensure that all referenced entries _do_ reach disk, this
  39  * module's XLOG records completely rebuild the data entered since the last
  40  * checkpoint.  We flush and sync all dirty OFFSETs and MEMBERs pages to disk
  41  * before each checkpoint is considered complete.
  42  *
  43  * Like clog.c, and unlike subtrans.c, we have to preserve state across
  44  * crashes and ensure that MXID and offset numbering increases monotonically
  45  * across a crash.  We do this in the same way as it's done for transaction
  46  * IDs: the WAL record is guaranteed to contain evidence of every MXID we
  47  * could need to worry about, and we just make sure that at the end of
  48  * replay, the next-MXID and next-offset counters are at least as large as
  49  * anything we saw during replay.
  50  *
  51  * We are able to remove segments no longer necessary by carefully tracking
  52  * each table's used values: during vacuum, any multixact older than a certain
  53  * value is removed; the cutoff value is stored in pg_class.  The minimum value
  54  * across all tables in each database is stored in pg_database, and the global
  55  * minimum across all databases is part of pg_control and is kept in shared
  56  * memory.  Whenever that minimum is advanced, the SLRUs are truncated.
  57  *
  58  * When new multixactid values are to be created, care is taken that the
  59  * counter does not fall within the wraparound horizon considering the global
  60  * minimum value.
  61  *
  62  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  63  * Portions Copyright (c) 1994, Regents of the University of California
  64  *
  65  * src/backend/access/transam/multixact.c
  66  *
  67  *-------------------------------------------------------------------------
  68  */
  69 #include "postgres.h"
  70
  71 #include "access/multixact.h"
  72 #include "access/slru.h"
  73 #include "access/transam.h"
  74 #include "access/twophase.h"
  75 #include "access/twophase_rmgr.h"
  76 #include "access/xact.h"
  77 #include "access/xlog.h"
  78 #include "access/xloginsert.h"
  79 #include "access/xlogutils.h"
  80 #include "commands/dbcommands.h"
  81 #include "funcapi.h"
  82 #include "lib/ilist.h"
  83 #include "miscadmin.h"
  84 #include "pg_trace.h"
  85 #include "pgstat.h"
  86 #include "postmaster/autovacuum.h"
  87 #include "storage/pmsignal.h"
  88 #include "storage/proc.h"
  89 #include "storage/procarray.h"
  90 #include "utils/fmgrprotos.h"
  91 #include "utils/guc_hooks.h"
  92 #include "utils/injection_point.h"
  93 #include "utils/memutils.h"
  94
  95
  96 /*
  97  * Defines for MultiXactOffset page sizes.  A page is the same BLCKSZ as is
  98  * used everywhere else in Postgres.
  99  *
 100  * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
 101  * MultiXact page numbering also wraps around at
 102  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
 103  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need
 104  * take no explicit notice of that fact in this module, except when comparing
 105  * segment and page numbers in TruncateMultiXact (see
 106  * MultiXactOffsetPagePrecedes).
 107  */
 108
 109 /* We need four bytes per offset */
 110 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
 111
 112 static inline int64
 113 MultiXactIdToOffsetPage(MultiXactId multi)
 114 {
 115         return multi / MULTIXACT_OFFSETS_PER_PAGE;
 116 }
 117
 118 static inline int
 119 MultiXactIdToOffsetEntry(MultiXactId multi)
 120 {
 121         return multi % MULTIXACT_OFFSETS_PER_PAGE;
 122 }
 123
 124 static inline int64
 125 MultiXactIdToOffsetSegment(MultiXactId multi)
 126 {
 127         return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT;
 128 }
 129
 130 /*
 131  * The situation for members is a bit more complex: we store one byte of
 132  * additional flag bits for each TransactionId.  To do this without getting
 133  * into alignment issues, we store four bytes of flags, and then the
 134  * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
 135  * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
 136  * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
 137  * performance) trumps space efficiency here.
 138  *
 139  * Note that the "offset" macros work with byte offset, not array indexes, so
 140  * arithmetic must be done using "char *" pointers.
 141  */
 142 /* We need eight bits per xact, so one xact fits in a byte */
 143 #define MXACT_MEMBER_BITS_PER_XACT                      8
 144 #define MXACT_MEMBER_FLAGS_PER_BYTE                     1
 145 #define MXACT_MEMBER_XACT_BITMASK       ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
 146
 147 /* how many full bytes of flags are there in a group? */
 148 #define MULTIXACT_FLAGBYTES_PER_GROUP           4
 149 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP       \
 150         (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
 151 /* size in bytes of a complete group */
 152 #define MULTIXACT_MEMBERGROUP_SIZE \
 153         (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
 154 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
 155 #define MULTIXACT_MEMBERS_PER_PAGE      \
 156         (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
 157
 158 /*
 159  * Because the number of items per page is not a divisor of the last item
 160  * number (member 0xFFFFFFFF), the last segment does not use the maximum number
 161  * of pages, and moreover the last used page therein does not use the same
 162  * number of items as previous pages.  (Another way to say it is that the
 163  * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
 164  * has some empty space after that item.)
 165  *
 166  * This constant is the number of members in the last page of the last segment.
 167  */
 168 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
 169                 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
 170
 171 /* page in which a member is to be found */
 172 static inline int64
 173 MXOffsetToMemberPage(MultiXactOffset offset)
 174 {
 175         return offset / MULTIXACT_MEMBERS_PER_PAGE;
 176 }
 177
 178 static inline int64
 179 MXOffsetToMemberSegment(MultiXactOffset offset)
 180 {
 181         return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT;
 182 }
 183
 184 /* Location (byte offset within page) of flag word for a given member */
 185 static inline int
 186 MXOffsetToFlagsOffset(MultiXactOffset offset)
 187 {
 188         MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
 189         int                     grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
 190         int                     byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
 191
 192         return byteoff;
 193 }
 194
 195 static inline int
 196 MXOffsetToFlagsBitShift(MultiXactOffset offset)
 197 {
 198         int                     member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
 199         int                     bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
 200
 201         return bshift;
 202 }
 203
 204 /* Location (byte offset within page) of TransactionId of given member */
 205 static inline int
 206 MXOffsetToMemberOffset(MultiXactOffset offset)
 207 {
 208         int                     member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
 209
 210         return MXOffsetToFlagsOffset(offset) +
 211                 MULTIXACT_FLAGBYTES_PER_GROUP +
 212                 member_in_group * sizeof(TransactionId);
 213 }
 214
 215 /* Multixact members wraparound thresholds. */
 216 #define MULTIXACT_MEMBER_SAFE_THRESHOLD         (MaxMultiXactOffset / 2)
 217 #define MULTIXACT_MEMBER_DANGER_THRESHOLD       \
 218         (MaxMultiXactOffset - MaxMultiXactOffset / 4)
 219
 220 static inline MultiXactId
 221 PreviousMultiXactId(MultiXactId multi)
 222 {
 223         return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
 224 }
 225
 226 /*
 227  * Links to shared-memory data structures for MultiXact control
 228  */
 229 static SlruCtlData MultiXactOffsetCtlData;
 230 static SlruCtlData MultiXactMemberCtlData;
 231
 232 #define MultiXactOffsetCtl      (&MultiXactOffsetCtlData)
 233 #define MultiXactMemberCtl      (&MultiXactMemberCtlData)
 234
 235 /*
 236  * MultiXact state shared across all backends.  All this state is protected
 237  * by MultiXactGenLock.  (We also use SLRU bank's lock of MultiXactOffset and
 238  * MultiXactMember to guard accesses to the two sets of SLRU buffers.  For
 239  * concurrency's sake, we avoid holding more than one of these locks at a
 240  * time.)
 241  */
 242 typedef struct MultiXactStateData
 243 {
 244         /* next-to-be-assigned MultiXactId */
 245         MultiXactId nextMXact;
 246
 247         /* next-to-be-assigned offset */
 248         MultiXactOffset nextOffset;
 249
 250         /* Have we completed multixact startup? */
 251         bool            finishedStartup;
 252
 253         /*
 254          * Oldest multixact that is still potentially referenced by a relation.
 255          * Anything older than this should not be consulted.  These values are
 256          * updated by vacuum.
 257          */
 258         MultiXactId oldestMultiXactId;
 259         Oid                     oldestMultiXactDB;
 260
 261         /*
 262          * Oldest multixact offset that is potentially referenced by a multixact
 263          * referenced by a relation.  We don't always know this value, so there's
 264          * a flag here to indicate whether or not we currently do.
 265          */
 266         MultiXactOffset oldestOffset;
 267         bool            oldestOffsetKnown;
 268
 269         /* support for anti-wraparound measures */
 270         MultiXactId multiVacLimit;
 271         MultiXactId multiWarnLimit;
 272         MultiXactId multiStopLimit;
 273         MultiXactId multiWrapLimit;
 274
 275         /* support for members anti-wraparound measures */
 276         MultiXactOffset offsetStopLimit;        /* known if oldestOffsetKnown */
 277
 278         /*
 279          * This is used to sleep until a multixact offset is written when we want
 280          * to create the next one.
 281          */
 282         ConditionVariable nextoff_cv;
 283
 284         /*
 285          * Per-backend data starts here.  We have two arrays stored in the area
 286          * immediately following the MultiXactStateData struct. Each is indexed by
 287          * ProcNumber.
 288          *
 289          * In both arrays, there's a slot for all normal backends
 290          * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
 291          * transactions.
 292          *
 293          * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
 294          * transaction(s) could possibly be a member of, or InvalidMultiXactId
 295          * when the backend has no live transaction that could possibly be a
 296          * member of a MultiXact.  Each backend sets its entry to the current
 297          * nextMXact counter just before first acquiring a shared lock in a given
 298          * transaction, and clears it at transaction end. (This works because only
 299          * during or after acquiring a shared lock could an XID possibly become a
 300          * member of a MultiXact, and that MultiXact would have to be created
 301          * during or after the lock acquisition.)
 302          *
 303          * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
 304          * current transaction(s) think is potentially live, or InvalidMultiXactId
 305          * when not in a transaction or not in a transaction that's paid any
 306          * attention to MultiXacts yet.  This is computed when first needed in a
 307          * given transaction, and cleared at transaction end.  We can compute it
 308          * as the minimum of the valid OldestMemberMXactId[] entries at the time
 309          * we compute it (using nextMXact if none are valid).  Each backend is
 310          * required not to attempt to access any SLRU data for MultiXactIds older
 311          * than its own OldestVisibleMXactId[] setting; this is necessary because
 312          * the relevant SLRU data can be concurrently truncated away.
 313          *
 314          * The oldest valid value among all of the OldestMemberMXactId[] and
 315          * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
 316          * possible value still having any live member transaction -- OldestMxact.
 317          * Any value older than that is typically removed from tuple headers, or
 318          * "frozen" via being replaced with a new xmax.  VACUUM can sometimes even
 319          * remove an individual MultiXact xmax whose value is >= its OldestMxact
 320          * cutoff, though typically only when no individual member XID is still
 321          * running.  See FreezeMultiXactId for full details.
 322          *
 323          * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
 324          * or the oldest extant Multi remaining in the table is used as the new
 325          * pg_class.relminmxid value (whichever is earlier).  The minimum of all
 326          * relminmxid values in each database is stored in pg_database.datminmxid.
 327          * In turn, the minimum of all of those values is stored in pg_control.
 328          * This is used as the truncation point for pg_multixact when unneeded
 329          * segments get removed by vac_truncate_clog() during vacuuming.
 330          */
 331         MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
 332 } MultiXactStateData;
 333
 334 /*
 335  * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
 336  */
 337 #define MaxOldestSlot   (MaxBackends + max_prepared_xacts)
 338
 339 /* Pointers to the state data in shared memory */
 340 static MultiXactStateData *MultiXactState;
 341 static MultiXactId *OldestMemberMXactId;
 342 static MultiXactId *OldestVisibleMXactId;
 343
 344
 345 /*
 346  * Definitions for the backend-local MultiXactId cache.
 347  *
 348  * We use this cache to store known MultiXacts, so we don't need to go to
 349  * SLRU areas every time.
 350  *
 351  * The cache lasts for the duration of a single transaction, the rationale
 352  * for this being that most entries will contain our own TransactionId and
 353  * so they will be uninteresting by the time our next transaction starts.
 354  * (XXX not clear that this is correct --- other members of the MultiXact
 355  * could hang around longer than we did.  However, it's not clear what a
 356  * better policy for flushing old cache entries would be.)      FIXME actually
 357  * this is plain wrong now that multixact's may contain update Xids.
 358  *
 359  * We allocate the cache entries in a memory context that is deleted at
 360  * transaction end, so we don't need to do retail freeing of entries.
 361  */
 362 typedef struct mXactCacheEnt
 363 {
 364         MultiXactId multi;
 365         int                     nmembers;
 366         dlist_node      node;
 367         MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
 368 } mXactCacheEnt;
 369
 370 #define MAX_CACHE_ENTRIES       256
 371 static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache);
 372 static MemoryContext MXactContext = NULL;
 373
 374 #ifdef MULTIXACT_DEBUG
 375 #define debug_elog2(a,b) elog(a,b)
 376 #define debug_elog3(a,b,c) elog(a,b,c)
 377 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
 378 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
 379 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
 380 #else
 381 #define debug_elog2(a,b)
 382 #define debug_elog3(a,b,c)
 383 #define debug_elog4(a,b,c,d)
 384 #define debug_elog5(a,b,c,d,e)
 385 #define debug_elog6(a,b,c,d,e,f)
 386 #endif
 387
 388 /* internal MultiXactId management */
 389 static void MultiXactIdSetOldestVisible(void);
 390 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 391                                                            int nmembers, MultiXactMember *members);
 392 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
 393
 394 /* MultiXact cache management */
 395 static int      mxactMemberComparator(const void *arg1, const void *arg2);
 396 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
 397 static int      mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
 398 static void mXactCachePut(MultiXactId multi, int nmembers,
 399                                                   MultiXactMember *members);
 400
 401 static char *mxstatus_to_string(MultiXactStatus status);
 402
 403 /* management of SLRU infrastructure */
 404 static int      ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog);
 405 static int      ZeroMultiXactMemberPage(int64 pageno, bool writeXlog);
 406 static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
 407 static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
 408 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 409                                                                         MultiXactOffset offset2);
 410 static void ExtendMultiXactOffset(MultiXactId multi);
 411 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
 412 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
 413                                                                          MultiXactOffset start, uint32 distance);
 414 static bool SetOffsetVacuumLimit(bool is_startup);
 415 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
 416 static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
 417 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
 418                                                                   MultiXactId startTruncOff,
 419                                                                   MultiXactId endTruncOff,
 420                                                                   MultiXactOffset startTruncMemb,
 421                                                                   MultiXactOffset endTruncMemb);
 422
 423
 424 /*
 425  * MultiXactIdCreate
 426  *              Construct a MultiXactId representing two TransactionIds.
 427  *
 428  * The two XIDs must be different, or be requesting different statuses.
 429  *
 430  * NB - we don't worry about our local MultiXactId cache here, because that
 431  * is handled by the lower-level routines.
 432  */
 433 MultiXactId
 434 MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
 435                                   TransactionId xid2, MultiXactStatus status2)
 436 {
 437         MultiXactId newMulti;
 438         MultiXactMember members[2];
 439
 440         Assert(TransactionIdIsValid(xid1));
 441         Assert(TransactionIdIsValid(xid2));
 442
 443         Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
 444
 445         /* MultiXactIdSetOldestMember() must have been called already. */
 446         Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
 447
 448         /*
 449          * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
 450          * are still running.  In typical usage, xid2 will be our own XID and the
 451          * caller just did a check on xid1, so it'd be wasted effort.
 452          */
 453
 454         members[0].xid = xid1;
 455         members[0].status = status1;
 456         members[1].xid = xid2;
 457         members[1].status = status2;
 458
 459         newMulti = MultiXactIdCreateFromMembers(2, members);
 460
 461         debug_elog3(DEBUG2, "Create: %s",
 462                                 mxid_to_string(newMulti, 2, members));
 463
 464         return newMulti;
 465 }
 466
 467 /*
 468  * MultiXactIdExpand
 469  *              Add a TransactionId to a pre-existing MultiXactId.
 470  *
 471  * If the TransactionId is already a member of the passed MultiXactId with the
 472  * same status, just return it as-is.
 473  *
 474  * Note that we do NOT actually modify the membership of a pre-existing
 475  * MultiXactId; instead we create a new one.  This is necessary to avoid
 476  * a race condition against code trying to wait for one MultiXactId to finish;
 477  * see notes in heapam.c.
 478  *
 479  * NB - we don't worry about our local MultiXactId cache here, because that
 480  * is handled by the lower-level routines.
 481  *
 482  * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
 483  * one upgraded by pg_upgrade from a cluster older than this feature) are not
 484  * passed in.
 485  */
 486 MultiXactId
 487 MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 488 {
 489         MultiXactId newMulti;
 490         MultiXactMember *members;
 491         MultiXactMember *newMembers;
 492         int                     nmembers;
 493         int                     i;
 494         int                     j;
 495
 496         Assert(MultiXactIdIsValid(multi));
 497         Assert(TransactionIdIsValid(xid));
 498
 499         /* MultiXactIdSetOldestMember() must have been called already. */
 500         Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
 501
 502         debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
 503                                 multi, xid, mxstatus_to_string(status));
 504
 505         /*
 506          * Note: we don't allow for old multis here.  The reason is that the only
 507          * caller of this function does a check that the multixact is no longer
 508          * running.
 509          */
 510         nmembers = GetMultiXactIdMembers(multi, &members, false, false);
 511
 512         if (nmembers < 0)
 513         {
 514                 MultiXactMember member;
 515
 516                 /*
 517                  * The MultiXactId is obsolete.  This can only happen if all the
 518                  * MultiXactId members stop running between the caller checking and
 519                  * passing it to us.  It would be better to return that fact to the
 520                  * caller, but it would complicate the API and it's unlikely to happen
 521                  * too often, so just deal with it by creating a singleton MultiXact.
 522                  */
 523                 member.xid = xid;
 524                 member.status = status;
 525                 newMulti = MultiXactIdCreateFromMembers(1, &member);
 526
 527                 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 528                                         multi, newMulti);
 529                 return newMulti;
 530         }
 531
 532         /*
 533          * If the TransactionId is already a member of the MultiXactId with the
 534          * same status, just return the existing MultiXactId.
 535          */
 536         for (i = 0; i < nmembers; i++)
 537         {
 538                 if (TransactionIdEquals(members[i].xid, xid) &&
 539                         (members[i].status == status))
 540                 {
 541                         debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
 542                                                 xid, multi);
 543                         pfree(members);
 544                         return multi;
 545                 }
 546         }
 547
 548         /*
 549          * Determine which of the members of the MultiXactId are still of
 550          * interest. This is any running transaction, and also any transaction
 551          * that grabbed something stronger than just a lock and was committed. (An
 552          * update that aborted is of no interest here; and having more than one
 553          * update Xid in a multixact would cause errors elsewhere.)
 554          *
 555          * Removing dead members is not just an optimization: freezing of tuples
 556          * whose Xmax are multis depends on this behavior.
 557          *
 558          * Note we have the same race condition here as above: j could be 0 at the
 559          * end of the loop.
 560          */
 561         newMembers = (MultiXactMember *)
 562                 palloc(sizeof(MultiXactMember) * (nmembers + 1));
 563
 564         for (i = 0, j = 0; i < nmembers; i++)
 565         {
 566                 if (TransactionIdIsInProgress(members[i].xid) ||
 567                         (ISUPDATE_from_mxstatus(members[i].status) &&
 568                          TransactionIdDidCommit(members[i].xid)))
 569                 {
 570                         newMembers[j].xid = members[i].xid;
 571                         newMembers[j++].status = members[i].status;
 572                 }
 573         }
 574
 575         newMembers[j].xid = xid;
 576         newMembers[j++].status = status;
 577         newMulti = MultiXactIdCreateFromMembers(j, newMembers);
 578
 579         pfree(members);
 580         pfree(newMembers);
 581
 582         debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
 583
 584         return newMulti;
 585 }
 586
 587 /*
 588  * MultiXactIdIsRunning
 589  *              Returns whether a MultiXactId is "running".
 590  *
 591  * We return true if at least one member of the given MultiXactId is still
 592  * running.  Note that a "false" result is certain not to change,
 593  * because it is not legal to add members to an existing MultiXactId.
 594  *
 595  * Caller is expected to have verified that the multixact does not come from
 596  * a pg_upgraded share-locked tuple.
 597  */
 598 bool
 599 MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
 600 {
 601         MultiXactMember *members;
 602         int                     nmembers;
 603         int                     i;
 604
 605         debug_elog3(DEBUG2, "IsRunning %u?", multi);
 606
 607         /*
 608          * "false" here means we assume our callers have checked that the given
 609          * multi cannot possibly come from a pg_upgraded database.
 610          */
 611         nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
 612
 613         if (nmembers <= 0)
 614         {
 615                 debug_elog2(DEBUG2, "IsRunning: no members");
 616                 return false;
 617         }
 618
 619         /*
 620          * Checking for myself is cheap compared to looking in shared memory;
 621          * return true if any live subtransaction of the current top-level
 622          * transaction is a member.
 623          *
 624          * This is not needed for correctness, it's just a fast path.
 625          */
 626         for (i = 0; i < nmembers; i++)
 627         {
 628                 if (TransactionIdIsCurrentTransactionId(members[i].xid))
 629                 {
 630                         debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
 631                         pfree(members);
 632                         return true;
 633                 }
 634         }
 635
 636         /*
 637          * This could be made faster by having another entry point in procarray.c,
 638          * walking the PGPROC array only once for all the members.  But in most
 639          * cases nmembers should be small enough that it doesn't much matter.
 640          */
 641         for (i = 0; i < nmembers; i++)
 642         {
 643                 if (TransactionIdIsInProgress(members[i].xid))
 644                 {
 645                         debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
 646                                                 i, members[i].xid);
 647                         pfree(members);
 648                         return true;
 649                 }
 650         }
 651
 652         pfree(members);
 653
 654         debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
 655
 656         return false;
 657 }
 658
 659 /*
 660  * MultiXactIdSetOldestMember
 661  *              Save the oldest MultiXactId this transaction could be a member of.
 662  *
 663  * We set the OldestMemberMXactId for a given transaction the first time it's
 664  * going to do some operation that might require a MultiXactId (tuple lock,
 665  * update or delete).  We need to do this even if we end up using a
 666  * TransactionId instead of a MultiXactId, because there is a chance that
 667  * another transaction would add our XID to a MultiXactId.
 668  *
 669  * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
 670  * be called just before doing any such possibly-MultiXactId-able operation.
 671  */
 672 void
 673 MultiXactIdSetOldestMember(void)
 674 {
 675         if (!MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]))
 676         {
 677                 MultiXactId nextMXact;
 678
 679                 /*
 680                  * You might think we don't need to acquire a lock here, since
 681                  * fetching and storing of TransactionIds is probably atomic, but in
 682                  * fact we do: suppose we pick up nextMXact and then lose the CPU for
 683                  * a long time.  Someone else could advance nextMXact, and then
 684                  * another someone else could compute an OldestVisibleMXactId that
 685                  * would be after the value we are going to store when we get control
 686                  * back.  Which would be wrong.
 687                  *
 688                  * Note that a shared lock is sufficient, because it's enough to stop
 689                  * someone from advancing nextMXact; and nobody else could be trying
 690                  * to write to our OldestMember entry, only reading (and we assume
 691                  * storing it is atomic.)
 692                  */
 693                 LWLockAcquire(MultiXactGenLock, LW_SHARED);
 694
 695                 /*
 696                  * We have to beware of the possibility that nextMXact is in the
 697                  * wrapped-around state.  We don't fix the counter itself here, but we
 698                  * must be sure to store a valid value in our array entry.
 699                  */
 700                 nextMXact = MultiXactState->nextMXact;
 701                 if (nextMXact < FirstMultiXactId)
 702                         nextMXact = FirstMultiXactId;
 703
 704                 OldestMemberMXactId[MyProcNumber] = nextMXact;
 705
 706                 LWLockRelease(MultiXactGenLock);
 707
 708                 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
 709                                         MyProcNumber, nextMXact);
 710         }
 711 }
 712
 713 /*
 714  * MultiXactIdSetOldestVisible
 715  *              Save the oldest MultiXactId this transaction considers possibly live.
 716  *
 717  * We set the OldestVisibleMXactId for a given transaction the first time
 718  * it's going to inspect any MultiXactId.  Once we have set this, we are
 719  * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
 720  * won't be truncated away.
 721  *
 722  * The value to set is the oldest of nextMXact and all the valid per-backend
 723  * OldestMemberMXactId[] entries.  Because of the locking we do, we can be
 724  * certain that no subsequent call to MultiXactIdSetOldestMember can set
 725  * an OldestMemberMXactId[] entry older than what we compute here.  Therefore
 726  * there is no live transaction, now or later, that can be a member of any
 727  * MultiXactId older than the OldestVisibleMXactId we compute here.
 728  */
 729 static void
 730 MultiXactIdSetOldestVisible(void)
 731 {
 732         if (!MultiXactIdIsValid(OldestVisibleMXactId[MyProcNumber]))
 733         {
 734                 MultiXactId oldestMXact;
 735                 int                     i;
 736
 737                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 738
 739                 /*
 740                  * We have to beware of the possibility that nextMXact is in the
 741                  * wrapped-around state.  We don't fix the counter itself here, but we
 742                  * must be sure to store a valid value in our array entry.
 743                  */
 744                 oldestMXact = MultiXactState->nextMXact;
 745                 if (oldestMXact < FirstMultiXactId)
 746                         oldestMXact = FirstMultiXactId;
 747
 748                 for (i = 0; i < MaxOldestSlot; i++)
 749                 {
 750                         MultiXactId thisoldest = OldestMemberMXactId[i];
 751
 752                         if (MultiXactIdIsValid(thisoldest) &&
 753                                 MultiXactIdPrecedes(thisoldest, oldestMXact))
 754                                 oldestMXact = thisoldest;
 755                 }
 756
 757                 OldestVisibleMXactId[MyProcNumber] = oldestMXact;
 758
 759                 LWLockRelease(MultiXactGenLock);
 760
 761                 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
 762                                         MyProcNumber, oldestMXact);
 763         }
 764 }
 765
 766 /*
 767  * ReadNextMultiXactId
 768  *              Return the next MultiXactId to be assigned, but don't allocate it
 769  */
 770 MultiXactId
 771 ReadNextMultiXactId(void)
 772 {
 773         MultiXactId mxid;
 774
 775         /* XXX we could presumably do this without a lock. */
 776         LWLockAcquire(MultiXactGenLock, LW_SHARED);
 777         mxid = MultiXactState->nextMXact;
 778         LWLockRelease(MultiXactGenLock);
 779
 780         if (mxid < FirstMultiXactId)
 781                 mxid = FirstMultiXactId;
 782
 783         return mxid;
 784 }
 785
 786 /*
 787  * ReadMultiXactIdRange
 788  *              Get the range of IDs that may still be referenced by a relation.
 789  */
 790 void
 791 ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
 792 {
 793         LWLockAcquire(MultiXactGenLock, LW_SHARED);
 794         *oldest = MultiXactState->oldestMultiXactId;
 795         *next = MultiXactState->nextMXact;
 796         LWLockRelease(MultiXactGenLock);
 797
 798         if (*oldest < FirstMultiXactId)
 799                 *oldest = FirstMultiXactId;
 800         if (*next < FirstMultiXactId)
 801                 *next = FirstMultiXactId;
 802 }
 803
 804
 805 /*
 806  * MultiXactIdCreateFromMembers
 807  *              Make a new MultiXactId from the specified set of members
 808  *
 809  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
 810  * given TransactionIds as members.  Returns the newly created MultiXactId.
 811  *
 812  * NB: the passed members[] array will be sorted in-place.
 813  */
 814 MultiXactId
 815 MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
 816 {
 817         MultiXactId multi;
 818         MultiXactOffset offset;
 819         xl_multixact_create xlrec;
 820
 821         debug_elog3(DEBUG2, "Create: %s",
 822                                 mxid_to_string(InvalidMultiXactId, nmembers, members));
 823
 824         /*
 825          * See if the same set of members already exists in our cache; if so, just
 826          * re-use that MultiXactId.  (Note: it might seem that looking in our
 827          * cache is insufficient, and we ought to search disk to see if a
 828          * duplicate definition already exists.  But since we only ever create
 829          * MultiXacts containing our own XID, in most cases any such MultiXacts
 830          * were in fact created by us, and so will be in our cache.  There are
 831          * corner cases where someone else added us to a MultiXact without our
 832          * knowledge, but it's not worth checking for.)
 833          */
 834         multi = mXactCacheGetBySet(nmembers, members);
 835         if (MultiXactIdIsValid(multi))
 836         {
 837                 debug_elog2(DEBUG2, "Create: in cache!");
 838                 return multi;
 839         }
 840
 841         /* Verify that there is a single update Xid among the given members. */
 842         {
 843                 int                     i;
 844                 bool            has_update = false;
 845
 846                 for (i = 0; i < nmembers; i++)
 847                 {
 848                         if (ISUPDATE_from_mxstatus(members[i].status))
 849                         {
 850                                 if (has_update)
 851                                         elog(ERROR, "new multixact has more than one updating member: %s",
 852                                                  mxid_to_string(InvalidMultiXactId, nmembers, members));
 853                                 has_update = true;
 854                         }
 855                 }
 856         }
 857
 858         /* Load the injection point before entering the critical section */
 859         INJECTION_POINT_LOAD("multixact-create-from-members");
 860
 861         /*
 862          * Assign the MXID and offsets range to use, and make sure there is space
 863          * in the OFFSETs and MEMBERs files.  NB: this routine does
 864          * START_CRIT_SECTION().
 865          *
 866          * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
 867          * that we've called MultiXactIdSetOldestMember here.  This is because
 868          * this routine is used in some places to create new MultiXactIds of which
 869          * the current backend is not a member, notably during freezing of multis
 870          * in vacuum.  During vacuum, in particular, it would be unacceptable to
 871          * keep OldestMulti set, in case it runs for long.
 872          */
 873         multi = GetNewMultiXactId(nmembers, &offset);
 874
 875         INJECTION_POINT_CACHED("multixact-create-from-members");
 876
 877         /* Make an XLOG entry describing the new MXID. */
 878         xlrec.mid = multi;
 879         xlrec.moff = offset;
 880         xlrec.nmembers = nmembers;
 881
 882         /*
 883          * XXX Note: there's a lot of padding space in MultiXactMember.  We could
 884          * find a more compact representation of this Xlog record -- perhaps all
 885          * the status flags in one XLogRecData, then all the xids in another one?
 886          * Not clear that it's worth the trouble though.
 887          */
 888         XLogBeginInsert();
 889         XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
 890         XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
 891
 892         (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
 893
 894         /* Now enter the information into the OFFSETs and MEMBERs logs */
 895         RecordNewMultiXact(multi, offset, nmembers, members);
 896
 897         /* Done with critical section */
 898         END_CRIT_SECTION();
 899
 900         /* Store the new MultiXactId in the local cache, too */
 901         mXactCachePut(multi, nmembers, members);
 902
 903         debug_elog2(DEBUG2, "Create: all done");
 904
 905         return multi;
 906 }
 907
 908 /*
 909  * RecordNewMultiXact
 910  *              Write info about a new multixact into the offsets and members files
 911  *
 912  * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
 913  * use it.
 914  */
 915 static void
 916 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 917                                    int nmembers, MultiXactMember *members)
 918 {
 919         int64           pageno;
 920         int64           prev_pageno;
 921         int                     entryno;
 922         int                     slotno;
 923         MultiXactOffset *offptr;
 924         int                     i;
 925         LWLock     *lock;
 926         LWLock     *prevlock = NULL;
 927
 928         pageno = MultiXactIdToOffsetPage(multi);
 929         entryno = MultiXactIdToOffsetEntry(multi);
 930
 931         lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 932         LWLockAcquire(lock, LW_EXCLUSIVE);
 933
 934         /*
 935          * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
 936          * to complain about if there's any I/O error.  This is kinda bogus, but
 937          * since the errors will always give the full pathname, it should be clear
 938          * enough that a MultiXactId is really involved.  Perhaps someday we'll
 939          * take the trouble to generalize the slru.c error reporting code.
 940          */
 941         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
 942         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 943         offptr += entryno;
 944
 945         *offptr = offset;
 946
 947         MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 948
 949         /* Release MultiXactOffset SLRU lock. */
 950         LWLockRelease(lock);
 951
 952         /*
 953          * If anybody was waiting to know the offset of this multixact ID we just
 954          * wrote, they can read it now, so wake them up.
 955          */
 956         ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
 957
 958         prev_pageno = -1;
 959
 960         for (i = 0; i < nmembers; i++, offset++)
 961         {
 962                 TransactionId *memberptr;
 963                 uint32     *flagsptr;
 964                 uint32          flagsval;
 965                 int                     bshift;
 966                 int                     flagsoff;
 967                 int                     memberoff;
 968
 969                 Assert(members[i].status <= MultiXactStatusUpdate);
 970
 971                 pageno = MXOffsetToMemberPage(offset);
 972                 memberoff = MXOffsetToMemberOffset(offset);
 973                 flagsoff = MXOffsetToFlagsOffset(offset);
 974                 bshift = MXOffsetToFlagsBitShift(offset);
 975
 976                 if (pageno != prev_pageno)
 977                 {
 978                         /*
 979                          * MultiXactMember SLRU page is changed so check if this new page
 980                          * fall into the different SLRU bank then release the old bank's
 981                          * lock and acquire lock on the new bank.
 982                          */
 983                         lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
 984                         if (lock != prevlock)
 985                         {
 986                                 if (prevlock != NULL)
 987                                         LWLockRelease(prevlock);
 988
 989                                 LWLockAcquire(lock, LW_EXCLUSIVE);
 990                                 prevlock = lock;
 991                         }
 992                         slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
 993                         prev_pageno = pageno;
 994                 }
 995
 996                 memberptr = (TransactionId *)
 997                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 998
 999                 *memberptr = members[i].xid;
1000
1001                 flagsptr = (uint32 *)
1002                         (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1003
1004                 flagsval = *flagsptr;
1005                 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1006                 flagsval |= (members[i].status << bshift);
1007                 *flagsptr = flagsval;
1008
1009                 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1010         }
1011
1012         if (prevlock != NULL)
1013                 LWLockRelease(prevlock);
1014 }
1015
1016 /*
1017  * GetNewMultiXactId
1018  *              Get the next MultiXactId.
1019  *
1020  * Also, reserve the needed amount of space in the "members" area.  The
1021  * starting offset of the reserved space is returned in *offset.
1022  *
1023  * This may generate XLOG records for expansion of the offsets and/or members
1024  * files.  Unfortunately, we have to do that while holding MultiXactGenLock
1025  * to avoid race conditions --- the XLOG record for zeroing a page must appear
1026  * before any backend can possibly try to store data in that page!
1027  *
1028  * We start a critical section before advancing the shared counters.  The
1029  * caller must end the critical section after writing SLRU data.
1030  */
1031 static MultiXactId
1032 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
1033 {
1034         MultiXactId result;
1035         MultiXactOffset nextOffset;
1036
1037         debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1038
1039         /* safety check, we should never get this far in a HS standby */
1040         if (RecoveryInProgress())
1041                 elog(ERROR, "cannot assign MultiXactIds during recovery");
1042
1043         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1044
1045         /* Handle wraparound of the nextMXact counter */
1046         if (MultiXactState->nextMXact < FirstMultiXactId)
1047                 MultiXactState->nextMXact = FirstMultiXactId;
1048
1049         /* Assign the MXID */
1050         result = MultiXactState->nextMXact;
1051
1052         /*----------
1053          * Check to see if it's safe to assign another MultiXactId.  This protects
1054          * against catastrophic data loss due to multixact wraparound.  The basic
1055          * rules are:
1056          *
1057          * If we're past multiVacLimit or the safe threshold for member storage
1058          * space, or we don't know what the safe threshold for member storage is,
1059          * start trying to force autovacuum cycles.
1060          * If we're past multiWarnLimit, start issuing warnings.
1061          * If we're past multiStopLimit, refuse to create new MultiXactIds.
1062          *
1063          * Note these are pretty much the same protections in GetNewTransactionId.
1064          *----------
1065          */
1066         if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
1067         {
1068                 /*
1069                  * For safety's sake, we release MultiXactGenLock while sending
1070                  * signals, warnings, etc.  This is not so much because we care about
1071                  * preserving concurrency in this situation, as to avoid any
1072                  * possibility of deadlock while doing get_database_name(). First,
1073                  * copy all the shared values we'll need in this path.
1074                  */
1075                 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1076                 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1077                 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1078                 Oid                     oldest_datoid = MultiXactState->oldestMultiXactDB;
1079
1080                 LWLockRelease(MultiXactGenLock);
1081
1082                 if (IsUnderPostmaster &&
1083                         !MultiXactIdPrecedes(result, multiStopLimit))
1084                 {
1085                         char       *oldest_datname = get_database_name(oldest_datoid);
1086
1087                         /*
1088                          * Immediately kick autovacuum into action as we're already in
1089                          * ERROR territory.
1090                          */
1091                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1092
1093                         /* complain even if that DB has disappeared */
1094                         if (oldest_datname)
1095                                 ereport(ERROR,
1096                                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1097                                                  errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1098                                                                 oldest_datname),
1099                                                  errhint("Execute a database-wide VACUUM in that database.\n"
1100                                                                  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1101                         else
1102                                 ereport(ERROR,
1103                                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1104                                                  errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1105                                                                 oldest_datoid),
1106                                                  errhint("Execute a database-wide VACUUM in that database.\n"
1107                                                                  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1108                 }
1109
1110                 /*
1111                  * To avoid swamping the postmaster with signals, we issue the autovac
1112                  * request only once per 64K multis generated.  This still gives
1113                  * plenty of chances before we get into real trouble.
1114                  */
1115                 if (IsUnderPostmaster && (result % 65536) == 0)
1116                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1117
1118                 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1119                 {
1120                         char       *oldest_datname = get_database_name(oldest_datoid);
1121
1122                         /* complain even if that DB has disappeared */
1123                         if (oldest_datname)
1124                                 ereport(WARNING,
1125                                                 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1126                                                                            "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1127                                                                            multiWrapLimit - result,
1128                                                                            oldest_datname,
1129                                                                            multiWrapLimit - result),
1130                                                  errhint("Execute a database-wide VACUUM in that database.\n"
1131                                                                  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1132                         else
1133                                 ereport(WARNING,
1134                                                 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1135                                                                            "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1136                                                                            multiWrapLimit - result,
1137                                                                            oldest_datoid,
1138                                                                            multiWrapLimit - result),
1139                                                  errhint("Execute a database-wide VACUUM in that database.\n"
1140                                                                  "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1141                 }
1142
1143                 /* Re-acquire lock and start over */
1144                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1145                 result = MultiXactState->nextMXact;
1146                 if (result < FirstMultiXactId)
1147                         result = FirstMultiXactId;
1148         }
1149
1150         /* Make sure there is room for the MXID in the file.  */
1151         ExtendMultiXactOffset(result);
1152
1153         /*
1154          * Reserve the members space, similarly to above.  Also, be careful not to
1155          * return zero as the starting offset for any multixact. See
1156          * GetMultiXactIdMembers() for motivation.
1157          */
1158         nextOffset = MultiXactState->nextOffset;
1159         if (nextOffset == 0)
1160         {
1161                 *offset = 1;
1162                 nmembers++;                             /* allocate member slot 0 too */
1163         }
1164         else
1165                 *offset = nextOffset;
1166
1167         /*----------
1168          * Protect against overrun of the members space as well, with the
1169          * following rules:
1170          *
1171          * If we're past offsetStopLimit, refuse to generate more multis.
1172          * If we're close to offsetStopLimit, emit a warning.
1173          *
1174          * Arbitrarily, we start emitting warnings when we're 20 segments or less
1175          * from offsetStopLimit.
1176          *
1177          * Note we haven't updated the shared state yet, so if we fail at this
1178          * point, the multixact ID we grabbed can still be used by the next guy.
1179          *
1180          * Note that there is no point in forcing autovacuum runs here: the
1181          * multixact freeze settings would have to be reduced for that to have any
1182          * effect.
1183          *----------
1184          */
1185 #define OFFSET_WARN_SEGMENTS    20
1186         if (MultiXactState->oldestOffsetKnown &&
1187                 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1188                                                                  nmembers))
1189         {
1190                 /* see comment in the corresponding offsets wraparound case */
1191                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1192
1193                 ereport(ERROR,
1194                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1195                                  errmsg("multixact \"members\" limit exceeded"),
1196                                  errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1197                                                                   "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1198                                                                   MultiXactState->offsetStopLimit - nextOffset - 1,
1199                                                                   nmembers,
1200                                                                   MultiXactState->offsetStopLimit - nextOffset - 1),
1201                                  errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1202                                                  MultiXactState->oldestMultiXactDB)));
1203         }
1204
1205         /*
1206          * Check whether we should kick autovacuum into action, to prevent members
1207          * wraparound. NB we use a much larger window to trigger autovacuum than
1208          * just the warning limit. The warning is just a measure of last resort -
1209          * this is in line with GetNewTransactionId's behaviour.
1210          */
1211         if (!MultiXactState->oldestOffsetKnown ||
1212                 (MultiXactState->nextOffset - MultiXactState->oldestOffset
1213                  > MULTIXACT_MEMBER_SAFE_THRESHOLD))
1214         {
1215                 /*
1216                  * To avoid swamping the postmaster with signals, we issue the autovac
1217                  * request only when crossing a segment boundary. With default
1218                  * compilation settings that's roughly after 50k members.  This still
1219                  * gives plenty of chances before we get into real trouble.
1220                  */
1221                 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1222                         (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1223                         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1224         }
1225
1226         if (MultiXactState->oldestOffsetKnown &&
1227                 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1228                                                                  nextOffset,
1229                                                                  nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1230                 ereport(WARNING,
1231                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1232                                  errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1233                                                            "database with OID %u must be vacuumed before %d more multixact members are used",
1234                                                            MultiXactState->offsetStopLimit - nextOffset + nmembers,
1235                                                            MultiXactState->oldestMultiXactDB,
1236                                                            MultiXactState->offsetStopLimit - nextOffset + nmembers),
1237                                  errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1238
1239         ExtendMultiXactMember(nextOffset, nmembers);
1240
1241         /*
1242          * Critical section from here until caller has written the data into the
1243          * just-reserved SLRU space; we don't want to error out with a partly
1244          * written MultiXact structure.  (In particular, failing to write our
1245          * start offset after advancing nextMXact would effectively corrupt the
1246          * previous MultiXact.)
1247          */
1248         START_CRIT_SECTION();
1249
1250         /*
1251          * Advance counters.  As in GetNewTransactionId(), this must not happen
1252          * until after file extension has succeeded!
1253          *
1254          * We don't care about MultiXactId wraparound here; it will be handled by
1255          * the next iteration.  But note that nextMXact may be InvalidMultiXactId
1256          * or the first value on a segment-beginning page after this routine
1257          * exits, so anyone else looking at the variable must be prepared to deal
1258          * with either case.  Similarly, nextOffset may be zero, but we won't use
1259          * that as the actual start offset of the next multixact.
1260          */
1261         (MultiXactState->nextMXact)++;
1262
1263         MultiXactState->nextOffset += nmembers;
1264
1265         LWLockRelease(MultiXactGenLock);
1266
1267         debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1268         return result;
1269 }
1270
1271 /*
1272  * GetMultiXactIdMembers
1273  *              Return the set of MultiXactMembers that make up a MultiXactId
1274  *
1275  * Return value is the number of members found, or -1 if there are none,
1276  * and *members is set to a newly palloc'ed array of members.  It's the
1277  * caller's responsibility to free it when done with it.
1278  *
1279  * from_pgupgrade must be passed as true if and only if only the multixact
1280  * corresponds to a value from a tuple that was locked in a 9.2-or-older
1281  * installation and later pg_upgrade'd (that is, the infomask is
1282  * HEAP_LOCKED_UPGRADED).  In this case, we know for certain that no members
1283  * can still be running, so we return -1 just like for an empty multixact
1284  * without any further checking.  It would be wrong to try to resolve such a
1285  * multixact: either the multixact is within the current valid multixact
1286  * range, in which case the returned result would be bogus, or outside that
1287  * range, in which case an error would be raised.
1288  *
1289  * In all other cases, the passed multixact must be within the known valid
1290  * range, that is, greater to or equal than oldestMultiXactId, and less than
1291  * nextMXact.  Otherwise, an error is raised.
1292  *
1293  * isLockOnly must be set to true if caller is certain that the given multi
1294  * is used only to lock tuples; can be false without loss of correctness,
1295  * but passing a true means we can return quickly without checking for
1296  * old updates.
1297  */
1298 int
1299 GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1300                                           bool from_pgupgrade, bool isLockOnly)
1301 {
1302         int64           pageno;
1303         int64           prev_pageno;
1304         int                     entryno;
1305         int                     slotno;
1306         MultiXactOffset *offptr;
1307         MultiXactOffset offset;
1308         int                     length;
1309         int                     truelength;
1310         MultiXactId oldestMXact;
1311         MultiXactId nextMXact;
1312         MultiXactId tmpMXact;
1313         MultiXactOffset nextOffset;
1314         MultiXactMember *ptr;
1315         LWLock     *lock;
1316         bool            slept = false;
1317
1318         debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1319
1320         if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1321         {
1322                 *members = NULL;
1323                 return -1;
1324         }
1325
1326         /* See if the MultiXactId is in the local cache */
1327         length = mXactCacheGetById(multi, members);
1328         if (length >= 0)
1329         {
1330                 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1331                                         mxid_to_string(multi, length, *members));
1332                 return length;
1333         }
1334
1335         /* Set our OldestVisibleMXactId[] entry if we didn't already */
1336         MultiXactIdSetOldestVisible();
1337
1338         /*
1339          * If we know the multi is used only for locking and not for updates, then
1340          * we can skip checking if the value is older than our oldest visible
1341          * multi.  It cannot possibly still be running.
1342          */
1343         if (isLockOnly &&
1344                 MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyProcNumber]))
1345         {
1346                 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1347                 *members = NULL;
1348                 return -1;
1349         }
1350
1351         /*
1352          * We check known limits on MultiXact before resorting to the SLRU area.
1353          *
1354          * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1355          * useful; it has already been removed, or will be removed shortly, by
1356          * truncation.  If one is passed, an error is raised.
1357          *
1358          * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1359          * implies undetected ID wraparound has occurred.  This raises a hard
1360          * error.
1361          *
1362          * Shared lock is enough here since we aren't modifying any global state.
1363          * Acquire it just long enough to grab the current counter values.  We may
1364          * need both nextMXact and nextOffset; see below.
1365          */
1366         LWLockAcquire(MultiXactGenLock, LW_SHARED);
1367
1368         oldestMXact = MultiXactState->oldestMultiXactId;
1369         nextMXact = MultiXactState->nextMXact;
1370         nextOffset = MultiXactState->nextOffset;
1371
1372         LWLockRelease(MultiXactGenLock);
1373
1374         if (MultiXactIdPrecedes(multi, oldestMXact))
1375                 ereport(ERROR,
1376                                 (errcode(ERRCODE_INTERNAL_ERROR),
1377                                  errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1378                                                 multi)));
1379
1380         if (!MultiXactIdPrecedes(multi, nextMXact))
1381                 ereport(ERROR,
1382                                 (errcode(ERRCODE_INTERNAL_ERROR),
1383                                  errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1384                                                 multi)));
1385
1386         /*
1387          * Find out the offset at which we need to start reading MultiXactMembers
1388          * and the number of members in the multixact.  We determine the latter as
1389          * the difference between this multixact's starting offset and the next
1390          * one's.  However, there are some corner cases to worry about:
1391          *
1392          * 1. This multixact may be the latest one created, in which case there is
1393          * no next one to look at.  In this case the nextOffset value we just
1394          * saved is the correct endpoint.
1395          *
1396          * 2. The next multixact may still be in process of being filled in: that
1397          * is, another process may have done GetNewMultiXactId but not yet written
1398          * the offset entry for that ID.  In that scenario, it is guaranteed that
1399          * the offset entry for that multixact exists (because GetNewMultiXactId
1400          * won't release MultiXactGenLock until it does) but contains zero
1401          * (because we are careful to pre-zero offset pages). Because
1402          * GetNewMultiXactId will never return zero as the starting offset for a
1403          * multixact, when we read zero as the next multixact's offset, we know we
1404          * have this case.  We handle this by sleeping on the condition variable
1405          * we have just for this; the process in charge will signal the CV as soon
1406          * as it has finished writing the multixact offset.
1407          *
1408          * 3. Because GetNewMultiXactId increments offset zero to offset one to
1409          * handle case #2, there is an ambiguity near the point of offset
1410          * wraparound.  If we see next multixact's offset is one, is that our
1411          * multixact's actual endpoint, or did it end at zero with a subsequent
1412          * increment?  We handle this using the knowledge that if the zero'th
1413          * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1414          * transaction ID so it can't be a multixact member.  Therefore, if we
1415          * read a zero from the members array, just ignore it.
1416          *
1417          * This is all pretty messy, but the mess occurs only in infrequent corner
1418          * cases, so it seems better than holding the MultiXactGenLock for a long
1419          * time on every multixact creation.
1420          */
1421 retry:
1422         pageno = MultiXactIdToOffsetPage(multi);
1423         entryno = MultiXactIdToOffsetEntry(multi);
1424
1425         /* Acquire the bank lock for the page we need. */
1426         lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1427         LWLockAcquire(lock, LW_EXCLUSIVE);
1428
1429         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1430         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1431         offptr += entryno;
1432         offset = *offptr;
1433
1434         Assert(offset != 0);
1435
1436         /*
1437          * Use the same increment rule as GetNewMultiXactId(), that is, don't
1438          * handle wraparound explicitly until needed.
1439          */
1440         tmpMXact = multi + 1;
1441
1442         if (nextMXact == tmpMXact)
1443         {
1444                 /* Corner case 1: there is no next multixact */
1445                 length = nextOffset - offset;
1446         }
1447         else
1448         {
1449                 MultiXactOffset nextMXOffset;
1450
1451                 /* handle wraparound if needed */
1452                 if (tmpMXact < FirstMultiXactId)
1453                         tmpMXact = FirstMultiXactId;
1454
1455                 prev_pageno = pageno;
1456
1457                 pageno = MultiXactIdToOffsetPage(tmpMXact);
1458                 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1459
1460                 if (pageno != prev_pageno)
1461                 {
1462                         LWLock     *newlock;
1463
1464                         /*
1465                          * Since we're going to access a different SLRU page, if this page
1466                          * falls under a different bank, release the old bank's lock and
1467                          * acquire the lock of the new bank.
1468                          */
1469                         newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1470                         if (newlock != lock)
1471                         {
1472                                 LWLockRelease(lock);
1473                                 LWLockAcquire(newlock, LW_EXCLUSIVE);
1474                                 lock = newlock;
1475                         }
1476                         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1477                 }
1478
1479                 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1480                 offptr += entryno;
1481                 nextMXOffset = *offptr;
1482
1483                 if (nextMXOffset == 0)
1484                 {
1485                         /* Corner case 2: next multixact is still being filled in */
1486                         LWLockRelease(lock);
1487                         CHECK_FOR_INTERRUPTS();
1488
1489                         INJECTION_POINT("multixact-get-members-cv-sleep");
1490
1491                         ConditionVariableSleep(&MultiXactState->nextoff_cv,
1492                                                                    WAIT_EVENT_MULTIXACT_CREATION);
1493                         slept = true;
1494                         goto retry;
1495                 }
1496
1497                 length = nextMXOffset - offset;
1498         }
1499
1500         LWLockRelease(lock);
1501         lock = NULL;
1502
1503         /*
1504          * If we slept above, clean up state; it's no longer needed.
1505          */
1506         if (slept)
1507                 ConditionVariableCancelSleep();
1508
1509         ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1510
1511         truelength = 0;
1512         prev_pageno = -1;
1513         for (int i = 0; i < length; i++, offset++)
1514         {
1515                 TransactionId *xactptr;
1516                 uint32     *flagsptr;
1517                 int                     flagsoff;
1518                 int                     bshift;
1519                 int                     memberoff;
1520
1521                 pageno = MXOffsetToMemberPage(offset);
1522                 memberoff = MXOffsetToMemberOffset(offset);
1523
1524                 if (pageno != prev_pageno)
1525                 {
1526                         LWLock     *newlock;
1527
1528                         /*
1529                          * Since we're going to access a different SLRU page, if this page
1530                          * falls under a different bank, release the old bank's lock and
1531                          * acquire the lock of the new bank.
1532                          */
1533                         newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1534                         if (newlock != lock)
1535                         {
1536                                 if (lock)
1537                                         LWLockRelease(lock);
1538                                 LWLockAcquire(newlock, LW_EXCLUSIVE);
1539                                 lock = newlock;
1540                         }
1541
1542                         slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1543                         prev_pageno = pageno;
1544                 }
1545
1546                 xactptr = (TransactionId *)
1547                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1548
1549                 if (!TransactionIdIsValid(*xactptr))
1550                 {
1551                         /* Corner case 3: we must be looking at unused slot zero */
1552                         Assert(offset == 0);
1553                         continue;
1554                 }
1555
1556                 flagsoff = MXOffsetToFlagsOffset(offset);
1557                 bshift = MXOffsetToFlagsBitShift(offset);
1558                 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1559
1560                 ptr[truelength].xid = *xactptr;
1561                 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1562                 truelength++;
1563         }
1564
1565         LWLockRelease(lock);
1566
1567         /* A multixid with zero members should not happen */
1568         Assert(truelength > 0);
1569
1570         /*
1571          * Copy the result into the local cache.
1572          */
1573         mXactCachePut(multi, truelength, ptr);
1574
1575         debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1576                                 mxid_to_string(multi, truelength, ptr));
1577         *members = ptr;
1578         return truelength;
1579 }
1580
1581 /*
1582  * mxactMemberComparator
1583  *              qsort comparison function for MultiXactMember
1584  *
1585  * We can't use wraparound comparison for XIDs because that does not respect
1586  * the triangle inequality!  Any old sort order will do.
1587  */
1588 static int
1589 mxactMemberComparator(const void *arg1, const void *arg2)
1590 {
1591         MultiXactMember member1 = *(const MultiXactMember *) arg1;
1592         MultiXactMember member2 = *(const MultiXactMember *) arg2;
1593
1594         if (member1.xid > member2.xid)
1595                 return 1;
1596         if (member1.xid < member2.xid)
1597                 return -1;
1598         if (member1.status > member2.status)
1599                 return 1;
1600         if (member1.status < member2.status)
1601                 return -1;
1602         return 0;
1603 }
1604
1605 /*
1606  * mXactCacheGetBySet
1607  *              returns a MultiXactId from the cache based on the set of
1608  *              TransactionIds that compose it, or InvalidMultiXactId if
1609  *              none matches.
1610  *
1611  * This is helpful, for example, if two transactions want to lock a huge
1612  * table.  By using the cache, the second will use the same MultiXactId
1613  * for the majority of tuples, thus keeping MultiXactId usage low (saving
1614  * both I/O and wraparound issues).
1615  *
1616  * NB: the passed members array will be sorted in-place.
1617  */
1618 static MultiXactId
1619 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1620 {
1621         dlist_iter      iter;
1622
1623         debug_elog3(DEBUG2, "CacheGet: looking for %s",
1624                                 mxid_to_string(InvalidMultiXactId, nmembers, members));
1625
1626         /* sort the array so comparison is easy */
1627         qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1628
1629         dclist_foreach(iter, &MXactCache)
1630         {
1631                 mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1632                                                                                                 iter.cur);
1633
1634                 if (entry->nmembers != nmembers)
1635                         continue;
1636
1637                 /*
1638                  * We assume the cache entries are sorted, and that the unused bits in
1639                  * "status" are zeroed.
1640                  */
1641                 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1642                 {
1643                         debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1644                         dclist_move_head(&MXactCache, iter.cur);
1645                         return entry->multi;
1646                 }
1647         }
1648
1649         debug_elog2(DEBUG2, "CacheGet: not found :-(");
1650         return InvalidMultiXactId;
1651 }
1652
1653 /*
1654  * mXactCacheGetById
1655  *              returns the composing MultiXactMember set from the cache for a
1656  *              given MultiXactId, if present.
1657  *
1658  * If successful, *xids is set to the address of a palloc'd copy of the
1659  * MultiXactMember set.  Return value is number of members, or -1 on failure.
1660  */
1661 static int
1662 mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1663 {
1664         dlist_iter      iter;
1665
1666         debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1667
1668         dclist_foreach(iter, &MXactCache)
1669         {
1670                 mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1671                                                                                                 iter.cur);
1672
1673                 if (entry->multi == multi)
1674                 {
1675                         MultiXactMember *ptr;
1676                         Size            size;
1677
1678                         size = sizeof(MultiXactMember) * entry->nmembers;
1679                         ptr = (MultiXactMember *) palloc(size);
1680
1681                         memcpy(ptr, entry->members, size);
1682
1683                         debug_elog3(DEBUG2, "CacheGet: found %s",
1684                                                 mxid_to_string(multi,
1685                                                                            entry->nmembers,
1686                                                                            entry->members));
1687
1688                         /*
1689                          * Note we modify the list while not using a modifiable iterator.
1690                          * This is acceptable only because we exit the iteration
1691                          * immediately afterwards.
1692                          */
1693                         dclist_move_head(&MXactCache, iter.cur);
1694
1695                         *members = ptr;
1696                         return entry->nmembers;
1697                 }
1698         }
1699
1700         debug_elog2(DEBUG2, "CacheGet: not found");
1701         return -1;
1702 }
1703
1704 /*
1705  * mXactCachePut
1706  *              Add a new MultiXactId and its composing set into the local cache.
1707  */
1708 static void
1709 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1710 {
1711         mXactCacheEnt *entry;
1712
1713         debug_elog3(DEBUG2, "CachePut: storing %s",
1714                                 mxid_to_string(multi, nmembers, members));
1715
1716         if (MXactContext == NULL)
1717         {
1718                 /* The cache only lives as long as the current transaction */
1719                 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1720                 MXactContext = AllocSetContextCreate(TopTransactionContext,
1721                                                                                          "MultiXact cache context",
1722                                                                                          ALLOCSET_SMALL_SIZES);
1723         }
1724
1725         entry = (mXactCacheEnt *)
1726                 MemoryContextAlloc(MXactContext,
1727                                                    offsetof(mXactCacheEnt, members) +
1728                                                    nmembers * sizeof(MultiXactMember));
1729
1730         entry->multi = multi;
1731         entry->nmembers = nmembers;
1732         memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1733
1734         /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1735         qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1736
1737         dclist_push_head(&MXactCache, &entry->node);
1738         if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES)
1739         {
1740                 dlist_node *node;
1741
1742                 node = dclist_tail_node(&MXactCache);
1743                 dclist_delete_from(&MXactCache, node);
1744
1745                 entry = dclist_container(mXactCacheEnt, node, node);
1746                 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1747                                         entry->multi);
1748
1749                 pfree(entry);
1750         }
1751 }
1752
1753 static char *
1754 mxstatus_to_string(MultiXactStatus status)
1755 {
1756         switch (status)
1757         {
1758                 case MultiXactStatusForKeyShare:
1759                         return "keysh";
1760                 case MultiXactStatusForShare:
1761                         return "sh";
1762                 case MultiXactStatusForNoKeyUpdate:
1763                         return "fornokeyupd";
1764                 case MultiXactStatusForUpdate:
1765                         return "forupd";
1766                 case MultiXactStatusNoKeyUpdate:
1767                         return "nokeyupd";
1768                 case MultiXactStatusUpdate:
1769                         return "upd";
1770                 default:
1771                         elog(ERROR, "unrecognized multixact status %d", status);
1772                         return "";
1773         }
1774 }
1775
1776 char *
1777 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1778 {
1779         static char *str = NULL;
1780         StringInfoData buf;
1781         int                     i;
1782
1783         if (str != NULL)
1784                 pfree(str);
1785
1786         initStringInfo(&buf);
1787
1788         appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1789                                          mxstatus_to_string(members[0].status));
1790
1791         for (i = 1; i < nmembers; i++)
1792                 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1793                                                  mxstatus_to_string(members[i].status));
1794
1795         appendStringInfoChar(&buf, ']');
1796         str = MemoryContextStrdup(TopMemoryContext, buf.data);
1797         pfree(buf.data);
1798         return str;
1799 }
1800
1801 /*
1802  * AtEOXact_MultiXact
1803  *              Handle transaction end for MultiXact
1804  *
1805  * This is called at top transaction commit or abort (we don't care which).
1806  */
1807 void
1808 AtEOXact_MultiXact(void)
1809 {
1810         /*
1811          * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1812          * which should only be valid while within a transaction.
1813          *
1814          * We assume that storing a MultiXactId is atomic and so we need not take
1815          * MultiXactGenLock to do this.
1816          */
1817         OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1818         OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1819
1820         /*
1821          * Discard the local MultiXactId cache.  Since MXactContext was created as
1822          * a child of TopTransactionContext, we needn't delete it explicitly.
1823          */
1824         MXactContext = NULL;
1825         dclist_init(&MXactCache);
1826 }
1827
1828 /*
1829  * AtPrepare_MultiXact
1830  *              Save multixact state at 2PC transaction prepare
1831  *
1832  * In this phase, we only store our OldestMemberMXactId value in the two-phase
1833  * state file.
1834  */
1835 void
1836 AtPrepare_MultiXact(void)
1837 {
1838         MultiXactId myOldestMember = OldestMemberMXactId[MyProcNumber];
1839
1840         if (MultiXactIdIsValid(myOldestMember))
1841                 RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1842                                                            &myOldestMember, sizeof(MultiXactId));
1843 }
1844
1845 /*
1846  * PostPrepare_MultiXact
1847  *              Clean up after successful PREPARE TRANSACTION
1848  */
1849 void
1850 PostPrepare_MultiXact(TransactionId xid)
1851 {
1852         MultiXactId myOldestMember;
1853
1854         /*
1855          * Transfer our OldestMemberMXactId value to the slot reserved for the
1856          * prepared transaction.
1857          */
1858         myOldestMember = OldestMemberMXactId[MyProcNumber];
1859         if (MultiXactIdIsValid(myOldestMember))
1860         {
1861                 ProcNumber      dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1862
1863                 /*
1864                  * Even though storing MultiXactId is atomic, acquire lock to make
1865                  * sure others see both changes, not just the reset of the slot of the
1866                  * current backend. Using a volatile pointer might suffice, but this
1867                  * isn't a hot spot.
1868                  */
1869                 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1870
1871                 OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1872                 OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1873
1874                 LWLockRelease(MultiXactGenLock);
1875         }
1876
1877         /*
1878          * We don't need to transfer OldestVisibleMXactId value, because the
1879          * transaction is not going to be looking at any more multixacts once it's
1880          * prepared.
1881          *
1882          * We assume that storing a MultiXactId is atomic and so we need not take
1883          * MultiXactGenLock to do this.
1884          */
1885         OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1886
1887         /*
1888          * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1889          */
1890         MXactContext = NULL;
1891         dclist_init(&MXactCache);
1892 }
1893
1894 /*
1895  * multixact_twophase_recover
1896  *              Recover the state of a prepared transaction at startup
1897  */
1898 void
1899 multixact_twophase_recover(TransactionId xid, uint16 info,
1900                                                    void *recdata, uint32 len)
1901 {
1902         ProcNumber      dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1903         MultiXactId oldestMember;
1904
1905         /*
1906          * Get the oldest member XID from the state file record, and set it in the
1907          * OldestMemberMXactId slot reserved for this prepared transaction.
1908          */
1909         Assert(len == sizeof(MultiXactId));
1910         oldestMember = *((MultiXactId *) recdata);
1911
1912         OldestMemberMXactId[dummyProcNumber] = oldestMember;
1913 }
1914
1915 /*
1916  * multixact_twophase_postcommit
1917  *              Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1918  */
1919 void
1920 multixact_twophase_postcommit(TransactionId xid, uint16 info,
1921                                                           void *recdata, uint32 len)
1922 {
1923         ProcNumber      dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true);
1924
1925         Assert(len == sizeof(MultiXactId));
1926
1927         OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1928 }
1929
1930 /*
1931  * multixact_twophase_postabort
1932  *              This is actually just the same as the COMMIT case.
1933  */
1934 void
1935 multixact_twophase_postabort(TransactionId xid, uint16 info,
1936                                                          void *recdata, uint32 len)
1937 {
1938         multixact_twophase_postcommit(xid, info, recdata, len);
1939 }
1940
1941 /*
1942  * Initialization of shared memory for MultiXact.  We use two SLRU areas,
1943  * thus double memory.  Also, reserve space for the shared MultiXactState
1944  * struct and the per-backend MultiXactId arrays (two of those, too).
1945  */
1946 Size
1947 MultiXactShmemSize(void)
1948 {
1949         Size            size;
1950
1951         /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1952 #define SHARED_MULTIXACT_STATE_SIZE \
1953         add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1954                          mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1955
1956         size = SHARED_MULTIXACT_STATE_SIZE;
1957         size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0));
1958         size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0));
1959
1960         return size;
1961 }
1962
1963 void
1964 MultiXactShmemInit(void)
1965 {
1966         bool            found;
1967
1968         debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1969
1970         MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1971         MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1972
1973         SimpleLruInit(MultiXactOffsetCtl,
1974                                   "multixact_offset", multixact_offset_buffers, 0,
1975                                   "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1976                                   LWTRANCHE_MULTIXACTOFFSET_SLRU,
1977                                   SYNC_HANDLER_MULTIXACT_OFFSET,
1978                                   false);
1979         SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
1980         SimpleLruInit(MultiXactMemberCtl,
1981                                   "multixact_member", multixact_member_buffers, 0,
1982                                   "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1983                                   LWTRANCHE_MULTIXACTMEMBER_SLRU,
1984                                   SYNC_HANDLER_MULTIXACT_MEMBER,
1985                                   false);
1986         /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1987
1988         /* Initialize our shared state struct */
1989         MultiXactState = ShmemInitStruct("Shared MultiXact State",
1990                                                                          SHARED_MULTIXACT_STATE_SIZE,
1991                                                                          &found);
1992         if (!IsUnderPostmaster)
1993         {
1994                 Assert(!found);
1995
1996                 /* Make sure we zero out the per-backend state */
1997                 MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1998                 ConditionVariableInit(&MultiXactState->nextoff_cv);
1999         }
2000         else
2001                 Assert(found);
2002
2003         /*
2004          * Set up array pointers.
2005          */
2006         OldestMemberMXactId = MultiXactState->perBackendXactIds;
2007         OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
2008 }
2009
2010 /*
2011  * GUC check_hook for multixact_offset_buffers
2012  */
2013 bool
2014 check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
2015 {
2016         return check_slru_buffers("multixact_offset_buffers", newval);
2017 }
2018
2019 /*
2020  * GUC check_hook for multixact_member_buffers
2021  */
2022 bool
2023 check_multixact_member_buffers(int *newval, void **extra, GucSource source)
2024 {
2025         return check_slru_buffers("multixact_member_buffers", newval);
2026 }
2027
2028 /*
2029  * This func must be called ONCE on system install.  It creates the initial
2030  * MultiXact segments.  (The MultiXacts directories are assumed to have been
2031  * created by initdb, and MultiXactShmemInit must have been called already.)
2032  */
2033 void
2034 BootStrapMultiXact(void)
2035 {
2036         int                     slotno;
2037         LWLock     *lock;
2038
2039         lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0);
2040         LWLockAcquire(lock, LW_EXCLUSIVE);
2041
2042         /* Create and zero the first page of the offsets log */
2043         slotno = ZeroMultiXactOffsetPage(0, false);
2044
2045         /* Make sure it's written out */
2046         SimpleLruWritePage(MultiXactOffsetCtl, slotno);
2047         Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
2048
2049         LWLockRelease(lock);
2050
2051         lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0);
2052         LWLockAcquire(lock, LW_EXCLUSIVE);
2053
2054         /* Create and zero the first page of the members log */
2055         slotno = ZeroMultiXactMemberPage(0, false);
2056
2057         /* Make sure it's written out */
2058         SimpleLruWritePage(MultiXactMemberCtl, slotno);
2059         Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
2060
2061         LWLockRelease(lock);
2062 }
2063
2064 /*
2065  * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
2066  * If writeXlog is true, also emit an XLOG record saying we did this.
2067  *
2068  * The page is not actually written, just set up in shared memory.
2069  * The slot number of the new page is returned.
2070  *
2071  * Control lock must be held at entry, and will be held at exit.
2072  */
2073 static int
2074 ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
2075 {
2076         int                     slotno;
2077
2078         slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2079
2080         if (writeXlog)
2081                 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
2082
2083         return slotno;
2084 }
2085
2086 /*
2087  * Ditto, for MultiXactMember
2088  */
2089 static int
2090 ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
2091 {
2092         int                     slotno;
2093
2094         slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2095
2096         if (writeXlog)
2097                 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
2098
2099         return slotno;
2100 }
2101
2102 /*
2103  * MaybeExtendOffsetSlru
2104  *              Extend the offsets SLRU area, if necessary
2105  *
2106  * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2107  * contain files that are shorter than necessary; this would occur if the old
2108  * installation had used multixacts beyond the first page (files cannot be
2109  * copied, because the on-disk representation is different).  pg_upgrade would
2110  * update pg_control to set the next offset value to be at that position, so
2111  * that tuples marked as locked by such MultiXacts would be seen as visible
2112  * without having to consult multixact.  However, trying to create and use a
2113  * new MultiXactId would result in an error because the page on which the new
2114  * value would reside does not exist.  This routine is in charge of creating
2115  * such pages.
2116  */
2117 static void
2118 MaybeExtendOffsetSlru(void)
2119 {
2120         int64           pageno;
2121         LWLock     *lock;
2122
2123         pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
2124         lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2125
2126         LWLockAcquire(lock, LW_EXCLUSIVE);
2127
2128         if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2129         {
2130                 int                     slotno;
2131
2132                 /*
2133                  * Fortunately for us, SimpleLruWritePage is already prepared to deal
2134                  * with creating a new segment file even if the page we're writing is
2135                  * not the first in it, so this is enough.
2136                  */
2137                 slotno = ZeroMultiXactOffsetPage(pageno, false);
2138                 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
2139         }
2140
2141         LWLockRelease(lock);
2142 }
2143
2144 /*
2145  * This must be called ONCE during postmaster or standalone-backend startup.
2146  *
2147  * StartupXLOG has already established nextMXact/nextOffset by calling
2148  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2149  * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2150  * replayed WAL.
2151  */
2152 void
2153 StartupMultiXact(void)
2154 {
2155         MultiXactId multi = MultiXactState->nextMXact;
2156         MultiXactOffset offset = MultiXactState->nextOffset;
2157         int64           pageno;
2158
2159         /*
2160          * Initialize offset's idea of the latest page number.
2161          */
2162         pageno = MultiXactIdToOffsetPage(multi);
2163         pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2164                                                 pageno);
2165
2166         /*
2167          * Initialize member's idea of the latest page number.
2168          */
2169         pageno = MXOffsetToMemberPage(offset);
2170         pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2171                                                 pageno);
2172 }
2173
2174 /*
2175  * This must be called ONCE at the end of startup/recovery.
2176  */
2177 void
2178 TrimMultiXact(void)
2179 {
2180         MultiXactId nextMXact;
2181         MultiXactOffset offset;
2182         MultiXactId oldestMXact;
2183         Oid                     oldestMXactDB;
2184         int64           pageno;
2185         int                     entryno;
2186         int                     flagsoff;
2187
2188         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2189         nextMXact = MultiXactState->nextMXact;
2190         offset = MultiXactState->nextOffset;
2191         oldestMXact = MultiXactState->oldestMultiXactId;
2192         oldestMXactDB = MultiXactState->oldestMultiXactDB;
2193         LWLockRelease(MultiXactGenLock);
2194
2195         /* Clean up offsets state */
2196
2197         /*
2198          * (Re-)Initialize our idea of the latest page number for offsets.
2199          */
2200         pageno = MultiXactIdToOffsetPage(nextMXact);
2201         pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2202                                                 pageno);
2203
2204         /*
2205          * Zero out the remainder of the current offsets page.  See notes in
2206          * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
2207          * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
2208          * rule "write xlog before data," nextMXact successors may carry obsolete,
2209          * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
2210          * operates normally.
2211          */
2212         entryno = MultiXactIdToOffsetEntry(nextMXact);
2213         if (entryno != 0)
2214         {
2215                 int                     slotno;
2216                 MultiXactOffset *offptr;
2217                 LWLock     *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2218
2219                 LWLockAcquire(lock, LW_EXCLUSIVE);
2220                 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2221                 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2222                 offptr += entryno;
2223
2224                 MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2225
2226                 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2227                 LWLockRelease(lock);
2228         }
2229
2230         /*
2231          * And the same for members.
2232          *
2233          * (Re-)Initialize our idea of the latest page number for members.
2234          */
2235         pageno = MXOffsetToMemberPage(offset);
2236         pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2237                                                 pageno);
2238
2239         /*
2240          * Zero out the remainder of the current members page.  See notes in
2241          * TrimCLOG() for motivation.
2242          */
2243         flagsoff = MXOffsetToFlagsOffset(offset);
2244         if (flagsoff != 0)
2245         {
2246                 int                     slotno;
2247                 TransactionId *xidptr;
2248                 int                     memberoff;
2249                 LWLock     *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2250
2251                 LWLockAcquire(lock, LW_EXCLUSIVE);
2252                 memberoff = MXOffsetToMemberOffset(offset);
2253                 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2254                 xidptr = (TransactionId *)
2255                         (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2256
2257                 MemSet(xidptr, 0, BLCKSZ - memberoff);
2258
2259                 /*
2260                  * Note: we don't need to zero out the flag bits in the remaining
2261                  * members of the current group, because they are always reset before
2262                  * writing.
2263                  */
2264
2265                 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2266                 LWLockRelease(lock);
2267         }
2268
2269         /* signal that we're officially up */
2270         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2271         MultiXactState->finishedStartup = true;
2272         LWLockRelease(MultiXactGenLock);
2273
2274         /* Now compute how far away the next members wraparound is. */
2275         SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2276 }
2277
2278 /*
2279  * Get the MultiXact data to save in a checkpoint record
2280  */
2281 void
2282 MultiXactGetCheckptMulti(bool is_shutdown,
2283                                                  MultiXactId *nextMulti,
2284                                                  MultiXactOffset *nextMultiOffset,
2285                                                  MultiXactId *oldestMulti,
2286                                                  Oid *oldestMultiDB)
2287 {
2288         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2289         *nextMulti = MultiXactState->nextMXact;
2290         *nextMultiOffset = MultiXactState->nextOffset;
2291         *oldestMulti = MultiXactState->oldestMultiXactId;
2292         *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2293         LWLockRelease(MultiXactGenLock);
2294
2295         debug_elog6(DEBUG2,
2296                                 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2297                                 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2298 }
2299
2300 /*
2301  * Perform a checkpoint --- either during shutdown, or on-the-fly
2302  */
2303 void
2304 CheckPointMultiXact(void)
2305 {
2306         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2307
2308         /*
2309          * Write dirty MultiXact pages to disk.  This may result in sync requests
2310          * queued for later handling by ProcessSyncRequests(), as part of the
2311          * checkpoint.
2312          */
2313         SimpleLruWriteAll(MultiXactOffsetCtl, true);
2314         SimpleLruWriteAll(MultiXactMemberCtl, true);
2315
2316         TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2317 }
2318
2319 /*
2320  * Set the next-to-be-assigned MultiXactId and offset
2321  *
2322  * This is used when we can determine the correct next ID/offset exactly
2323  * from a checkpoint record.  Although this is only called during bootstrap
2324  * and XLog replay, we take the lock in case any hot-standby backends are
2325  * examining the values.
2326  */
2327 void
2328 MultiXactSetNextMXact(MultiXactId nextMulti,
2329                                           MultiXactOffset nextMultiOffset)
2330 {
2331         debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2332                                 nextMulti, nextMultiOffset);
2333         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2334         MultiXactState->nextMXact = nextMulti;
2335         MultiXactState->nextOffset = nextMultiOffset;
2336         LWLockRelease(MultiXactGenLock);
2337
2338         /*
2339          * During a binary upgrade, make sure that the offsets SLRU is large
2340          * enough to contain the next value that would be created.
2341          *
2342          * We need to do this pretty early during the first startup in binary
2343          * upgrade mode: before StartupMultiXact() in fact, because this routine
2344          * is called even before that by StartupXLOG().  And we can't do it
2345          * earlier than at this point, because during that first call of this
2346          * routine we determine the MultiXactState->nextMXact value that
2347          * MaybeExtendOffsetSlru needs.
2348          */
2349         if (IsBinaryUpgrade)
2350                 MaybeExtendOffsetSlru();
2351 }
2352
2353 /*
2354  * Determine the last safe MultiXactId to allocate given the currently oldest
2355  * datminmxid (ie, the oldest MultiXactId that might exist in any database
2356  * of our cluster), and the OID of the (or a) database with that value.
2357  *
2358  * is_startup is true when we are just starting the cluster, false when we
2359  * are updating state in a running cluster.  This only affects log messages.
2360  */
2361 void
2362 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2363                                         bool is_startup)
2364 {
2365         MultiXactId multiVacLimit;
2366         MultiXactId multiWarnLimit;
2367         MultiXactId multiStopLimit;
2368         MultiXactId multiWrapLimit;
2369         MultiXactId curMulti;
2370         bool            needs_offset_vacuum;
2371
2372         Assert(MultiXactIdIsValid(oldest_datminmxid));
2373
2374         /*
2375          * We pretend that a wrap will happen halfway through the multixact ID
2376          * space, but that's not really true, because multixacts wrap differently
2377          * from transaction IDs.  Note that, separately from any concern about
2378          * multixact IDs wrapping, we must ensure that multixact members do not
2379          * wrap.  Limits for that are set in SetOffsetVacuumLimit, not here.
2380          */
2381         multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2382         if (multiWrapLimit < FirstMultiXactId)
2383                 multiWrapLimit += FirstMultiXactId;
2384
2385         /*
2386          * We'll refuse to continue assigning MultiXactIds once we get within 3M
2387          * multi of data loss.  See SetTransactionIdLimit.
2388          */
2389         multiStopLimit = multiWrapLimit - 3000000;
2390         if (multiStopLimit < FirstMultiXactId)
2391                 multiStopLimit -= FirstMultiXactId;
2392
2393         /*
2394          * We'll start complaining loudly when we get within 40M multis of data
2395          * loss.  This is kind of arbitrary, but if you let your gas gauge get
2396          * down to 2% of full, would you be looking for the next gas station?  We
2397          * need to be fairly liberal about this number because there are lots of
2398          * scenarios where most transactions are done by automatic clients that
2399          * won't pay attention to warnings.  (No, we're not gonna make this
2400          * configurable.  If you know enough to configure it, you know enough to
2401          * not get in this kind of trouble in the first place.)
2402          */
2403         multiWarnLimit = multiWrapLimit - 40000000;
2404         if (multiWarnLimit < FirstMultiXactId)
2405                 multiWarnLimit -= FirstMultiXactId;
2406
2407         /*
2408          * We'll start trying to force autovacuums when oldest_datminmxid gets to
2409          * be more than autovacuum_multixact_freeze_max_age mxids old.
2410          *
2411          * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2412          * so that we don't have to worry about dealing with on-the-fly changes in
2413          * its value.  See SetTransactionIdLimit.
2414          */
2415         multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2416         if (multiVacLimit < FirstMultiXactId)
2417                 multiVacLimit += FirstMultiXactId;
2418
2419         /* Grab lock for just long enough to set the new limit values */
2420         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2421         MultiXactState->oldestMultiXactId = oldest_datminmxid;
2422         MultiXactState->oldestMultiXactDB = oldest_datoid;
2423         MultiXactState->multiVacLimit = multiVacLimit;
2424         MultiXactState->multiWarnLimit = multiWarnLimit;
2425         MultiXactState->multiStopLimit = multiStopLimit;
2426         MultiXactState->multiWrapLimit = multiWrapLimit;
2427         curMulti = MultiXactState->nextMXact;
2428         LWLockRelease(MultiXactGenLock);
2429
2430         /* Log the info */
2431         ereport(DEBUG1,
2432                         (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2433                                                          multiWrapLimit, oldest_datoid)));
2434
2435         /*
2436          * Computing the actual limits is only possible once the data directory is
2437          * in a consistent state. There's no need to compute the limits while
2438          * still replaying WAL - no decisions about new multis are made even
2439          * though multixact creations might be replayed. So we'll only do further
2440          * checks after TrimMultiXact() has been called.
2441          */
2442         if (!MultiXactState->finishedStartup)
2443                 return;
2444
2445         Assert(!InRecovery);
2446
2447         /* Set limits for offset vacuum. */
2448         needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2449
2450         /*
2451          * If past the autovacuum force point, immediately signal an autovac
2452          * request.  The reason for this is that autovac only processes one
2453          * database per invocation.  Once it's finished cleaning up the oldest
2454          * database, it'll call here, and we'll signal the postmaster to start
2455          * another iteration immediately if there are still any old databases.
2456          */
2457         if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2458                  needs_offset_vacuum) && IsUnderPostmaster)
2459                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2460
2461         /* Give an immediate warning if past the wrap warn point */
2462         if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2463         {
2464                 char       *oldest_datname;
2465
2466                 /*
2467                  * We can be called when not inside a transaction, for example during
2468                  * StartupXLOG().  In such a case we cannot do database access, so we
2469                  * must just report the oldest DB's OID.
2470                  *
2471                  * Note: it's also possible that get_database_name fails and returns
2472                  * NULL, for example because the database just got dropped.  We'll
2473                  * still warn, even though the warning might now be unnecessary.
2474                  */
2475                 if (IsTransactionState())
2476                         oldest_datname = get_database_name(oldest_datoid);
2477                 else
2478                         oldest_datname = NULL;
2479
2480                 if (oldest_datname)
2481                         ereport(WARNING,
2482                                         (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2483                                                                    "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2484                                                                    multiWrapLimit - curMulti,
2485                                                                    oldest_datname,
2486                                                                    multiWrapLimit - curMulti),
2487                                          errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2488                                                          "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2489                 else
2490                         ereport(WARNING,
2491                                         (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2492                                                                    "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2493                                                                    multiWrapLimit - curMulti,
2494                                                                    oldest_datoid,
2495                                                                    multiWrapLimit - curMulti),
2496                                          errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2497                                                          "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2498         }
2499 }
2500
2501 /*
2502  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2503  * and similarly nextOffset is at least minMultiOffset.
2504  *
2505  * This is used when we can determine minimum safe values from an XLog
2506  * record (either an on-line checkpoint or an mxact creation log entry).
2507  * Although this is only called during XLog replay, we take the lock in case
2508  * any hot-standby backends are examining the values.
2509  */
2510 void
2511 MultiXactAdvanceNextMXact(MultiXactId minMulti,
2512                                                   MultiXactOffset minMultiOffset)
2513 {
2514         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2515         if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2516         {
2517                 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2518                 MultiXactState->nextMXact = minMulti;
2519         }
2520         if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2521         {
2522                 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2523                                         minMultiOffset);
2524                 MultiXactState->nextOffset = minMultiOffset;
2525         }
2526         LWLockRelease(MultiXactGenLock);
2527 }
2528
2529 /*
2530  * Update our oldestMultiXactId value, but only if it's more recent than what
2531  * we had.
2532  *
2533  * This may only be called during WAL replay.
2534  */
2535 void
2536 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2537 {
2538         Assert(InRecovery);
2539
2540         if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2541                 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2542 }
2543
2544 /*
2545  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2546  *
2547  * NB: this is called while holding MultiXactGenLock.  We want it to be very
2548  * fast most of the time; even when it's not so fast, no actual I/O need
2549  * happen unless we're forced to write out a dirty log or xlog page to make
2550  * room in shared memory.
2551  */
2552 static void
2553 ExtendMultiXactOffset(MultiXactId multi)
2554 {
2555         int64           pageno;
2556         LWLock     *lock;
2557
2558         /*
2559          * No work except at first MultiXactId of a page.  But beware: just after
2560          * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2561          */
2562         if (MultiXactIdToOffsetEntry(multi) != 0 &&
2563                 multi != FirstMultiXactId)
2564                 return;
2565
2566         pageno = MultiXactIdToOffsetPage(multi);
2567         lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2568
2569         LWLockAcquire(lock, LW_EXCLUSIVE);
2570
2571         /* Zero the page and make an XLOG entry about it */
2572         ZeroMultiXactOffsetPage(pageno, true);
2573
2574         LWLockRelease(lock);
2575 }
2576
2577 /*
2578  * Make sure that MultiXactMember has room for the members of a newly-
2579  * allocated MultiXactId.
2580  *
2581  * Like the above routine, this is called while holding MultiXactGenLock;
2582  * same comments apply.
2583  */
2584 static void
2585 ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2586 {
2587         /*
2588          * It's possible that the members span more than one page of the members
2589          * file, so we loop to ensure we consider each page.  The coding is not
2590          * optimal if the members span several pages, but that seems unusual
2591          * enough to not worry much about.
2592          */
2593         while (nmembers > 0)
2594         {
2595                 int                     flagsoff;
2596                 int                     flagsbit;
2597                 uint32          difference;
2598
2599                 /*
2600                  * Only zero when at first entry of a page.
2601                  */
2602                 flagsoff = MXOffsetToFlagsOffset(offset);
2603                 flagsbit = MXOffsetToFlagsBitShift(offset);
2604                 if (flagsoff == 0 && flagsbit == 0)
2605                 {
2606                         int64           pageno;
2607                         LWLock     *lock;
2608
2609                         pageno = MXOffsetToMemberPage(offset);
2610                         lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2611
2612                         LWLockAcquire(lock, LW_EXCLUSIVE);
2613
2614                         /* Zero the page and make an XLOG entry about it */
2615                         ZeroMultiXactMemberPage(pageno, true);
2616
2617                         LWLockRelease(lock);
2618                 }
2619
2620                 /*
2621                  * Compute the number of items till end of current page.  Careful: if
2622                  * addition of unsigned ints wraps around, we're at the last page of
2623                  * the last segment; since that page holds a different number of items
2624                  * than other pages, we need to do it differently.
2625                  */
2626                 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2627                 {
2628                         /*
2629                          * This is the last page of the last segment; we can compute the
2630                          * number of items left to allocate in it without modulo
2631                          * arithmetic.
2632                          */
2633                         difference = MaxMultiXactOffset - offset + 1;
2634                 }
2635                 else
2636                         difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2637
2638                 /*
2639                  * Advance to next page, taking care to properly handle the wraparound
2640                  * case.  OK if nmembers goes negative.
2641                  */
2642                 nmembers -= difference;
2643                 offset += difference;
2644         }
2645 }
2646
2647 /*
2648  * GetOldestMultiXactId
2649  *
2650  * Return the oldest MultiXactId that's still possibly still seen as live by
2651  * any running transaction.  Older ones might still exist on disk, but they no
2652  * longer have any running member transaction.
2653  *
2654  * It's not safe to truncate MultiXact SLRU segments on the value returned by
2655  * this function; however, it can be set as the new relminmxid for any table
2656  * that VACUUM knows has no remaining MXIDs < the same value.  It is only safe
2657  * to truncate SLRUs when no table can possibly still have a referencing MXID.
2658  */
2659 MultiXactId
2660 GetOldestMultiXactId(void)
2661 {
2662         MultiXactId oldestMXact;
2663         MultiXactId nextMXact;
2664         int                     i;
2665
2666         /*
2667          * This is the oldest valid value among all the OldestMemberMXactId[] and
2668          * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2669          */
2670         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2671
2672         /*
2673          * We have to beware of the possibility that nextMXact is in the
2674          * wrapped-around state.  We don't fix the counter itself here, but we
2675          * must be sure to use a valid value in our calculation.
2676          */
2677         nextMXact = MultiXactState->nextMXact;
2678         if (nextMXact < FirstMultiXactId)
2679                 nextMXact = FirstMultiXactId;
2680
2681         oldestMXact = nextMXact;
2682         for (i = 0; i < MaxOldestSlot; i++)
2683         {
2684                 MultiXactId thisoldest;
2685
2686                 thisoldest = OldestMemberMXactId[i];
2687                 if (MultiXactIdIsValid(thisoldest) &&
2688                         MultiXactIdPrecedes(thisoldest, oldestMXact))
2689                         oldestMXact = thisoldest;
2690                 thisoldest = OldestVisibleMXactId[i];
2691                 if (MultiXactIdIsValid(thisoldest) &&
2692                         MultiXactIdPrecedes(thisoldest, oldestMXact))
2693                         oldestMXact = thisoldest;
2694         }
2695
2696         LWLockRelease(MultiXactGenLock);
2697
2698         return oldestMXact;
2699 }
2700
2701 /*
2702  * Determine how aggressively we need to vacuum in order to prevent member
2703  * wraparound.
2704  *
2705  * To do so determine what's the oldest member offset and install the limit
2706  * info in MultiXactState, where it can be used to prevent overrun of old data
2707  * in the members SLRU area.
2708  *
2709  * The return value is true if emergency autovacuum is required and false
2710  * otherwise.
2711  */
2712 static bool
2713 SetOffsetVacuumLimit(bool is_startup)
2714 {
2715         MultiXactId oldestMultiXactId;
2716         MultiXactId nextMXact;
2717         MultiXactOffset oldestOffset = 0;       /* placate compiler */
2718         MultiXactOffset prevOldestOffset;
2719         MultiXactOffset nextOffset;
2720         bool            oldestOffsetKnown = false;
2721         bool            prevOldestOffsetKnown;
2722         MultiXactOffset offsetStopLimit = 0;
2723         MultiXactOffset prevOffsetStopLimit;
2724
2725         /*
2726          * NB: Have to prevent concurrent truncation, we might otherwise try to
2727          * lookup an oldestMulti that's concurrently getting truncated away.
2728          */
2729         LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2730
2731         /* Read relevant fields from shared memory. */
2732         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2733         oldestMultiXactId = MultiXactState->oldestMultiXactId;
2734         nextMXact = MultiXactState->nextMXact;
2735         nextOffset = MultiXactState->nextOffset;
2736         prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2737         prevOldestOffset = MultiXactState->oldestOffset;
2738         prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2739         Assert(MultiXactState->finishedStartup);
2740         LWLockRelease(MultiXactGenLock);
2741
2742         /*
2743          * Determine the offset of the oldest multixact.  Normally, we can read
2744          * the offset from the multixact itself, but there's an important special
2745          * case: if there are no multixacts in existence at all, oldestMXact
2746          * obviously can't point to one.  It will instead point to the multixact
2747          * ID that will be assigned the next time one is needed.
2748          */
2749         if (oldestMultiXactId == nextMXact)
2750         {
2751                 /*
2752                  * When the next multixact gets created, it will be stored at the next
2753                  * offset.
2754                  */
2755                 oldestOffset = nextOffset;
2756                 oldestOffsetKnown = true;
2757         }
2758         else
2759         {
2760                 /*
2761                  * Figure out where the oldest existing multixact's offsets are
2762                  * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2763                  * the supposedly-earliest multixact might not really exist.  We are
2764                  * careful not to fail in that case.
2765                  */
2766                 oldestOffsetKnown =
2767                         find_multixact_start(oldestMultiXactId, &oldestOffset);
2768
2769                 if (oldestOffsetKnown)
2770                         ereport(DEBUG1,
2771                                         (errmsg_internal("oldest MultiXactId member is at offset %u",
2772                                                                          oldestOffset)));
2773                 else
2774                         ereport(LOG,
2775                                         (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2776                                                         oldestMultiXactId)));
2777         }
2778
2779         LWLockRelease(MultiXactTruncationLock);
2780
2781         /*
2782          * If we can, compute limits (and install them MultiXactState) to prevent
2783          * overrun of old data in the members SLRU area. We can only do so if the
2784          * oldest offset is known though.
2785          */
2786         if (oldestOffsetKnown)
2787         {
2788                 /* move back to start of the corresponding segment */
2789                 offsetStopLimit = oldestOffset - (oldestOffset %
2790                                                                                   (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2791
2792                 /* always leave one segment before the wraparound point */
2793                 offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2794
2795                 if (!prevOldestOffsetKnown && !is_startup)
2796                         ereport(LOG,
2797                                         (errmsg("MultiXact member wraparound protections are now enabled")));
2798
2799                 ereport(DEBUG1,
2800                                 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2801                                                                  offsetStopLimit, oldestMultiXactId)));
2802         }
2803         else if (prevOldestOffsetKnown)
2804         {
2805                 /*
2806                  * If we failed to get the oldest offset this time, but we have a
2807                  * value from a previous pass through this function, use the old
2808                  * values rather than automatically forcing an emergency autovacuum
2809                  * cycle again.
2810                  */
2811                 oldestOffset = prevOldestOffset;
2812                 oldestOffsetKnown = true;
2813                 offsetStopLimit = prevOffsetStopLimit;
2814         }
2815
2816         /* Install the computed values */
2817         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2818         MultiXactState->oldestOffset = oldestOffset;
2819         MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2820         MultiXactState->offsetStopLimit = offsetStopLimit;
2821         LWLockRelease(MultiXactGenLock);
2822
2823         /*
2824          * Do we need an emergency autovacuum?  If we're not sure, assume yes.
2825          */
2826         return !oldestOffsetKnown ||
2827                 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2828 }
2829
2830 /*
2831  * Return whether adding "distance" to "start" would move past "boundary".
2832  *
2833  * We use this to determine whether the addition is "wrapping around" the
2834  * boundary point, hence the name.  The reason we don't want to use the regular
2835  * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2836  * the 2^32-1 space here, allowing for more multixacts than would fit
2837  * otherwise.
2838  */
2839 static bool
2840 MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2841                                                  uint32 distance)
2842 {
2843         MultiXactOffset finish;
2844
2845         /*
2846          * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2847          * if the addition wraps around the UINT_MAX boundary, skip that value.
2848          */
2849         finish = start + distance;
2850         if (finish < start)
2851                 finish++;
2852
2853         /*-----------------------------------------------------------------------
2854          * When the boundary is numerically greater than the starting point, any
2855          * value numerically between the two is not wrapped:
2856          *
2857          *      <----S----B---->
2858          *      [---)                    = F wrapped past B (and UINT_MAX)
2859          *               [---)           = F not wrapped
2860          *                        [----] = F wrapped past B
2861          *
2862          * When the boundary is numerically less than the starting point (i.e. the
2863          * UINT_MAX wraparound occurs somewhere in between) then all values in
2864          * between are wrapped:
2865          *
2866          *      <----B----S---->
2867          *      [---)                    = F not wrapped past B (but wrapped past UINT_MAX)
2868          *               [---)           = F wrapped past B (and UINT_MAX)
2869          *                        [----] = F not wrapped
2870          *-----------------------------------------------------------------------
2871          */
2872         if (start < boundary)
2873                 return finish >= boundary || finish < start;
2874         else
2875                 return finish >= boundary && finish < start;
2876 }
2877
2878 /*
2879  * Find the starting offset of the given MultiXactId.
2880  *
2881  * Returns false if the file containing the multi does not exist on disk.
2882  * Otherwise, returns true and sets *result to the starting member offset.
2883  *
2884  * This function does not prevent concurrent truncation, so if that's
2885  * required, the caller has to protect against that.
2886  */
2887 static bool
2888 find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2889 {
2890         MultiXactOffset offset;
2891         int64           pageno;
2892         int                     entryno;
2893         int                     slotno;
2894         MultiXactOffset *offptr;
2895
2896         Assert(MultiXactState->finishedStartup);
2897
2898         pageno = MultiXactIdToOffsetPage(multi);
2899         entryno = MultiXactIdToOffsetEntry(multi);
2900
2901         /*
2902          * Write out dirty data, so PhysicalPageExists can work correctly.
2903          */
2904         SimpleLruWriteAll(MultiXactOffsetCtl, true);
2905         SimpleLruWriteAll(MultiXactMemberCtl, true);
2906
2907         if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2908                 return false;
2909
2910         /* lock is acquired by SimpleLruReadPage_ReadOnly */
2911         slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2912         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2913         offptr += entryno;
2914         offset = *offptr;
2915         LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
2916
2917         *result = offset;
2918         return true;
2919 }
2920
2921 /*
2922  * Determine how many multixacts, and how many multixact members, currently
2923  * exist.  Return false if unable to determine.
2924  */
2925 static bool
2926 ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
2927 {
2928         MultiXactOffset nextOffset;
2929         MultiXactOffset oldestOffset;
2930         MultiXactId oldestMultiXactId;
2931         MultiXactId nextMultiXactId;
2932         bool            oldestOffsetKnown;
2933
2934         LWLockAcquire(MultiXactGenLock, LW_SHARED);
2935         nextOffset = MultiXactState->nextOffset;
2936         oldestMultiXactId = MultiXactState->oldestMultiXactId;
2937         nextMultiXactId = MultiXactState->nextMXact;
2938         oldestOffset = MultiXactState->oldestOffset;
2939         oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2940         LWLockRelease(MultiXactGenLock);
2941
2942         if (!oldestOffsetKnown)
2943                 return false;
2944
2945         *members = nextOffset - oldestOffset;
2946         *multixacts = nextMultiXactId - oldestMultiXactId;
2947         return true;
2948 }
2949
2950 /*
2951  * Multixact members can be removed once the multixacts that refer to them
2952  * are older than every datminmxid.  autovacuum_multixact_freeze_max_age and
2953  * vacuum_multixact_freeze_table_age work together to make sure we never have
2954  * too many multixacts; we hope that, at least under normal circumstances,
2955  * this will also be sufficient to keep us from using too many offsets.
2956  * However, if the average multixact has many members, we might exhaust the
2957  * members space while still using few enough members that these limits fail
2958  * to trigger relminmxid advancement by VACUUM.  At that point, we'd have no
2959  * choice but to start failing multixact-creating operations with an error.
2960  *
2961  * To prevent that, if more than a threshold portion of the members space is
2962  * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2963  * to a value just less than the number of multixacts in use.  We hope that
2964  * this will quickly trigger autovacuuming on the table or tables with the
2965  * oldest relminmxid, thus allowing datminmxid values to advance and removing
2966  * some members.
2967  *
2968  * As the fraction of the member space currently in use grows, we become
2969  * more aggressive in clamping this value.  That not only causes autovacuum
2970  * to ramp up, but also makes any manual vacuums the user issues more
2971  * aggressive.  This happens because vacuum_get_cutoffs() will clamp the
2972  * freeze table and the minimum freeze age cutoffs based on the effective
2973  * autovacuum_multixact_freeze_max_age this function returns.  In the worst
2974  * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2975  * table will freeze every multixact.
2976  */
2977 int
2978 MultiXactMemberFreezeThreshold(void)
2979 {
2980         MultiXactOffset members;
2981         uint32          multixacts;
2982         uint32          victim_multixacts;
2983         double          fraction;
2984         int                     result;
2985
2986         /* If we can't determine member space utilization, assume the worst. */
2987         if (!ReadMultiXactCounts(&multixacts, &members))
2988                 return 0;
2989
2990         /* If member space utilization is low, no special action is required. */
2991         if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2992                 return autovacuum_multixact_freeze_max_age;
2993
2994         /*
2995          * Compute a target for relminmxid advancement.  The number of multixacts
2996          * we try to eliminate from the system is based on how far we are past
2997          * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2998          */
2999         fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
3000                 (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
3001         victim_multixacts = multixacts * fraction;
3002
3003         /* fraction could be > 1.0, but lowest possible freeze age is zero */
3004         if (victim_multixacts > multixacts)
3005                 return 0;
3006         result = multixacts - victim_multixacts;
3007
3008         /*
3009          * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
3010          * autovacuum less aggressive than it would otherwise be.
3011          */
3012         return Min(result, autovacuum_multixact_freeze_max_age);
3013 }
3014
3015 typedef struct mxtruncinfo
3016 {
3017         int64           earliestExistingPage;
3018 } mxtruncinfo;
3019
3020 /*
3021  * SlruScanDirectory callback
3022  *              This callback determines the earliest existing page number.
3023  */
3024 static bool
3025 SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
3026 {
3027         mxtruncinfo *trunc = (mxtruncinfo *) data;
3028
3029         if (trunc->earliestExistingPage == -1 ||
3030                 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
3031         {
3032                 trunc->earliestExistingPage = segpage;
3033         }
3034
3035         return false;                           /* keep going */
3036 }
3037
3038
3039 /*
3040  * Delete members segments [oldest, newOldest)
3041  *
3042  * The members SLRU can, in contrast to the offsets one, be filled to almost
3043  * the full range at once. This means SimpleLruTruncate() can't trivially be
3044  * used - instead the to-be-deleted range is computed using the offsets
3045  * SLRU. C.f. TruncateMultiXact().
3046  */
3047 static void
3048 PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
3049 {
3050         const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
3051         int64           startsegment = MXOffsetToMemberSegment(oldestOffset);
3052         int64           endsegment = MXOffsetToMemberSegment(newOldestOffset);
3053         int64           segment = startsegment;
3054
3055         /*
3056          * Delete all the segments but the last one. The last segment can still
3057          * contain, possibly partially, valid data.
3058          */
3059         while (segment != endsegment)
3060         {
3061                 elog(DEBUG2, "truncating multixact members segment %llx",
3062                          (unsigned long long) segment);
3063                 SlruDeleteSegment(MultiXactMemberCtl, segment);
3064
3065                 /* move to next segment, handling wraparound correctly */
3066                 if (segment == maxsegment)
3067                         segment = 0;
3068                 else
3069                         segment += 1;
3070         }
3071 }
3072
3073 /*
3074  * Delete offsets segments [oldest, newOldest)
3075  */
3076 static void
3077 PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
3078 {
3079         /*
3080          * We step back one multixact to avoid passing a cutoff page that hasn't
3081          * been created yet in the rare case that oldestMulti would be the first
3082          * item on a page and oldestMulti == nextMulti.  In that case, if we
3083          * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3084          * detection.
3085          */
3086         SimpleLruTruncate(MultiXactOffsetCtl,
3087                                           MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
3088 }
3089
3090 /*
3091  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3092  * ones still of interest.
3093  *
3094  * This is only called on a primary as part of vacuum (via
3095  * vac_truncate_clog()). During recovery truncation is done by replaying
3096  * truncation WAL records logged here.
3097  *
3098  * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3099  * is one of the databases preventing newOldestMulti from increasing.
3100  */
3101 void
3102 TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3103 {
3104         MultiXactId oldestMulti;
3105         MultiXactId nextMulti;
3106         MultiXactOffset newOldestOffset;
3107         MultiXactOffset oldestOffset;
3108         MultiXactOffset nextOffset;
3109         mxtruncinfo trunc;
3110         MultiXactId earliest;
3111
3112         Assert(!RecoveryInProgress());
3113         Assert(MultiXactState->finishedStartup);
3114
3115         /*
3116          * We can only allow one truncation to happen at once. Otherwise parts of
3117          * members might vanish while we're doing lookups or similar. There's no
3118          * need to have an interlock with creating new multis or such, since those
3119          * are constrained by the limits (which only grow, never shrink).
3120          */
3121         LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3122
3123         LWLockAcquire(MultiXactGenLock, LW_SHARED);
3124         nextMulti = MultiXactState->nextMXact;
3125         nextOffset = MultiXactState->nextOffset;
3126         oldestMulti = MultiXactState->oldestMultiXactId;
3127         LWLockRelease(MultiXactGenLock);
3128         Assert(MultiXactIdIsValid(oldestMulti));
3129
3130         /*
3131          * Make sure to only attempt truncation if there's values to truncate
3132          * away. In normal processing values shouldn't go backwards, but there's
3133          * some corner cases (due to bugs) where that's possible.
3134          */
3135         if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3136         {
3137                 LWLockRelease(MultiXactTruncationLock);
3138                 return;
3139         }
3140
3141         /*
3142          * Note we can't just plow ahead with the truncation; it's possible that
3143          * there are no segments to truncate, which is a problem because we are
3144          * going to attempt to read the offsets page to determine where to
3145          * truncate the members SLRU.  So we first scan the directory to determine
3146          * the earliest offsets page number that we can read without error.
3147          *
3148          * When nextMXact is less than one segment away from multiWrapLimit,
3149          * SlruScanDirCbFindEarliest can find some early segment other than the
3150          * actual earliest.  (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3151          * returns false, because not all pairs of entries have the same answer.)
3152          * That can also arise when an earlier truncation attempt failed unlink()
3153          * or returned early from this function.  The only consequence is
3154          * returning early, which wastes space that we could have liberated.
3155          *
3156          * NB: It's also possible that the page that oldestMulti is on has already
3157          * been truncated away, and we crashed before updating oldestMulti.
3158          */
3159         trunc.earliestExistingPage = -1;
3160         SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
3161         earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
3162         if (earliest < FirstMultiXactId)
3163                 earliest = FirstMultiXactId;
3164
3165         /* If there's nothing to remove, we can bail out early. */
3166         if (MultiXactIdPrecedes(oldestMulti, earliest))
3167         {
3168                 LWLockRelease(MultiXactTruncationLock);
3169                 return;
3170         }
3171
3172         /*
3173          * First, compute the safe truncation point for MultiXactMember. This is
3174          * the starting offset of the oldest multixact.
3175          *
3176          * Hopefully, find_multixact_start will always work here, because we've
3177          * already checked that it doesn't precede the earliest MultiXact on disk.
3178          * But if it fails, don't truncate anything, and log a message.
3179          */
3180         if (oldestMulti == nextMulti)
3181         {
3182                 /* there are NO MultiXacts */
3183                 oldestOffset = nextOffset;
3184         }
3185         else if (!find_multixact_start(oldestMulti, &oldestOffset))
3186         {
3187                 ereport(LOG,
3188                                 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3189                                                 oldestMulti, earliest)));
3190                 LWLockRelease(MultiXactTruncationLock);
3191                 return;
3192         }
3193
3194         /*
3195          * Secondly compute up to where to truncate. Lookup the corresponding
3196          * member offset for newOldestMulti for that.
3197          */
3198         if (newOldestMulti == nextMulti)
3199         {
3200                 /* there are NO MultiXacts */
3201                 newOldestOffset = nextOffset;
3202         }
3203         else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3204         {
3205                 ereport(LOG,
3206                                 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3207                                                 newOldestMulti)));
3208                 LWLockRelease(MultiXactTruncationLock);
3209                 return;
3210         }
3211
3212         elog(DEBUG1, "performing multixact truncation: "
3213                  "offsets [%u, %u), offsets segments [%llx, %llx), "
3214                  "members [%u, %u), members segments [%llx, %llx)",
3215                  oldestMulti, newOldestMulti,
3216                  (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti),
3217                  (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti),
3218                  oldestOffset, newOldestOffset,
3219                  (unsigned long long) MXOffsetToMemberSegment(oldestOffset),
3220                  (unsigned long long) MXOffsetToMemberSegment(newOldestOffset));
3221
3222         /*
3223          * Do truncation, and the WAL logging of the truncation, in a critical
3224          * section. That way offsets/members cannot get out of sync anymore, i.e.
3225          * once consistent the newOldestMulti will always exist in members, even
3226          * if we crashed in the wrong moment.
3227          */
3228         START_CRIT_SECTION();
3229
3230         /*
3231          * Prevent checkpoints from being scheduled concurrently. This is critical
3232          * because otherwise a truncation record might not be replayed after a
3233          * crash/basebackup, even though the state of the data directory would
3234          * require it.
3235          */
3236         Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
3237         MyProc->delayChkptFlags |= DELAY_CHKPT_START;
3238
3239         /* WAL log truncation */
3240         WriteMTruncateXlogRec(newOldestMultiDB,
3241                                                   oldestMulti, newOldestMulti,
3242                                                   oldestOffset, newOldestOffset);
3243
3244         /*
3245          * Update in-memory limits before performing the truncation, while inside
3246          * the critical section: Have to do it before truncation, to prevent
3247          * concurrent lookups of those values. Has to be inside the critical
3248          * section as otherwise a future call to this function would error out,
3249          * while looking up the oldest member in offsets, if our caller crashes
3250          * before updating the limits.
3251          */
3252         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3253         MultiXactState->oldestMultiXactId = newOldestMulti;
3254         MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3255         LWLockRelease(MultiXactGenLock);
3256
3257         /* First truncate members */
3258         PerformMembersTruncation(oldestOffset, newOldestOffset);
3259
3260         /* Then offsets */
3261         PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3262
3263         MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3264
3265         END_CRIT_SECTION();
3266         LWLockRelease(MultiXactTruncationLock);
3267 }
3268
3269 /*
3270  * Decide whether a MultiXactOffset page number is "older" for truncation
3271  * purposes.  Analogous to CLOGPagePrecedes().
3272  *
3273  * Offsetting the values is optional, because MultiXactIdPrecedes() has
3274  * translational symmetry.
3275  */
3276 static bool
3277 MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
3278 {
3279         MultiXactId multi1;
3280         MultiXactId multi2;
3281
3282         multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3283         multi1 += FirstMultiXactId + 1;
3284         multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3285         multi2 += FirstMultiXactId + 1;
3286
3287         return (MultiXactIdPrecedes(multi1, multi2) &&
3288                         MultiXactIdPrecedes(multi1,
3289                                                                 multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3290 }
3291
3292 /*
3293  * Decide whether a MultiXactMember page number is "older" for truncation
3294  * purposes.  There is no "invalid offset number" so use the numbers verbatim.
3295  */
3296 static bool
3297 MultiXactMemberPagePrecedes(int64 page1, int64 page2)
3298 {
3299         MultiXactOffset offset1;
3300         MultiXactOffset offset2;
3301
3302         offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3303         offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3304
3305         return (MultiXactOffsetPrecedes(offset1, offset2) &&
3306                         MultiXactOffsetPrecedes(offset1,
3307                                                                         offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3308 }
3309
3310 /*
3311  * Decide which of two MultiXactIds is earlier.
3312  *
3313  * XXX do we need to do something special for InvalidMultiXactId?
3314  * (Doesn't look like it.)
3315  */
3316 bool
3317 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3318 {
3319         int32           diff = (int32) (multi1 - multi2);
3320
3321         return (diff < 0);
3322 }
3323
3324 /*
3325  * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3326  *
3327  * XXX do we need to do something special for InvalidMultiXactId?
3328  * (Doesn't look like it.)
3329  */
3330 bool
3331 MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3332 {
3333         int32           diff = (int32) (multi1 - multi2);
3334
3335         return (diff <= 0);
3336 }
3337
3338
3339 /*
3340  * Decide which of two offsets is earlier.
3341  */
3342 static bool
3343 MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3344 {
3345         int32           diff = (int32) (offset1 - offset2);
3346
3347         return (diff < 0);
3348 }
3349
3350 /*
3351  * Write an xlog record reflecting the zeroing of either a MEMBERs or
3352  * OFFSETs page (info shows which)
3353  */
3354 static void
3355 WriteMZeroPageXlogRec(int64 pageno, uint8 info)
3356 {
3357         XLogBeginInsert();
3358         XLogRegisterData((char *) (&pageno), sizeof(pageno));
3359         (void) XLogInsert(RM_MULTIXACT_ID, info);
3360 }
3361
3362 /*
3363  * Write a TRUNCATE xlog record
3364  *
3365  * We must flush the xlog record to disk before returning --- see notes in
3366  * TruncateCLOG().
3367  */
3368 static void
3369 WriteMTruncateXlogRec(Oid oldestMultiDB,
3370                                           MultiXactId startTruncOff, MultiXactId endTruncOff,
3371                                           MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3372 {
3373         XLogRecPtr      recptr;
3374         xl_multixact_truncate xlrec;
3375
3376         xlrec.oldestMultiDB = oldestMultiDB;
3377
3378         xlrec.startTruncOff = startTruncOff;
3379         xlrec.endTruncOff = endTruncOff;
3380
3381         xlrec.startTruncMemb = startTruncMemb;
3382         xlrec.endTruncMemb = endTruncMemb;
3383
3384         XLogBeginInsert();
3385         XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3386         recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3387         XLogFlush(recptr);
3388 }
3389
3390 /*
3391  * MULTIXACT resource manager's routines
3392  */
3393 void
3394 multixact_redo(XLogReaderState *record)
3395 {
3396         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3397
3398         /* Backup blocks are not used in multixact records */
3399         Assert(!XLogRecHasAnyBlockRefs(record));
3400
3401         if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3402         {
3403                 int64           pageno;
3404                 int                     slotno;
3405                 LWLock     *lock;
3406
3407                 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3408
3409                 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3410                 LWLockAcquire(lock, LW_EXCLUSIVE);
3411
3412                 slotno = ZeroMultiXactOffsetPage(pageno, false);
3413                 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3414                 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3415
3416                 LWLockRelease(lock);
3417         }
3418         else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3419         {
3420                 int64           pageno;
3421                 int                     slotno;
3422                 LWLock     *lock;
3423
3424                 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3425
3426                 lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
3427                 LWLockAcquire(lock, LW_EXCLUSIVE);
3428
3429                 slotno = ZeroMultiXactMemberPage(pageno, false);
3430                 SimpleLruWritePage(MultiXactMemberCtl, slotno);
3431                 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3432
3433                 LWLockRelease(lock);
3434         }
3435         else if (info == XLOG_MULTIXACT_CREATE_ID)
3436         {
3437                 xl_multixact_create *xlrec =
3438                         (xl_multixact_create *) XLogRecGetData(record);
3439                 TransactionId max_xid;
3440                 int                     i;
3441
3442                 /* Store the data back into the SLRU files */
3443                 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3444                                                    xlrec->members);
3445
3446                 /* Make sure nextMXact/nextOffset are beyond what this record has */
3447                 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3448                                                                   xlrec->moff + xlrec->nmembers);
3449
3450                 /*
3451                  * Make sure nextXid is beyond any XID mentioned in the record. This
3452                  * should be unnecessary, since any XID found here ought to have other
3453                  * evidence in the XLOG, but let's be safe.
3454                  */
3455                 max_xid = XLogRecGetXid(record);
3456                 for (i = 0; i < xlrec->nmembers; i++)
3457                 {
3458                         if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3459                                 max_xid = xlrec->members[i].xid;
3460                 }
3461
3462                 AdvanceNextFullTransactionIdPastXid(max_xid);
3463         }
3464         else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3465         {
3466                 xl_multixact_truncate xlrec;
3467                 int64           pageno;
3468
3469                 memcpy(&xlrec, XLogRecGetData(record),
3470                            SizeOfMultiXactTruncate);
3471
3472                 elog(DEBUG1, "replaying multixact truncation: "
3473                          "offsets [%u, %u), offsets segments [%llx, %llx), "
3474                          "members [%u, %u), members segments [%llx, %llx)",
3475                          xlrec.startTruncOff, xlrec.endTruncOff,
3476                          (unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3477                          (unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3478                          xlrec.startTruncMemb, xlrec.endTruncMemb,
3479                          (unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb),
3480                          (unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb));
3481
3482                 /* should not be required, but more than cheap enough */
3483                 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3484
3485                 /*
3486                  * Advance the horizon values, so they're current at the end of
3487                  * recovery.
3488                  */
3489                 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3490
3491                 PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3492
3493                 /*
3494                  * During XLOG replay, latest_page_number isn't necessarily set up
3495                  * yet; insert a suitable value to bypass the sanity test in
3496                  * SimpleLruTruncate.
3497                  */
3498                 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3499                 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3500                                                         pageno);
3501                 PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3502
3503                 LWLockRelease(MultiXactTruncationLock);
3504         }
3505         else
3506                 elog(PANIC, "multixact_redo: unknown op code %u", info);
3507 }
3508
3509 Datum
3510 pg_get_multixact_members(PG_FUNCTION_ARGS)
3511 {
3512         typedef struct
3513         {
3514                 MultiXactMember *members;
3515                 int                     nmembers;
3516                 int                     iter;
3517         } mxact;
3518         MultiXactId mxid = PG_GETARG_TRANSACTIONID(0);
3519         mxact      *multi;
3520         FuncCallContext *funccxt;
3521
3522         if (mxid < FirstMultiXactId)
3523                 ereport(ERROR,
3524                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3525                                  errmsg("invalid MultiXactId: %u", mxid)));
3526
3527         if (SRF_IS_FIRSTCALL())
3528         {
3529                 MemoryContext oldcxt;
3530                 TupleDesc       tupdesc;
3531
3532                 funccxt = SRF_FIRSTCALL_INIT();
3533                 oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3534
3535                 multi = palloc(sizeof(mxact));
3536                 /* no need to allow for old values here */
3537                 multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3538                                                                                                 false);
3539                 multi->iter = 0;
3540
3541                 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
3542                         elog(ERROR, "return type must be a row type");
3543                 funccxt->tuple_desc = tupdesc;
3544                 funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3545                 funccxt->user_fctx = multi;
3546
3547                 MemoryContextSwitchTo(oldcxt);
3548         }
3549
3550         funccxt = SRF_PERCALL_SETUP();
3551         multi = (mxact *) funccxt->user_fctx;
3552
3553         while (multi->iter < multi->nmembers)
3554         {
3555                 HeapTuple       tuple;
3556                 char       *values[2];
3557
3558                 values[0] = psprintf("%u", multi->members[multi->iter].xid);
3559                 values[1] = mxstatus_to_string(multi->members[multi->iter].status);
3560
3561                 tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3562
3563                 multi->iter++;
3564                 pfree(values[0]);
3565                 SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3566         }
3567
3568         SRF_RETURN_DONE(funccxt);
3569 }
3570
3571 /*
3572  * Entrypoint for sync.c to sync offsets files.
3573  */
3574 int
3575 multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3576 {
3577         return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3578 }
3579
3580 /*
3581  * Entrypoint for sync.c to sync members files.
3582  */
3583 int
3584 multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3585 {
3586         return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
3587 }