doc: Update links which returned 404
[pgsql.git] / src / backend / access / transam / multixact.c
blob27ccdf9500fb251632b080679de03ffe57edd6c7
1 /*-------------------------------------------------------------------------
3 * multixact.c
4 * PostgreSQL multi-transaction-log manager
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
60 * minimum value.
62 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
65 * src/backend/access/transam/multixact.c
67 *-------------------------------------------------------------------------
69 #include "postgres.h"
71 #include "access/multixact.h"
72 #include "access/slru.h"
73 #include "access/transam.h"
74 #include "access/twophase.h"
75 #include "access/twophase_rmgr.h"
76 #include "access/xact.h"
77 #include "access/xlog.h"
78 #include "access/xloginsert.h"
79 #include "access/xlogutils.h"
80 #include "commands/dbcommands.h"
81 #include "funcapi.h"
82 #include "lib/ilist.h"
83 #include "miscadmin.h"
84 #include "pg_trace.h"
85 #include "pgstat.h"
86 #include "postmaster/autovacuum.h"
87 #include "storage/pmsignal.h"
88 #include "storage/proc.h"
89 #include "storage/procarray.h"
90 #include "utils/fmgrprotos.h"
91 #include "utils/guc_hooks.h"
92 #include "utils/injection_point.h"
93 #include "utils/memutils.h"
97 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
98 * used everywhere else in Postgres.
100 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
101 * MultiXact page numbering also wraps around at
102 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
103 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
104 * take no explicit notice of that fact in this module, except when comparing
105 * segment and page numbers in TruncateMultiXact (see
106 * MultiXactOffsetPagePrecedes).
109 /* We need four bytes per offset */
110 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
112 static inline int64
113 MultiXactIdToOffsetPage(MultiXactId multi)
115 return multi / MULTIXACT_OFFSETS_PER_PAGE;
118 static inline int
119 MultiXactIdToOffsetEntry(MultiXactId multi)
121 return multi % MULTIXACT_OFFSETS_PER_PAGE;
124 static inline int64
125 MultiXactIdToOffsetSegment(MultiXactId multi)
127 return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT;
131 * The situation for members is a bit more complex: we store one byte of
132 * additional flag bits for each TransactionId. To do this without getting
133 * into alignment issues, we store four bytes of flags, and then the
134 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
135 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
136 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
137 * performance) trumps space efficiency here.
139 * Note that the "offset" macros work with byte offset, not array indexes, so
140 * arithmetic must be done using "char *" pointers.
142 /* We need eight bits per xact, so one xact fits in a byte */
143 #define MXACT_MEMBER_BITS_PER_XACT 8
144 #define MXACT_MEMBER_FLAGS_PER_BYTE 1
145 #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
147 /* how many full bytes of flags are there in a group? */
148 #define MULTIXACT_FLAGBYTES_PER_GROUP 4
149 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
150 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
151 /* size in bytes of a complete group */
152 #define MULTIXACT_MEMBERGROUP_SIZE \
153 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
154 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
155 #define MULTIXACT_MEMBERS_PER_PAGE \
156 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
159 * Because the number of items per page is not a divisor of the last item
160 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
161 * of pages, and moreover the last used page therein does not use the same
162 * number of items as previous pages. (Another way to say it is that the
163 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
164 * has some empty space after that item.)
166 * This constant is the number of members in the last page of the last segment.
168 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
169 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
171 /* page in which a member is to be found */
172 static inline int64
173 MXOffsetToMemberPage(MultiXactOffset offset)
175 return offset / MULTIXACT_MEMBERS_PER_PAGE;
178 static inline int64
179 MXOffsetToMemberSegment(MultiXactOffset offset)
181 return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT;
184 /* Location (byte offset within page) of flag word for a given member */
185 static inline int
186 MXOffsetToFlagsOffset(MultiXactOffset offset)
188 MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
189 int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
190 int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
192 return byteoff;
195 static inline int
196 MXOffsetToFlagsBitShift(MultiXactOffset offset)
198 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
199 int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
201 return bshift;
204 /* Location (byte offset within page) of TransactionId of given member */
205 static inline int
206 MXOffsetToMemberOffset(MultiXactOffset offset)
208 int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
210 return MXOffsetToFlagsOffset(offset) +
211 MULTIXACT_FLAGBYTES_PER_GROUP +
212 member_in_group * sizeof(TransactionId);
215 /* Multixact members wraparound thresholds. */
216 #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
217 #define MULTIXACT_MEMBER_DANGER_THRESHOLD \
218 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
220 static inline MultiXactId
221 PreviousMultiXactId(MultiXactId multi)
223 return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1;
227 * Links to shared-memory data structures for MultiXact control
229 static SlruCtlData MultiXactOffsetCtlData;
230 static SlruCtlData MultiXactMemberCtlData;
232 #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
233 #define MultiXactMemberCtl (&MultiXactMemberCtlData)
236 * MultiXact state shared across all backends. All this state is protected
237 * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
238 * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
239 * concurrency's sake, we avoid holding more than one of these locks at a
240 * time.)
242 typedef struct MultiXactStateData
244 /* next-to-be-assigned MultiXactId */
245 MultiXactId nextMXact;
247 /* next-to-be-assigned offset */
248 MultiXactOffset nextOffset;
250 /* Have we completed multixact startup? */
251 bool finishedStartup;
254 * Oldest multixact that is still potentially referenced by a relation.
255 * Anything older than this should not be consulted. These values are
256 * updated by vacuum.
258 MultiXactId oldestMultiXactId;
259 Oid oldestMultiXactDB;
262 * Oldest multixact offset that is potentially referenced by a multixact
263 * referenced by a relation. We don't always know this value, so there's
264 * a flag here to indicate whether or not we currently do.
266 MultiXactOffset oldestOffset;
267 bool oldestOffsetKnown;
269 /* support for anti-wraparound measures */
270 MultiXactId multiVacLimit;
271 MultiXactId multiWarnLimit;
272 MultiXactId multiStopLimit;
273 MultiXactId multiWrapLimit;
275 /* support for members anti-wraparound measures */
276 MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
279 * This is used to sleep until a multixact offset is written when we want
280 * to create the next one.
282 ConditionVariable nextoff_cv;
285 * Per-backend data starts here. We have two arrays stored in the area
286 * immediately following the MultiXactStateData struct. Each is indexed by
287 * ProcNumber.
289 * In both arrays, there's a slot for all normal backends
290 * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
291 * transactions.
293 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
294 * transaction(s) could possibly be a member of, or InvalidMultiXactId
295 * when the backend has no live transaction that could possibly be a
296 * member of a MultiXact. Each backend sets its entry to the current
297 * nextMXact counter just before first acquiring a shared lock in a given
298 * transaction, and clears it at transaction end. (This works because only
299 * during or after acquiring a shared lock could an XID possibly become a
300 * member of a MultiXact, and that MultiXact would have to be created
301 * during or after the lock acquisition.)
303 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
304 * current transaction(s) think is potentially live, or InvalidMultiXactId
305 * when not in a transaction or not in a transaction that's paid any
306 * attention to MultiXacts yet. This is computed when first needed in a
307 * given transaction, and cleared at transaction end. We can compute it
308 * as the minimum of the valid OldestMemberMXactId[] entries at the time
309 * we compute it (using nextMXact if none are valid). Each backend is
310 * required not to attempt to access any SLRU data for MultiXactIds older
311 * than its own OldestVisibleMXactId[] setting; this is necessary because
312 * the relevant SLRU data can be concurrently truncated away.
314 * The oldest valid value among all of the OldestMemberMXactId[] and
315 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
316 * possible value still having any live member transaction -- OldestMxact.
317 * Any value older than that is typically removed from tuple headers, or
318 * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
319 * remove an individual MultiXact xmax whose value is >= its OldestMxact
320 * cutoff, though typically only when no individual member XID is still
321 * running. See FreezeMultiXactId for full details.
323 * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
324 * or the oldest extant Multi remaining in the table is used as the new
325 * pg_class.relminmxid value (whichever is earlier). The minimum of all
326 * relminmxid values in each database is stored in pg_database.datminmxid.
327 * In turn, the minimum of all of those values is stored in pg_control.
328 * This is used as the truncation point for pg_multixact when unneeded
329 * segments get removed by vac_truncate_clog() during vacuuming.
331 MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
332 } MultiXactStateData;
335 * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
337 #define MaxOldestSlot (MaxBackends + max_prepared_xacts)
339 /* Pointers to the state data in shared memory */
340 static MultiXactStateData *MultiXactState;
341 static MultiXactId *OldestMemberMXactId;
342 static MultiXactId *OldestVisibleMXactId;
346 * Definitions for the backend-local MultiXactId cache.
348 * We use this cache to store known MultiXacts, so we don't need to go to
349 * SLRU areas every time.
351 * The cache lasts for the duration of a single transaction, the rationale
352 * for this being that most entries will contain our own TransactionId and
353 * so they will be uninteresting by the time our next transaction starts.
354 * (XXX not clear that this is correct --- other members of the MultiXact
355 * could hang around longer than we did. However, it's not clear what a
356 * better policy for flushing old cache entries would be.) FIXME actually
357 * this is plain wrong now that multixact's may contain update Xids.
359 * We allocate the cache entries in a memory context that is deleted at
360 * transaction end, so we don't need to do retail freeing of entries.
362 typedef struct mXactCacheEnt
364 MultiXactId multi;
365 int nmembers;
366 dlist_node node;
367 MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
368 } mXactCacheEnt;
370 #define MAX_CACHE_ENTRIES 256
371 static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache);
372 static MemoryContext MXactContext = NULL;
374 #ifdef MULTIXACT_DEBUG
375 #define debug_elog2(a,b) elog(a,b)
376 #define debug_elog3(a,b,c) elog(a,b,c)
377 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
378 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
379 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
380 #else
381 #define debug_elog2(a,b)
382 #define debug_elog3(a,b,c)
383 #define debug_elog4(a,b,c,d)
384 #define debug_elog5(a,b,c,d,e)
385 #define debug_elog6(a,b,c,d,e,f)
386 #endif
388 /* internal MultiXactId management */
389 static void MultiXactIdSetOldestVisible(void);
390 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
391 int nmembers, MultiXactMember *members);
392 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
394 /* MultiXact cache management */
395 static int mxactMemberComparator(const void *arg1, const void *arg2);
396 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
397 static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
398 static void mXactCachePut(MultiXactId multi, int nmembers,
399 MultiXactMember *members);
401 static char *mxstatus_to_string(MultiXactStatus status);
403 /* management of SLRU infrastructure */
404 static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog);
405 static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog);
406 static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2);
407 static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2);
408 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
409 MultiXactOffset offset2);
410 static void ExtendMultiXactOffset(MultiXactId multi);
411 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
412 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
413 MultiXactOffset start, uint32 distance);
414 static bool SetOffsetVacuumLimit(bool is_startup);
415 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
416 static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
417 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
418 MultiXactId startTruncOff,
419 MultiXactId endTruncOff,
420 MultiXactOffset startTruncMemb,
421 MultiXactOffset endTruncMemb);
425 * MultiXactIdCreate
426 * Construct a MultiXactId representing two TransactionIds.
428 * The two XIDs must be different, or be requesting different statuses.
430 * NB - we don't worry about our local MultiXactId cache here, because that
431 * is handled by the lower-level routines.
433 MultiXactId
434 MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
435 TransactionId xid2, MultiXactStatus status2)
437 MultiXactId newMulti;
438 MultiXactMember members[2];
440 Assert(TransactionIdIsValid(xid1));
441 Assert(TransactionIdIsValid(xid2));
443 Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
445 /* MultiXactIdSetOldestMember() must have been called already. */
446 Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
449 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
450 * are still running. In typical usage, xid2 will be our own XID and the
451 * caller just did a check on xid1, so it'd be wasted effort.
454 members[0].xid = xid1;
455 members[0].status = status1;
456 members[1].xid = xid2;
457 members[1].status = status2;
459 newMulti = MultiXactIdCreateFromMembers(2, members);
461 debug_elog3(DEBUG2, "Create: %s",
462 mxid_to_string(newMulti, 2, members));
464 return newMulti;
468 * MultiXactIdExpand
469 * Add a TransactionId to a pre-existing MultiXactId.
471 * If the TransactionId is already a member of the passed MultiXactId with the
472 * same status, just return it as-is.
474 * Note that we do NOT actually modify the membership of a pre-existing
475 * MultiXactId; instead we create a new one. This is necessary to avoid
476 * a race condition against code trying to wait for one MultiXactId to finish;
477 * see notes in heapam.c.
479 * NB - we don't worry about our local MultiXactId cache here, because that
480 * is handled by the lower-level routines.
482 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
483 * one upgraded by pg_upgrade from a cluster older than this feature) are not
484 * passed in.
486 MultiXactId
487 MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
489 MultiXactId newMulti;
490 MultiXactMember *members;
491 MultiXactMember *newMembers;
492 int nmembers;
493 int i;
494 int j;
496 Assert(MultiXactIdIsValid(multi));
497 Assert(TransactionIdIsValid(xid));
499 /* MultiXactIdSetOldestMember() must have been called already. */
500 Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]));
502 debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
503 multi, xid, mxstatus_to_string(status));
506 * Note: we don't allow for old multis here. The reason is that the only
507 * caller of this function does a check that the multixact is no longer
508 * running.
510 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
512 if (nmembers < 0)
514 MultiXactMember member;
517 * The MultiXactId is obsolete. This can only happen if all the
518 * MultiXactId members stop running between the caller checking and
519 * passing it to us. It would be better to return that fact to the
520 * caller, but it would complicate the API and it's unlikely to happen
521 * too often, so just deal with it by creating a singleton MultiXact.
523 member.xid = xid;
524 member.status = status;
525 newMulti = MultiXactIdCreateFromMembers(1, &member);
527 debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
528 multi, newMulti);
529 return newMulti;
533 * If the TransactionId is already a member of the MultiXactId with the
534 * same status, just return the existing MultiXactId.
536 for (i = 0; i < nmembers; i++)
538 if (TransactionIdEquals(members[i].xid, xid) &&
539 (members[i].status == status))
541 debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
542 xid, multi);
543 pfree(members);
544 return multi;
549 * Determine which of the members of the MultiXactId are still of
550 * interest. This is any running transaction, and also any transaction
551 * that grabbed something stronger than just a lock and was committed. (An
552 * update that aborted is of no interest here; and having more than one
553 * update Xid in a multixact would cause errors elsewhere.)
555 * Removing dead members is not just an optimization: freezing of tuples
556 * whose Xmax are multis depends on this behavior.
558 * Note we have the same race condition here as above: j could be 0 at the
559 * end of the loop.
561 newMembers = (MultiXactMember *)
562 palloc(sizeof(MultiXactMember) * (nmembers + 1));
564 for (i = 0, j = 0; i < nmembers; i++)
566 if (TransactionIdIsInProgress(members[i].xid) ||
567 (ISUPDATE_from_mxstatus(members[i].status) &&
568 TransactionIdDidCommit(members[i].xid)))
570 newMembers[j].xid = members[i].xid;
571 newMembers[j++].status = members[i].status;
575 newMembers[j].xid = xid;
576 newMembers[j++].status = status;
577 newMulti = MultiXactIdCreateFromMembers(j, newMembers);
579 pfree(members);
580 pfree(newMembers);
582 debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
584 return newMulti;
588 * MultiXactIdIsRunning
589 * Returns whether a MultiXactId is "running".
591 * We return true if at least one member of the given MultiXactId is still
592 * running. Note that a "false" result is certain not to change,
593 * because it is not legal to add members to an existing MultiXactId.
595 * Caller is expected to have verified that the multixact does not come from
596 * a pg_upgraded share-locked tuple.
598 bool
599 MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
601 MultiXactMember *members;
602 int nmembers;
603 int i;
605 debug_elog3(DEBUG2, "IsRunning %u?", multi);
608 * "false" here means we assume our callers have checked that the given
609 * multi cannot possibly come from a pg_upgraded database.
611 nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
613 if (nmembers <= 0)
615 debug_elog2(DEBUG2, "IsRunning: no members");
616 return false;
620 * Checking for myself is cheap compared to looking in shared memory;
621 * return true if any live subtransaction of the current top-level
622 * transaction is a member.
624 * This is not needed for correctness, it's just a fast path.
626 for (i = 0; i < nmembers; i++)
628 if (TransactionIdIsCurrentTransactionId(members[i].xid))
630 debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
631 pfree(members);
632 return true;
637 * This could be made faster by having another entry point in procarray.c,
638 * walking the PGPROC array only once for all the members. But in most
639 * cases nmembers should be small enough that it doesn't much matter.
641 for (i = 0; i < nmembers; i++)
643 if (TransactionIdIsInProgress(members[i].xid))
645 debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
646 i, members[i].xid);
647 pfree(members);
648 return true;
652 pfree(members);
654 debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
656 return false;
660 * MultiXactIdSetOldestMember
661 * Save the oldest MultiXactId this transaction could be a member of.
663 * We set the OldestMemberMXactId for a given transaction the first time it's
664 * going to do some operation that might require a MultiXactId (tuple lock,
665 * update or delete). We need to do this even if we end up using a
666 * TransactionId instead of a MultiXactId, because there is a chance that
667 * another transaction would add our XID to a MultiXactId.
669 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
670 * be called just before doing any such possibly-MultiXactId-able operation.
672 void
673 MultiXactIdSetOldestMember(void)
675 if (!MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber]))
677 MultiXactId nextMXact;
680 * You might think we don't need to acquire a lock here, since
681 * fetching and storing of TransactionIds is probably atomic, but in
682 * fact we do: suppose we pick up nextMXact and then lose the CPU for
683 * a long time. Someone else could advance nextMXact, and then
684 * another someone else could compute an OldestVisibleMXactId that
685 * would be after the value we are going to store when we get control
686 * back. Which would be wrong.
688 * Note that a shared lock is sufficient, because it's enough to stop
689 * someone from advancing nextMXact; and nobody else could be trying
690 * to write to our OldestMember entry, only reading (and we assume
691 * storing it is atomic.)
693 LWLockAcquire(MultiXactGenLock, LW_SHARED);
696 * We have to beware of the possibility that nextMXact is in the
697 * wrapped-around state. We don't fix the counter itself here, but we
698 * must be sure to store a valid value in our array entry.
700 nextMXact = MultiXactState->nextMXact;
701 if (nextMXact < FirstMultiXactId)
702 nextMXact = FirstMultiXactId;
704 OldestMemberMXactId[MyProcNumber] = nextMXact;
706 LWLockRelease(MultiXactGenLock);
708 debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
709 MyProcNumber, nextMXact);
714 * MultiXactIdSetOldestVisible
715 * Save the oldest MultiXactId this transaction considers possibly live.
717 * We set the OldestVisibleMXactId for a given transaction the first time
718 * it's going to inspect any MultiXactId. Once we have set this, we are
719 * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
720 * won't be truncated away.
722 * The value to set is the oldest of nextMXact and all the valid per-backend
723 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
724 * certain that no subsequent call to MultiXactIdSetOldestMember can set
725 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
726 * there is no live transaction, now or later, that can be a member of any
727 * MultiXactId older than the OldestVisibleMXactId we compute here.
729 static void
730 MultiXactIdSetOldestVisible(void)
732 if (!MultiXactIdIsValid(OldestVisibleMXactId[MyProcNumber]))
734 MultiXactId oldestMXact;
735 int i;
737 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
740 * We have to beware of the possibility that nextMXact is in the
741 * wrapped-around state. We don't fix the counter itself here, but we
742 * must be sure to store a valid value in our array entry.
744 oldestMXact = MultiXactState->nextMXact;
745 if (oldestMXact < FirstMultiXactId)
746 oldestMXact = FirstMultiXactId;
748 for (i = 0; i < MaxOldestSlot; i++)
750 MultiXactId thisoldest = OldestMemberMXactId[i];
752 if (MultiXactIdIsValid(thisoldest) &&
753 MultiXactIdPrecedes(thisoldest, oldestMXact))
754 oldestMXact = thisoldest;
757 OldestVisibleMXactId[MyProcNumber] = oldestMXact;
759 LWLockRelease(MultiXactGenLock);
761 debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
762 MyProcNumber, oldestMXact);
767 * ReadNextMultiXactId
768 * Return the next MultiXactId to be assigned, but don't allocate it
770 MultiXactId
771 ReadNextMultiXactId(void)
773 MultiXactId mxid;
775 /* XXX we could presumably do this without a lock. */
776 LWLockAcquire(MultiXactGenLock, LW_SHARED);
777 mxid = MultiXactState->nextMXact;
778 LWLockRelease(MultiXactGenLock);
780 if (mxid < FirstMultiXactId)
781 mxid = FirstMultiXactId;
783 return mxid;
787 * ReadMultiXactIdRange
788 * Get the range of IDs that may still be referenced by a relation.
790 void
791 ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
793 LWLockAcquire(MultiXactGenLock, LW_SHARED);
794 *oldest = MultiXactState->oldestMultiXactId;
795 *next = MultiXactState->nextMXact;
796 LWLockRelease(MultiXactGenLock);
798 if (*oldest < FirstMultiXactId)
799 *oldest = FirstMultiXactId;
800 if (*next < FirstMultiXactId)
801 *next = FirstMultiXactId;
806 * MultiXactIdCreateFromMembers
807 * Make a new MultiXactId from the specified set of members
809 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
810 * given TransactionIds as members. Returns the newly created MultiXactId.
812 * NB: the passed members[] array will be sorted in-place.
814 MultiXactId
815 MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
817 MultiXactId multi;
818 MultiXactOffset offset;
819 xl_multixact_create xlrec;
821 debug_elog3(DEBUG2, "Create: %s",
822 mxid_to_string(InvalidMultiXactId, nmembers, members));
825 * See if the same set of members already exists in our cache; if so, just
826 * re-use that MultiXactId. (Note: it might seem that looking in our
827 * cache is insufficient, and we ought to search disk to see if a
828 * duplicate definition already exists. But since we only ever create
829 * MultiXacts containing our own XID, in most cases any such MultiXacts
830 * were in fact created by us, and so will be in our cache. There are
831 * corner cases where someone else added us to a MultiXact without our
832 * knowledge, but it's not worth checking for.)
834 multi = mXactCacheGetBySet(nmembers, members);
835 if (MultiXactIdIsValid(multi))
837 debug_elog2(DEBUG2, "Create: in cache!");
838 return multi;
841 /* Verify that there is a single update Xid among the given members. */
843 int i;
844 bool has_update = false;
846 for (i = 0; i < nmembers; i++)
848 if (ISUPDATE_from_mxstatus(members[i].status))
850 if (has_update)
851 elog(ERROR, "new multixact has more than one updating member: %s",
852 mxid_to_string(InvalidMultiXactId, nmembers, members));
853 has_update = true;
858 /* Load the injection point before entering the critical section */
859 INJECTION_POINT_LOAD("multixact-create-from-members");
862 * Assign the MXID and offsets range to use, and make sure there is space
863 * in the OFFSETs and MEMBERs files. NB: this routine does
864 * START_CRIT_SECTION().
866 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
867 * that we've called MultiXactIdSetOldestMember here. This is because
868 * this routine is used in some places to create new MultiXactIds of which
869 * the current backend is not a member, notably during freezing of multis
870 * in vacuum. During vacuum, in particular, it would be unacceptable to
871 * keep OldestMulti set, in case it runs for long.
873 multi = GetNewMultiXactId(nmembers, &offset);
875 INJECTION_POINT_CACHED("multixact-create-from-members");
877 /* Make an XLOG entry describing the new MXID. */
878 xlrec.mid = multi;
879 xlrec.moff = offset;
880 xlrec.nmembers = nmembers;
883 * XXX Note: there's a lot of padding space in MultiXactMember. We could
884 * find a more compact representation of this Xlog record -- perhaps all
885 * the status flags in one XLogRecData, then all the xids in another one?
886 * Not clear that it's worth the trouble though.
888 XLogBeginInsert();
889 XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
890 XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
892 (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
894 /* Now enter the information into the OFFSETs and MEMBERs logs */
895 RecordNewMultiXact(multi, offset, nmembers, members);
897 /* Done with critical section */
898 END_CRIT_SECTION();
900 /* Store the new MultiXactId in the local cache, too */
901 mXactCachePut(multi, nmembers, members);
903 debug_elog2(DEBUG2, "Create: all done");
905 return multi;
909 * RecordNewMultiXact
910 * Write info about a new multixact into the offsets and members files
912 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
913 * use it.
915 static void
916 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
917 int nmembers, MultiXactMember *members)
919 int64 pageno;
920 int64 prev_pageno;
921 int entryno;
922 int slotno;
923 MultiXactOffset *offptr;
924 int i;
925 LWLock *lock;
926 LWLock *prevlock = NULL;
928 pageno = MultiXactIdToOffsetPage(multi);
929 entryno = MultiXactIdToOffsetEntry(multi);
931 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
932 LWLockAcquire(lock, LW_EXCLUSIVE);
935 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
936 * to complain about if there's any I/O error. This is kinda bogus, but
937 * since the errors will always give the full pathname, it should be clear
938 * enough that a MultiXactId is really involved. Perhaps someday we'll
939 * take the trouble to generalize the slru.c error reporting code.
941 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
942 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
943 offptr += entryno;
945 *offptr = offset;
947 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
949 /* Release MultiXactOffset SLRU lock. */
950 LWLockRelease(lock);
953 * If anybody was waiting to know the offset of this multixact ID we just
954 * wrote, they can read it now, so wake them up.
956 ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
958 prev_pageno = -1;
960 for (i = 0; i < nmembers; i++, offset++)
962 TransactionId *memberptr;
963 uint32 *flagsptr;
964 uint32 flagsval;
965 int bshift;
966 int flagsoff;
967 int memberoff;
969 Assert(members[i].status <= MultiXactStatusUpdate);
971 pageno = MXOffsetToMemberPage(offset);
972 memberoff = MXOffsetToMemberOffset(offset);
973 flagsoff = MXOffsetToFlagsOffset(offset);
974 bshift = MXOffsetToFlagsBitShift(offset);
976 if (pageno != prev_pageno)
979 * MultiXactMember SLRU page is changed so check if this new page
980 * fall into the different SLRU bank then release the old bank's
981 * lock and acquire lock on the new bank.
983 lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
984 if (lock != prevlock)
986 if (prevlock != NULL)
987 LWLockRelease(prevlock);
989 LWLockAcquire(lock, LW_EXCLUSIVE);
990 prevlock = lock;
992 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
993 prev_pageno = pageno;
996 memberptr = (TransactionId *)
997 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
999 *memberptr = members[i].xid;
1001 flagsptr = (uint32 *)
1002 (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1004 flagsval = *flagsptr;
1005 flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
1006 flagsval |= (members[i].status << bshift);
1007 *flagsptr = flagsval;
1009 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
1012 if (prevlock != NULL)
1013 LWLockRelease(prevlock);
1017 * GetNewMultiXactId
1018 * Get the next MultiXactId.
1020 * Also, reserve the needed amount of space in the "members" area. The
1021 * starting offset of the reserved space is returned in *offset.
1023 * This may generate XLOG records for expansion of the offsets and/or members
1024 * files. Unfortunately, we have to do that while holding MultiXactGenLock
1025 * to avoid race conditions --- the XLOG record for zeroing a page must appear
1026 * before any backend can possibly try to store data in that page!
1028 * We start a critical section before advancing the shared counters. The
1029 * caller must end the critical section after writing SLRU data.
1031 static MultiXactId
1032 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
1034 MultiXactId result;
1035 MultiXactOffset nextOffset;
1037 debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
1039 /* safety check, we should never get this far in a HS standby */
1040 if (RecoveryInProgress())
1041 elog(ERROR, "cannot assign MultiXactIds during recovery");
1043 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1045 /* Handle wraparound of the nextMXact counter */
1046 if (MultiXactState->nextMXact < FirstMultiXactId)
1047 MultiXactState->nextMXact = FirstMultiXactId;
1049 /* Assign the MXID */
1050 result = MultiXactState->nextMXact;
1052 /*----------
1053 * Check to see if it's safe to assign another MultiXactId. This protects
1054 * against catastrophic data loss due to multixact wraparound. The basic
1055 * rules are:
1057 * If we're past multiVacLimit or the safe threshold for member storage
1058 * space, or we don't know what the safe threshold for member storage is,
1059 * start trying to force autovacuum cycles.
1060 * If we're past multiWarnLimit, start issuing warnings.
1061 * If we're past multiStopLimit, refuse to create new MultiXactIds.
1063 * Note these are pretty much the same protections in GetNewTransactionId.
1064 *----------
1066 if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
1069 * For safety's sake, we release MultiXactGenLock while sending
1070 * signals, warnings, etc. This is not so much because we care about
1071 * preserving concurrency in this situation, as to avoid any
1072 * possibility of deadlock while doing get_database_name(). First,
1073 * copy all the shared values we'll need in this path.
1075 MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
1076 MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
1077 MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
1078 Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
1080 LWLockRelease(MultiXactGenLock);
1082 if (IsUnderPostmaster &&
1083 !MultiXactIdPrecedes(result, multiStopLimit))
1085 char *oldest_datname = get_database_name(oldest_datoid);
1088 * Immediately kick autovacuum into action as we're already in
1089 * ERROR territory.
1091 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1093 /* complain even if that DB has disappeared */
1094 if (oldest_datname)
1095 ereport(ERROR,
1096 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1097 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1098 oldest_datname),
1099 errhint("Execute a database-wide VACUUM in that database.\n"
1100 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1101 else
1102 ereport(ERROR,
1103 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1104 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1105 oldest_datoid),
1106 errhint("Execute a database-wide VACUUM in that database.\n"
1107 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1111 * To avoid swamping the postmaster with signals, we issue the autovac
1112 * request only once per 64K multis generated. This still gives
1113 * plenty of chances before we get into real trouble.
1115 if (IsUnderPostmaster && (result % 65536) == 0)
1116 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1118 if (!MultiXactIdPrecedes(result, multiWarnLimit))
1120 char *oldest_datname = get_database_name(oldest_datoid);
1122 /* complain even if that DB has disappeared */
1123 if (oldest_datname)
1124 ereport(WARNING,
1125 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1126 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1127 multiWrapLimit - result,
1128 oldest_datname,
1129 multiWrapLimit - result),
1130 errhint("Execute a database-wide VACUUM in that database.\n"
1131 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1132 else
1133 ereport(WARNING,
1134 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1135 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1136 multiWrapLimit - result,
1137 oldest_datoid,
1138 multiWrapLimit - result),
1139 errhint("Execute a database-wide VACUUM in that database.\n"
1140 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1143 /* Re-acquire lock and start over */
1144 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1145 result = MultiXactState->nextMXact;
1146 if (result < FirstMultiXactId)
1147 result = FirstMultiXactId;
1150 /* Make sure there is room for the MXID in the file. */
1151 ExtendMultiXactOffset(result);
1154 * Reserve the members space, similarly to above. Also, be careful not to
1155 * return zero as the starting offset for any multixact. See
1156 * GetMultiXactIdMembers() for motivation.
1158 nextOffset = MultiXactState->nextOffset;
1159 if (nextOffset == 0)
1161 *offset = 1;
1162 nmembers++; /* allocate member slot 0 too */
1164 else
1165 *offset = nextOffset;
1167 /*----------
1168 * Protect against overrun of the members space as well, with the
1169 * following rules:
1171 * If we're past offsetStopLimit, refuse to generate more multis.
1172 * If we're close to offsetStopLimit, emit a warning.
1174 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1175 * from offsetStopLimit.
1177 * Note we haven't updated the shared state yet, so if we fail at this
1178 * point, the multixact ID we grabbed can still be used by the next guy.
1180 * Note that there is no point in forcing autovacuum runs here: the
1181 * multixact freeze settings would have to be reduced for that to have any
1182 * effect.
1183 *----------
1185 #define OFFSET_WARN_SEGMENTS 20
1186 if (MultiXactState->oldestOffsetKnown &&
1187 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
1188 nmembers))
1190 /* see comment in the corresponding offsets wraparound case */
1191 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1193 ereport(ERROR,
1194 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1195 errmsg("multixact \"members\" limit exceeded"),
1196 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1197 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1198 MultiXactState->offsetStopLimit - nextOffset - 1,
1199 nmembers,
1200 MultiXactState->offsetStopLimit - nextOffset - 1),
1201 errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1202 MultiXactState->oldestMultiXactDB)));
1206 * Check whether we should kick autovacuum into action, to prevent members
1207 * wraparound. NB we use a much larger window to trigger autovacuum than
1208 * just the warning limit. The warning is just a measure of last resort -
1209 * this is in line with GetNewTransactionId's behaviour.
1211 if (!MultiXactState->oldestOffsetKnown ||
1212 (MultiXactState->nextOffset - MultiXactState->oldestOffset
1213 > MULTIXACT_MEMBER_SAFE_THRESHOLD))
1216 * To avoid swamping the postmaster with signals, we issue the autovac
1217 * request only when crossing a segment boundary. With default
1218 * compilation settings that's roughly after 50k members. This still
1219 * gives plenty of chances before we get into real trouble.
1221 if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
1222 (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
1223 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
1226 if (MultiXactState->oldestOffsetKnown &&
1227 MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
1228 nextOffset,
1229 nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
1230 ereport(WARNING,
1231 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1232 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1233 "database with OID %u must be vacuumed before %d more multixact members are used",
1234 MultiXactState->offsetStopLimit - nextOffset + nmembers,
1235 MultiXactState->oldestMultiXactDB,
1236 MultiXactState->offsetStopLimit - nextOffset + nmembers),
1237 errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1239 ExtendMultiXactMember(nextOffset, nmembers);
1242 * Critical section from here until caller has written the data into the
1243 * just-reserved SLRU space; we don't want to error out with a partly
1244 * written MultiXact structure. (In particular, failing to write our
1245 * start offset after advancing nextMXact would effectively corrupt the
1246 * previous MultiXact.)
1248 START_CRIT_SECTION();
1251 * Advance counters. As in GetNewTransactionId(), this must not happen
1252 * until after file extension has succeeded!
1254 * We don't care about MultiXactId wraparound here; it will be handled by
1255 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1256 * or the first value on a segment-beginning page after this routine
1257 * exits, so anyone else looking at the variable must be prepared to deal
1258 * with either case. Similarly, nextOffset may be zero, but we won't use
1259 * that as the actual start offset of the next multixact.
1261 (MultiXactState->nextMXact)++;
1263 MultiXactState->nextOffset += nmembers;
1265 LWLockRelease(MultiXactGenLock);
1267 debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
1268 return result;
1272 * GetMultiXactIdMembers
1273 * Return the set of MultiXactMembers that make up a MultiXactId
1275 * Return value is the number of members found, or -1 if there are none,
1276 * and *members is set to a newly palloc'ed array of members. It's the
1277 * caller's responsibility to free it when done with it.
1279 * from_pgupgrade must be passed as true if and only if only the multixact
1280 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1281 * installation and later pg_upgrade'd (that is, the infomask is
1282 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1283 * can still be running, so we return -1 just like for an empty multixact
1284 * without any further checking. It would be wrong to try to resolve such a
1285 * multixact: either the multixact is within the current valid multixact
1286 * range, in which case the returned result would be bogus, or outside that
1287 * range, in which case an error would be raised.
1289 * In all other cases, the passed multixact must be within the known valid
1290 * range, that is, greater to or equal than oldestMultiXactId, and less than
1291 * nextMXact. Otherwise, an error is raised.
1293 * isLockOnly must be set to true if caller is certain that the given multi
1294 * is used only to lock tuples; can be false without loss of correctness,
1295 * but passing a true means we can return quickly without checking for
1296 * old updates.
1299 GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
1300 bool from_pgupgrade, bool isLockOnly)
1302 int64 pageno;
1303 int64 prev_pageno;
1304 int entryno;
1305 int slotno;
1306 MultiXactOffset *offptr;
1307 MultiXactOffset offset;
1308 int length;
1309 int truelength;
1310 MultiXactId oldestMXact;
1311 MultiXactId nextMXact;
1312 MultiXactId tmpMXact;
1313 MultiXactOffset nextOffset;
1314 MultiXactMember *ptr;
1315 LWLock *lock;
1316 bool slept = false;
1318 debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
1320 if (!MultiXactIdIsValid(multi) || from_pgupgrade)
1322 *members = NULL;
1323 return -1;
1326 /* See if the MultiXactId is in the local cache */
1327 length = mXactCacheGetById(multi, members);
1328 if (length >= 0)
1330 debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
1331 mxid_to_string(multi, length, *members));
1332 return length;
1335 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1336 MultiXactIdSetOldestVisible();
1339 * If we know the multi is used only for locking and not for updates, then
1340 * we can skip checking if the value is older than our oldest visible
1341 * multi. It cannot possibly still be running.
1343 if (isLockOnly &&
1344 MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyProcNumber]))
1346 debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
1347 *members = NULL;
1348 return -1;
1352 * We check known limits on MultiXact before resorting to the SLRU area.
1354 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1355 * useful; it has already been removed, or will be removed shortly, by
1356 * truncation. If one is passed, an error is raised.
1358 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1359 * implies undetected ID wraparound has occurred. This raises a hard
1360 * error.
1362 * Shared lock is enough here since we aren't modifying any global state.
1363 * Acquire it just long enough to grab the current counter values. We may
1364 * need both nextMXact and nextOffset; see below.
1366 LWLockAcquire(MultiXactGenLock, LW_SHARED);
1368 oldestMXact = MultiXactState->oldestMultiXactId;
1369 nextMXact = MultiXactState->nextMXact;
1370 nextOffset = MultiXactState->nextOffset;
1372 LWLockRelease(MultiXactGenLock);
1374 if (MultiXactIdPrecedes(multi, oldestMXact))
1375 ereport(ERROR,
1376 (errcode(ERRCODE_INTERNAL_ERROR),
1377 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1378 multi)));
1380 if (!MultiXactIdPrecedes(multi, nextMXact))
1381 ereport(ERROR,
1382 (errcode(ERRCODE_INTERNAL_ERROR),
1383 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1384 multi)));
1387 * Find out the offset at which we need to start reading MultiXactMembers
1388 * and the number of members in the multixact. We determine the latter as
1389 * the difference between this multixact's starting offset and the next
1390 * one's. However, there are some corner cases to worry about:
1392 * 1. This multixact may be the latest one created, in which case there is
1393 * no next one to look at. In this case the nextOffset value we just
1394 * saved is the correct endpoint.
1396 * 2. The next multixact may still be in process of being filled in: that
1397 * is, another process may have done GetNewMultiXactId but not yet written
1398 * the offset entry for that ID. In that scenario, it is guaranteed that
1399 * the offset entry for that multixact exists (because GetNewMultiXactId
1400 * won't release MultiXactGenLock until it does) but contains zero
1401 * (because we are careful to pre-zero offset pages). Because
1402 * GetNewMultiXactId will never return zero as the starting offset for a
1403 * multixact, when we read zero as the next multixact's offset, we know we
1404 * have this case. We handle this by sleeping on the condition variable
1405 * we have just for this; the process in charge will signal the CV as soon
1406 * as it has finished writing the multixact offset.
1408 * 3. Because GetNewMultiXactId increments offset zero to offset one to
1409 * handle case #2, there is an ambiguity near the point of offset
1410 * wraparound. If we see next multixact's offset is one, is that our
1411 * multixact's actual endpoint, or did it end at zero with a subsequent
1412 * increment? We handle this using the knowledge that if the zero'th
1413 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1414 * transaction ID so it can't be a multixact member. Therefore, if we
1415 * read a zero from the members array, just ignore it.
1417 * This is all pretty messy, but the mess occurs only in infrequent corner
1418 * cases, so it seems better than holding the MultiXactGenLock for a long
1419 * time on every multixact creation.
1421 retry:
1422 pageno = MultiXactIdToOffsetPage(multi);
1423 entryno = MultiXactIdToOffsetEntry(multi);
1425 /* Acquire the bank lock for the page we need. */
1426 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1427 LWLockAcquire(lock, LW_EXCLUSIVE);
1429 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
1430 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1431 offptr += entryno;
1432 offset = *offptr;
1434 Assert(offset != 0);
1437 * Use the same increment rule as GetNewMultiXactId(), that is, don't
1438 * handle wraparound explicitly until needed.
1440 tmpMXact = multi + 1;
1442 if (nextMXact == tmpMXact)
1444 /* Corner case 1: there is no next multixact */
1445 length = nextOffset - offset;
1447 else
1449 MultiXactOffset nextMXOffset;
1451 /* handle wraparound if needed */
1452 if (tmpMXact < FirstMultiXactId)
1453 tmpMXact = FirstMultiXactId;
1455 prev_pageno = pageno;
1457 pageno = MultiXactIdToOffsetPage(tmpMXact);
1458 entryno = MultiXactIdToOffsetEntry(tmpMXact);
1460 if (pageno != prev_pageno)
1462 LWLock *newlock;
1465 * Since we're going to access a different SLRU page, if this page
1466 * falls under a different bank, release the old bank's lock and
1467 * acquire the lock of the new bank.
1469 newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
1470 if (newlock != lock)
1472 LWLockRelease(lock);
1473 LWLockAcquire(newlock, LW_EXCLUSIVE);
1474 lock = newlock;
1476 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
1479 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1480 offptr += entryno;
1481 nextMXOffset = *offptr;
1483 if (nextMXOffset == 0)
1485 /* Corner case 2: next multixact is still being filled in */
1486 LWLockRelease(lock);
1487 CHECK_FOR_INTERRUPTS();
1489 INJECTION_POINT("multixact-get-members-cv-sleep");
1491 ConditionVariableSleep(&MultiXactState->nextoff_cv,
1492 WAIT_EVENT_MULTIXACT_CREATION);
1493 slept = true;
1494 goto retry;
1497 length = nextMXOffset - offset;
1500 LWLockRelease(lock);
1501 lock = NULL;
1504 * If we slept above, clean up state; it's no longer needed.
1506 if (slept)
1507 ConditionVariableCancelSleep();
1509 ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
1511 truelength = 0;
1512 prev_pageno = -1;
1513 for (int i = 0; i < length; i++, offset++)
1515 TransactionId *xactptr;
1516 uint32 *flagsptr;
1517 int flagsoff;
1518 int bshift;
1519 int memberoff;
1521 pageno = MXOffsetToMemberPage(offset);
1522 memberoff = MXOffsetToMemberOffset(offset);
1524 if (pageno != prev_pageno)
1526 LWLock *newlock;
1529 * Since we're going to access a different SLRU page, if this page
1530 * falls under a different bank, release the old bank's lock and
1531 * acquire the lock of the new bank.
1533 newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
1534 if (newlock != lock)
1536 if (lock)
1537 LWLockRelease(lock);
1538 LWLockAcquire(newlock, LW_EXCLUSIVE);
1539 lock = newlock;
1542 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
1543 prev_pageno = pageno;
1546 xactptr = (TransactionId *)
1547 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
1549 if (!TransactionIdIsValid(*xactptr))
1551 /* Corner case 3: we must be looking at unused slot zero */
1552 Assert(offset == 0);
1553 continue;
1556 flagsoff = MXOffsetToFlagsOffset(offset);
1557 bshift = MXOffsetToFlagsBitShift(offset);
1558 flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
1560 ptr[truelength].xid = *xactptr;
1561 ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
1562 truelength++;
1565 LWLockRelease(lock);
1567 /* A multixid with zero members should not happen */
1568 Assert(truelength > 0);
1571 * Copy the result into the local cache.
1573 mXactCachePut(multi, truelength, ptr);
1575 debug_elog3(DEBUG2, "GetMembers: no cache for %s",
1576 mxid_to_string(multi, truelength, ptr));
1577 *members = ptr;
1578 return truelength;
1582 * mxactMemberComparator
1583 * qsort comparison function for MultiXactMember
1585 * We can't use wraparound comparison for XIDs because that does not respect
1586 * the triangle inequality! Any old sort order will do.
1588 static int
1589 mxactMemberComparator(const void *arg1, const void *arg2)
1591 MultiXactMember member1 = *(const MultiXactMember *) arg1;
1592 MultiXactMember member2 = *(const MultiXactMember *) arg2;
1594 if (member1.xid > member2.xid)
1595 return 1;
1596 if (member1.xid < member2.xid)
1597 return -1;
1598 if (member1.status > member2.status)
1599 return 1;
1600 if (member1.status < member2.status)
1601 return -1;
1602 return 0;
1606 * mXactCacheGetBySet
1607 * returns a MultiXactId from the cache based on the set of
1608 * TransactionIds that compose it, or InvalidMultiXactId if
1609 * none matches.
1611 * This is helpful, for example, if two transactions want to lock a huge
1612 * table. By using the cache, the second will use the same MultiXactId
1613 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1614 * both I/O and wraparound issues).
1616 * NB: the passed members array will be sorted in-place.
1618 static MultiXactId
1619 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
1621 dlist_iter iter;
1623 debug_elog3(DEBUG2, "CacheGet: looking for %s",
1624 mxid_to_string(InvalidMultiXactId, nmembers, members));
1626 /* sort the array so comparison is easy */
1627 qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1629 dclist_foreach(iter, &MXactCache)
1631 mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1632 iter.cur);
1634 if (entry->nmembers != nmembers)
1635 continue;
1638 * We assume the cache entries are sorted, and that the unused bits in
1639 * "status" are zeroed.
1641 if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
1643 debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
1644 dclist_move_head(&MXactCache, iter.cur);
1645 return entry->multi;
1649 debug_elog2(DEBUG2, "CacheGet: not found :-(");
1650 return InvalidMultiXactId;
1654 * mXactCacheGetById
1655 * returns the composing MultiXactMember set from the cache for a
1656 * given MultiXactId, if present.
1658 * If successful, *xids is set to the address of a palloc'd copy of the
1659 * MultiXactMember set. Return value is number of members, or -1 on failure.
1661 static int
1662 mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
1664 dlist_iter iter;
1666 debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
1668 dclist_foreach(iter, &MXactCache)
1670 mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node,
1671 iter.cur);
1673 if (entry->multi == multi)
1675 MultiXactMember *ptr;
1676 Size size;
1678 size = sizeof(MultiXactMember) * entry->nmembers;
1679 ptr = (MultiXactMember *) palloc(size);
1681 memcpy(ptr, entry->members, size);
1683 debug_elog3(DEBUG2, "CacheGet: found %s",
1684 mxid_to_string(multi,
1685 entry->nmembers,
1686 entry->members));
1689 * Note we modify the list while not using a modifiable iterator.
1690 * This is acceptable only because we exit the iteration
1691 * immediately afterwards.
1693 dclist_move_head(&MXactCache, iter.cur);
1695 *members = ptr;
1696 return entry->nmembers;
1700 debug_elog2(DEBUG2, "CacheGet: not found");
1701 return -1;
1705 * mXactCachePut
1706 * Add a new MultiXactId and its composing set into the local cache.
1708 static void
1709 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
1711 mXactCacheEnt *entry;
1713 debug_elog3(DEBUG2, "CachePut: storing %s",
1714 mxid_to_string(multi, nmembers, members));
1716 if (MXactContext == NULL)
1718 /* The cache only lives as long as the current transaction */
1719 debug_elog2(DEBUG2, "CachePut: initializing memory context");
1720 MXactContext = AllocSetContextCreate(TopTransactionContext,
1721 "MultiXact cache context",
1722 ALLOCSET_SMALL_SIZES);
1725 entry = (mXactCacheEnt *)
1726 MemoryContextAlloc(MXactContext,
1727 offsetof(mXactCacheEnt, members) +
1728 nmembers * sizeof(MultiXactMember));
1730 entry->multi = multi;
1731 entry->nmembers = nmembers;
1732 memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
1734 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1735 qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
1737 dclist_push_head(&MXactCache, &entry->node);
1738 if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES)
1740 dlist_node *node;
1742 node = dclist_tail_node(&MXactCache);
1743 dclist_delete_from(&MXactCache, node);
1745 entry = dclist_container(mXactCacheEnt, node, node);
1746 debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
1747 entry->multi);
1749 pfree(entry);
1753 static char *
1754 mxstatus_to_string(MultiXactStatus status)
1756 switch (status)
1758 case MultiXactStatusForKeyShare:
1759 return "keysh";
1760 case MultiXactStatusForShare:
1761 return "sh";
1762 case MultiXactStatusForNoKeyUpdate:
1763 return "fornokeyupd";
1764 case MultiXactStatusForUpdate:
1765 return "forupd";
1766 case MultiXactStatusNoKeyUpdate:
1767 return "nokeyupd";
1768 case MultiXactStatusUpdate:
1769 return "upd";
1770 default:
1771 elog(ERROR, "unrecognized multixact status %d", status);
1772 return "";
1776 char *
1777 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
1779 static char *str = NULL;
1780 StringInfoData buf;
1781 int i;
1783 if (str != NULL)
1784 pfree(str);
1786 initStringInfo(&buf);
1788 appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
1789 mxstatus_to_string(members[0].status));
1791 for (i = 1; i < nmembers; i++)
1792 appendStringInfo(&buf, ", %u (%s)", members[i].xid,
1793 mxstatus_to_string(members[i].status));
1795 appendStringInfoChar(&buf, ']');
1796 str = MemoryContextStrdup(TopMemoryContext, buf.data);
1797 pfree(buf.data);
1798 return str;
1802 * AtEOXact_MultiXact
1803 * Handle transaction end for MultiXact
1805 * This is called at top transaction commit or abort (we don't care which).
1807 void
1808 AtEOXact_MultiXact(void)
1811 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1812 * which should only be valid while within a transaction.
1814 * We assume that storing a MultiXactId is atomic and so we need not take
1815 * MultiXactGenLock to do this.
1817 OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1818 OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1821 * Discard the local MultiXactId cache. Since MXactContext was created as
1822 * a child of TopTransactionContext, we needn't delete it explicitly.
1824 MXactContext = NULL;
1825 dclist_init(&MXactCache);
1829 * AtPrepare_MultiXact
1830 * Save multixact state at 2PC transaction prepare
1832 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1833 * state file.
1835 void
1836 AtPrepare_MultiXact(void)
1838 MultiXactId myOldestMember = OldestMemberMXactId[MyProcNumber];
1840 if (MultiXactIdIsValid(myOldestMember))
1841 RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
1842 &myOldestMember, sizeof(MultiXactId));
1846 * PostPrepare_MultiXact
1847 * Clean up after successful PREPARE TRANSACTION
1849 void
1850 PostPrepare_MultiXact(TransactionId xid)
1852 MultiXactId myOldestMember;
1855 * Transfer our OldestMemberMXactId value to the slot reserved for the
1856 * prepared transaction.
1858 myOldestMember = OldestMemberMXactId[MyProcNumber];
1859 if (MultiXactIdIsValid(myOldestMember))
1861 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1864 * Even though storing MultiXactId is atomic, acquire lock to make
1865 * sure others see both changes, not just the reset of the slot of the
1866 * current backend. Using a volatile pointer might suffice, but this
1867 * isn't a hot spot.
1869 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
1871 OldestMemberMXactId[dummyProcNumber] = myOldestMember;
1872 OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId;
1874 LWLockRelease(MultiXactGenLock);
1878 * We don't need to transfer OldestVisibleMXactId value, because the
1879 * transaction is not going to be looking at any more multixacts once it's
1880 * prepared.
1882 * We assume that storing a MultiXactId is atomic and so we need not take
1883 * MultiXactGenLock to do this.
1885 OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId;
1888 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1890 MXactContext = NULL;
1891 dclist_init(&MXactCache);
1895 * multixact_twophase_recover
1896 * Recover the state of a prepared transaction at startup
1898 void
1899 multixact_twophase_recover(TransactionId xid, uint16 info,
1900 void *recdata, uint32 len)
1902 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
1903 MultiXactId oldestMember;
1906 * Get the oldest member XID from the state file record, and set it in the
1907 * OldestMemberMXactId slot reserved for this prepared transaction.
1909 Assert(len == sizeof(MultiXactId));
1910 oldestMember = *((MultiXactId *) recdata);
1912 OldestMemberMXactId[dummyProcNumber] = oldestMember;
1916 * multixact_twophase_postcommit
1917 * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1919 void
1920 multixact_twophase_postcommit(TransactionId xid, uint16 info,
1921 void *recdata, uint32 len)
1923 ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true);
1925 Assert(len == sizeof(MultiXactId));
1927 OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId;
1931 * multixact_twophase_postabort
1932 * This is actually just the same as the COMMIT case.
1934 void
1935 multixact_twophase_postabort(TransactionId xid, uint16 info,
1936 void *recdata, uint32 len)
1938 multixact_twophase_postcommit(xid, info, recdata, len);
1942 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1943 * thus double memory. Also, reserve space for the shared MultiXactState
1944 * struct and the per-backend MultiXactId arrays (two of those, too).
1946 Size
1947 MultiXactShmemSize(void)
1949 Size size;
1951 /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1952 #define SHARED_MULTIXACT_STATE_SIZE \
1953 add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1954 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1956 size = SHARED_MULTIXACT_STATE_SIZE;
1957 size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0));
1958 size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0));
1960 return size;
1963 void
1964 MultiXactShmemInit(void)
1966 bool found;
1968 debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
1970 MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
1971 MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
1973 SimpleLruInit(MultiXactOffsetCtl,
1974 "multixact_offset", multixact_offset_buffers, 0,
1975 "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
1976 LWTRANCHE_MULTIXACTOFFSET_SLRU,
1977 SYNC_HANDLER_MULTIXACT_OFFSET,
1978 false);
1979 SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
1980 SimpleLruInit(MultiXactMemberCtl,
1981 "multixact_member", multixact_member_buffers, 0,
1982 "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
1983 LWTRANCHE_MULTIXACTMEMBER_SLRU,
1984 SYNC_HANDLER_MULTIXACT_MEMBER,
1985 false);
1986 /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1988 /* Initialize our shared state struct */
1989 MultiXactState = ShmemInitStruct("Shared MultiXact State",
1990 SHARED_MULTIXACT_STATE_SIZE,
1991 &found);
1992 if (!IsUnderPostmaster)
1994 Assert(!found);
1996 /* Make sure we zero out the per-backend state */
1997 MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1998 ConditionVariableInit(&MultiXactState->nextoff_cv);
2000 else
2001 Assert(found);
2004 * Set up array pointers.
2006 OldestMemberMXactId = MultiXactState->perBackendXactIds;
2007 OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
2011 * GUC check_hook for multixact_offset_buffers
2013 bool
2014 check_multixact_offset_buffers(int *newval, void **extra, GucSource source)
2016 return check_slru_buffers("multixact_offset_buffers", newval);
2020 * GUC check_hook for multixact_member_buffers
2022 bool
2023 check_multixact_member_buffers(int *newval, void **extra, GucSource source)
2025 return check_slru_buffers("multixact_member_buffers", newval);
2029 * This func must be called ONCE on system install. It creates the initial
2030 * MultiXact segments. (The MultiXacts directories are assumed to have been
2031 * created by initdb, and MultiXactShmemInit must have been called already.)
2033 void
2034 BootStrapMultiXact(void)
2036 int slotno;
2037 LWLock *lock;
2039 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0);
2040 LWLockAcquire(lock, LW_EXCLUSIVE);
2042 /* Create and zero the first page of the offsets log */
2043 slotno = ZeroMultiXactOffsetPage(0, false);
2045 /* Make sure it's written out */
2046 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
2047 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
2049 LWLockRelease(lock);
2051 lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0);
2052 LWLockAcquire(lock, LW_EXCLUSIVE);
2054 /* Create and zero the first page of the members log */
2055 slotno = ZeroMultiXactMemberPage(0, false);
2057 /* Make sure it's written out */
2058 SimpleLruWritePage(MultiXactMemberCtl, slotno);
2059 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
2061 LWLockRelease(lock);
2065 * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
2066 * If writeXlog is true, also emit an XLOG record saying we did this.
2068 * The page is not actually written, just set up in shared memory.
2069 * The slot number of the new page is returned.
2071 * Control lock must be held at entry, and will be held at exit.
2073 static int
2074 ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
2076 int slotno;
2078 slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2080 if (writeXlog)
2081 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
2083 return slotno;
2087 * Ditto, for MultiXactMember
2089 static int
2090 ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
2092 int slotno;
2094 slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
2096 if (writeXlog)
2097 WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
2099 return slotno;
2103 * MaybeExtendOffsetSlru
2104 * Extend the offsets SLRU area, if necessary
2106 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2107 * contain files that are shorter than necessary; this would occur if the old
2108 * installation had used multixacts beyond the first page (files cannot be
2109 * copied, because the on-disk representation is different). pg_upgrade would
2110 * update pg_control to set the next offset value to be at that position, so
2111 * that tuples marked as locked by such MultiXacts would be seen as visible
2112 * without having to consult multixact. However, trying to create and use a
2113 * new MultiXactId would result in an error because the page on which the new
2114 * value would reside does not exist. This routine is in charge of creating
2115 * such pages.
2117 static void
2118 MaybeExtendOffsetSlru(void)
2120 int64 pageno;
2121 LWLock *lock;
2123 pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
2124 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2126 LWLockAcquire(lock, LW_EXCLUSIVE);
2128 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2130 int slotno;
2133 * Fortunately for us, SimpleLruWritePage is already prepared to deal
2134 * with creating a new segment file even if the page we're writing is
2135 * not the first in it, so this is enough.
2137 slotno = ZeroMultiXactOffsetPage(pageno, false);
2138 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
2141 LWLockRelease(lock);
2145 * This must be called ONCE during postmaster or standalone-backend startup.
2147 * StartupXLOG has already established nextMXact/nextOffset by calling
2148 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2149 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2150 * replayed WAL.
2152 void
2153 StartupMultiXact(void)
2155 MultiXactId multi = MultiXactState->nextMXact;
2156 MultiXactOffset offset = MultiXactState->nextOffset;
2157 int64 pageno;
2160 * Initialize offset's idea of the latest page number.
2162 pageno = MultiXactIdToOffsetPage(multi);
2163 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2164 pageno);
2167 * Initialize member's idea of the latest page number.
2169 pageno = MXOffsetToMemberPage(offset);
2170 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2171 pageno);
2175 * This must be called ONCE at the end of startup/recovery.
2177 void
2178 TrimMultiXact(void)
2180 MultiXactId nextMXact;
2181 MultiXactOffset offset;
2182 MultiXactId oldestMXact;
2183 Oid oldestMXactDB;
2184 int64 pageno;
2185 int entryno;
2186 int flagsoff;
2188 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2189 nextMXact = MultiXactState->nextMXact;
2190 offset = MultiXactState->nextOffset;
2191 oldestMXact = MultiXactState->oldestMultiXactId;
2192 oldestMXactDB = MultiXactState->oldestMultiXactDB;
2193 LWLockRelease(MultiXactGenLock);
2195 /* Clean up offsets state */
2198 * (Re-)Initialize our idea of the latest page number for offsets.
2200 pageno = MultiXactIdToOffsetPage(nextMXact);
2201 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
2202 pageno);
2205 * Zero out the remainder of the current offsets page. See notes in
2206 * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2207 * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2208 * rule "write xlog before data," nextMXact successors may carry obsolete,
2209 * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2210 * operates normally.
2212 entryno = MultiXactIdToOffsetEntry(nextMXact);
2213 if (entryno != 0)
2215 int slotno;
2216 MultiXactOffset *offptr;
2217 LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2219 LWLockAcquire(lock, LW_EXCLUSIVE);
2220 slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2221 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2222 offptr += entryno;
2224 MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2226 MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
2227 LWLockRelease(lock);
2231 * And the same for members.
2233 * (Re-)Initialize our idea of the latest page number for members.
2235 pageno = MXOffsetToMemberPage(offset);
2236 pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number,
2237 pageno);
2240 * Zero out the remainder of the current members page. See notes in
2241 * TrimCLOG() for motivation.
2243 flagsoff = MXOffsetToFlagsOffset(offset);
2244 if (flagsoff != 0)
2246 int slotno;
2247 TransactionId *xidptr;
2248 int memberoff;
2249 LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2251 LWLockAcquire(lock, LW_EXCLUSIVE);
2252 memberoff = MXOffsetToMemberOffset(offset);
2253 slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
2254 xidptr = (TransactionId *)
2255 (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
2257 MemSet(xidptr, 0, BLCKSZ - memberoff);
2260 * Note: we don't need to zero out the flag bits in the remaining
2261 * members of the current group, because they are always reset before
2262 * writing.
2265 MultiXactMemberCtl->shared->page_dirty[slotno] = true;
2266 LWLockRelease(lock);
2269 /* signal that we're officially up */
2270 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2271 MultiXactState->finishedStartup = true;
2272 LWLockRelease(MultiXactGenLock);
2274 /* Now compute how far away the next members wraparound is. */
2275 SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
2279 * Get the MultiXact data to save in a checkpoint record
2281 void
2282 MultiXactGetCheckptMulti(bool is_shutdown,
2283 MultiXactId *nextMulti,
2284 MultiXactOffset *nextMultiOffset,
2285 MultiXactId *oldestMulti,
2286 Oid *oldestMultiDB)
2288 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2289 *nextMulti = MultiXactState->nextMXact;
2290 *nextMultiOffset = MultiXactState->nextOffset;
2291 *oldestMulti = MultiXactState->oldestMultiXactId;
2292 *oldestMultiDB = MultiXactState->oldestMultiXactDB;
2293 LWLockRelease(MultiXactGenLock);
2295 debug_elog6(DEBUG2,
2296 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2297 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
2301 * Perform a checkpoint --- either during shutdown, or on-the-fly
2303 void
2304 CheckPointMultiXact(void)
2306 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2309 * Write dirty MultiXact pages to disk. This may result in sync requests
2310 * queued for later handling by ProcessSyncRequests(), as part of the
2311 * checkpoint.
2313 SimpleLruWriteAll(MultiXactOffsetCtl, true);
2314 SimpleLruWriteAll(MultiXactMemberCtl, true);
2316 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2320 * Set the next-to-be-assigned MultiXactId and offset
2322 * This is used when we can determine the correct next ID/offset exactly
2323 * from a checkpoint record. Although this is only called during bootstrap
2324 * and XLog replay, we take the lock in case any hot-standby backends are
2325 * examining the values.
2327 void
2328 MultiXactSetNextMXact(MultiXactId nextMulti,
2329 MultiXactOffset nextMultiOffset)
2331 debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
2332 nextMulti, nextMultiOffset);
2333 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2334 MultiXactState->nextMXact = nextMulti;
2335 MultiXactState->nextOffset = nextMultiOffset;
2336 LWLockRelease(MultiXactGenLock);
2339 * During a binary upgrade, make sure that the offsets SLRU is large
2340 * enough to contain the next value that would be created.
2342 * We need to do this pretty early during the first startup in binary
2343 * upgrade mode: before StartupMultiXact() in fact, because this routine
2344 * is called even before that by StartupXLOG(). And we can't do it
2345 * earlier than at this point, because during that first call of this
2346 * routine we determine the MultiXactState->nextMXact value that
2347 * MaybeExtendOffsetSlru needs.
2349 if (IsBinaryUpgrade)
2350 MaybeExtendOffsetSlru();
2354 * Determine the last safe MultiXactId to allocate given the currently oldest
2355 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2356 * of our cluster), and the OID of the (or a) database with that value.
2358 * is_startup is true when we are just starting the cluster, false when we
2359 * are updating state in a running cluster. This only affects log messages.
2361 void
2362 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
2363 bool is_startup)
2365 MultiXactId multiVacLimit;
2366 MultiXactId multiWarnLimit;
2367 MultiXactId multiStopLimit;
2368 MultiXactId multiWrapLimit;
2369 MultiXactId curMulti;
2370 bool needs_offset_vacuum;
2372 Assert(MultiXactIdIsValid(oldest_datminmxid));
2375 * We pretend that a wrap will happen halfway through the multixact ID
2376 * space, but that's not really true, because multixacts wrap differently
2377 * from transaction IDs. Note that, separately from any concern about
2378 * multixact IDs wrapping, we must ensure that multixact members do not
2379 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2381 multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
2382 if (multiWrapLimit < FirstMultiXactId)
2383 multiWrapLimit += FirstMultiXactId;
2386 * We'll refuse to continue assigning MultiXactIds once we get within 3M
2387 * multi of data loss. See SetTransactionIdLimit.
2389 multiStopLimit = multiWrapLimit - 3000000;
2390 if (multiStopLimit < FirstMultiXactId)
2391 multiStopLimit -= FirstMultiXactId;
2394 * We'll start complaining loudly when we get within 40M multis of data
2395 * loss. This is kind of arbitrary, but if you let your gas gauge get
2396 * down to 2% of full, would you be looking for the next gas station? We
2397 * need to be fairly liberal about this number because there are lots of
2398 * scenarios where most transactions are done by automatic clients that
2399 * won't pay attention to warnings. (No, we're not gonna make this
2400 * configurable. If you know enough to configure it, you know enough to
2401 * not get in this kind of trouble in the first place.)
2403 multiWarnLimit = multiWrapLimit - 40000000;
2404 if (multiWarnLimit < FirstMultiXactId)
2405 multiWarnLimit -= FirstMultiXactId;
2408 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2409 * be more than autovacuum_multixact_freeze_max_age mxids old.
2411 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2412 * so that we don't have to worry about dealing with on-the-fly changes in
2413 * its value. See SetTransactionIdLimit.
2415 multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
2416 if (multiVacLimit < FirstMultiXactId)
2417 multiVacLimit += FirstMultiXactId;
2419 /* Grab lock for just long enough to set the new limit values */
2420 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2421 MultiXactState->oldestMultiXactId = oldest_datminmxid;
2422 MultiXactState->oldestMultiXactDB = oldest_datoid;
2423 MultiXactState->multiVacLimit = multiVacLimit;
2424 MultiXactState->multiWarnLimit = multiWarnLimit;
2425 MultiXactState->multiStopLimit = multiStopLimit;
2426 MultiXactState->multiWrapLimit = multiWrapLimit;
2427 curMulti = MultiXactState->nextMXact;
2428 LWLockRelease(MultiXactGenLock);
2430 /* Log the info */
2431 ereport(DEBUG1,
2432 (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2433 multiWrapLimit, oldest_datoid)));
2436 * Computing the actual limits is only possible once the data directory is
2437 * in a consistent state. There's no need to compute the limits while
2438 * still replaying WAL - no decisions about new multis are made even
2439 * though multixact creations might be replayed. So we'll only do further
2440 * checks after TrimMultiXact() has been called.
2442 if (!MultiXactState->finishedStartup)
2443 return;
2445 Assert(!InRecovery);
2447 /* Set limits for offset vacuum. */
2448 needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
2451 * If past the autovacuum force point, immediately signal an autovac
2452 * request. The reason for this is that autovac only processes one
2453 * database per invocation. Once it's finished cleaning up the oldest
2454 * database, it'll call here, and we'll signal the postmaster to start
2455 * another iteration immediately if there are still any old databases.
2457 if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
2458 needs_offset_vacuum) && IsUnderPostmaster)
2459 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
2461 /* Give an immediate warning if past the wrap warn point */
2462 if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
2464 char *oldest_datname;
2467 * We can be called when not inside a transaction, for example during
2468 * StartupXLOG(). In such a case we cannot do database access, so we
2469 * must just report the oldest DB's OID.
2471 * Note: it's also possible that get_database_name fails and returns
2472 * NULL, for example because the database just got dropped. We'll
2473 * still warn, even though the warning might now be unnecessary.
2475 if (IsTransactionState())
2476 oldest_datname = get_database_name(oldest_datoid);
2477 else
2478 oldest_datname = NULL;
2480 if (oldest_datname)
2481 ereport(WARNING,
2482 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2483 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2484 multiWrapLimit - curMulti,
2485 oldest_datname,
2486 multiWrapLimit - curMulti),
2487 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2488 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2489 else
2490 ereport(WARNING,
2491 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2492 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2493 multiWrapLimit - curMulti,
2494 oldest_datoid,
2495 multiWrapLimit - curMulti),
2496 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2497 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2502 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2503 * and similarly nextOffset is at least minMultiOffset.
2505 * This is used when we can determine minimum safe values from an XLog
2506 * record (either an on-line checkpoint or an mxact creation log entry).
2507 * Although this is only called during XLog replay, we take the lock in case
2508 * any hot-standby backends are examining the values.
2510 void
2511 MultiXactAdvanceNextMXact(MultiXactId minMulti,
2512 MultiXactOffset minMultiOffset)
2514 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2515 if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
2517 debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
2518 MultiXactState->nextMXact = minMulti;
2520 if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
2522 debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
2523 minMultiOffset);
2524 MultiXactState->nextOffset = minMultiOffset;
2526 LWLockRelease(MultiXactGenLock);
2530 * Update our oldestMultiXactId value, but only if it's more recent than what
2531 * we had.
2533 * This may only be called during WAL replay.
2535 void
2536 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
2538 Assert(InRecovery);
2540 if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
2541 SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
2545 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2547 * NB: this is called while holding MultiXactGenLock. We want it to be very
2548 * fast most of the time; even when it's not so fast, no actual I/O need
2549 * happen unless we're forced to write out a dirty log or xlog page to make
2550 * room in shared memory.
2552 static void
2553 ExtendMultiXactOffset(MultiXactId multi)
2555 int64 pageno;
2556 LWLock *lock;
2559 * No work except at first MultiXactId of a page. But beware: just after
2560 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2562 if (MultiXactIdToOffsetEntry(multi) != 0 &&
2563 multi != FirstMultiXactId)
2564 return;
2566 pageno = MultiXactIdToOffsetPage(multi);
2567 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
2569 LWLockAcquire(lock, LW_EXCLUSIVE);
2571 /* Zero the page and make an XLOG entry about it */
2572 ZeroMultiXactOffsetPage(pageno, true);
2574 LWLockRelease(lock);
2578 * Make sure that MultiXactMember has room for the members of a newly-
2579 * allocated MultiXactId.
2581 * Like the above routine, this is called while holding MultiXactGenLock;
2582 * same comments apply.
2584 static void
2585 ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
2588 * It's possible that the members span more than one page of the members
2589 * file, so we loop to ensure we consider each page. The coding is not
2590 * optimal if the members span several pages, but that seems unusual
2591 * enough to not worry much about.
2593 while (nmembers > 0)
2595 int flagsoff;
2596 int flagsbit;
2597 uint32 difference;
2600 * Only zero when at first entry of a page.
2602 flagsoff = MXOffsetToFlagsOffset(offset);
2603 flagsbit = MXOffsetToFlagsBitShift(offset);
2604 if (flagsoff == 0 && flagsbit == 0)
2606 int64 pageno;
2607 LWLock *lock;
2609 pageno = MXOffsetToMemberPage(offset);
2610 lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
2612 LWLockAcquire(lock, LW_EXCLUSIVE);
2614 /* Zero the page and make an XLOG entry about it */
2615 ZeroMultiXactMemberPage(pageno, true);
2617 LWLockRelease(lock);
2621 * Compute the number of items till end of current page. Careful: if
2622 * addition of unsigned ints wraps around, we're at the last page of
2623 * the last segment; since that page holds a different number of items
2624 * than other pages, we need to do it differently.
2626 if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
2629 * This is the last page of the last segment; we can compute the
2630 * number of items left to allocate in it without modulo
2631 * arithmetic.
2633 difference = MaxMultiXactOffset - offset + 1;
2635 else
2636 difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
2639 * Advance to next page, taking care to properly handle the wraparound
2640 * case. OK if nmembers goes negative.
2642 nmembers -= difference;
2643 offset += difference;
2648 * GetOldestMultiXactId
2650 * Return the oldest MultiXactId that's still possibly still seen as live by
2651 * any running transaction. Older ones might still exist on disk, but they no
2652 * longer have any running member transaction.
2654 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2655 * this function; however, it can be set as the new relminmxid for any table
2656 * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2657 * to truncate SLRUs when no table can possibly still have a referencing MXID.
2659 MultiXactId
2660 GetOldestMultiXactId(void)
2662 MultiXactId oldestMXact;
2663 MultiXactId nextMXact;
2664 int i;
2667 * This is the oldest valid value among all the OldestMemberMXactId[] and
2668 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2670 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2673 * We have to beware of the possibility that nextMXact is in the
2674 * wrapped-around state. We don't fix the counter itself here, but we
2675 * must be sure to use a valid value in our calculation.
2677 nextMXact = MultiXactState->nextMXact;
2678 if (nextMXact < FirstMultiXactId)
2679 nextMXact = FirstMultiXactId;
2681 oldestMXact = nextMXact;
2682 for (i = 0; i < MaxOldestSlot; i++)
2684 MultiXactId thisoldest;
2686 thisoldest = OldestMemberMXactId[i];
2687 if (MultiXactIdIsValid(thisoldest) &&
2688 MultiXactIdPrecedes(thisoldest, oldestMXact))
2689 oldestMXact = thisoldest;
2690 thisoldest = OldestVisibleMXactId[i];
2691 if (MultiXactIdIsValid(thisoldest) &&
2692 MultiXactIdPrecedes(thisoldest, oldestMXact))
2693 oldestMXact = thisoldest;
2696 LWLockRelease(MultiXactGenLock);
2698 return oldestMXact;
2702 * Determine how aggressively we need to vacuum in order to prevent member
2703 * wraparound.
2705 * To do so determine what's the oldest member offset and install the limit
2706 * info in MultiXactState, where it can be used to prevent overrun of old data
2707 * in the members SLRU area.
2709 * The return value is true if emergency autovacuum is required and false
2710 * otherwise.
2712 static bool
2713 SetOffsetVacuumLimit(bool is_startup)
2715 MultiXactId oldestMultiXactId;
2716 MultiXactId nextMXact;
2717 MultiXactOffset oldestOffset = 0; /* placate compiler */
2718 MultiXactOffset prevOldestOffset;
2719 MultiXactOffset nextOffset;
2720 bool oldestOffsetKnown = false;
2721 bool prevOldestOffsetKnown;
2722 MultiXactOffset offsetStopLimit = 0;
2723 MultiXactOffset prevOffsetStopLimit;
2726 * NB: Have to prevent concurrent truncation, we might otherwise try to
2727 * lookup an oldestMulti that's concurrently getting truncated away.
2729 LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
2731 /* Read relevant fields from shared memory. */
2732 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2733 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2734 nextMXact = MultiXactState->nextMXact;
2735 nextOffset = MultiXactState->nextOffset;
2736 prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2737 prevOldestOffset = MultiXactState->oldestOffset;
2738 prevOffsetStopLimit = MultiXactState->offsetStopLimit;
2739 Assert(MultiXactState->finishedStartup);
2740 LWLockRelease(MultiXactGenLock);
2743 * Determine the offset of the oldest multixact. Normally, we can read
2744 * the offset from the multixact itself, but there's an important special
2745 * case: if there are no multixacts in existence at all, oldestMXact
2746 * obviously can't point to one. It will instead point to the multixact
2747 * ID that will be assigned the next time one is needed.
2749 if (oldestMultiXactId == nextMXact)
2752 * When the next multixact gets created, it will be stored at the next
2753 * offset.
2755 oldestOffset = nextOffset;
2756 oldestOffsetKnown = true;
2758 else
2761 * Figure out where the oldest existing multixact's offsets are
2762 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2763 * the supposedly-earliest multixact might not really exist. We are
2764 * careful not to fail in that case.
2766 oldestOffsetKnown =
2767 find_multixact_start(oldestMultiXactId, &oldestOffset);
2769 if (oldestOffsetKnown)
2770 ereport(DEBUG1,
2771 (errmsg_internal("oldest MultiXactId member is at offset %u",
2772 oldestOffset)));
2773 else
2774 ereport(LOG,
2775 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2776 oldestMultiXactId)));
2779 LWLockRelease(MultiXactTruncationLock);
2782 * If we can, compute limits (and install them MultiXactState) to prevent
2783 * overrun of old data in the members SLRU area. We can only do so if the
2784 * oldest offset is known though.
2786 if (oldestOffsetKnown)
2788 /* move back to start of the corresponding segment */
2789 offsetStopLimit = oldestOffset - (oldestOffset %
2790 (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
2792 /* always leave one segment before the wraparound point */
2793 offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
2795 if (!prevOldestOffsetKnown && !is_startup)
2796 ereport(LOG,
2797 (errmsg("MultiXact member wraparound protections are now enabled")));
2799 ereport(DEBUG1,
2800 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2801 offsetStopLimit, oldestMultiXactId)));
2803 else if (prevOldestOffsetKnown)
2806 * If we failed to get the oldest offset this time, but we have a
2807 * value from a previous pass through this function, use the old
2808 * values rather than automatically forcing an emergency autovacuum
2809 * cycle again.
2811 oldestOffset = prevOldestOffset;
2812 oldestOffsetKnown = true;
2813 offsetStopLimit = prevOffsetStopLimit;
2816 /* Install the computed values */
2817 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
2818 MultiXactState->oldestOffset = oldestOffset;
2819 MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
2820 MultiXactState->offsetStopLimit = offsetStopLimit;
2821 LWLockRelease(MultiXactGenLock);
2824 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2826 return !oldestOffsetKnown ||
2827 (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
2831 * Return whether adding "distance" to "start" would move past "boundary".
2833 * We use this to determine whether the addition is "wrapping around" the
2834 * boundary point, hence the name. The reason we don't want to use the regular
2835 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2836 * the 2^32-1 space here, allowing for more multixacts than would fit
2837 * otherwise.
2839 static bool
2840 MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
2841 uint32 distance)
2843 MultiXactOffset finish;
2846 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2847 * if the addition wraps around the UINT_MAX boundary, skip that value.
2849 finish = start + distance;
2850 if (finish < start)
2851 finish++;
2853 /*-----------------------------------------------------------------------
2854 * When the boundary is numerically greater than the starting point, any
2855 * value numerically between the two is not wrapped:
2857 * <----S----B---->
2858 * [---) = F wrapped past B (and UINT_MAX)
2859 * [---) = F not wrapped
2860 * [----] = F wrapped past B
2862 * When the boundary is numerically less than the starting point (i.e. the
2863 * UINT_MAX wraparound occurs somewhere in between) then all values in
2864 * between are wrapped:
2866 * <----B----S---->
2867 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2868 * [---) = F wrapped past B (and UINT_MAX)
2869 * [----] = F not wrapped
2870 *-----------------------------------------------------------------------
2872 if (start < boundary)
2873 return finish >= boundary || finish < start;
2874 else
2875 return finish >= boundary && finish < start;
2879 * Find the starting offset of the given MultiXactId.
2881 * Returns false if the file containing the multi does not exist on disk.
2882 * Otherwise, returns true and sets *result to the starting member offset.
2884 * This function does not prevent concurrent truncation, so if that's
2885 * required, the caller has to protect against that.
2887 static bool
2888 find_multixact_start(MultiXactId multi, MultiXactOffset *result)
2890 MultiXactOffset offset;
2891 int64 pageno;
2892 int entryno;
2893 int slotno;
2894 MultiXactOffset *offptr;
2896 Assert(MultiXactState->finishedStartup);
2898 pageno = MultiXactIdToOffsetPage(multi);
2899 entryno = MultiXactIdToOffsetEntry(multi);
2902 * Write out dirty data, so PhysicalPageExists can work correctly.
2904 SimpleLruWriteAll(MultiXactOffsetCtl, true);
2905 SimpleLruWriteAll(MultiXactMemberCtl, true);
2907 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
2908 return false;
2910 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2911 slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
2912 offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
2913 offptr += entryno;
2914 offset = *offptr;
2915 LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
2917 *result = offset;
2918 return true;
2922 * Determine how many multixacts, and how many multixact members, currently
2923 * exist. Return false if unable to determine.
2925 static bool
2926 ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
2928 MultiXactOffset nextOffset;
2929 MultiXactOffset oldestOffset;
2930 MultiXactId oldestMultiXactId;
2931 MultiXactId nextMultiXactId;
2932 bool oldestOffsetKnown;
2934 LWLockAcquire(MultiXactGenLock, LW_SHARED);
2935 nextOffset = MultiXactState->nextOffset;
2936 oldestMultiXactId = MultiXactState->oldestMultiXactId;
2937 nextMultiXactId = MultiXactState->nextMXact;
2938 oldestOffset = MultiXactState->oldestOffset;
2939 oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
2940 LWLockRelease(MultiXactGenLock);
2942 if (!oldestOffsetKnown)
2943 return false;
2945 *members = nextOffset - oldestOffset;
2946 *multixacts = nextMultiXactId - oldestMultiXactId;
2947 return true;
2951 * Multixact members can be removed once the multixacts that refer to them
2952 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2953 * vacuum_multixact_freeze_table_age work together to make sure we never have
2954 * too many multixacts; we hope that, at least under normal circumstances,
2955 * this will also be sufficient to keep us from using too many offsets.
2956 * However, if the average multixact has many members, we might exhaust the
2957 * members space while still using few enough members that these limits fail
2958 * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2959 * choice but to start failing multixact-creating operations with an error.
2961 * To prevent that, if more than a threshold portion of the members space is
2962 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2963 * to a value just less than the number of multixacts in use. We hope that
2964 * this will quickly trigger autovacuuming on the table or tables with the
2965 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2966 * some members.
2968 * As the fraction of the member space currently in use grows, we become
2969 * more aggressive in clamping this value. That not only causes autovacuum
2970 * to ramp up, but also makes any manual vacuums the user issues more
2971 * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2972 * freeze table and the minimum freeze age cutoffs based on the effective
2973 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2974 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2975 * table will freeze every multixact.
2978 MultiXactMemberFreezeThreshold(void)
2980 MultiXactOffset members;
2981 uint32 multixacts;
2982 uint32 victim_multixacts;
2983 double fraction;
2984 int result;
2986 /* If we can't determine member space utilization, assume the worst. */
2987 if (!ReadMultiXactCounts(&multixacts, &members))
2988 return 0;
2990 /* If member space utilization is low, no special action is required. */
2991 if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
2992 return autovacuum_multixact_freeze_max_age;
2995 * Compute a target for relminmxid advancement. The number of multixacts
2996 * we try to eliminate from the system is based on how far we are past
2997 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2999 fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
3000 (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
3001 victim_multixacts = multixacts * fraction;
3003 /* fraction could be > 1.0, but lowest possible freeze age is zero */
3004 if (victim_multixacts > multixacts)
3005 return 0;
3006 result = multixacts - victim_multixacts;
3009 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
3010 * autovacuum less aggressive than it would otherwise be.
3012 return Min(result, autovacuum_multixact_freeze_max_age);
3015 typedef struct mxtruncinfo
3017 int64 earliestExistingPage;
3018 } mxtruncinfo;
3021 * SlruScanDirectory callback
3022 * This callback determines the earliest existing page number.
3024 static bool
3025 SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data)
3027 mxtruncinfo *trunc = (mxtruncinfo *) data;
3029 if (trunc->earliestExistingPage == -1 ||
3030 ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
3032 trunc->earliestExistingPage = segpage;
3035 return false; /* keep going */
3040 * Delete members segments [oldest, newOldest)
3042 * The members SLRU can, in contrast to the offsets one, be filled to almost
3043 * the full range at once. This means SimpleLruTruncate() can't trivially be
3044 * used - instead the to-be-deleted range is computed using the offsets
3045 * SLRU. C.f. TruncateMultiXact().
3047 static void
3048 PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
3050 const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
3051 int64 startsegment = MXOffsetToMemberSegment(oldestOffset);
3052 int64 endsegment = MXOffsetToMemberSegment(newOldestOffset);
3053 int64 segment = startsegment;
3056 * Delete all the segments but the last one. The last segment can still
3057 * contain, possibly partially, valid data.
3059 while (segment != endsegment)
3061 elog(DEBUG2, "truncating multixact members segment %llx",
3062 (unsigned long long) segment);
3063 SlruDeleteSegment(MultiXactMemberCtl, segment);
3065 /* move to next segment, handling wraparound correctly */
3066 if (segment == maxsegment)
3067 segment = 0;
3068 else
3069 segment += 1;
3074 * Delete offsets segments [oldest, newOldest)
3076 static void
3077 PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
3080 * We step back one multixact to avoid passing a cutoff page that hasn't
3081 * been created yet in the rare case that oldestMulti would be the first
3082 * item on a page and oldestMulti == nextMulti. In that case, if we
3083 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3084 * detection.
3086 SimpleLruTruncate(MultiXactOffsetCtl,
3087 MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
3091 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3092 * ones still of interest.
3094 * This is only called on a primary as part of vacuum (via
3095 * vac_truncate_clog()). During recovery truncation is done by replaying
3096 * truncation WAL records logged here.
3098 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3099 * is one of the databases preventing newOldestMulti from increasing.
3101 void
3102 TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
3104 MultiXactId oldestMulti;
3105 MultiXactId nextMulti;
3106 MultiXactOffset newOldestOffset;
3107 MultiXactOffset oldestOffset;
3108 MultiXactOffset nextOffset;
3109 mxtruncinfo trunc;
3110 MultiXactId earliest;
3112 Assert(!RecoveryInProgress());
3113 Assert(MultiXactState->finishedStartup);
3116 * We can only allow one truncation to happen at once. Otherwise parts of
3117 * members might vanish while we're doing lookups or similar. There's no
3118 * need to have an interlock with creating new multis or such, since those
3119 * are constrained by the limits (which only grow, never shrink).
3121 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3123 LWLockAcquire(MultiXactGenLock, LW_SHARED);
3124 nextMulti = MultiXactState->nextMXact;
3125 nextOffset = MultiXactState->nextOffset;
3126 oldestMulti = MultiXactState->oldestMultiXactId;
3127 LWLockRelease(MultiXactGenLock);
3128 Assert(MultiXactIdIsValid(oldestMulti));
3131 * Make sure to only attempt truncation if there's values to truncate
3132 * away. In normal processing values shouldn't go backwards, but there's
3133 * some corner cases (due to bugs) where that's possible.
3135 if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
3137 LWLockRelease(MultiXactTruncationLock);
3138 return;
3142 * Note we can't just plow ahead with the truncation; it's possible that
3143 * there are no segments to truncate, which is a problem because we are
3144 * going to attempt to read the offsets page to determine where to
3145 * truncate the members SLRU. So we first scan the directory to determine
3146 * the earliest offsets page number that we can read without error.
3148 * When nextMXact is less than one segment away from multiWrapLimit,
3149 * SlruScanDirCbFindEarliest can find some early segment other than the
3150 * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3151 * returns false, because not all pairs of entries have the same answer.)
3152 * That can also arise when an earlier truncation attempt failed unlink()
3153 * or returned early from this function. The only consequence is
3154 * returning early, which wastes space that we could have liberated.
3156 * NB: It's also possible that the page that oldestMulti is on has already
3157 * been truncated away, and we crashed before updating oldestMulti.
3159 trunc.earliestExistingPage = -1;
3160 SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
3161 earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
3162 if (earliest < FirstMultiXactId)
3163 earliest = FirstMultiXactId;
3165 /* If there's nothing to remove, we can bail out early. */
3166 if (MultiXactIdPrecedes(oldestMulti, earliest))
3168 LWLockRelease(MultiXactTruncationLock);
3169 return;
3173 * First, compute the safe truncation point for MultiXactMember. This is
3174 * the starting offset of the oldest multixact.
3176 * Hopefully, find_multixact_start will always work here, because we've
3177 * already checked that it doesn't precede the earliest MultiXact on disk.
3178 * But if it fails, don't truncate anything, and log a message.
3180 if (oldestMulti == nextMulti)
3182 /* there are NO MultiXacts */
3183 oldestOffset = nextOffset;
3185 else if (!find_multixact_start(oldestMulti, &oldestOffset))
3187 ereport(LOG,
3188 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3189 oldestMulti, earliest)));
3190 LWLockRelease(MultiXactTruncationLock);
3191 return;
3195 * Secondly compute up to where to truncate. Lookup the corresponding
3196 * member offset for newOldestMulti for that.
3198 if (newOldestMulti == nextMulti)
3200 /* there are NO MultiXacts */
3201 newOldestOffset = nextOffset;
3203 else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
3205 ereport(LOG,
3206 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3207 newOldestMulti)));
3208 LWLockRelease(MultiXactTruncationLock);
3209 return;
3212 elog(DEBUG1, "performing multixact truncation: "
3213 "offsets [%u, %u), offsets segments [%llx, %llx), "
3214 "members [%u, %u), members segments [%llx, %llx)",
3215 oldestMulti, newOldestMulti,
3216 (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti),
3217 (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti),
3218 oldestOffset, newOldestOffset,
3219 (unsigned long long) MXOffsetToMemberSegment(oldestOffset),
3220 (unsigned long long) MXOffsetToMemberSegment(newOldestOffset));
3223 * Do truncation, and the WAL logging of the truncation, in a critical
3224 * section. That way offsets/members cannot get out of sync anymore, i.e.
3225 * once consistent the newOldestMulti will always exist in members, even
3226 * if we crashed in the wrong moment.
3228 START_CRIT_SECTION();
3231 * Prevent checkpoints from being scheduled concurrently. This is critical
3232 * because otherwise a truncation record might not be replayed after a
3233 * crash/basebackup, even though the state of the data directory would
3234 * require it.
3236 Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
3237 MyProc->delayChkptFlags |= DELAY_CHKPT_START;
3239 /* WAL log truncation */
3240 WriteMTruncateXlogRec(newOldestMultiDB,
3241 oldestMulti, newOldestMulti,
3242 oldestOffset, newOldestOffset);
3245 * Update in-memory limits before performing the truncation, while inside
3246 * the critical section: Have to do it before truncation, to prevent
3247 * concurrent lookups of those values. Has to be inside the critical
3248 * section as otherwise a future call to this function would error out,
3249 * while looking up the oldest member in offsets, if our caller crashes
3250 * before updating the limits.
3252 LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
3253 MultiXactState->oldestMultiXactId = newOldestMulti;
3254 MultiXactState->oldestMultiXactDB = newOldestMultiDB;
3255 LWLockRelease(MultiXactGenLock);
3257 /* First truncate members */
3258 PerformMembersTruncation(oldestOffset, newOldestOffset);
3260 /* Then offsets */
3261 PerformOffsetsTruncation(oldestMulti, newOldestMulti);
3263 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
3265 END_CRIT_SECTION();
3266 LWLockRelease(MultiXactTruncationLock);
3270 * Decide whether a MultiXactOffset page number is "older" for truncation
3271 * purposes. Analogous to CLOGPagePrecedes().
3273 * Offsetting the values is optional, because MultiXactIdPrecedes() has
3274 * translational symmetry.
3276 static bool
3277 MultiXactOffsetPagePrecedes(int64 page1, int64 page2)
3279 MultiXactId multi1;
3280 MultiXactId multi2;
3282 multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3283 multi1 += FirstMultiXactId + 1;
3284 multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3285 multi2 += FirstMultiXactId + 1;
3287 return (MultiXactIdPrecedes(multi1, multi2) &&
3288 MultiXactIdPrecedes(multi1,
3289 multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
3293 * Decide whether a MultiXactMember page number is "older" for truncation
3294 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3296 static bool
3297 MultiXactMemberPagePrecedes(int64 page1, int64 page2)
3299 MultiXactOffset offset1;
3300 MultiXactOffset offset2;
3302 offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
3303 offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
3305 return (MultiXactOffsetPrecedes(offset1, offset2) &&
3306 MultiXactOffsetPrecedes(offset1,
3307 offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
3311 * Decide which of two MultiXactIds is earlier.
3313 * XXX do we need to do something special for InvalidMultiXactId?
3314 * (Doesn't look like it.)
3316 bool
3317 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
3319 int32 diff = (int32) (multi1 - multi2);
3321 return (diff < 0);
3325 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3327 * XXX do we need to do something special for InvalidMultiXactId?
3328 * (Doesn't look like it.)
3330 bool
3331 MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
3333 int32 diff = (int32) (multi1 - multi2);
3335 return (diff <= 0);
3340 * Decide which of two offsets is earlier.
3342 static bool
3343 MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
3345 int32 diff = (int32) (offset1 - offset2);
3347 return (diff < 0);
3351 * Write an xlog record reflecting the zeroing of either a MEMBERs or
3352 * OFFSETs page (info shows which)
3354 static void
3355 WriteMZeroPageXlogRec(int64 pageno, uint8 info)
3357 XLogBeginInsert();
3358 XLogRegisterData((char *) (&pageno), sizeof(pageno));
3359 (void) XLogInsert(RM_MULTIXACT_ID, info);
3363 * Write a TRUNCATE xlog record
3365 * We must flush the xlog record to disk before returning --- see notes in
3366 * TruncateCLOG().
3368 static void
3369 WriteMTruncateXlogRec(Oid oldestMultiDB,
3370 MultiXactId startTruncOff, MultiXactId endTruncOff,
3371 MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
3373 XLogRecPtr recptr;
3374 xl_multixact_truncate xlrec;
3376 xlrec.oldestMultiDB = oldestMultiDB;
3378 xlrec.startTruncOff = startTruncOff;
3379 xlrec.endTruncOff = endTruncOff;
3381 xlrec.startTruncMemb = startTruncMemb;
3382 xlrec.endTruncMemb = endTruncMemb;
3384 XLogBeginInsert();
3385 XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
3386 recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
3387 XLogFlush(recptr);
3391 * MULTIXACT resource manager's routines
3393 void
3394 multixact_redo(XLogReaderState *record)
3396 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
3398 /* Backup blocks are not used in multixact records */
3399 Assert(!XLogRecHasAnyBlockRefs(record));
3401 if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
3403 int64 pageno;
3404 int slotno;
3405 LWLock *lock;
3407 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3409 lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3410 LWLockAcquire(lock, LW_EXCLUSIVE);
3412 slotno = ZeroMultiXactOffsetPage(pageno, false);
3413 SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3414 Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3416 LWLockRelease(lock);
3418 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
3420 int64 pageno;
3421 int slotno;
3422 LWLock *lock;
3424 memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
3426 lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno);
3427 LWLockAcquire(lock, LW_EXCLUSIVE);
3429 slotno = ZeroMultiXactMemberPage(pageno, false);
3430 SimpleLruWritePage(MultiXactMemberCtl, slotno);
3431 Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
3433 LWLockRelease(lock);
3435 else if (info == XLOG_MULTIXACT_CREATE_ID)
3437 xl_multixact_create *xlrec =
3438 (xl_multixact_create *) XLogRecGetData(record);
3439 TransactionId max_xid;
3440 int i;
3442 /* Store the data back into the SLRU files */
3443 RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
3444 xlrec->members);
3446 /* Make sure nextMXact/nextOffset are beyond what this record has */
3447 MultiXactAdvanceNextMXact(xlrec->mid + 1,
3448 xlrec->moff + xlrec->nmembers);
3451 * Make sure nextXid is beyond any XID mentioned in the record. This
3452 * should be unnecessary, since any XID found here ought to have other
3453 * evidence in the XLOG, but let's be safe.
3455 max_xid = XLogRecGetXid(record);
3456 for (i = 0; i < xlrec->nmembers; i++)
3458 if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
3459 max_xid = xlrec->members[i].xid;
3462 AdvanceNextFullTransactionIdPastXid(max_xid);
3464 else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
3466 xl_multixact_truncate xlrec;
3467 int64 pageno;
3469 memcpy(&xlrec, XLogRecGetData(record),
3470 SizeOfMultiXactTruncate);
3472 elog(DEBUG1, "replaying multixact truncation: "
3473 "offsets [%u, %u), offsets segments [%llx, %llx), "
3474 "members [%u, %u), members segments [%llx, %llx)",
3475 xlrec.startTruncOff, xlrec.endTruncOff,
3476 (unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff),
3477 (unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff),
3478 xlrec.startTruncMemb, xlrec.endTruncMemb,
3479 (unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb),
3480 (unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb));
3482 /* should not be required, but more than cheap enough */
3483 LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
3486 * Advance the horizon values, so they're current at the end of
3487 * recovery.
3489 SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
3491 PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
3494 * During XLOG replay, latest_page_number isn't necessarily set up
3495 * yet; insert a suitable value to bypass the sanity test in
3496 * SimpleLruTruncate.
3498 pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
3499 pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number,
3500 pageno);
3501 PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
3503 LWLockRelease(MultiXactTruncationLock);
3505 else
3506 elog(PANIC, "multixact_redo: unknown op code %u", info);
3509 Datum
3510 pg_get_multixact_members(PG_FUNCTION_ARGS)
3512 typedef struct
3514 MultiXactMember *members;
3515 int nmembers;
3516 int iter;
3517 } mxact;
3518 MultiXactId mxid = PG_GETARG_TRANSACTIONID(0);
3519 mxact *multi;
3520 FuncCallContext *funccxt;
3522 if (mxid < FirstMultiXactId)
3523 ereport(ERROR,
3524 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3525 errmsg("invalid MultiXactId: %u", mxid)));
3527 if (SRF_IS_FIRSTCALL())
3529 MemoryContext oldcxt;
3530 TupleDesc tupdesc;
3532 funccxt = SRF_FIRSTCALL_INIT();
3533 oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
3535 multi = palloc(sizeof(mxact));
3536 /* no need to allow for old values here */
3537 multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
3538 false);
3539 multi->iter = 0;
3541 if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
3542 elog(ERROR, "return type must be a row type");
3543 funccxt->tuple_desc = tupdesc;
3544 funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
3545 funccxt->user_fctx = multi;
3547 MemoryContextSwitchTo(oldcxt);
3550 funccxt = SRF_PERCALL_SETUP();
3551 multi = (mxact *) funccxt->user_fctx;
3553 while (multi->iter < multi->nmembers)
3555 HeapTuple tuple;
3556 char *values[2];
3558 values[0] = psprintf("%u", multi->members[multi->iter].xid);
3559 values[1] = mxstatus_to_string(multi->members[multi->iter].status);
3561 tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
3563 multi->iter++;
3564 pfree(values[0]);
3565 SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
3568 SRF_RETURN_DONE(funccxt);
3572 * Entrypoint for sync.c to sync offsets files.
3575 multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
3577 return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
3581 * Entrypoint for sync.c to sync members files.
3584 multixactmemberssyncfiletag(const FileTag *ftag, char *path)
3586 return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);