1 /*-------------------------------------------------------------------------
4 * PostgreSQL multi-transaction-log manager
6 * The pg_multixact manager is a pg_xact-like manager that stores an array of
7 * MultiXactMember for each MultiXactId. It is a fundamental part of the
8 * shared-row-lock implementation. Each MultiXactMember is comprised of a
9 * TransactionId and a set of flag bits. The name is a bit historical:
10 * originally, a MultiXactId consisted of more than one TransactionId (except
11 * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
12 * legitimate to have MultiXactIds that only include a single Xid.
14 * The meaning of the flag bits is opaque to this module, but they are mostly
15 * used in heapam.c to identify lock modes that each of the member transactions
16 * is holding on any given tuple. This module just contains support to store
17 * and retrieve the arrays.
19 * We use two SLRU areas, one for storing the offsets at which the data
20 * starts for each MultiXactId in the other one. This trick allows us to
21 * store variable length arrays of TransactionIds. (We could alternatively
22 * use one area containing counts and TransactionIds, with valid MultiXactId
23 * values pointing at slots containing counts; but that way seems less robust
24 * since it would get completely confused if someone inquired about a bogus
25 * MultiXactId that pointed to an intermediate slot containing an XID.)
27 * XLOG interactions: this module generates a record whenever a new OFFSETs or
28 * MEMBERs page is initialized to zeroes, as well as an
29 * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
30 * This module ignores the WAL rule "write xlog before data," because it
31 * suffices that actions recording a MultiXactId in a heap xmax do follow that
32 * rule. The only way for the MXID to be referenced from any data page is for
33 * heap_lock_tuple() or heap_update() to have put it there, and each generates
34 * an XLOG record that must follow ours. The normal LSN interlock between the
35 * data page and that XLOG record will ensure that our XLOG record reaches
36 * disk first. If the SLRU members/offsets data reaches disk sooner than the
37 * XLOG records, we do not care; after recovery, no xmax will refer to it. On
38 * the flip side, to ensure that all referenced entries _do_ reach disk, this
39 * module's XLOG records completely rebuild the data entered since the last
40 * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
41 * before each checkpoint is considered complete.
43 * Like clog.c, and unlike subtrans.c, we have to preserve state across
44 * crashes and ensure that MXID and offset numbering increases monotonically
45 * across a crash. We do this in the same way as it's done for transaction
46 * IDs: the WAL record is guaranteed to contain evidence of every MXID we
47 * could need to worry about, and we just make sure that at the end of
48 * replay, the next-MXID and next-offset counters are at least as large as
49 * anything we saw during replay.
51 * We are able to remove segments no longer necessary by carefully tracking
52 * each table's used values: during vacuum, any multixact older than a certain
53 * value is removed; the cutoff value is stored in pg_class. The minimum value
54 * across all tables in each database is stored in pg_database, and the global
55 * minimum across all databases is part of pg_control and is kept in shared
56 * memory. Whenever that minimum is advanced, the SLRUs are truncated.
58 * When new multixactid values are to be created, care is taken that the
59 * counter does not fall within the wraparound horizon considering the global
62 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
63 * Portions Copyright (c) 1994, Regents of the University of California
65 * src/backend/access/transam/multixact.c
67 *-------------------------------------------------------------------------
71 #include "access/multixact.h"
72 #include "access/slru.h"
73 #include "access/transam.h"
74 #include "access/twophase.h"
75 #include "access/twophase_rmgr.h"
76 #include "access/xact.h"
77 #include "access/xlog.h"
78 #include "access/xloginsert.h"
79 #include "access/xlogutils.h"
80 #include "commands/dbcommands.h"
82 #include "lib/ilist.h"
83 #include "miscadmin.h"
86 #include "postmaster/autovacuum.h"
87 #include "storage/pmsignal.h"
88 #include "storage/proc.h"
89 #include "storage/procarray.h"
90 #include "utils/fmgrprotos.h"
91 #include "utils/guc_hooks.h"
92 #include "utils/injection_point.h"
93 #include "utils/memutils.h"
97 * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
98 * used everywhere else in Postgres.
100 * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
101 * MultiXact page numbering also wraps around at
102 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
103 * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
104 * take no explicit notice of that fact in this module, except when comparing
105 * segment and page numbers in TruncateMultiXact (see
106 * MultiXactOffsetPagePrecedes).
109 /* We need four bytes per offset */
110 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
113 MultiXactIdToOffsetPage(MultiXactId multi
)
115 return multi
/ MULTIXACT_OFFSETS_PER_PAGE
;
119 MultiXactIdToOffsetEntry(MultiXactId multi
)
121 return multi
% MULTIXACT_OFFSETS_PER_PAGE
;
125 MultiXactIdToOffsetSegment(MultiXactId multi
)
127 return MultiXactIdToOffsetPage(multi
) / SLRU_PAGES_PER_SEGMENT
;
131 * The situation for members is a bit more complex: we store one byte of
132 * additional flag bits for each TransactionId. To do this without getting
133 * into alignment issues, we store four bytes of flags, and then the
134 * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
135 * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
136 * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
137 * performance) trumps space efficiency here.
139 * Note that the "offset" macros work with byte offset, not array indexes, so
140 * arithmetic must be done using "char *" pointers.
142 /* We need eight bits per xact, so one xact fits in a byte */
143 #define MXACT_MEMBER_BITS_PER_XACT 8
144 #define MXACT_MEMBER_FLAGS_PER_BYTE 1
145 #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
147 /* how many full bytes of flags are there in a group? */
148 #define MULTIXACT_FLAGBYTES_PER_GROUP 4
149 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
150 (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
151 /* size in bytes of a complete group */
152 #define MULTIXACT_MEMBERGROUP_SIZE \
153 (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
154 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
155 #define MULTIXACT_MEMBERS_PER_PAGE \
156 (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
159 * Because the number of items per page is not a divisor of the last item
160 * number (member 0xFFFFFFFF), the last segment does not use the maximum number
161 * of pages, and moreover the last used page therein does not use the same
162 * number of items as previous pages. (Another way to say it is that the
163 * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
164 * has some empty space after that item.)
166 * This constant is the number of members in the last page of the last segment.
168 #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
169 ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
171 /* page in which a member is to be found */
173 MXOffsetToMemberPage(MultiXactOffset offset
)
175 return offset
/ MULTIXACT_MEMBERS_PER_PAGE
;
179 MXOffsetToMemberSegment(MultiXactOffset offset
)
181 return MXOffsetToMemberPage(offset
) / SLRU_PAGES_PER_SEGMENT
;
184 /* Location (byte offset within page) of flag word for a given member */
186 MXOffsetToFlagsOffset(MultiXactOffset offset
)
188 MultiXactOffset group
= offset
/ MULTIXACT_MEMBERS_PER_MEMBERGROUP
;
189 int grouponpg
= group
% MULTIXACT_MEMBERGROUPS_PER_PAGE
;
190 int byteoff
= grouponpg
* MULTIXACT_MEMBERGROUP_SIZE
;
196 MXOffsetToFlagsBitShift(MultiXactOffset offset
)
198 int member_in_group
= offset
% MULTIXACT_MEMBERS_PER_MEMBERGROUP
;
199 int bshift
= member_in_group
* MXACT_MEMBER_BITS_PER_XACT
;
204 /* Location (byte offset within page) of TransactionId of given member */
206 MXOffsetToMemberOffset(MultiXactOffset offset
)
208 int member_in_group
= offset
% MULTIXACT_MEMBERS_PER_MEMBERGROUP
;
210 return MXOffsetToFlagsOffset(offset
) +
211 MULTIXACT_FLAGBYTES_PER_GROUP
+
212 member_in_group
* sizeof(TransactionId
);
215 /* Multixact members wraparound thresholds. */
216 #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
217 #define MULTIXACT_MEMBER_DANGER_THRESHOLD \
218 (MaxMultiXactOffset - MaxMultiXactOffset / 4)
220 static inline MultiXactId
221 PreviousMultiXactId(MultiXactId multi
)
223 return multi
== FirstMultiXactId
? MaxMultiXactId
: multi
- 1;
227 * Links to shared-memory data structures for MultiXact control
229 static SlruCtlData MultiXactOffsetCtlData
;
230 static SlruCtlData MultiXactMemberCtlData
;
232 #define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
233 #define MultiXactMemberCtl (&MultiXactMemberCtlData)
236 * MultiXact state shared across all backends. All this state is protected
237 * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and
238 * MultiXactMember to guard accesses to the two sets of SLRU buffers. For
239 * concurrency's sake, we avoid holding more than one of these locks at a
242 typedef struct MultiXactStateData
244 /* next-to-be-assigned MultiXactId */
245 MultiXactId nextMXact
;
247 /* next-to-be-assigned offset */
248 MultiXactOffset nextOffset
;
250 /* Have we completed multixact startup? */
251 bool finishedStartup
;
254 * Oldest multixact that is still potentially referenced by a relation.
255 * Anything older than this should not be consulted. These values are
258 MultiXactId oldestMultiXactId
;
259 Oid oldestMultiXactDB
;
262 * Oldest multixact offset that is potentially referenced by a multixact
263 * referenced by a relation. We don't always know this value, so there's
264 * a flag here to indicate whether or not we currently do.
266 MultiXactOffset oldestOffset
;
267 bool oldestOffsetKnown
;
269 /* support for anti-wraparound measures */
270 MultiXactId multiVacLimit
;
271 MultiXactId multiWarnLimit
;
272 MultiXactId multiStopLimit
;
273 MultiXactId multiWrapLimit
;
275 /* support for members anti-wraparound measures */
276 MultiXactOffset offsetStopLimit
; /* known if oldestOffsetKnown */
279 * This is used to sleep until a multixact offset is written when we want
280 * to create the next one.
282 ConditionVariable nextoff_cv
;
285 * Per-backend data starts here. We have two arrays stored in the area
286 * immediately following the MultiXactStateData struct. Each is indexed by
289 * In both arrays, there's a slot for all normal backends
290 * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared
293 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
294 * transaction(s) could possibly be a member of, or InvalidMultiXactId
295 * when the backend has no live transaction that could possibly be a
296 * member of a MultiXact. Each backend sets its entry to the current
297 * nextMXact counter just before first acquiring a shared lock in a given
298 * transaction, and clears it at transaction end. (This works because only
299 * during or after acquiring a shared lock could an XID possibly become a
300 * member of a MultiXact, and that MultiXact would have to be created
301 * during or after the lock acquisition.)
303 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
304 * current transaction(s) think is potentially live, or InvalidMultiXactId
305 * when not in a transaction or not in a transaction that's paid any
306 * attention to MultiXacts yet. This is computed when first needed in a
307 * given transaction, and cleared at transaction end. We can compute it
308 * as the minimum of the valid OldestMemberMXactId[] entries at the time
309 * we compute it (using nextMXact if none are valid). Each backend is
310 * required not to attempt to access any SLRU data for MultiXactIds older
311 * than its own OldestVisibleMXactId[] setting; this is necessary because
312 * the relevant SLRU data can be concurrently truncated away.
314 * The oldest valid value among all of the OldestMemberMXactId[] and
315 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
316 * possible value still having any live member transaction -- OldestMxact.
317 * Any value older than that is typically removed from tuple headers, or
318 * "frozen" via being replaced with a new xmax. VACUUM can sometimes even
319 * remove an individual MultiXact xmax whose value is >= its OldestMxact
320 * cutoff, though typically only when no individual member XID is still
321 * running. See FreezeMultiXactId for full details.
323 * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff
324 * or the oldest extant Multi remaining in the table is used as the new
325 * pg_class.relminmxid value (whichever is earlier). The minimum of all
326 * relminmxid values in each database is stored in pg_database.datminmxid.
327 * In turn, the minimum of all of those values is stored in pg_control.
328 * This is used as the truncation point for pg_multixact when unneeded
329 * segments get removed by vac_truncate_clog() during vacuuming.
331 MultiXactId perBackendXactIds
[FLEXIBLE_ARRAY_MEMBER
];
332 } MultiXactStateData
;
335 * Size of OldestMemberMXactId and OldestVisibleMXactId arrays.
337 #define MaxOldestSlot (MaxBackends + max_prepared_xacts)
339 /* Pointers to the state data in shared memory */
340 static MultiXactStateData
*MultiXactState
;
341 static MultiXactId
*OldestMemberMXactId
;
342 static MultiXactId
*OldestVisibleMXactId
;
346 * Definitions for the backend-local MultiXactId cache.
348 * We use this cache to store known MultiXacts, so we don't need to go to
349 * SLRU areas every time.
351 * The cache lasts for the duration of a single transaction, the rationale
352 * for this being that most entries will contain our own TransactionId and
353 * so they will be uninteresting by the time our next transaction starts.
354 * (XXX not clear that this is correct --- other members of the MultiXact
355 * could hang around longer than we did. However, it's not clear what a
356 * better policy for flushing old cache entries would be.) FIXME actually
357 * this is plain wrong now that multixact's may contain update Xids.
359 * We allocate the cache entries in a memory context that is deleted at
360 * transaction end, so we don't need to do retail freeing of entries.
362 typedef struct mXactCacheEnt
367 MultiXactMember members
[FLEXIBLE_ARRAY_MEMBER
];
370 #define MAX_CACHE_ENTRIES 256
371 static dclist_head MXactCache
= DCLIST_STATIC_INIT(MXactCache
);
372 static MemoryContext MXactContext
= NULL
;
374 #ifdef MULTIXACT_DEBUG
375 #define debug_elog2(a,b) elog(a,b)
376 #define debug_elog3(a,b,c) elog(a,b,c)
377 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
378 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
379 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
381 #define debug_elog2(a,b)
382 #define debug_elog3(a,b,c)
383 #define debug_elog4(a,b,c,d)
384 #define debug_elog5(a,b,c,d,e)
385 #define debug_elog6(a,b,c,d,e,f)
388 /* internal MultiXactId management */
389 static void MultiXactIdSetOldestVisible(void);
390 static void RecordNewMultiXact(MultiXactId multi
, MultiXactOffset offset
,
391 int nmembers
, MultiXactMember
*members
);
392 static MultiXactId
GetNewMultiXactId(int nmembers
, MultiXactOffset
*offset
);
394 /* MultiXact cache management */
395 static int mxactMemberComparator(const void *arg1
, const void *arg2
);
396 static MultiXactId
mXactCacheGetBySet(int nmembers
, MultiXactMember
*members
);
397 static int mXactCacheGetById(MultiXactId multi
, MultiXactMember
**members
);
398 static void mXactCachePut(MultiXactId multi
, int nmembers
,
399 MultiXactMember
*members
);
401 static char *mxstatus_to_string(MultiXactStatus status
);
403 /* management of SLRU infrastructure */
404 static int ZeroMultiXactOffsetPage(int64 pageno
, bool writeXlog
);
405 static int ZeroMultiXactMemberPage(int64 pageno
, bool writeXlog
);
406 static bool MultiXactOffsetPagePrecedes(int64 page1
, int64 page2
);
407 static bool MultiXactMemberPagePrecedes(int64 page1
, int64 page2
);
408 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1
,
409 MultiXactOffset offset2
);
410 static void ExtendMultiXactOffset(MultiXactId multi
);
411 static void ExtendMultiXactMember(MultiXactOffset offset
, int nmembers
);
412 static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary
,
413 MultiXactOffset start
, uint32 distance
);
414 static bool SetOffsetVacuumLimit(bool is_startup
);
415 static bool find_multixact_start(MultiXactId multi
, MultiXactOffset
*result
);
416 static void WriteMZeroPageXlogRec(int64 pageno
, uint8 info
);
417 static void WriteMTruncateXlogRec(Oid oldestMultiDB
,
418 MultiXactId startTruncOff
,
419 MultiXactId endTruncOff
,
420 MultiXactOffset startTruncMemb
,
421 MultiXactOffset endTruncMemb
);
426 * Construct a MultiXactId representing two TransactionIds.
428 * The two XIDs must be different, or be requesting different statuses.
430 * NB - we don't worry about our local MultiXactId cache here, because that
431 * is handled by the lower-level routines.
434 MultiXactIdCreate(TransactionId xid1
, MultiXactStatus status1
,
435 TransactionId xid2
, MultiXactStatus status2
)
437 MultiXactId newMulti
;
438 MultiXactMember members
[2];
440 Assert(TransactionIdIsValid(xid1
));
441 Assert(TransactionIdIsValid(xid2
));
443 Assert(!TransactionIdEquals(xid1
, xid2
) || (status1
!= status2
));
445 /* MultiXactIdSetOldestMember() must have been called already. */
446 Assert(MultiXactIdIsValid(OldestMemberMXactId
[MyProcNumber
]));
449 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
450 * are still running. In typical usage, xid2 will be our own XID and the
451 * caller just did a check on xid1, so it'd be wasted effort.
454 members
[0].xid
= xid1
;
455 members
[0].status
= status1
;
456 members
[1].xid
= xid2
;
457 members
[1].status
= status2
;
459 newMulti
= MultiXactIdCreateFromMembers(2, members
);
461 debug_elog3(DEBUG2
, "Create: %s",
462 mxid_to_string(newMulti
, 2, members
));
469 * Add a TransactionId to a pre-existing MultiXactId.
471 * If the TransactionId is already a member of the passed MultiXactId with the
472 * same status, just return it as-is.
474 * Note that we do NOT actually modify the membership of a pre-existing
475 * MultiXactId; instead we create a new one. This is necessary to avoid
476 * a race condition against code trying to wait for one MultiXactId to finish;
477 * see notes in heapam.c.
479 * NB - we don't worry about our local MultiXactId cache here, because that
480 * is handled by the lower-level routines.
482 * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
483 * one upgraded by pg_upgrade from a cluster older than this feature) are not
487 MultiXactIdExpand(MultiXactId multi
, TransactionId xid
, MultiXactStatus status
)
489 MultiXactId newMulti
;
490 MultiXactMember
*members
;
491 MultiXactMember
*newMembers
;
496 Assert(MultiXactIdIsValid(multi
));
497 Assert(TransactionIdIsValid(xid
));
499 /* MultiXactIdSetOldestMember() must have been called already. */
500 Assert(MultiXactIdIsValid(OldestMemberMXactId
[MyProcNumber
]));
502 debug_elog5(DEBUG2
, "Expand: received multi %u, xid %u status %s",
503 multi
, xid
, mxstatus_to_string(status
));
506 * Note: we don't allow for old multis here. The reason is that the only
507 * caller of this function does a check that the multixact is no longer
510 nmembers
= GetMultiXactIdMembers(multi
, &members
, false, false);
514 MultiXactMember member
;
517 * The MultiXactId is obsolete. This can only happen if all the
518 * MultiXactId members stop running between the caller checking and
519 * passing it to us. It would be better to return that fact to the
520 * caller, but it would complicate the API and it's unlikely to happen
521 * too often, so just deal with it by creating a singleton MultiXact.
524 member
.status
= status
;
525 newMulti
= MultiXactIdCreateFromMembers(1, &member
);
527 debug_elog4(DEBUG2
, "Expand: %u has no members, create singleton %u",
533 * If the TransactionId is already a member of the MultiXactId with the
534 * same status, just return the existing MultiXactId.
536 for (i
= 0; i
< nmembers
; i
++)
538 if (TransactionIdEquals(members
[i
].xid
, xid
) &&
539 (members
[i
].status
== status
))
541 debug_elog4(DEBUG2
, "Expand: %u is already a member of %u",
549 * Determine which of the members of the MultiXactId are still of
550 * interest. This is any running transaction, and also any transaction
551 * that grabbed something stronger than just a lock and was committed. (An
552 * update that aborted is of no interest here; and having more than one
553 * update Xid in a multixact would cause errors elsewhere.)
555 * Removing dead members is not just an optimization: freezing of tuples
556 * whose Xmax are multis depends on this behavior.
558 * Note we have the same race condition here as above: j could be 0 at the
561 newMembers
= (MultiXactMember
*)
562 palloc(sizeof(MultiXactMember
) * (nmembers
+ 1));
564 for (i
= 0, j
= 0; i
< nmembers
; i
++)
566 if (TransactionIdIsInProgress(members
[i
].xid
) ||
567 (ISUPDATE_from_mxstatus(members
[i
].status
) &&
568 TransactionIdDidCommit(members
[i
].xid
)))
570 newMembers
[j
].xid
= members
[i
].xid
;
571 newMembers
[j
++].status
= members
[i
].status
;
575 newMembers
[j
].xid
= xid
;
576 newMembers
[j
++].status
= status
;
577 newMulti
= MultiXactIdCreateFromMembers(j
, newMembers
);
582 debug_elog3(DEBUG2
, "Expand: returning new multi %u", newMulti
);
588 * MultiXactIdIsRunning
589 * Returns whether a MultiXactId is "running".
591 * We return true if at least one member of the given MultiXactId is still
592 * running. Note that a "false" result is certain not to change,
593 * because it is not legal to add members to an existing MultiXactId.
595 * Caller is expected to have verified that the multixact does not come from
596 * a pg_upgraded share-locked tuple.
599 MultiXactIdIsRunning(MultiXactId multi
, bool isLockOnly
)
601 MultiXactMember
*members
;
605 debug_elog3(DEBUG2
, "IsRunning %u?", multi
);
608 * "false" here means we assume our callers have checked that the given
609 * multi cannot possibly come from a pg_upgraded database.
611 nmembers
= GetMultiXactIdMembers(multi
, &members
, false, isLockOnly
);
615 debug_elog2(DEBUG2
, "IsRunning: no members");
620 * Checking for myself is cheap compared to looking in shared memory;
621 * return true if any live subtransaction of the current top-level
622 * transaction is a member.
624 * This is not needed for correctness, it's just a fast path.
626 for (i
= 0; i
< nmembers
; i
++)
628 if (TransactionIdIsCurrentTransactionId(members
[i
].xid
))
630 debug_elog3(DEBUG2
, "IsRunning: I (%d) am running!", i
);
637 * This could be made faster by having another entry point in procarray.c,
638 * walking the PGPROC array only once for all the members. But in most
639 * cases nmembers should be small enough that it doesn't much matter.
641 for (i
= 0; i
< nmembers
; i
++)
643 if (TransactionIdIsInProgress(members
[i
].xid
))
645 debug_elog4(DEBUG2
, "IsRunning: member %d (%u) is running",
654 debug_elog3(DEBUG2
, "IsRunning: %u is not running", multi
);
660 * MultiXactIdSetOldestMember
661 * Save the oldest MultiXactId this transaction could be a member of.
663 * We set the OldestMemberMXactId for a given transaction the first time it's
664 * going to do some operation that might require a MultiXactId (tuple lock,
665 * update or delete). We need to do this even if we end up using a
666 * TransactionId instead of a MultiXactId, because there is a chance that
667 * another transaction would add our XID to a MultiXactId.
669 * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
670 * be called just before doing any such possibly-MultiXactId-able operation.
673 MultiXactIdSetOldestMember(void)
675 if (!MultiXactIdIsValid(OldestMemberMXactId
[MyProcNumber
]))
677 MultiXactId nextMXact
;
680 * You might think we don't need to acquire a lock here, since
681 * fetching and storing of TransactionIds is probably atomic, but in
682 * fact we do: suppose we pick up nextMXact and then lose the CPU for
683 * a long time. Someone else could advance nextMXact, and then
684 * another someone else could compute an OldestVisibleMXactId that
685 * would be after the value we are going to store when we get control
686 * back. Which would be wrong.
688 * Note that a shared lock is sufficient, because it's enough to stop
689 * someone from advancing nextMXact; and nobody else could be trying
690 * to write to our OldestMember entry, only reading (and we assume
691 * storing it is atomic.)
693 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
696 * We have to beware of the possibility that nextMXact is in the
697 * wrapped-around state. We don't fix the counter itself here, but we
698 * must be sure to store a valid value in our array entry.
700 nextMXact
= MultiXactState
->nextMXact
;
701 if (nextMXact
< FirstMultiXactId
)
702 nextMXact
= FirstMultiXactId
;
704 OldestMemberMXactId
[MyProcNumber
] = nextMXact
;
706 LWLockRelease(MultiXactGenLock
);
708 debug_elog4(DEBUG2
, "MultiXact: setting OldestMember[%d] = %u",
709 MyProcNumber
, nextMXact
);
714 * MultiXactIdSetOldestVisible
715 * Save the oldest MultiXactId this transaction considers possibly live.
717 * We set the OldestVisibleMXactId for a given transaction the first time
718 * it's going to inspect any MultiXactId. Once we have set this, we are
719 * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId
720 * won't be truncated away.
722 * The value to set is the oldest of nextMXact and all the valid per-backend
723 * OldestMemberMXactId[] entries. Because of the locking we do, we can be
724 * certain that no subsequent call to MultiXactIdSetOldestMember can set
725 * an OldestMemberMXactId[] entry older than what we compute here. Therefore
726 * there is no live transaction, now or later, that can be a member of any
727 * MultiXactId older than the OldestVisibleMXactId we compute here.
730 MultiXactIdSetOldestVisible(void)
732 if (!MultiXactIdIsValid(OldestVisibleMXactId
[MyProcNumber
]))
734 MultiXactId oldestMXact
;
737 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
740 * We have to beware of the possibility that nextMXact is in the
741 * wrapped-around state. We don't fix the counter itself here, but we
742 * must be sure to store a valid value in our array entry.
744 oldestMXact
= MultiXactState
->nextMXact
;
745 if (oldestMXact
< FirstMultiXactId
)
746 oldestMXact
= FirstMultiXactId
;
748 for (i
= 0; i
< MaxOldestSlot
; i
++)
750 MultiXactId thisoldest
= OldestMemberMXactId
[i
];
752 if (MultiXactIdIsValid(thisoldest
) &&
753 MultiXactIdPrecedes(thisoldest
, oldestMXact
))
754 oldestMXact
= thisoldest
;
757 OldestVisibleMXactId
[MyProcNumber
] = oldestMXact
;
759 LWLockRelease(MultiXactGenLock
);
761 debug_elog4(DEBUG2
, "MultiXact: setting OldestVisible[%d] = %u",
762 MyProcNumber
, oldestMXact
);
767 * ReadNextMultiXactId
768 * Return the next MultiXactId to be assigned, but don't allocate it
771 ReadNextMultiXactId(void)
775 /* XXX we could presumably do this without a lock. */
776 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
777 mxid
= MultiXactState
->nextMXact
;
778 LWLockRelease(MultiXactGenLock
);
780 if (mxid
< FirstMultiXactId
)
781 mxid
= FirstMultiXactId
;
787 * ReadMultiXactIdRange
788 * Get the range of IDs that may still be referenced by a relation.
791 ReadMultiXactIdRange(MultiXactId
*oldest
, MultiXactId
*next
)
793 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
794 *oldest
= MultiXactState
->oldestMultiXactId
;
795 *next
= MultiXactState
->nextMXact
;
796 LWLockRelease(MultiXactGenLock
);
798 if (*oldest
< FirstMultiXactId
)
799 *oldest
= FirstMultiXactId
;
800 if (*next
< FirstMultiXactId
)
801 *next
= FirstMultiXactId
;
806 * MultiXactIdCreateFromMembers
807 * Make a new MultiXactId from the specified set of members
809 * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
810 * given TransactionIds as members. Returns the newly created MultiXactId.
812 * NB: the passed members[] array will be sorted in-place.
815 MultiXactIdCreateFromMembers(int nmembers
, MultiXactMember
*members
)
818 MultiXactOffset offset
;
819 xl_multixact_create xlrec
;
821 debug_elog3(DEBUG2
, "Create: %s",
822 mxid_to_string(InvalidMultiXactId
, nmembers
, members
));
825 * See if the same set of members already exists in our cache; if so, just
826 * re-use that MultiXactId. (Note: it might seem that looking in our
827 * cache is insufficient, and we ought to search disk to see if a
828 * duplicate definition already exists. But since we only ever create
829 * MultiXacts containing our own XID, in most cases any such MultiXacts
830 * were in fact created by us, and so will be in our cache. There are
831 * corner cases where someone else added us to a MultiXact without our
832 * knowledge, but it's not worth checking for.)
834 multi
= mXactCacheGetBySet(nmembers
, members
);
835 if (MultiXactIdIsValid(multi
))
837 debug_elog2(DEBUG2
, "Create: in cache!");
841 /* Verify that there is a single update Xid among the given members. */
844 bool has_update
= false;
846 for (i
= 0; i
< nmembers
; i
++)
848 if (ISUPDATE_from_mxstatus(members
[i
].status
))
851 elog(ERROR
, "new multixact has more than one updating member: %s",
852 mxid_to_string(InvalidMultiXactId
, nmembers
, members
));
858 /* Load the injection point before entering the critical section */
859 INJECTION_POINT_LOAD("multixact-create-from-members");
862 * Assign the MXID and offsets range to use, and make sure there is space
863 * in the OFFSETs and MEMBERs files. NB: this routine does
864 * START_CRIT_SECTION().
866 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
867 * that we've called MultiXactIdSetOldestMember here. This is because
868 * this routine is used in some places to create new MultiXactIds of which
869 * the current backend is not a member, notably during freezing of multis
870 * in vacuum. During vacuum, in particular, it would be unacceptable to
871 * keep OldestMulti set, in case it runs for long.
873 multi
= GetNewMultiXactId(nmembers
, &offset
);
875 INJECTION_POINT_CACHED("multixact-create-from-members");
877 /* Make an XLOG entry describing the new MXID. */
880 xlrec
.nmembers
= nmembers
;
883 * XXX Note: there's a lot of padding space in MultiXactMember. We could
884 * find a more compact representation of this Xlog record -- perhaps all
885 * the status flags in one XLogRecData, then all the xids in another one?
886 * Not clear that it's worth the trouble though.
889 XLogRegisterData((char *) (&xlrec
), SizeOfMultiXactCreate
);
890 XLogRegisterData((char *) members
, nmembers
* sizeof(MultiXactMember
));
892 (void) XLogInsert(RM_MULTIXACT_ID
, XLOG_MULTIXACT_CREATE_ID
);
894 /* Now enter the information into the OFFSETs and MEMBERs logs */
895 RecordNewMultiXact(multi
, offset
, nmembers
, members
);
897 /* Done with critical section */
900 /* Store the new MultiXactId in the local cache, too */
901 mXactCachePut(multi
, nmembers
, members
);
903 debug_elog2(DEBUG2
, "Create: all done");
910 * Write info about a new multixact into the offsets and members files
912 * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
916 RecordNewMultiXact(MultiXactId multi
, MultiXactOffset offset
,
917 int nmembers
, MultiXactMember
*members
)
923 MultiXactOffset
*offptr
;
926 LWLock
*prevlock
= NULL
;
928 pageno
= MultiXactIdToOffsetPage(multi
);
929 entryno
= MultiXactIdToOffsetEntry(multi
);
931 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
932 LWLockAcquire(lock
, LW_EXCLUSIVE
);
935 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
936 * to complain about if there's any I/O error. This is kinda bogus, but
937 * since the errors will always give the full pathname, it should be clear
938 * enough that a MultiXactId is really involved. Perhaps someday we'll
939 * take the trouble to generalize the slru.c error reporting code.
941 slotno
= SimpleLruReadPage(MultiXactOffsetCtl
, pageno
, true, multi
);
942 offptr
= (MultiXactOffset
*) MultiXactOffsetCtl
->shared
->page_buffer
[slotno
];
947 MultiXactOffsetCtl
->shared
->page_dirty
[slotno
] = true;
949 /* Release MultiXactOffset SLRU lock. */
953 * If anybody was waiting to know the offset of this multixact ID we just
954 * wrote, they can read it now, so wake them up.
956 ConditionVariableBroadcast(&MultiXactState
->nextoff_cv
);
960 for (i
= 0; i
< nmembers
; i
++, offset
++)
962 TransactionId
*memberptr
;
969 Assert(members
[i
].status
<= MultiXactStatusUpdate
);
971 pageno
= MXOffsetToMemberPage(offset
);
972 memberoff
= MXOffsetToMemberOffset(offset
);
973 flagsoff
= MXOffsetToFlagsOffset(offset
);
974 bshift
= MXOffsetToFlagsBitShift(offset
);
976 if (pageno
!= prev_pageno
)
979 * MultiXactMember SLRU page is changed so check if this new page
980 * fall into the different SLRU bank then release the old bank's
981 * lock and acquire lock on the new bank.
983 lock
= SimpleLruGetBankLock(MultiXactMemberCtl
, pageno
);
984 if (lock
!= prevlock
)
986 if (prevlock
!= NULL
)
987 LWLockRelease(prevlock
);
989 LWLockAcquire(lock
, LW_EXCLUSIVE
);
992 slotno
= SimpleLruReadPage(MultiXactMemberCtl
, pageno
, true, multi
);
993 prev_pageno
= pageno
;
996 memberptr
= (TransactionId
*)
997 (MultiXactMemberCtl
->shared
->page_buffer
[slotno
] + memberoff
);
999 *memberptr
= members
[i
].xid
;
1001 flagsptr
= (uint32
*)
1002 (MultiXactMemberCtl
->shared
->page_buffer
[slotno
] + flagsoff
);
1004 flagsval
= *flagsptr
;
1005 flagsval
&= ~(((1 << MXACT_MEMBER_BITS_PER_XACT
) - 1) << bshift
);
1006 flagsval
|= (members
[i
].status
<< bshift
);
1007 *flagsptr
= flagsval
;
1009 MultiXactMemberCtl
->shared
->page_dirty
[slotno
] = true;
1012 if (prevlock
!= NULL
)
1013 LWLockRelease(prevlock
);
1018 * Get the next MultiXactId.
1020 * Also, reserve the needed amount of space in the "members" area. The
1021 * starting offset of the reserved space is returned in *offset.
1023 * This may generate XLOG records for expansion of the offsets and/or members
1024 * files. Unfortunately, we have to do that while holding MultiXactGenLock
1025 * to avoid race conditions --- the XLOG record for zeroing a page must appear
1026 * before any backend can possibly try to store data in that page!
1028 * We start a critical section before advancing the shared counters. The
1029 * caller must end the critical section after writing SLRU data.
1032 GetNewMultiXactId(int nmembers
, MultiXactOffset
*offset
)
1035 MultiXactOffset nextOffset
;
1037 debug_elog3(DEBUG2
, "GetNew: for %d xids", nmembers
);
1039 /* safety check, we should never get this far in a HS standby */
1040 if (RecoveryInProgress())
1041 elog(ERROR
, "cannot assign MultiXactIds during recovery");
1043 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
1045 /* Handle wraparound of the nextMXact counter */
1046 if (MultiXactState
->nextMXact
< FirstMultiXactId
)
1047 MultiXactState
->nextMXact
= FirstMultiXactId
;
1049 /* Assign the MXID */
1050 result
= MultiXactState
->nextMXact
;
1053 * Check to see if it's safe to assign another MultiXactId. This protects
1054 * against catastrophic data loss due to multixact wraparound. The basic
1057 * If we're past multiVacLimit or the safe threshold for member storage
1058 * space, or we don't know what the safe threshold for member storage is,
1059 * start trying to force autovacuum cycles.
1060 * If we're past multiWarnLimit, start issuing warnings.
1061 * If we're past multiStopLimit, refuse to create new MultiXactIds.
1063 * Note these are pretty much the same protections in GetNewTransactionId.
1066 if (!MultiXactIdPrecedes(result
, MultiXactState
->multiVacLimit
))
1069 * For safety's sake, we release MultiXactGenLock while sending
1070 * signals, warnings, etc. This is not so much because we care about
1071 * preserving concurrency in this situation, as to avoid any
1072 * possibility of deadlock while doing get_database_name(). First,
1073 * copy all the shared values we'll need in this path.
1075 MultiXactId multiWarnLimit
= MultiXactState
->multiWarnLimit
;
1076 MultiXactId multiStopLimit
= MultiXactState
->multiStopLimit
;
1077 MultiXactId multiWrapLimit
= MultiXactState
->multiWrapLimit
;
1078 Oid oldest_datoid
= MultiXactState
->oldestMultiXactDB
;
1080 LWLockRelease(MultiXactGenLock
);
1082 if (IsUnderPostmaster
&&
1083 !MultiXactIdPrecedes(result
, multiStopLimit
))
1085 char *oldest_datname
= get_database_name(oldest_datoid
);
1088 * Immediately kick autovacuum into action as we're already in
1091 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER
);
1093 /* complain even if that DB has disappeared */
1096 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
1097 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"",
1099 errhint("Execute a database-wide VACUUM in that database.\n"
1100 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1103 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
1104 errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u",
1106 errhint("Execute a database-wide VACUUM in that database.\n"
1107 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1111 * To avoid swamping the postmaster with signals, we issue the autovac
1112 * request only once per 64K multis generated. This still gives
1113 * plenty of chances before we get into real trouble.
1115 if (IsUnderPostmaster
&& (result
% 65536) == 0)
1116 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER
);
1118 if (!MultiXactIdPrecedes(result
, multiWarnLimit
))
1120 char *oldest_datname
= get_database_name(oldest_datoid
);
1122 /* complain even if that DB has disappeared */
1125 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
1126 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
1127 multiWrapLimit
- result
,
1129 multiWrapLimit
- result
),
1130 errhint("Execute a database-wide VACUUM in that database.\n"
1131 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1134 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
1135 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
1136 multiWrapLimit
- result
,
1138 multiWrapLimit
- result
),
1139 errhint("Execute a database-wide VACUUM in that database.\n"
1140 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
1143 /* Re-acquire lock and start over */
1144 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
1145 result
= MultiXactState
->nextMXact
;
1146 if (result
< FirstMultiXactId
)
1147 result
= FirstMultiXactId
;
1150 /* Make sure there is room for the MXID in the file. */
1151 ExtendMultiXactOffset(result
);
1154 * Reserve the members space, similarly to above. Also, be careful not to
1155 * return zero as the starting offset for any multixact. See
1156 * GetMultiXactIdMembers() for motivation.
1158 nextOffset
= MultiXactState
->nextOffset
;
1159 if (nextOffset
== 0)
1162 nmembers
++; /* allocate member slot 0 too */
1165 *offset
= nextOffset
;
1168 * Protect against overrun of the members space as well, with the
1171 * If we're past offsetStopLimit, refuse to generate more multis.
1172 * If we're close to offsetStopLimit, emit a warning.
1174 * Arbitrarily, we start emitting warnings when we're 20 segments or less
1175 * from offsetStopLimit.
1177 * Note we haven't updated the shared state yet, so if we fail at this
1178 * point, the multixact ID we grabbed can still be used by the next guy.
1180 * Note that there is no point in forcing autovacuum runs here: the
1181 * multixact freeze settings would have to be reduced for that to have any
1185 #define OFFSET_WARN_SEGMENTS 20
1186 if (MultiXactState
->oldestOffsetKnown
&&
1187 MultiXactOffsetWouldWrap(MultiXactState
->offsetStopLimit
, nextOffset
,
1190 /* see comment in the corresponding offsets wraparound case */
1191 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER
);
1194 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
1195 errmsg("multixact \"members\" limit exceeded"),
1196 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
1197 "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
1198 MultiXactState
->offsetStopLimit
- nextOffset
- 1,
1200 MultiXactState
->offsetStopLimit
- nextOffset
- 1),
1201 errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.",
1202 MultiXactState
->oldestMultiXactDB
)));
1206 * Check whether we should kick autovacuum into action, to prevent members
1207 * wraparound. NB we use a much larger window to trigger autovacuum than
1208 * just the warning limit. The warning is just a measure of last resort -
1209 * this is in line with GetNewTransactionId's behaviour.
1211 if (!MultiXactState
->oldestOffsetKnown
||
1212 (MultiXactState
->nextOffset
- MultiXactState
->oldestOffset
1213 > MULTIXACT_MEMBER_SAFE_THRESHOLD
))
1216 * To avoid swamping the postmaster with signals, we issue the autovac
1217 * request only when crossing a segment boundary. With default
1218 * compilation settings that's roughly after 50k members. This still
1219 * gives plenty of chances before we get into real trouble.
1221 if ((MXOffsetToMemberPage(nextOffset
) / SLRU_PAGES_PER_SEGMENT
) !=
1222 (MXOffsetToMemberPage(nextOffset
+ nmembers
) / SLRU_PAGES_PER_SEGMENT
))
1223 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER
);
1226 if (MultiXactState
->oldestOffsetKnown
&&
1227 MultiXactOffsetWouldWrap(MultiXactState
->offsetStopLimit
,
1229 nmembers
+ MULTIXACT_MEMBERS_PER_PAGE
* SLRU_PAGES_PER_SEGMENT
* OFFSET_WARN_SEGMENTS
))
1231 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
1232 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
1233 "database with OID %u must be vacuumed before %d more multixact members are used",
1234 MultiXactState
->offsetStopLimit
- nextOffset
+ nmembers
,
1235 MultiXactState
->oldestMultiXactDB
,
1236 MultiXactState
->offsetStopLimit
- nextOffset
+ nmembers
),
1237 errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.")));
1239 ExtendMultiXactMember(nextOffset
, nmembers
);
1242 * Critical section from here until caller has written the data into the
1243 * just-reserved SLRU space; we don't want to error out with a partly
1244 * written MultiXact structure. (In particular, failing to write our
1245 * start offset after advancing nextMXact would effectively corrupt the
1246 * previous MultiXact.)
1248 START_CRIT_SECTION();
1251 * Advance counters. As in GetNewTransactionId(), this must not happen
1252 * until after file extension has succeeded!
1254 * We don't care about MultiXactId wraparound here; it will be handled by
1255 * the next iteration. But note that nextMXact may be InvalidMultiXactId
1256 * or the first value on a segment-beginning page after this routine
1257 * exits, so anyone else looking at the variable must be prepared to deal
1258 * with either case. Similarly, nextOffset may be zero, but we won't use
1259 * that as the actual start offset of the next multixact.
1261 (MultiXactState
->nextMXact
)++;
1263 MultiXactState
->nextOffset
+= nmembers
;
1265 LWLockRelease(MultiXactGenLock
);
1267 debug_elog4(DEBUG2
, "GetNew: returning %u offset %u", result
, *offset
);
1272 * GetMultiXactIdMembers
1273 * Return the set of MultiXactMembers that make up a MultiXactId
1275 * Return value is the number of members found, or -1 if there are none,
1276 * and *members is set to a newly palloc'ed array of members. It's the
1277 * caller's responsibility to free it when done with it.
1279 * from_pgupgrade must be passed as true if and only if only the multixact
1280 * corresponds to a value from a tuple that was locked in a 9.2-or-older
1281 * installation and later pg_upgrade'd (that is, the infomask is
1282 * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
1283 * can still be running, so we return -1 just like for an empty multixact
1284 * without any further checking. It would be wrong to try to resolve such a
1285 * multixact: either the multixact is within the current valid multixact
1286 * range, in which case the returned result would be bogus, or outside that
1287 * range, in which case an error would be raised.
1289 * In all other cases, the passed multixact must be within the known valid
1290 * range, that is, greater to or equal than oldestMultiXactId, and less than
1291 * nextMXact. Otherwise, an error is raised.
1293 * isLockOnly must be set to true if caller is certain that the given multi
1294 * is used only to lock tuples; can be false without loss of correctness,
1295 * but passing a true means we can return quickly without checking for
1299 GetMultiXactIdMembers(MultiXactId multi
, MultiXactMember
**members
,
1300 bool from_pgupgrade
, bool isLockOnly
)
1306 MultiXactOffset
*offptr
;
1307 MultiXactOffset offset
;
1310 MultiXactId oldestMXact
;
1311 MultiXactId nextMXact
;
1312 MultiXactId tmpMXact
;
1313 MultiXactOffset nextOffset
;
1314 MultiXactMember
*ptr
;
1318 debug_elog3(DEBUG2
, "GetMembers: asked for %u", multi
);
1320 if (!MultiXactIdIsValid(multi
) || from_pgupgrade
)
1326 /* See if the MultiXactId is in the local cache */
1327 length
= mXactCacheGetById(multi
, members
);
1330 debug_elog3(DEBUG2
, "GetMembers: found %s in the cache",
1331 mxid_to_string(multi
, length
, *members
));
1335 /* Set our OldestVisibleMXactId[] entry if we didn't already */
1336 MultiXactIdSetOldestVisible();
1339 * If we know the multi is used only for locking and not for updates, then
1340 * we can skip checking if the value is older than our oldest visible
1341 * multi. It cannot possibly still be running.
1344 MultiXactIdPrecedes(multi
, OldestVisibleMXactId
[MyProcNumber
]))
1346 debug_elog2(DEBUG2
, "GetMembers: a locker-only multi is too old");
1352 * We check known limits on MultiXact before resorting to the SLRU area.
1354 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
1355 * useful; it has already been removed, or will be removed shortly, by
1356 * truncation. If one is passed, an error is raised.
1358 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
1359 * implies undetected ID wraparound has occurred. This raises a hard
1362 * Shared lock is enough here since we aren't modifying any global state.
1363 * Acquire it just long enough to grab the current counter values. We may
1364 * need both nextMXact and nextOffset; see below.
1366 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
1368 oldestMXact
= MultiXactState
->oldestMultiXactId
;
1369 nextMXact
= MultiXactState
->nextMXact
;
1370 nextOffset
= MultiXactState
->nextOffset
;
1372 LWLockRelease(MultiXactGenLock
);
1374 if (MultiXactIdPrecedes(multi
, oldestMXact
))
1376 (errcode(ERRCODE_INTERNAL_ERROR
),
1377 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
1380 if (!MultiXactIdPrecedes(multi
, nextMXact
))
1382 (errcode(ERRCODE_INTERNAL_ERROR
),
1383 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
1387 * Find out the offset at which we need to start reading MultiXactMembers
1388 * and the number of members in the multixact. We determine the latter as
1389 * the difference between this multixact's starting offset and the next
1390 * one's. However, there are some corner cases to worry about:
1392 * 1. This multixact may be the latest one created, in which case there is
1393 * no next one to look at. In this case the nextOffset value we just
1394 * saved is the correct endpoint.
1396 * 2. The next multixact may still be in process of being filled in: that
1397 * is, another process may have done GetNewMultiXactId but not yet written
1398 * the offset entry for that ID. In that scenario, it is guaranteed that
1399 * the offset entry for that multixact exists (because GetNewMultiXactId
1400 * won't release MultiXactGenLock until it does) but contains zero
1401 * (because we are careful to pre-zero offset pages). Because
1402 * GetNewMultiXactId will never return zero as the starting offset for a
1403 * multixact, when we read zero as the next multixact's offset, we know we
1404 * have this case. We handle this by sleeping on the condition variable
1405 * we have just for this; the process in charge will signal the CV as soon
1406 * as it has finished writing the multixact offset.
1408 * 3. Because GetNewMultiXactId increments offset zero to offset one to
1409 * handle case #2, there is an ambiguity near the point of offset
1410 * wraparound. If we see next multixact's offset is one, is that our
1411 * multixact's actual endpoint, or did it end at zero with a subsequent
1412 * increment? We handle this using the knowledge that if the zero'th
1413 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
1414 * transaction ID so it can't be a multixact member. Therefore, if we
1415 * read a zero from the members array, just ignore it.
1417 * This is all pretty messy, but the mess occurs only in infrequent corner
1418 * cases, so it seems better than holding the MultiXactGenLock for a long
1419 * time on every multixact creation.
1422 pageno
= MultiXactIdToOffsetPage(multi
);
1423 entryno
= MultiXactIdToOffsetEntry(multi
);
1425 /* Acquire the bank lock for the page we need. */
1426 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
1427 LWLockAcquire(lock
, LW_EXCLUSIVE
);
1429 slotno
= SimpleLruReadPage(MultiXactOffsetCtl
, pageno
, true, multi
);
1430 offptr
= (MultiXactOffset
*) MultiXactOffsetCtl
->shared
->page_buffer
[slotno
];
1434 Assert(offset
!= 0);
1437 * Use the same increment rule as GetNewMultiXactId(), that is, don't
1438 * handle wraparound explicitly until needed.
1440 tmpMXact
= multi
+ 1;
1442 if (nextMXact
== tmpMXact
)
1444 /* Corner case 1: there is no next multixact */
1445 length
= nextOffset
- offset
;
1449 MultiXactOffset nextMXOffset
;
1451 /* handle wraparound if needed */
1452 if (tmpMXact
< FirstMultiXactId
)
1453 tmpMXact
= FirstMultiXactId
;
1455 prev_pageno
= pageno
;
1457 pageno
= MultiXactIdToOffsetPage(tmpMXact
);
1458 entryno
= MultiXactIdToOffsetEntry(tmpMXact
);
1460 if (pageno
!= prev_pageno
)
1465 * Since we're going to access a different SLRU page, if this page
1466 * falls under a different bank, release the old bank's lock and
1467 * acquire the lock of the new bank.
1469 newlock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
1470 if (newlock
!= lock
)
1472 LWLockRelease(lock
);
1473 LWLockAcquire(newlock
, LW_EXCLUSIVE
);
1476 slotno
= SimpleLruReadPage(MultiXactOffsetCtl
, pageno
, true, tmpMXact
);
1479 offptr
= (MultiXactOffset
*) MultiXactOffsetCtl
->shared
->page_buffer
[slotno
];
1481 nextMXOffset
= *offptr
;
1483 if (nextMXOffset
== 0)
1485 /* Corner case 2: next multixact is still being filled in */
1486 LWLockRelease(lock
);
1487 CHECK_FOR_INTERRUPTS();
1489 INJECTION_POINT("multixact-get-members-cv-sleep");
1491 ConditionVariableSleep(&MultiXactState
->nextoff_cv
,
1492 WAIT_EVENT_MULTIXACT_CREATION
);
1497 length
= nextMXOffset
- offset
;
1500 LWLockRelease(lock
);
1504 * If we slept above, clean up state; it's no longer needed.
1507 ConditionVariableCancelSleep();
1509 ptr
= (MultiXactMember
*) palloc(length
* sizeof(MultiXactMember
));
1513 for (int i
= 0; i
< length
; i
++, offset
++)
1515 TransactionId
*xactptr
;
1521 pageno
= MXOffsetToMemberPage(offset
);
1522 memberoff
= MXOffsetToMemberOffset(offset
);
1524 if (pageno
!= prev_pageno
)
1529 * Since we're going to access a different SLRU page, if this page
1530 * falls under a different bank, release the old bank's lock and
1531 * acquire the lock of the new bank.
1533 newlock
= SimpleLruGetBankLock(MultiXactMemberCtl
, pageno
);
1534 if (newlock
!= lock
)
1537 LWLockRelease(lock
);
1538 LWLockAcquire(newlock
, LW_EXCLUSIVE
);
1542 slotno
= SimpleLruReadPage(MultiXactMemberCtl
, pageno
, true, multi
);
1543 prev_pageno
= pageno
;
1546 xactptr
= (TransactionId
*)
1547 (MultiXactMemberCtl
->shared
->page_buffer
[slotno
] + memberoff
);
1549 if (!TransactionIdIsValid(*xactptr
))
1551 /* Corner case 3: we must be looking at unused slot zero */
1552 Assert(offset
== 0);
1556 flagsoff
= MXOffsetToFlagsOffset(offset
);
1557 bshift
= MXOffsetToFlagsBitShift(offset
);
1558 flagsptr
= (uint32
*) (MultiXactMemberCtl
->shared
->page_buffer
[slotno
] + flagsoff
);
1560 ptr
[truelength
].xid
= *xactptr
;
1561 ptr
[truelength
].status
= (*flagsptr
>> bshift
) & MXACT_MEMBER_XACT_BITMASK
;
1565 LWLockRelease(lock
);
1567 /* A multixid with zero members should not happen */
1568 Assert(truelength
> 0);
1571 * Copy the result into the local cache.
1573 mXactCachePut(multi
, truelength
, ptr
);
1575 debug_elog3(DEBUG2
, "GetMembers: no cache for %s",
1576 mxid_to_string(multi
, truelength
, ptr
));
1582 * mxactMemberComparator
1583 * qsort comparison function for MultiXactMember
1585 * We can't use wraparound comparison for XIDs because that does not respect
1586 * the triangle inequality! Any old sort order will do.
1589 mxactMemberComparator(const void *arg1
, const void *arg2
)
1591 MultiXactMember member1
= *(const MultiXactMember
*) arg1
;
1592 MultiXactMember member2
= *(const MultiXactMember
*) arg2
;
1594 if (member1
.xid
> member2
.xid
)
1596 if (member1
.xid
< member2
.xid
)
1598 if (member1
.status
> member2
.status
)
1600 if (member1
.status
< member2
.status
)
1606 * mXactCacheGetBySet
1607 * returns a MultiXactId from the cache based on the set of
1608 * TransactionIds that compose it, or InvalidMultiXactId if
1611 * This is helpful, for example, if two transactions want to lock a huge
1612 * table. By using the cache, the second will use the same MultiXactId
1613 * for the majority of tuples, thus keeping MultiXactId usage low (saving
1614 * both I/O and wraparound issues).
1616 * NB: the passed members array will be sorted in-place.
1619 mXactCacheGetBySet(int nmembers
, MultiXactMember
*members
)
1623 debug_elog3(DEBUG2
, "CacheGet: looking for %s",
1624 mxid_to_string(InvalidMultiXactId
, nmembers
, members
));
1626 /* sort the array so comparison is easy */
1627 qsort(members
, nmembers
, sizeof(MultiXactMember
), mxactMemberComparator
);
1629 dclist_foreach(iter
, &MXactCache
)
1631 mXactCacheEnt
*entry
= dclist_container(mXactCacheEnt
, node
,
1634 if (entry
->nmembers
!= nmembers
)
1638 * We assume the cache entries are sorted, and that the unused bits in
1639 * "status" are zeroed.
1641 if (memcmp(members
, entry
->members
, nmembers
* sizeof(MultiXactMember
)) == 0)
1643 debug_elog3(DEBUG2
, "CacheGet: found %u", entry
->multi
);
1644 dclist_move_head(&MXactCache
, iter
.cur
);
1645 return entry
->multi
;
1649 debug_elog2(DEBUG2
, "CacheGet: not found :-(");
1650 return InvalidMultiXactId
;
1655 * returns the composing MultiXactMember set from the cache for a
1656 * given MultiXactId, if present.
1658 * If successful, *xids is set to the address of a palloc'd copy of the
1659 * MultiXactMember set. Return value is number of members, or -1 on failure.
1662 mXactCacheGetById(MultiXactId multi
, MultiXactMember
**members
)
1666 debug_elog3(DEBUG2
, "CacheGet: looking for %u", multi
);
1668 dclist_foreach(iter
, &MXactCache
)
1670 mXactCacheEnt
*entry
= dclist_container(mXactCacheEnt
, node
,
1673 if (entry
->multi
== multi
)
1675 MultiXactMember
*ptr
;
1678 size
= sizeof(MultiXactMember
) * entry
->nmembers
;
1679 ptr
= (MultiXactMember
*) palloc(size
);
1681 memcpy(ptr
, entry
->members
, size
);
1683 debug_elog3(DEBUG2
, "CacheGet: found %s",
1684 mxid_to_string(multi
,
1689 * Note we modify the list while not using a modifiable iterator.
1690 * This is acceptable only because we exit the iteration
1691 * immediately afterwards.
1693 dclist_move_head(&MXactCache
, iter
.cur
);
1696 return entry
->nmembers
;
1700 debug_elog2(DEBUG2
, "CacheGet: not found");
1706 * Add a new MultiXactId and its composing set into the local cache.
1709 mXactCachePut(MultiXactId multi
, int nmembers
, MultiXactMember
*members
)
1711 mXactCacheEnt
*entry
;
1713 debug_elog3(DEBUG2
, "CachePut: storing %s",
1714 mxid_to_string(multi
, nmembers
, members
));
1716 if (MXactContext
== NULL
)
1718 /* The cache only lives as long as the current transaction */
1719 debug_elog2(DEBUG2
, "CachePut: initializing memory context");
1720 MXactContext
= AllocSetContextCreate(TopTransactionContext
,
1721 "MultiXact cache context",
1722 ALLOCSET_SMALL_SIZES
);
1725 entry
= (mXactCacheEnt
*)
1726 MemoryContextAlloc(MXactContext
,
1727 offsetof(mXactCacheEnt
, members
) +
1728 nmembers
* sizeof(MultiXactMember
));
1730 entry
->multi
= multi
;
1731 entry
->nmembers
= nmembers
;
1732 memcpy(entry
->members
, members
, nmembers
* sizeof(MultiXactMember
));
1734 /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
1735 qsort(entry
->members
, nmembers
, sizeof(MultiXactMember
), mxactMemberComparator
);
1737 dclist_push_head(&MXactCache
, &entry
->node
);
1738 if (dclist_count(&MXactCache
) > MAX_CACHE_ENTRIES
)
1742 node
= dclist_tail_node(&MXactCache
);
1743 dclist_delete_from(&MXactCache
, node
);
1745 entry
= dclist_container(mXactCacheEnt
, node
, node
);
1746 debug_elog3(DEBUG2
, "CachePut: pruning cached multi %u",
1754 mxstatus_to_string(MultiXactStatus status
)
1758 case MultiXactStatusForKeyShare
:
1760 case MultiXactStatusForShare
:
1762 case MultiXactStatusForNoKeyUpdate
:
1763 return "fornokeyupd";
1764 case MultiXactStatusForUpdate
:
1766 case MultiXactStatusNoKeyUpdate
:
1768 case MultiXactStatusUpdate
:
1771 elog(ERROR
, "unrecognized multixact status %d", status
);
1777 mxid_to_string(MultiXactId multi
, int nmembers
, MultiXactMember
*members
)
1779 static char *str
= NULL
;
1786 initStringInfo(&buf
);
1788 appendStringInfo(&buf
, "%u %d[%u (%s)", multi
, nmembers
, members
[0].xid
,
1789 mxstatus_to_string(members
[0].status
));
1791 for (i
= 1; i
< nmembers
; i
++)
1792 appendStringInfo(&buf
, ", %u (%s)", members
[i
].xid
,
1793 mxstatus_to_string(members
[i
].status
));
1795 appendStringInfoChar(&buf
, ']');
1796 str
= MemoryContextStrdup(TopMemoryContext
, buf
.data
);
1802 * AtEOXact_MultiXact
1803 * Handle transaction end for MultiXact
1805 * This is called at top transaction commit or abort (we don't care which).
1808 AtEOXact_MultiXact(void)
1811 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
1812 * which should only be valid while within a transaction.
1814 * We assume that storing a MultiXactId is atomic and so we need not take
1815 * MultiXactGenLock to do this.
1817 OldestMemberMXactId
[MyProcNumber
] = InvalidMultiXactId
;
1818 OldestVisibleMXactId
[MyProcNumber
] = InvalidMultiXactId
;
1821 * Discard the local MultiXactId cache. Since MXactContext was created as
1822 * a child of TopTransactionContext, we needn't delete it explicitly.
1824 MXactContext
= NULL
;
1825 dclist_init(&MXactCache
);
1829 * AtPrepare_MultiXact
1830 * Save multixact state at 2PC transaction prepare
1832 * In this phase, we only store our OldestMemberMXactId value in the two-phase
1836 AtPrepare_MultiXact(void)
1838 MultiXactId myOldestMember
= OldestMemberMXactId
[MyProcNumber
];
1840 if (MultiXactIdIsValid(myOldestMember
))
1841 RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID
, 0,
1842 &myOldestMember
, sizeof(MultiXactId
));
1846 * PostPrepare_MultiXact
1847 * Clean up after successful PREPARE TRANSACTION
1850 PostPrepare_MultiXact(TransactionId xid
)
1852 MultiXactId myOldestMember
;
1855 * Transfer our OldestMemberMXactId value to the slot reserved for the
1856 * prepared transaction.
1858 myOldestMember
= OldestMemberMXactId
[MyProcNumber
];
1859 if (MultiXactIdIsValid(myOldestMember
))
1861 ProcNumber dummyProcNumber
= TwoPhaseGetDummyProcNumber(xid
, false);
1864 * Even though storing MultiXactId is atomic, acquire lock to make
1865 * sure others see both changes, not just the reset of the slot of the
1866 * current backend. Using a volatile pointer might suffice, but this
1869 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
1871 OldestMemberMXactId
[dummyProcNumber
] = myOldestMember
;
1872 OldestMemberMXactId
[MyProcNumber
] = InvalidMultiXactId
;
1874 LWLockRelease(MultiXactGenLock
);
1878 * We don't need to transfer OldestVisibleMXactId value, because the
1879 * transaction is not going to be looking at any more multixacts once it's
1882 * We assume that storing a MultiXactId is atomic and so we need not take
1883 * MultiXactGenLock to do this.
1885 OldestVisibleMXactId
[MyProcNumber
] = InvalidMultiXactId
;
1888 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
1890 MXactContext
= NULL
;
1891 dclist_init(&MXactCache
);
1895 * multixact_twophase_recover
1896 * Recover the state of a prepared transaction at startup
1899 multixact_twophase_recover(TransactionId xid
, uint16 info
,
1900 void *recdata
, uint32 len
)
1902 ProcNumber dummyProcNumber
= TwoPhaseGetDummyProcNumber(xid
, false);
1903 MultiXactId oldestMember
;
1906 * Get the oldest member XID from the state file record, and set it in the
1907 * OldestMemberMXactId slot reserved for this prepared transaction.
1909 Assert(len
== sizeof(MultiXactId
));
1910 oldestMember
= *((MultiXactId
*) recdata
);
1912 OldestMemberMXactId
[dummyProcNumber
] = oldestMember
;
1916 * multixact_twophase_postcommit
1917 * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
1920 multixact_twophase_postcommit(TransactionId xid
, uint16 info
,
1921 void *recdata
, uint32 len
)
1923 ProcNumber dummyProcNumber
= TwoPhaseGetDummyProcNumber(xid
, true);
1925 Assert(len
== sizeof(MultiXactId
));
1927 OldestMemberMXactId
[dummyProcNumber
] = InvalidMultiXactId
;
1931 * multixact_twophase_postabort
1932 * This is actually just the same as the COMMIT case.
1935 multixact_twophase_postabort(TransactionId xid
, uint16 info
,
1936 void *recdata
, uint32 len
)
1938 multixact_twophase_postcommit(xid
, info
, recdata
, len
);
1942 * Initialization of shared memory for MultiXact. We use two SLRU areas,
1943 * thus double memory. Also, reserve space for the shared MultiXactState
1944 * struct and the per-backend MultiXactId arrays (two of those, too).
1947 MultiXactShmemSize(void)
1951 /* We need 2*MaxOldestSlot perBackendXactIds[] entries */
1952 #define SHARED_MULTIXACT_STATE_SIZE \
1953 add_size(offsetof(MultiXactStateData, perBackendXactIds), \
1954 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
1956 size
= SHARED_MULTIXACT_STATE_SIZE
;
1957 size
= add_size(size
, SimpleLruShmemSize(multixact_offset_buffers
, 0));
1958 size
= add_size(size
, SimpleLruShmemSize(multixact_member_buffers
, 0));
1964 MultiXactShmemInit(void)
1968 debug_elog2(DEBUG2
, "Shared Memory Init for MultiXact");
1970 MultiXactOffsetCtl
->PagePrecedes
= MultiXactOffsetPagePrecedes
;
1971 MultiXactMemberCtl
->PagePrecedes
= MultiXactMemberPagePrecedes
;
1973 SimpleLruInit(MultiXactOffsetCtl
,
1974 "multixact_offset", multixact_offset_buffers
, 0,
1975 "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER
,
1976 LWTRANCHE_MULTIXACTOFFSET_SLRU
,
1977 SYNC_HANDLER_MULTIXACT_OFFSET
,
1979 SlruPagePrecedesUnitTests(MultiXactOffsetCtl
, MULTIXACT_OFFSETS_PER_PAGE
);
1980 SimpleLruInit(MultiXactMemberCtl
,
1981 "multixact_member", multixact_member_buffers
, 0,
1982 "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER
,
1983 LWTRANCHE_MULTIXACTMEMBER_SLRU
,
1984 SYNC_HANDLER_MULTIXACT_MEMBER
,
1986 /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
1988 /* Initialize our shared state struct */
1989 MultiXactState
= ShmemInitStruct("Shared MultiXact State",
1990 SHARED_MULTIXACT_STATE_SIZE
,
1992 if (!IsUnderPostmaster
)
1996 /* Make sure we zero out the per-backend state */
1997 MemSet(MultiXactState
, 0, SHARED_MULTIXACT_STATE_SIZE
);
1998 ConditionVariableInit(&MultiXactState
->nextoff_cv
);
2004 * Set up array pointers.
2006 OldestMemberMXactId
= MultiXactState
->perBackendXactIds
;
2007 OldestVisibleMXactId
= OldestMemberMXactId
+ MaxOldestSlot
;
2011 * GUC check_hook for multixact_offset_buffers
2014 check_multixact_offset_buffers(int *newval
, void **extra
, GucSource source
)
2016 return check_slru_buffers("multixact_offset_buffers", newval
);
2020 * GUC check_hook for multixact_member_buffers
2023 check_multixact_member_buffers(int *newval
, void **extra
, GucSource source
)
2025 return check_slru_buffers("multixact_member_buffers", newval
);
2029 * This func must be called ONCE on system install. It creates the initial
2030 * MultiXact segments. (The MultiXacts directories are assumed to have been
2031 * created by initdb, and MultiXactShmemInit must have been called already.)
2034 BootStrapMultiXact(void)
2039 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, 0);
2040 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2042 /* Create and zero the first page of the offsets log */
2043 slotno
= ZeroMultiXactOffsetPage(0, false);
2045 /* Make sure it's written out */
2046 SimpleLruWritePage(MultiXactOffsetCtl
, slotno
);
2047 Assert(!MultiXactOffsetCtl
->shared
->page_dirty
[slotno
]);
2049 LWLockRelease(lock
);
2051 lock
= SimpleLruGetBankLock(MultiXactMemberCtl
, 0);
2052 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2054 /* Create and zero the first page of the members log */
2055 slotno
= ZeroMultiXactMemberPage(0, false);
2057 /* Make sure it's written out */
2058 SimpleLruWritePage(MultiXactMemberCtl
, slotno
);
2059 Assert(!MultiXactMemberCtl
->shared
->page_dirty
[slotno
]);
2061 LWLockRelease(lock
);
2065 * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
2066 * If writeXlog is true, also emit an XLOG record saying we did this.
2068 * The page is not actually written, just set up in shared memory.
2069 * The slot number of the new page is returned.
2071 * Control lock must be held at entry, and will be held at exit.
2074 ZeroMultiXactOffsetPage(int64 pageno
, bool writeXlog
)
2078 slotno
= SimpleLruZeroPage(MultiXactOffsetCtl
, pageno
);
2081 WriteMZeroPageXlogRec(pageno
, XLOG_MULTIXACT_ZERO_OFF_PAGE
);
2087 * Ditto, for MultiXactMember
2090 ZeroMultiXactMemberPage(int64 pageno
, bool writeXlog
)
2094 slotno
= SimpleLruZeroPage(MultiXactMemberCtl
, pageno
);
2097 WriteMZeroPageXlogRec(pageno
, XLOG_MULTIXACT_ZERO_MEM_PAGE
);
2103 * MaybeExtendOffsetSlru
2104 * Extend the offsets SLRU area, if necessary
2106 * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
2107 * contain files that are shorter than necessary; this would occur if the old
2108 * installation had used multixacts beyond the first page (files cannot be
2109 * copied, because the on-disk representation is different). pg_upgrade would
2110 * update pg_control to set the next offset value to be at that position, so
2111 * that tuples marked as locked by such MultiXacts would be seen as visible
2112 * without having to consult multixact. However, trying to create and use a
2113 * new MultiXactId would result in an error because the page on which the new
2114 * value would reside does not exist. This routine is in charge of creating
2118 MaybeExtendOffsetSlru(void)
2123 pageno
= MultiXactIdToOffsetPage(MultiXactState
->nextMXact
);
2124 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
2126 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2128 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl
, pageno
))
2133 * Fortunately for us, SimpleLruWritePage is already prepared to deal
2134 * with creating a new segment file even if the page we're writing is
2135 * not the first in it, so this is enough.
2137 slotno
= ZeroMultiXactOffsetPage(pageno
, false);
2138 SimpleLruWritePage(MultiXactOffsetCtl
, slotno
);
2141 LWLockRelease(lock
);
2145 * This must be called ONCE during postmaster or standalone-backend startup.
2147 * StartupXLOG has already established nextMXact/nextOffset by calling
2148 * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
2149 * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
2153 StartupMultiXact(void)
2155 MultiXactId multi
= MultiXactState
->nextMXact
;
2156 MultiXactOffset offset
= MultiXactState
->nextOffset
;
2160 * Initialize offset's idea of the latest page number.
2162 pageno
= MultiXactIdToOffsetPage(multi
);
2163 pg_atomic_write_u64(&MultiXactOffsetCtl
->shared
->latest_page_number
,
2167 * Initialize member's idea of the latest page number.
2169 pageno
= MXOffsetToMemberPage(offset
);
2170 pg_atomic_write_u64(&MultiXactMemberCtl
->shared
->latest_page_number
,
2175 * This must be called ONCE at the end of startup/recovery.
2180 MultiXactId nextMXact
;
2181 MultiXactOffset offset
;
2182 MultiXactId oldestMXact
;
2188 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
2189 nextMXact
= MultiXactState
->nextMXact
;
2190 offset
= MultiXactState
->nextOffset
;
2191 oldestMXact
= MultiXactState
->oldestMultiXactId
;
2192 oldestMXactDB
= MultiXactState
->oldestMultiXactDB
;
2193 LWLockRelease(MultiXactGenLock
);
2195 /* Clean up offsets state */
2198 * (Re-)Initialize our idea of the latest page number for offsets.
2200 pageno
= MultiXactIdToOffsetPage(nextMXact
);
2201 pg_atomic_write_u64(&MultiXactOffsetCtl
->shared
->latest_page_number
,
2205 * Zero out the remainder of the current offsets page. See notes in
2206 * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2207 * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2208 * rule "write xlog before data," nextMXact successors may carry obsolete,
2209 * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2210 * operates normally.
2212 entryno
= MultiXactIdToOffsetEntry(nextMXact
);
2216 MultiXactOffset
*offptr
;
2217 LWLock
*lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
2219 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2220 slotno
= SimpleLruReadPage(MultiXactOffsetCtl
, pageno
, true, nextMXact
);
2221 offptr
= (MultiXactOffset
*) MultiXactOffsetCtl
->shared
->page_buffer
[slotno
];
2224 MemSet(offptr
, 0, BLCKSZ
- (entryno
* sizeof(MultiXactOffset
)));
2226 MultiXactOffsetCtl
->shared
->page_dirty
[slotno
] = true;
2227 LWLockRelease(lock
);
2231 * And the same for members.
2233 * (Re-)Initialize our idea of the latest page number for members.
2235 pageno
= MXOffsetToMemberPage(offset
);
2236 pg_atomic_write_u64(&MultiXactMemberCtl
->shared
->latest_page_number
,
2240 * Zero out the remainder of the current members page. See notes in
2241 * TrimCLOG() for motivation.
2243 flagsoff
= MXOffsetToFlagsOffset(offset
);
2247 TransactionId
*xidptr
;
2249 LWLock
*lock
= SimpleLruGetBankLock(MultiXactMemberCtl
, pageno
);
2251 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2252 memberoff
= MXOffsetToMemberOffset(offset
);
2253 slotno
= SimpleLruReadPage(MultiXactMemberCtl
, pageno
, true, offset
);
2254 xidptr
= (TransactionId
*)
2255 (MultiXactMemberCtl
->shared
->page_buffer
[slotno
] + memberoff
);
2257 MemSet(xidptr
, 0, BLCKSZ
- memberoff
);
2260 * Note: we don't need to zero out the flag bits in the remaining
2261 * members of the current group, because they are always reset before
2265 MultiXactMemberCtl
->shared
->page_dirty
[slotno
] = true;
2266 LWLockRelease(lock
);
2269 /* signal that we're officially up */
2270 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
2271 MultiXactState
->finishedStartup
= true;
2272 LWLockRelease(MultiXactGenLock
);
2274 /* Now compute how far away the next members wraparound is. */
2275 SetMultiXactIdLimit(oldestMXact
, oldestMXactDB
, true);
2279 * Get the MultiXact data to save in a checkpoint record
2282 MultiXactGetCheckptMulti(bool is_shutdown
,
2283 MultiXactId
*nextMulti
,
2284 MultiXactOffset
*nextMultiOffset
,
2285 MultiXactId
*oldestMulti
,
2288 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
2289 *nextMulti
= MultiXactState
->nextMXact
;
2290 *nextMultiOffset
= MultiXactState
->nextOffset
;
2291 *oldestMulti
= MultiXactState
->oldestMultiXactId
;
2292 *oldestMultiDB
= MultiXactState
->oldestMultiXactDB
;
2293 LWLockRelease(MultiXactGenLock
);
2296 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
2297 *nextMulti
, *nextMultiOffset
, *oldestMulti
, *oldestMultiDB
);
2301 * Perform a checkpoint --- either during shutdown, or on-the-fly
2304 CheckPointMultiXact(void)
2306 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
2309 * Write dirty MultiXact pages to disk. This may result in sync requests
2310 * queued for later handling by ProcessSyncRequests(), as part of the
2313 SimpleLruWriteAll(MultiXactOffsetCtl
, true);
2314 SimpleLruWriteAll(MultiXactMemberCtl
, true);
2316 TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
2320 * Set the next-to-be-assigned MultiXactId and offset
2322 * This is used when we can determine the correct next ID/offset exactly
2323 * from a checkpoint record. Although this is only called during bootstrap
2324 * and XLog replay, we take the lock in case any hot-standby backends are
2325 * examining the values.
2328 MultiXactSetNextMXact(MultiXactId nextMulti
,
2329 MultiXactOffset nextMultiOffset
)
2331 debug_elog4(DEBUG2
, "MultiXact: setting next multi to %u offset %u",
2332 nextMulti
, nextMultiOffset
);
2333 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
2334 MultiXactState
->nextMXact
= nextMulti
;
2335 MultiXactState
->nextOffset
= nextMultiOffset
;
2336 LWLockRelease(MultiXactGenLock
);
2339 * During a binary upgrade, make sure that the offsets SLRU is large
2340 * enough to contain the next value that would be created.
2342 * We need to do this pretty early during the first startup in binary
2343 * upgrade mode: before StartupMultiXact() in fact, because this routine
2344 * is called even before that by StartupXLOG(). And we can't do it
2345 * earlier than at this point, because during that first call of this
2346 * routine we determine the MultiXactState->nextMXact value that
2347 * MaybeExtendOffsetSlru needs.
2349 if (IsBinaryUpgrade
)
2350 MaybeExtendOffsetSlru();
2354 * Determine the last safe MultiXactId to allocate given the currently oldest
2355 * datminmxid (ie, the oldest MultiXactId that might exist in any database
2356 * of our cluster), and the OID of the (or a) database with that value.
2358 * is_startup is true when we are just starting the cluster, false when we
2359 * are updating state in a running cluster. This only affects log messages.
2362 SetMultiXactIdLimit(MultiXactId oldest_datminmxid
, Oid oldest_datoid
,
2365 MultiXactId multiVacLimit
;
2366 MultiXactId multiWarnLimit
;
2367 MultiXactId multiStopLimit
;
2368 MultiXactId multiWrapLimit
;
2369 MultiXactId curMulti
;
2370 bool needs_offset_vacuum
;
2372 Assert(MultiXactIdIsValid(oldest_datminmxid
));
2375 * We pretend that a wrap will happen halfway through the multixact ID
2376 * space, but that's not really true, because multixacts wrap differently
2377 * from transaction IDs. Note that, separately from any concern about
2378 * multixact IDs wrapping, we must ensure that multixact members do not
2379 * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
2381 multiWrapLimit
= oldest_datminmxid
+ (MaxMultiXactId
>> 1);
2382 if (multiWrapLimit
< FirstMultiXactId
)
2383 multiWrapLimit
+= FirstMultiXactId
;
2386 * We'll refuse to continue assigning MultiXactIds once we get within 3M
2387 * multi of data loss. See SetTransactionIdLimit.
2389 multiStopLimit
= multiWrapLimit
- 3000000;
2390 if (multiStopLimit
< FirstMultiXactId
)
2391 multiStopLimit
-= FirstMultiXactId
;
2394 * We'll start complaining loudly when we get within 40M multis of data
2395 * loss. This is kind of arbitrary, but if you let your gas gauge get
2396 * down to 2% of full, would you be looking for the next gas station? We
2397 * need to be fairly liberal about this number because there are lots of
2398 * scenarios where most transactions are done by automatic clients that
2399 * won't pay attention to warnings. (No, we're not gonna make this
2400 * configurable. If you know enough to configure it, you know enough to
2401 * not get in this kind of trouble in the first place.)
2403 multiWarnLimit
= multiWrapLimit
- 40000000;
2404 if (multiWarnLimit
< FirstMultiXactId
)
2405 multiWarnLimit
-= FirstMultiXactId
;
2408 * We'll start trying to force autovacuums when oldest_datminmxid gets to
2409 * be more than autovacuum_multixact_freeze_max_age mxids old.
2411 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
2412 * so that we don't have to worry about dealing with on-the-fly changes in
2413 * its value. See SetTransactionIdLimit.
2415 multiVacLimit
= oldest_datminmxid
+ autovacuum_multixact_freeze_max_age
;
2416 if (multiVacLimit
< FirstMultiXactId
)
2417 multiVacLimit
+= FirstMultiXactId
;
2419 /* Grab lock for just long enough to set the new limit values */
2420 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
2421 MultiXactState
->oldestMultiXactId
= oldest_datminmxid
;
2422 MultiXactState
->oldestMultiXactDB
= oldest_datoid
;
2423 MultiXactState
->multiVacLimit
= multiVacLimit
;
2424 MultiXactState
->multiWarnLimit
= multiWarnLimit
;
2425 MultiXactState
->multiStopLimit
= multiStopLimit
;
2426 MultiXactState
->multiWrapLimit
= multiWrapLimit
;
2427 curMulti
= MultiXactState
->nextMXact
;
2428 LWLockRelease(MultiXactGenLock
);
2432 (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
2433 multiWrapLimit
, oldest_datoid
)));
2436 * Computing the actual limits is only possible once the data directory is
2437 * in a consistent state. There's no need to compute the limits while
2438 * still replaying WAL - no decisions about new multis are made even
2439 * though multixact creations might be replayed. So we'll only do further
2440 * checks after TrimMultiXact() has been called.
2442 if (!MultiXactState
->finishedStartup
)
2445 Assert(!InRecovery
);
2447 /* Set limits for offset vacuum. */
2448 needs_offset_vacuum
= SetOffsetVacuumLimit(is_startup
);
2451 * If past the autovacuum force point, immediately signal an autovac
2452 * request. The reason for this is that autovac only processes one
2453 * database per invocation. Once it's finished cleaning up the oldest
2454 * database, it'll call here, and we'll signal the postmaster to start
2455 * another iteration immediately if there are still any old databases.
2457 if ((MultiXactIdPrecedes(multiVacLimit
, curMulti
) ||
2458 needs_offset_vacuum
) && IsUnderPostmaster
)
2459 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER
);
2461 /* Give an immediate warning if past the wrap warn point */
2462 if (MultiXactIdPrecedes(multiWarnLimit
, curMulti
))
2464 char *oldest_datname
;
2467 * We can be called when not inside a transaction, for example during
2468 * StartupXLOG(). In such a case we cannot do database access, so we
2469 * must just report the oldest DB's OID.
2471 * Note: it's also possible that get_database_name fails and returns
2472 * NULL, for example because the database just got dropped. We'll
2473 * still warn, even though the warning might now be unnecessary.
2475 if (IsTransactionState())
2476 oldest_datname
= get_database_name(oldest_datoid
);
2478 oldest_datname
= NULL
;
2482 (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
2483 "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
2484 multiWrapLimit
- curMulti
,
2486 multiWrapLimit
- curMulti
),
2487 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2488 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2491 (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
2492 "database with OID %u must be vacuumed before %u more MultiXactIds are used",
2493 multiWrapLimit
- curMulti
,
2495 multiWrapLimit
- curMulti
),
2496 errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n"
2497 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
2502 * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
2503 * and similarly nextOffset is at least minMultiOffset.
2505 * This is used when we can determine minimum safe values from an XLog
2506 * record (either an on-line checkpoint or an mxact creation log entry).
2507 * Although this is only called during XLog replay, we take the lock in case
2508 * any hot-standby backends are examining the values.
2511 MultiXactAdvanceNextMXact(MultiXactId minMulti
,
2512 MultiXactOffset minMultiOffset
)
2514 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
2515 if (MultiXactIdPrecedes(MultiXactState
->nextMXact
, minMulti
))
2517 debug_elog3(DEBUG2
, "MultiXact: setting next multi to %u", minMulti
);
2518 MultiXactState
->nextMXact
= minMulti
;
2520 if (MultiXactOffsetPrecedes(MultiXactState
->nextOffset
, minMultiOffset
))
2522 debug_elog3(DEBUG2
, "MultiXact: setting next offset to %u",
2524 MultiXactState
->nextOffset
= minMultiOffset
;
2526 LWLockRelease(MultiXactGenLock
);
2530 * Update our oldestMultiXactId value, but only if it's more recent than what
2533 * This may only be called during WAL replay.
2536 MultiXactAdvanceOldest(MultiXactId oldestMulti
, Oid oldestMultiDB
)
2540 if (MultiXactIdPrecedes(MultiXactState
->oldestMultiXactId
, oldestMulti
))
2541 SetMultiXactIdLimit(oldestMulti
, oldestMultiDB
, false);
2545 * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
2547 * NB: this is called while holding MultiXactGenLock. We want it to be very
2548 * fast most of the time; even when it's not so fast, no actual I/O need
2549 * happen unless we're forced to write out a dirty log or xlog page to make
2550 * room in shared memory.
2553 ExtendMultiXactOffset(MultiXactId multi
)
2559 * No work except at first MultiXactId of a page. But beware: just after
2560 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
2562 if (MultiXactIdToOffsetEntry(multi
) != 0 &&
2563 multi
!= FirstMultiXactId
)
2566 pageno
= MultiXactIdToOffsetPage(multi
);
2567 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
2569 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2571 /* Zero the page and make an XLOG entry about it */
2572 ZeroMultiXactOffsetPage(pageno
, true);
2574 LWLockRelease(lock
);
2578 * Make sure that MultiXactMember has room for the members of a newly-
2579 * allocated MultiXactId.
2581 * Like the above routine, this is called while holding MultiXactGenLock;
2582 * same comments apply.
2585 ExtendMultiXactMember(MultiXactOffset offset
, int nmembers
)
2588 * It's possible that the members span more than one page of the members
2589 * file, so we loop to ensure we consider each page. The coding is not
2590 * optimal if the members span several pages, but that seems unusual
2591 * enough to not worry much about.
2593 while (nmembers
> 0)
2600 * Only zero when at first entry of a page.
2602 flagsoff
= MXOffsetToFlagsOffset(offset
);
2603 flagsbit
= MXOffsetToFlagsBitShift(offset
);
2604 if (flagsoff
== 0 && flagsbit
== 0)
2609 pageno
= MXOffsetToMemberPage(offset
);
2610 lock
= SimpleLruGetBankLock(MultiXactMemberCtl
, pageno
);
2612 LWLockAcquire(lock
, LW_EXCLUSIVE
);
2614 /* Zero the page and make an XLOG entry about it */
2615 ZeroMultiXactMemberPage(pageno
, true);
2617 LWLockRelease(lock
);
2621 * Compute the number of items till end of current page. Careful: if
2622 * addition of unsigned ints wraps around, we're at the last page of
2623 * the last segment; since that page holds a different number of items
2624 * than other pages, we need to do it differently.
2626 if (offset
+ MAX_MEMBERS_IN_LAST_MEMBERS_PAGE
< offset
)
2629 * This is the last page of the last segment; we can compute the
2630 * number of items left to allocate in it without modulo
2633 difference
= MaxMultiXactOffset
- offset
+ 1;
2636 difference
= MULTIXACT_MEMBERS_PER_PAGE
- offset
% MULTIXACT_MEMBERS_PER_PAGE
;
2639 * Advance to next page, taking care to properly handle the wraparound
2640 * case. OK if nmembers goes negative.
2642 nmembers
-= difference
;
2643 offset
+= difference
;
2648 * GetOldestMultiXactId
2650 * Return the oldest MultiXactId that's still possibly still seen as live by
2651 * any running transaction. Older ones might still exist on disk, but they no
2652 * longer have any running member transaction.
2654 * It's not safe to truncate MultiXact SLRU segments on the value returned by
2655 * this function; however, it can be set as the new relminmxid for any table
2656 * that VACUUM knows has no remaining MXIDs < the same value. It is only safe
2657 * to truncate SLRUs when no table can possibly still have a referencing MXID.
2660 GetOldestMultiXactId(void)
2662 MultiXactId oldestMXact
;
2663 MultiXactId nextMXact
;
2667 * This is the oldest valid value among all the OldestMemberMXactId[] and
2668 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
2670 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
2673 * We have to beware of the possibility that nextMXact is in the
2674 * wrapped-around state. We don't fix the counter itself here, but we
2675 * must be sure to use a valid value in our calculation.
2677 nextMXact
= MultiXactState
->nextMXact
;
2678 if (nextMXact
< FirstMultiXactId
)
2679 nextMXact
= FirstMultiXactId
;
2681 oldestMXact
= nextMXact
;
2682 for (i
= 0; i
< MaxOldestSlot
; i
++)
2684 MultiXactId thisoldest
;
2686 thisoldest
= OldestMemberMXactId
[i
];
2687 if (MultiXactIdIsValid(thisoldest
) &&
2688 MultiXactIdPrecedes(thisoldest
, oldestMXact
))
2689 oldestMXact
= thisoldest
;
2690 thisoldest
= OldestVisibleMXactId
[i
];
2691 if (MultiXactIdIsValid(thisoldest
) &&
2692 MultiXactIdPrecedes(thisoldest
, oldestMXact
))
2693 oldestMXact
= thisoldest
;
2696 LWLockRelease(MultiXactGenLock
);
2702 * Determine how aggressively we need to vacuum in order to prevent member
2705 * To do so determine what's the oldest member offset and install the limit
2706 * info in MultiXactState, where it can be used to prevent overrun of old data
2707 * in the members SLRU area.
2709 * The return value is true if emergency autovacuum is required and false
2713 SetOffsetVacuumLimit(bool is_startup
)
2715 MultiXactId oldestMultiXactId
;
2716 MultiXactId nextMXact
;
2717 MultiXactOffset oldestOffset
= 0; /* placate compiler */
2718 MultiXactOffset prevOldestOffset
;
2719 MultiXactOffset nextOffset
;
2720 bool oldestOffsetKnown
= false;
2721 bool prevOldestOffsetKnown
;
2722 MultiXactOffset offsetStopLimit
= 0;
2723 MultiXactOffset prevOffsetStopLimit
;
2726 * NB: Have to prevent concurrent truncation, we might otherwise try to
2727 * lookup an oldestMulti that's concurrently getting truncated away.
2729 LWLockAcquire(MultiXactTruncationLock
, LW_SHARED
);
2731 /* Read relevant fields from shared memory. */
2732 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
2733 oldestMultiXactId
= MultiXactState
->oldestMultiXactId
;
2734 nextMXact
= MultiXactState
->nextMXact
;
2735 nextOffset
= MultiXactState
->nextOffset
;
2736 prevOldestOffsetKnown
= MultiXactState
->oldestOffsetKnown
;
2737 prevOldestOffset
= MultiXactState
->oldestOffset
;
2738 prevOffsetStopLimit
= MultiXactState
->offsetStopLimit
;
2739 Assert(MultiXactState
->finishedStartup
);
2740 LWLockRelease(MultiXactGenLock
);
2743 * Determine the offset of the oldest multixact. Normally, we can read
2744 * the offset from the multixact itself, but there's an important special
2745 * case: if there are no multixacts in existence at all, oldestMXact
2746 * obviously can't point to one. It will instead point to the multixact
2747 * ID that will be assigned the next time one is needed.
2749 if (oldestMultiXactId
== nextMXact
)
2752 * When the next multixact gets created, it will be stored at the next
2755 oldestOffset
= nextOffset
;
2756 oldestOffsetKnown
= true;
2761 * Figure out where the oldest existing multixact's offsets are
2762 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
2763 * the supposedly-earliest multixact might not really exist. We are
2764 * careful not to fail in that case.
2767 find_multixact_start(oldestMultiXactId
, &oldestOffset
);
2769 if (oldestOffsetKnown
)
2771 (errmsg_internal("oldest MultiXactId member is at offset %u",
2775 (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
2776 oldestMultiXactId
)));
2779 LWLockRelease(MultiXactTruncationLock
);
2782 * If we can, compute limits (and install them MultiXactState) to prevent
2783 * overrun of old data in the members SLRU area. We can only do so if the
2784 * oldest offset is known though.
2786 if (oldestOffsetKnown
)
2788 /* move back to start of the corresponding segment */
2789 offsetStopLimit
= oldestOffset
- (oldestOffset
%
2790 (MULTIXACT_MEMBERS_PER_PAGE
* SLRU_PAGES_PER_SEGMENT
));
2792 /* always leave one segment before the wraparound point */
2793 offsetStopLimit
-= (MULTIXACT_MEMBERS_PER_PAGE
* SLRU_PAGES_PER_SEGMENT
);
2795 if (!prevOldestOffsetKnown
&& !is_startup
)
2797 (errmsg("MultiXact member wraparound protections are now enabled")));
2800 (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
2801 offsetStopLimit
, oldestMultiXactId
)));
2803 else if (prevOldestOffsetKnown
)
2806 * If we failed to get the oldest offset this time, but we have a
2807 * value from a previous pass through this function, use the old
2808 * values rather than automatically forcing an emergency autovacuum
2811 oldestOffset
= prevOldestOffset
;
2812 oldestOffsetKnown
= true;
2813 offsetStopLimit
= prevOffsetStopLimit
;
2816 /* Install the computed values */
2817 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
2818 MultiXactState
->oldestOffset
= oldestOffset
;
2819 MultiXactState
->oldestOffsetKnown
= oldestOffsetKnown
;
2820 MultiXactState
->offsetStopLimit
= offsetStopLimit
;
2821 LWLockRelease(MultiXactGenLock
);
2824 * Do we need an emergency autovacuum? If we're not sure, assume yes.
2826 return !oldestOffsetKnown
||
2827 (nextOffset
- oldestOffset
> MULTIXACT_MEMBER_SAFE_THRESHOLD
);
2831 * Return whether adding "distance" to "start" would move past "boundary".
2833 * We use this to determine whether the addition is "wrapping around" the
2834 * boundary point, hence the name. The reason we don't want to use the regular
2835 * 2^31-modulo arithmetic here is that we want to be able to use the whole of
2836 * the 2^32-1 space here, allowing for more multixacts than would fit
2840 MultiXactOffsetWouldWrap(MultiXactOffset boundary
, MultiXactOffset start
,
2843 MultiXactOffset finish
;
2846 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
2847 * if the addition wraps around the UINT_MAX boundary, skip that value.
2849 finish
= start
+ distance
;
2853 /*-----------------------------------------------------------------------
2854 * When the boundary is numerically greater than the starting point, any
2855 * value numerically between the two is not wrapped:
2858 * [---) = F wrapped past B (and UINT_MAX)
2859 * [---) = F not wrapped
2860 * [----] = F wrapped past B
2862 * When the boundary is numerically less than the starting point (i.e. the
2863 * UINT_MAX wraparound occurs somewhere in between) then all values in
2864 * between are wrapped:
2867 * [---) = F not wrapped past B (but wrapped past UINT_MAX)
2868 * [---) = F wrapped past B (and UINT_MAX)
2869 * [----] = F not wrapped
2870 *-----------------------------------------------------------------------
2872 if (start
< boundary
)
2873 return finish
>= boundary
|| finish
< start
;
2875 return finish
>= boundary
&& finish
< start
;
2879 * Find the starting offset of the given MultiXactId.
2881 * Returns false if the file containing the multi does not exist on disk.
2882 * Otherwise, returns true and sets *result to the starting member offset.
2884 * This function does not prevent concurrent truncation, so if that's
2885 * required, the caller has to protect against that.
2888 find_multixact_start(MultiXactId multi
, MultiXactOffset
*result
)
2890 MultiXactOffset offset
;
2894 MultiXactOffset
*offptr
;
2896 Assert(MultiXactState
->finishedStartup
);
2898 pageno
= MultiXactIdToOffsetPage(multi
);
2899 entryno
= MultiXactIdToOffsetEntry(multi
);
2902 * Write out dirty data, so PhysicalPageExists can work correctly.
2904 SimpleLruWriteAll(MultiXactOffsetCtl
, true);
2905 SimpleLruWriteAll(MultiXactMemberCtl
, true);
2907 if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl
, pageno
))
2910 /* lock is acquired by SimpleLruReadPage_ReadOnly */
2911 slotno
= SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl
, pageno
, multi
);
2912 offptr
= (MultiXactOffset
*) MultiXactOffsetCtl
->shared
->page_buffer
[slotno
];
2915 LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
));
2922 * Determine how many multixacts, and how many multixact members, currently
2923 * exist. Return false if unable to determine.
2926 ReadMultiXactCounts(uint32
*multixacts
, MultiXactOffset
*members
)
2928 MultiXactOffset nextOffset
;
2929 MultiXactOffset oldestOffset
;
2930 MultiXactId oldestMultiXactId
;
2931 MultiXactId nextMultiXactId
;
2932 bool oldestOffsetKnown
;
2934 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
2935 nextOffset
= MultiXactState
->nextOffset
;
2936 oldestMultiXactId
= MultiXactState
->oldestMultiXactId
;
2937 nextMultiXactId
= MultiXactState
->nextMXact
;
2938 oldestOffset
= MultiXactState
->oldestOffset
;
2939 oldestOffsetKnown
= MultiXactState
->oldestOffsetKnown
;
2940 LWLockRelease(MultiXactGenLock
);
2942 if (!oldestOffsetKnown
)
2945 *members
= nextOffset
- oldestOffset
;
2946 *multixacts
= nextMultiXactId
- oldestMultiXactId
;
2951 * Multixact members can be removed once the multixacts that refer to them
2952 * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
2953 * vacuum_multixact_freeze_table_age work together to make sure we never have
2954 * too many multixacts; we hope that, at least under normal circumstances,
2955 * this will also be sufficient to keep us from using too many offsets.
2956 * However, if the average multixact has many members, we might exhaust the
2957 * members space while still using few enough members that these limits fail
2958 * to trigger relminmxid advancement by VACUUM. At that point, we'd have no
2959 * choice but to start failing multixact-creating operations with an error.
2961 * To prevent that, if more than a threshold portion of the members space is
2962 * used, we effectively reduce autovacuum_multixact_freeze_max_age and
2963 * to a value just less than the number of multixacts in use. We hope that
2964 * this will quickly trigger autovacuuming on the table or tables with the
2965 * oldest relminmxid, thus allowing datminmxid values to advance and removing
2968 * As the fraction of the member space currently in use grows, we become
2969 * more aggressive in clamping this value. That not only causes autovacuum
2970 * to ramp up, but also makes any manual vacuums the user issues more
2971 * aggressive. This happens because vacuum_get_cutoffs() will clamp the
2972 * freeze table and the minimum freeze age cutoffs based on the effective
2973 * autovacuum_multixact_freeze_max_age this function returns. In the worst
2974 * case, we'll claim the freeze_max_age to zero, and every vacuum of any
2975 * table will freeze every multixact.
2978 MultiXactMemberFreezeThreshold(void)
2980 MultiXactOffset members
;
2982 uint32 victim_multixacts
;
2986 /* If we can't determine member space utilization, assume the worst. */
2987 if (!ReadMultiXactCounts(&multixacts
, &members
))
2990 /* If member space utilization is low, no special action is required. */
2991 if (members
<= MULTIXACT_MEMBER_SAFE_THRESHOLD
)
2992 return autovacuum_multixact_freeze_max_age
;
2995 * Compute a target for relminmxid advancement. The number of multixacts
2996 * we try to eliminate from the system is based on how far we are past
2997 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
2999 fraction
= (double) (members
- MULTIXACT_MEMBER_SAFE_THRESHOLD
) /
3000 (MULTIXACT_MEMBER_DANGER_THRESHOLD
- MULTIXACT_MEMBER_SAFE_THRESHOLD
);
3001 victim_multixacts
= multixacts
* fraction
;
3003 /* fraction could be > 1.0, but lowest possible freeze age is zero */
3004 if (victim_multixacts
> multixacts
)
3006 result
= multixacts
- victim_multixacts
;
3009 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
3010 * autovacuum less aggressive than it would otherwise be.
3012 return Min(result
, autovacuum_multixact_freeze_max_age
);
3015 typedef struct mxtruncinfo
3017 int64 earliestExistingPage
;
3021 * SlruScanDirectory callback
3022 * This callback determines the earliest existing page number.
3025 SlruScanDirCbFindEarliest(SlruCtl ctl
, char *filename
, int64 segpage
, void *data
)
3027 mxtruncinfo
*trunc
= (mxtruncinfo
*) data
;
3029 if (trunc
->earliestExistingPage
== -1 ||
3030 ctl
->PagePrecedes(segpage
, trunc
->earliestExistingPage
))
3032 trunc
->earliestExistingPage
= segpage
;
3035 return false; /* keep going */
3040 * Delete members segments [oldest, newOldest)
3042 * The members SLRU can, in contrast to the offsets one, be filled to almost
3043 * the full range at once. This means SimpleLruTruncate() can't trivially be
3044 * used - instead the to-be-deleted range is computed using the offsets
3045 * SLRU. C.f. TruncateMultiXact().
3048 PerformMembersTruncation(MultiXactOffset oldestOffset
, MultiXactOffset newOldestOffset
)
3050 const int64 maxsegment
= MXOffsetToMemberSegment(MaxMultiXactOffset
);
3051 int64 startsegment
= MXOffsetToMemberSegment(oldestOffset
);
3052 int64 endsegment
= MXOffsetToMemberSegment(newOldestOffset
);
3053 int64 segment
= startsegment
;
3056 * Delete all the segments but the last one. The last segment can still
3057 * contain, possibly partially, valid data.
3059 while (segment
!= endsegment
)
3061 elog(DEBUG2
, "truncating multixact members segment %llx",
3062 (unsigned long long) segment
);
3063 SlruDeleteSegment(MultiXactMemberCtl
, segment
);
3065 /* move to next segment, handling wraparound correctly */
3066 if (segment
== maxsegment
)
3074 * Delete offsets segments [oldest, newOldest)
3077 PerformOffsetsTruncation(MultiXactId oldestMulti
, MultiXactId newOldestMulti
)
3080 * We step back one multixact to avoid passing a cutoff page that hasn't
3081 * been created yet in the rare case that oldestMulti would be the first
3082 * item on a page and oldestMulti == nextMulti. In that case, if we
3083 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
3086 SimpleLruTruncate(MultiXactOffsetCtl
,
3087 MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti
)));
3091 * Remove all MultiXactOffset and MultiXactMember segments before the oldest
3092 * ones still of interest.
3094 * This is only called on a primary as part of vacuum (via
3095 * vac_truncate_clog()). During recovery truncation is done by replaying
3096 * truncation WAL records logged here.
3098 * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
3099 * is one of the databases preventing newOldestMulti from increasing.
3102 TruncateMultiXact(MultiXactId newOldestMulti
, Oid newOldestMultiDB
)
3104 MultiXactId oldestMulti
;
3105 MultiXactId nextMulti
;
3106 MultiXactOffset newOldestOffset
;
3107 MultiXactOffset oldestOffset
;
3108 MultiXactOffset nextOffset
;
3110 MultiXactId earliest
;
3112 Assert(!RecoveryInProgress());
3113 Assert(MultiXactState
->finishedStartup
);
3116 * We can only allow one truncation to happen at once. Otherwise parts of
3117 * members might vanish while we're doing lookups or similar. There's no
3118 * need to have an interlock with creating new multis or such, since those
3119 * are constrained by the limits (which only grow, never shrink).
3121 LWLockAcquire(MultiXactTruncationLock
, LW_EXCLUSIVE
);
3123 LWLockAcquire(MultiXactGenLock
, LW_SHARED
);
3124 nextMulti
= MultiXactState
->nextMXact
;
3125 nextOffset
= MultiXactState
->nextOffset
;
3126 oldestMulti
= MultiXactState
->oldestMultiXactId
;
3127 LWLockRelease(MultiXactGenLock
);
3128 Assert(MultiXactIdIsValid(oldestMulti
));
3131 * Make sure to only attempt truncation if there's values to truncate
3132 * away. In normal processing values shouldn't go backwards, but there's
3133 * some corner cases (due to bugs) where that's possible.
3135 if (MultiXactIdPrecedesOrEquals(newOldestMulti
, oldestMulti
))
3137 LWLockRelease(MultiXactTruncationLock
);
3142 * Note we can't just plow ahead with the truncation; it's possible that
3143 * there are no segments to truncate, which is a problem because we are
3144 * going to attempt to read the offsets page to determine where to
3145 * truncate the members SLRU. So we first scan the directory to determine
3146 * the earliest offsets page number that we can read without error.
3148 * When nextMXact is less than one segment away from multiWrapLimit,
3149 * SlruScanDirCbFindEarliest can find some early segment other than the
3150 * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
3151 * returns false, because not all pairs of entries have the same answer.)
3152 * That can also arise when an earlier truncation attempt failed unlink()
3153 * or returned early from this function. The only consequence is
3154 * returning early, which wastes space that we could have liberated.
3156 * NB: It's also possible that the page that oldestMulti is on has already
3157 * been truncated away, and we crashed before updating oldestMulti.
3159 trunc
.earliestExistingPage
= -1;
3160 SlruScanDirectory(MultiXactOffsetCtl
, SlruScanDirCbFindEarliest
, &trunc
);
3161 earliest
= trunc
.earliestExistingPage
* MULTIXACT_OFFSETS_PER_PAGE
;
3162 if (earliest
< FirstMultiXactId
)
3163 earliest
= FirstMultiXactId
;
3165 /* If there's nothing to remove, we can bail out early. */
3166 if (MultiXactIdPrecedes(oldestMulti
, earliest
))
3168 LWLockRelease(MultiXactTruncationLock
);
3173 * First, compute the safe truncation point for MultiXactMember. This is
3174 * the starting offset of the oldest multixact.
3176 * Hopefully, find_multixact_start will always work here, because we've
3177 * already checked that it doesn't precede the earliest MultiXact on disk.
3178 * But if it fails, don't truncate anything, and log a message.
3180 if (oldestMulti
== nextMulti
)
3182 /* there are NO MultiXacts */
3183 oldestOffset
= nextOffset
;
3185 else if (!find_multixact_start(oldestMulti
, &oldestOffset
))
3188 (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
3189 oldestMulti
, earliest
)));
3190 LWLockRelease(MultiXactTruncationLock
);
3195 * Secondly compute up to where to truncate. Lookup the corresponding
3196 * member offset for newOldestMulti for that.
3198 if (newOldestMulti
== nextMulti
)
3200 /* there are NO MultiXacts */
3201 newOldestOffset
= nextOffset
;
3203 else if (!find_multixact_start(newOldestMulti
, &newOldestOffset
))
3206 (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
3208 LWLockRelease(MultiXactTruncationLock
);
3212 elog(DEBUG1
, "performing multixact truncation: "
3213 "offsets [%u, %u), offsets segments [%llx, %llx), "
3214 "members [%u, %u), members segments [%llx, %llx)",
3215 oldestMulti
, newOldestMulti
,
3216 (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti
),
3217 (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti
),
3218 oldestOffset
, newOldestOffset
,
3219 (unsigned long long) MXOffsetToMemberSegment(oldestOffset
),
3220 (unsigned long long) MXOffsetToMemberSegment(newOldestOffset
));
3223 * Do truncation, and the WAL logging of the truncation, in a critical
3224 * section. That way offsets/members cannot get out of sync anymore, i.e.
3225 * once consistent the newOldestMulti will always exist in members, even
3226 * if we crashed in the wrong moment.
3228 START_CRIT_SECTION();
3231 * Prevent checkpoints from being scheduled concurrently. This is critical
3232 * because otherwise a truncation record might not be replayed after a
3233 * crash/basebackup, even though the state of the data directory would
3236 Assert((MyProc
->delayChkptFlags
& DELAY_CHKPT_START
) == 0);
3237 MyProc
->delayChkptFlags
|= DELAY_CHKPT_START
;
3239 /* WAL log truncation */
3240 WriteMTruncateXlogRec(newOldestMultiDB
,
3241 oldestMulti
, newOldestMulti
,
3242 oldestOffset
, newOldestOffset
);
3245 * Update in-memory limits before performing the truncation, while inside
3246 * the critical section: Have to do it before truncation, to prevent
3247 * concurrent lookups of those values. Has to be inside the critical
3248 * section as otherwise a future call to this function would error out,
3249 * while looking up the oldest member in offsets, if our caller crashes
3250 * before updating the limits.
3252 LWLockAcquire(MultiXactGenLock
, LW_EXCLUSIVE
);
3253 MultiXactState
->oldestMultiXactId
= newOldestMulti
;
3254 MultiXactState
->oldestMultiXactDB
= newOldestMultiDB
;
3255 LWLockRelease(MultiXactGenLock
);
3257 /* First truncate members */
3258 PerformMembersTruncation(oldestOffset
, newOldestOffset
);
3261 PerformOffsetsTruncation(oldestMulti
, newOldestMulti
);
3263 MyProc
->delayChkptFlags
&= ~DELAY_CHKPT_START
;
3266 LWLockRelease(MultiXactTruncationLock
);
3270 * Decide whether a MultiXactOffset page number is "older" for truncation
3271 * purposes. Analogous to CLOGPagePrecedes().
3273 * Offsetting the values is optional, because MultiXactIdPrecedes() has
3274 * translational symmetry.
3277 MultiXactOffsetPagePrecedes(int64 page1
, int64 page2
)
3282 multi1
= ((MultiXactId
) page1
) * MULTIXACT_OFFSETS_PER_PAGE
;
3283 multi1
+= FirstMultiXactId
+ 1;
3284 multi2
= ((MultiXactId
) page2
) * MULTIXACT_OFFSETS_PER_PAGE
;
3285 multi2
+= FirstMultiXactId
+ 1;
3287 return (MultiXactIdPrecedes(multi1
, multi2
) &&
3288 MultiXactIdPrecedes(multi1
,
3289 multi2
+ MULTIXACT_OFFSETS_PER_PAGE
- 1));
3293 * Decide whether a MultiXactMember page number is "older" for truncation
3294 * purposes. There is no "invalid offset number" so use the numbers verbatim.
3297 MultiXactMemberPagePrecedes(int64 page1
, int64 page2
)
3299 MultiXactOffset offset1
;
3300 MultiXactOffset offset2
;
3302 offset1
= ((MultiXactOffset
) page1
) * MULTIXACT_MEMBERS_PER_PAGE
;
3303 offset2
= ((MultiXactOffset
) page2
) * MULTIXACT_MEMBERS_PER_PAGE
;
3305 return (MultiXactOffsetPrecedes(offset1
, offset2
) &&
3306 MultiXactOffsetPrecedes(offset1
,
3307 offset2
+ MULTIXACT_MEMBERS_PER_PAGE
- 1));
3311 * Decide which of two MultiXactIds is earlier.
3313 * XXX do we need to do something special for InvalidMultiXactId?
3314 * (Doesn't look like it.)
3317 MultiXactIdPrecedes(MultiXactId multi1
, MultiXactId multi2
)
3319 int32 diff
= (int32
) (multi1
- multi2
);
3325 * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
3327 * XXX do we need to do something special for InvalidMultiXactId?
3328 * (Doesn't look like it.)
3331 MultiXactIdPrecedesOrEquals(MultiXactId multi1
, MultiXactId multi2
)
3333 int32 diff
= (int32
) (multi1
- multi2
);
3340 * Decide which of two offsets is earlier.
3343 MultiXactOffsetPrecedes(MultiXactOffset offset1
, MultiXactOffset offset2
)
3345 int32 diff
= (int32
) (offset1
- offset2
);
3351 * Write an xlog record reflecting the zeroing of either a MEMBERs or
3352 * OFFSETs page (info shows which)
3355 WriteMZeroPageXlogRec(int64 pageno
, uint8 info
)
3358 XLogRegisterData((char *) (&pageno
), sizeof(pageno
));
3359 (void) XLogInsert(RM_MULTIXACT_ID
, info
);
3363 * Write a TRUNCATE xlog record
3365 * We must flush the xlog record to disk before returning --- see notes in
3369 WriteMTruncateXlogRec(Oid oldestMultiDB
,
3370 MultiXactId startTruncOff
, MultiXactId endTruncOff
,
3371 MultiXactOffset startTruncMemb
, MultiXactOffset endTruncMemb
)
3374 xl_multixact_truncate xlrec
;
3376 xlrec
.oldestMultiDB
= oldestMultiDB
;
3378 xlrec
.startTruncOff
= startTruncOff
;
3379 xlrec
.endTruncOff
= endTruncOff
;
3381 xlrec
.startTruncMemb
= startTruncMemb
;
3382 xlrec
.endTruncMemb
= endTruncMemb
;
3385 XLogRegisterData((char *) (&xlrec
), SizeOfMultiXactTruncate
);
3386 recptr
= XLogInsert(RM_MULTIXACT_ID
, XLOG_MULTIXACT_TRUNCATE_ID
);
3391 * MULTIXACT resource manager's routines
3394 multixact_redo(XLogReaderState
*record
)
3396 uint8 info
= XLogRecGetInfo(record
) & ~XLR_INFO_MASK
;
3398 /* Backup blocks are not used in multixact records */
3399 Assert(!XLogRecHasAnyBlockRefs(record
));
3401 if (info
== XLOG_MULTIXACT_ZERO_OFF_PAGE
)
3407 memcpy(&pageno
, XLogRecGetData(record
), sizeof(pageno
));
3409 lock
= SimpleLruGetBankLock(MultiXactOffsetCtl
, pageno
);
3410 LWLockAcquire(lock
, LW_EXCLUSIVE
);
3412 slotno
= ZeroMultiXactOffsetPage(pageno
, false);
3413 SimpleLruWritePage(MultiXactOffsetCtl
, slotno
);
3414 Assert(!MultiXactOffsetCtl
->shared
->page_dirty
[slotno
]);
3416 LWLockRelease(lock
);
3418 else if (info
== XLOG_MULTIXACT_ZERO_MEM_PAGE
)
3424 memcpy(&pageno
, XLogRecGetData(record
), sizeof(pageno
));
3426 lock
= SimpleLruGetBankLock(MultiXactMemberCtl
, pageno
);
3427 LWLockAcquire(lock
, LW_EXCLUSIVE
);
3429 slotno
= ZeroMultiXactMemberPage(pageno
, false);
3430 SimpleLruWritePage(MultiXactMemberCtl
, slotno
);
3431 Assert(!MultiXactMemberCtl
->shared
->page_dirty
[slotno
]);
3433 LWLockRelease(lock
);
3435 else if (info
== XLOG_MULTIXACT_CREATE_ID
)
3437 xl_multixact_create
*xlrec
=
3438 (xl_multixact_create
*) XLogRecGetData(record
);
3439 TransactionId max_xid
;
3442 /* Store the data back into the SLRU files */
3443 RecordNewMultiXact(xlrec
->mid
, xlrec
->moff
, xlrec
->nmembers
,
3446 /* Make sure nextMXact/nextOffset are beyond what this record has */
3447 MultiXactAdvanceNextMXact(xlrec
->mid
+ 1,
3448 xlrec
->moff
+ xlrec
->nmembers
);
3451 * Make sure nextXid is beyond any XID mentioned in the record. This
3452 * should be unnecessary, since any XID found here ought to have other
3453 * evidence in the XLOG, but let's be safe.
3455 max_xid
= XLogRecGetXid(record
);
3456 for (i
= 0; i
< xlrec
->nmembers
; i
++)
3458 if (TransactionIdPrecedes(max_xid
, xlrec
->members
[i
].xid
))
3459 max_xid
= xlrec
->members
[i
].xid
;
3462 AdvanceNextFullTransactionIdPastXid(max_xid
);
3464 else if (info
== XLOG_MULTIXACT_TRUNCATE_ID
)
3466 xl_multixact_truncate xlrec
;
3469 memcpy(&xlrec
, XLogRecGetData(record
),
3470 SizeOfMultiXactTruncate
);
3472 elog(DEBUG1
, "replaying multixact truncation: "
3473 "offsets [%u, %u), offsets segments [%llx, %llx), "
3474 "members [%u, %u), members segments [%llx, %llx)",
3475 xlrec
.startTruncOff
, xlrec
.endTruncOff
,
3476 (unsigned long long) MultiXactIdToOffsetSegment(xlrec
.startTruncOff
),
3477 (unsigned long long) MultiXactIdToOffsetSegment(xlrec
.endTruncOff
),
3478 xlrec
.startTruncMemb
, xlrec
.endTruncMemb
,
3479 (unsigned long long) MXOffsetToMemberSegment(xlrec
.startTruncMemb
),
3480 (unsigned long long) MXOffsetToMemberSegment(xlrec
.endTruncMemb
));
3482 /* should not be required, but more than cheap enough */
3483 LWLockAcquire(MultiXactTruncationLock
, LW_EXCLUSIVE
);
3486 * Advance the horizon values, so they're current at the end of
3489 SetMultiXactIdLimit(xlrec
.endTruncOff
, xlrec
.oldestMultiDB
, false);
3491 PerformMembersTruncation(xlrec
.startTruncMemb
, xlrec
.endTruncMemb
);
3494 * During XLOG replay, latest_page_number isn't necessarily set up
3495 * yet; insert a suitable value to bypass the sanity test in
3496 * SimpleLruTruncate.
3498 pageno
= MultiXactIdToOffsetPage(xlrec
.endTruncOff
);
3499 pg_atomic_write_u64(&MultiXactOffsetCtl
->shared
->latest_page_number
,
3501 PerformOffsetsTruncation(xlrec
.startTruncOff
, xlrec
.endTruncOff
);
3503 LWLockRelease(MultiXactTruncationLock
);
3506 elog(PANIC
, "multixact_redo: unknown op code %u", info
);
3510 pg_get_multixact_members(PG_FUNCTION_ARGS
)
3514 MultiXactMember
*members
;
3518 MultiXactId mxid
= PG_GETARG_TRANSACTIONID(0);
3520 FuncCallContext
*funccxt
;
3522 if (mxid
< FirstMultiXactId
)
3524 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
3525 errmsg("invalid MultiXactId: %u", mxid
)));
3527 if (SRF_IS_FIRSTCALL())
3529 MemoryContext oldcxt
;
3532 funccxt
= SRF_FIRSTCALL_INIT();
3533 oldcxt
= MemoryContextSwitchTo(funccxt
->multi_call_memory_ctx
);
3535 multi
= palloc(sizeof(mxact
));
3536 /* no need to allow for old values here */
3537 multi
->nmembers
= GetMultiXactIdMembers(mxid
, &multi
->members
, false,
3541 if (get_call_result_type(fcinfo
, NULL
, &tupdesc
) != TYPEFUNC_COMPOSITE
)
3542 elog(ERROR
, "return type must be a row type");
3543 funccxt
->tuple_desc
= tupdesc
;
3544 funccxt
->attinmeta
= TupleDescGetAttInMetadata(tupdesc
);
3545 funccxt
->user_fctx
= multi
;
3547 MemoryContextSwitchTo(oldcxt
);
3550 funccxt
= SRF_PERCALL_SETUP();
3551 multi
= (mxact
*) funccxt
->user_fctx
;
3553 while (multi
->iter
< multi
->nmembers
)
3558 values
[0] = psprintf("%u", multi
->members
[multi
->iter
].xid
);
3559 values
[1] = mxstatus_to_string(multi
->members
[multi
->iter
].status
);
3561 tuple
= BuildTupleFromCStrings(funccxt
->attinmeta
, values
);
3565 SRF_RETURN_NEXT(funccxt
, HeapTupleGetDatum(tuple
));
3568 SRF_RETURN_DONE(funccxt
);
3572 * Entrypoint for sync.c to sync offsets files.
3575 multixactoffsetssyncfiletag(const FileTag
*ftag
, char *path
)
3577 return SlruSyncFileTag(MultiXactOffsetCtl
, ftag
, path
);
3581 * Entrypoint for sync.c to sync members files.
3584 multixactmemberssyncfiletag(const FileTag
*ftag
, char *path
)
3586 return SlruSyncFileTag(MultiXactMemberCtl
, ftag
, path
);