1 /*-------------------------------------------------------------------------
4 * Implementation of generic xlog records.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/access/transam/generic_xlog.c
12 *-------------------------------------------------------------------------
16 #include "access/bufmask.h"
17 #include "access/generic_xlog.h"
18 #include "access/xlogutils.h"
19 #include "miscadmin.h"
21 /*-------------------------------------------------------------------------
22 * Internally, a delta between pages consists of a set of fragments. Each
23 * fragment represents changes made in a given region of a page. A fragment
24 * is made up as follows:
26 * - offset of page region (OffsetNumber)
27 * - length of page region (OffsetNumber)
28 * - data - the data to place into the region ('length' number of bytes)
30 * Unchanged regions of a page are not represented in its delta. As a result,
31 * a delta can be more compact than the full page image. But having an
32 * unchanged region between two fragments that is smaller than the fragment
33 * header (offset+length) does not pay off in terms of the overall size of
34 * the delta. For this reason, we merge adjacent fragments if the unchanged
35 * region between them is <= MATCH_THRESHOLD bytes.
37 * We do not bother to merge fragments across the "lower" and "upper" parts
38 * of a page; it's very seldom the case that pd_lower and pd_upper are within
39 * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
40 * would complicate and slow down the delta-computation code unduly.
41 * Therefore, the worst-case delta size includes two fragment headers plus
42 * a full page's worth of data.
43 *-------------------------------------------------------------------------
45 #define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
46 #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
47 #define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
49 /* Struct of generic xlog data for single page */
52 Buffer buffer
; /* registered buffer */
53 int flags
; /* flags for this buffer */
54 int deltaLen
; /* space consumed in delta field */
55 char *image
; /* copy of page image for modification, do not
56 * do it in-place to have aligned memory chunk */
57 char delta
[MAX_DELTA_SIZE
]; /* delta between page images */
58 } GenericXLogPageData
;
61 * State of generic xlog record construction. Must be allocated at an I/O
64 struct GenericXLogState
66 /* Page images (properly aligned, must be first) */
67 PGIOAlignedBlock images
[MAX_GENERIC_XLOG_PAGES
];
68 /* Info about each page, see above */
69 GenericXLogPageData pages
[MAX_GENERIC_XLOG_PAGES
];
73 static void writeFragment(GenericXLogPageData
*pageData
, OffsetNumber offset
,
74 OffsetNumber length
, const char *data
);
75 static void computeRegionDelta(GenericXLogPageData
*pageData
,
76 const char *curpage
, const char *targetpage
,
77 int targetStart
, int targetEnd
,
78 int validStart
, int validEnd
);
79 static void computeDelta(GenericXLogPageData
*pageData
, Page curpage
, Page targetpage
);
80 static void applyPageRedo(Page page
, const char *delta
, Size deltaSize
);
84 * Write next fragment into pageData's delta.
86 * The fragment has the given offset and length, and data points to the
87 * actual data (of length length).
90 writeFragment(GenericXLogPageData
*pageData
, OffsetNumber offset
, OffsetNumber length
,
93 char *ptr
= pageData
->delta
+ pageData
->deltaLen
;
95 /* Verify we have enough space */
96 Assert(pageData
->deltaLen
+ sizeof(offset
) +
97 sizeof(length
) + length
<= sizeof(pageData
->delta
));
99 /* Write fragment data */
100 memcpy(ptr
, &offset
, sizeof(offset
));
101 ptr
+= sizeof(offset
);
102 memcpy(ptr
, &length
, sizeof(length
));
103 ptr
+= sizeof(length
);
104 memcpy(ptr
, data
, length
);
107 pageData
->deltaLen
= ptr
- pageData
->delta
;
111 * Compute the XLOG fragments needed to transform a region of curpage into the
112 * corresponding region of targetpage, and append them to pageData's delta
113 * field. The region to transform runs from targetStart to targetEnd-1.
114 * Bytes in curpage outside the range validStart to validEnd-1 should be
115 * considered invalid, and always overwritten with target data.
117 * This function is a hot spot, so it's worth being as tense as possible
118 * about the data-matching loops.
121 computeRegionDelta(GenericXLogPageData
*pageData
,
122 const char *curpage
, const char *targetpage
,
123 int targetStart
, int targetEnd
,
124 int validStart
, int validEnd
)
131 /* Deal with any invalid start region by including it in first fragment */
132 if (validStart
> targetStart
)
134 fragmentBegin
= targetStart
;
135 targetStart
= validStart
;
138 /* We'll deal with any invalid end region after the main loop */
139 loopEnd
= Min(targetEnd
, validEnd
);
141 /* Examine all the potentially matchable bytes */
145 if (curpage
[i
] != targetpage
[i
])
147 /* On unmatched byte, start new fragment if not already in one */
148 if (fragmentBegin
< 0)
150 /* Mark unmatched-data endpoint as uncertain */
152 /* Extend the fragment as far as possible in a tight loop */
154 while (i
< loopEnd
&& curpage
[i
] != targetpage
[i
])
160 /* Found a matched byte, so remember end of unmatched fragment */
164 * Extend the match as far as possible in a tight loop. (On typical
165 * workloads, this inner loop is the bulk of this function's runtime.)
168 while (i
< loopEnd
&& curpage
[i
] == targetpage
[i
])
172 * There are several possible cases at this point:
174 * 1. We have no unwritten fragment (fragmentBegin < 0). There's
175 * nothing to write; and it doesn't matter what fragmentEnd is.
177 * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
178 * Dump out the unwritten fragment, stopping at fragmentEnd.
180 * 3. The match extends to loopEnd. We'll do nothing here, exit the
181 * loop, and then dump the unwritten fragment, after merging it with
182 * the invalid end region if any. If we don't so merge, fragmentEnd
183 * establishes how much the final writeFragment call needs to write.
185 * 4. We found an unmatched byte before loopEnd. The loop will repeat
186 * and will enter the unmatched-byte stanza above. So in this case
187 * also, it doesn't matter what fragmentEnd is. The matched bytes
188 * will get merged into the continuing unmatched fragment.
190 * Only in case 3 do we reach the bottom of the loop with a meaningful
191 * fragmentEnd value, which is why it's OK that we unconditionally
192 * assign "fragmentEnd = i" above.
194 if (fragmentBegin
>= 0 && i
- fragmentEnd
> MATCH_THRESHOLD
)
196 writeFragment(pageData
, fragmentBegin
,
197 fragmentEnd
- fragmentBegin
,
198 targetpage
+ fragmentBegin
);
200 fragmentEnd
= -1; /* not really necessary */
204 /* Deal with any invalid end region by including it in final fragment */
205 if (loopEnd
< targetEnd
)
207 if (fragmentBegin
< 0)
208 fragmentBegin
= loopEnd
;
209 fragmentEnd
= targetEnd
;
212 /* Write final fragment if any */
213 if (fragmentBegin
>= 0)
216 fragmentEnd
= targetEnd
;
217 writeFragment(pageData
, fragmentBegin
,
218 fragmentEnd
- fragmentBegin
,
219 targetpage
+ fragmentBegin
);
224 * Compute the XLOG delta record needed to transform curpage into targetpage,
225 * and store it in pageData's delta field.
228 computeDelta(GenericXLogPageData
*pageData
, Page curpage
, Page targetpage
)
230 int targetLower
= ((PageHeader
) targetpage
)->pd_lower
,
231 targetUpper
= ((PageHeader
) targetpage
)->pd_upper
,
232 curLower
= ((PageHeader
) curpage
)->pd_lower
,
233 curUpper
= ((PageHeader
) curpage
)->pd_upper
;
235 pageData
->deltaLen
= 0;
237 /* Compute delta records for lower part of page ... */
238 computeRegionDelta(pageData
, curpage
, targetpage
,
241 /* ... and for upper part, ignoring what's between */
242 computeRegionDelta(pageData
, curpage
, targetpage
,
247 * If xlog debug is enabled, then check produced delta. Result of delta
248 * application to curpage should be equivalent to targetpage.
255 memcpy(tmp
.data
, curpage
, BLCKSZ
);
256 applyPageRedo(tmp
.data
, pageData
->delta
, pageData
->deltaLen
);
257 if (memcmp(tmp
.data
, targetpage
, targetLower
) != 0 ||
258 memcmp(tmp
.data
+ targetUpper
, targetpage
+ targetUpper
,
259 BLCKSZ
- targetUpper
) != 0)
260 elog(ERROR
, "result of generic xlog apply does not match");
266 * Start new generic xlog record for modifications to specified relation.
269 GenericXLogStart(Relation relation
)
271 GenericXLogState
*state
;
274 state
= (GenericXLogState
*) palloc_aligned(sizeof(GenericXLogState
),
277 state
->isLogged
= RelationNeedsWAL(relation
);
279 for (i
= 0; i
< MAX_GENERIC_XLOG_PAGES
; i
++)
281 state
->pages
[i
].image
= state
->images
[i
].data
;
282 state
->pages
[i
].buffer
= InvalidBuffer
;
289 * Register new buffer for generic xlog record.
291 * Returns pointer to the page's image in the GenericXLogState, which
292 * is what the caller should modify.
294 * If the buffer is already registered, just return its existing entry.
295 * (It's not very clear what to do with the flags in such a case, but
296 * for now we stay with the original flags.)
299 GenericXLogRegisterBuffer(GenericXLogState
*state
, Buffer buffer
, int flags
)
303 /* Search array for existing entry or first unused slot */
304 for (block_id
= 0; block_id
< MAX_GENERIC_XLOG_PAGES
; block_id
++)
306 GenericXLogPageData
*page
= &state
->pages
[block_id
];
308 if (BufferIsInvalid(page
->buffer
))
310 /* Empty slot, so use it (there cannot be a match later) */
311 page
->buffer
= buffer
;
313 memcpy(page
->image
, BufferGetPage(buffer
), BLCKSZ
);
314 return (Page
) page
->image
;
316 else if (page
->buffer
== buffer
)
319 * Buffer is already registered. Just return the image, which is
322 return (Page
) page
->image
;
326 elog(ERROR
, "maximum number %d of generic xlog buffers is exceeded",
327 MAX_GENERIC_XLOG_PAGES
);
328 /* keep compiler quiet */
333 * Apply changes represented by GenericXLogState to the actual buffers,
334 * and emit a generic xlog record.
337 GenericXLogFinish(GenericXLogState
*state
)
344 /* Logged relation: make xlog record in critical section. */
347 START_CRIT_SECTION();
350 * Compute deltas if necessary, write changes to buffers, mark buffers
351 * dirty, and register changes.
353 for (i
= 0; i
< MAX_GENERIC_XLOG_PAGES
; i
++)
355 GenericXLogPageData
*pageData
= &state
->pages
[i
];
357 PageHeader pageHeader
;
359 if (BufferIsInvalid(pageData
->buffer
))
362 page
= BufferGetPage(pageData
->buffer
);
363 pageHeader
= (PageHeader
) pageData
->image
;
366 * Compute delta while we still have both the unmodified page and
367 * the new image. Not needed if we are logging the full image.
369 if (!(pageData
->flags
& GENERIC_XLOG_FULL_IMAGE
))
370 computeDelta(pageData
, page
, (Page
) pageData
->image
);
373 * Apply the image, being careful to zero the "hole" between
374 * pd_lower and pd_upper in order to avoid divergence between
375 * actual page state and what replay would produce.
377 memcpy(page
, pageData
->image
, pageHeader
->pd_lower
);
378 memset(page
+ pageHeader
->pd_lower
, 0,
379 pageHeader
->pd_upper
- pageHeader
->pd_lower
);
380 memcpy(page
+ pageHeader
->pd_upper
,
381 pageData
->image
+ pageHeader
->pd_upper
,
382 BLCKSZ
- pageHeader
->pd_upper
);
384 MarkBufferDirty(pageData
->buffer
);
386 if (pageData
->flags
& GENERIC_XLOG_FULL_IMAGE
)
388 XLogRegisterBuffer(i
, pageData
->buffer
,
389 REGBUF_FORCE_IMAGE
| REGBUF_STANDARD
);
393 XLogRegisterBuffer(i
, pageData
->buffer
, REGBUF_STANDARD
);
394 XLogRegisterBufData(i
, pageData
->delta
, pageData
->deltaLen
);
398 /* Insert xlog record */
399 lsn
= XLogInsert(RM_GENERIC_ID
, 0);
402 for (i
= 0; i
< MAX_GENERIC_XLOG_PAGES
; i
++)
404 GenericXLogPageData
*pageData
= &state
->pages
[i
];
406 if (BufferIsInvalid(pageData
->buffer
))
408 PageSetLSN(BufferGetPage(pageData
->buffer
), lsn
);
414 /* Unlogged relation: skip xlog-related stuff */
415 START_CRIT_SECTION();
416 for (i
= 0; i
< MAX_GENERIC_XLOG_PAGES
; i
++)
418 GenericXLogPageData
*pageData
= &state
->pages
[i
];
420 if (BufferIsInvalid(pageData
->buffer
))
422 memcpy(BufferGetPage(pageData
->buffer
),
425 /* We don't worry about zeroing the "hole" in this case */
426 MarkBufferDirty(pageData
->buffer
);
429 /* We don't have a LSN to return, in this case */
430 lsn
= InvalidXLogRecPtr
;
439 * Abort generic xlog record construction. No changes are applied to buffers.
441 * Note: caller is responsible for releasing locks/pins on buffers, if needed.
444 GenericXLogAbort(GenericXLogState
*state
)
450 * Apply delta to given page image.
453 applyPageRedo(Page page
, const char *delta
, Size deltaSize
)
455 const char *ptr
= delta
;
456 const char *end
= delta
+ deltaSize
;
463 memcpy(&offset
, ptr
, sizeof(offset
));
464 ptr
+= sizeof(offset
);
465 memcpy(&length
, ptr
, sizeof(length
));
466 ptr
+= sizeof(length
);
468 memcpy(page
+ offset
, ptr
, length
);
475 * Redo function for generic xlog record.
478 generic_redo(XLogReaderState
*record
)
480 XLogRecPtr lsn
= record
->EndRecPtr
;
481 Buffer buffers
[MAX_GENERIC_XLOG_PAGES
];
484 /* Protect limited size of buffers[] array */
485 Assert(XLogRecMaxBlockId(record
) < MAX_GENERIC_XLOG_PAGES
);
487 /* Iterate over blocks */
488 for (block_id
= 0; block_id
<= XLogRecMaxBlockId(record
); block_id
++)
490 XLogRedoAction action
;
492 if (!XLogRecHasBlockRef(record
, block_id
))
494 buffers
[block_id
] = InvalidBuffer
;
498 action
= XLogReadBufferForRedo(record
, block_id
, &buffers
[block_id
]);
500 /* Apply redo to given block if needed */
501 if (action
== BLK_NEEDS_REDO
)
504 PageHeader pageHeader
;
508 page
= BufferGetPage(buffers
[block_id
]);
509 blockDelta
= XLogRecGetBlockData(record
, block_id
, &blockDeltaSize
);
510 applyPageRedo(page
, blockDelta
, blockDeltaSize
);
513 * Since the delta contains no information about what's in the
514 * "hole" between pd_lower and pd_upper, set that to zero to
515 * ensure we produce the same page state that application of the
516 * logged action by GenericXLogFinish did.
518 pageHeader
= (PageHeader
) page
;
519 memset(page
+ pageHeader
->pd_lower
, 0,
520 pageHeader
->pd_upper
- pageHeader
->pd_lower
);
522 PageSetLSN(page
, lsn
);
523 MarkBufferDirty(buffers
[block_id
]);
527 /* Changes are done: unlock and release all buffers */
528 for (block_id
= 0; block_id
<= XLogRecMaxBlockId(record
); block_id
++)
530 if (BufferIsValid(buffers
[block_id
]))
531 UnlockReleaseBuffer(buffers
[block_id
]);
536 * Mask a generic page before performing consistency checks on it.
539 generic_mask(char *page
, BlockNumber blkno
)
541 mask_page_lsn_and_checksum(page
);
543 mask_unused_space(page
);