1 /*-------------------------------------------------------------------------
4 * Overflow page management code for the Postgres hash access method
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashovfl.c
14 * Overflow pages look like ordinary relation pages.
16 *-------------------------------------------------------------------------
20 #include "access/hash.h"
21 #include "access/hash_xlog.h"
22 #include "access/xloginsert.h"
23 #include "miscadmin.h"
24 #include "utils/rel.h"
27 static uint32
_hash_firstfreebit(uint32 map
);
31 * Convert overflow page bit number (its index in the free-page bitmaps)
32 * to block number within the index.
35 bitno_to_blkno(HashMetaPage metap
, uint32 ovflbitnum
)
37 uint32 splitnum
= metap
->hashm_ovflpoint
;
40 /* Convert zero-based bitnumber to 1-based page number */
43 /* Determine the split number for this page (must be >= 1) */
45 i
< splitnum
&& ovflbitnum
> metap
->hashm_spares
[i
];
50 * Convert to absolute page number by adding the number of bucket pages
51 * that exist before this split point.
53 return (BlockNumber
) (_hash_get_totalbuckets(i
) + ovflbitnum
);
57 * _hash_ovflblkno_to_bitno
59 * Convert overflow page block number to bit number for free-page bitmap.
62 _hash_ovflblkno_to_bitno(HashMetaPage metap
, BlockNumber ovflblkno
)
64 uint32 splitnum
= metap
->hashm_ovflpoint
;
68 /* Determine the split number containing this page */
69 for (i
= 1; i
<= splitnum
; i
++)
71 if (ovflblkno
<= (BlockNumber
) _hash_get_totalbuckets(i
))
73 bitnum
= ovflblkno
- _hash_get_totalbuckets(i
);
76 * bitnum has to be greater than number of overflow page added in
77 * previous split point. The overflow page at this splitnum (i) if any
78 * should start from (_hash_get_totalbuckets(i) +
79 * metap->hashm_spares[i - 1] + 1).
81 if (bitnum
> metap
->hashm_spares
[i
- 1] &&
82 bitnum
<= metap
->hashm_spares
[i
])
83 return bitnum
- 1; /* -1 to convert 1-based to 0-based */
87 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
88 errmsg("invalid overflow block number %u", ovflblkno
)));
89 return 0; /* keep compiler quiet */
95 * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
97 * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
98 * dropped before exiting (we assume the caller is not interested in 'buf'
99 * anymore) if not asked to retain. The pin will be retained only for the
100 * primary bucket. The returned overflow page will be pinned and
101 * write-locked; it is guaranteed to be empty.
103 * The caller must hold a pin, but no lock, on the metapage buffer.
104 * That buffer is returned in the same state.
106 * NB: since this could be executed concurrently by multiple processes,
107 * one should not assume that the returned overflow page will be the
108 * immediate successor of the originally passed 'buf'. Additional overflow
109 * pages might have been added to the bucket chain in between.
112 _hash_addovflpage(Relation rel
, Buffer metabuf
, Buffer buf
, bool retain_pin
)
117 HashPageOpaque pageopaque
;
118 HashPageOpaque ovflopaque
;
120 Buffer mapbuf
= InvalidBuffer
;
121 Buffer newmapbuf
= InvalidBuffer
;
123 uint32 orig_firstfree
;
125 uint32
*freep
= NULL
;
128 uint32 bitmap_page_bit
;
134 bool page_found
= false;
137 * Write-lock the tail page. Here, we need to maintain locking order such
138 * that, first acquire the lock on tail page of bucket, then on meta page
139 * to find and lock the bitmap page and if it is found, then lock on meta
140 * page is released, then finally acquire the lock on new overflow buffer.
141 * We need this locking order to avoid deadlock with backends that are
144 * Note: We could have avoided locking many buffers here if we made two
145 * WAL records for acquiring an overflow page (one to allocate an overflow
146 * page and another to add it to overflow bucket chain). However, doing
147 * so can leak an overflow page, if the system crashes after allocation.
148 * Needless to say, it is better to have a single record from a
149 * performance point of view as well.
151 LockBuffer(buf
, BUFFER_LOCK_EXCLUSIVE
);
153 /* probably redundant... */
154 _hash_checkpage(rel
, buf
, LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
156 /* loop to find current tail page, in case someone else inserted too */
159 BlockNumber nextblkno
;
161 page
= BufferGetPage(buf
);
162 pageopaque
= HashPageGetOpaque(page
);
163 nextblkno
= pageopaque
->hasho_nextblkno
;
165 if (!BlockNumberIsValid(nextblkno
))
168 /* we assume we do not need to write the unmodified page */
171 /* pin will be retained only for the primary bucket page */
172 Assert((pageopaque
->hasho_flag
& LH_PAGE_TYPE
) == LH_BUCKET_PAGE
);
173 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
176 _hash_relbuf(rel
, buf
);
180 buf
= _hash_getbuf(rel
, nextblkno
, HASH_WRITE
, LH_OVERFLOW_PAGE
);
183 /* Get exclusive lock on the meta page */
184 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
186 _hash_checkpage(rel
, metabuf
, LH_META_PAGE
);
187 metap
= HashPageGetMeta(BufferGetPage(metabuf
));
189 /* start search at hashm_firstfree */
190 orig_firstfree
= metap
->hashm_firstfree
;
191 first_page
= orig_firstfree
>> BMPG_SHIFT(metap
);
192 bit
= orig_firstfree
& BMPG_MASK(metap
);
194 j
= bit
/ BITS_PER_MAP
;
195 bit
&= ~(BITS_PER_MAP
- 1);
197 /* outer loop iterates once per bitmap page */
200 BlockNumber mapblkno
;
204 /* want to end search with the last existing overflow page */
205 splitnum
= metap
->hashm_ovflpoint
;
206 max_ovflpg
= metap
->hashm_spares
[splitnum
] - 1;
207 last_page
= max_ovflpg
>> BMPG_SHIFT(metap
);
208 last_bit
= max_ovflpg
& BMPG_MASK(metap
);
213 Assert(i
< metap
->hashm_nmaps
);
214 mapblkno
= metap
->hashm_mapp
[i
];
217 last_inpage
= last_bit
;
219 last_inpage
= BMPGSZ_BIT(metap
) - 1;
221 /* Release exclusive lock on metapage while reading bitmap page */
222 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
224 mapbuf
= _hash_getbuf(rel
, mapblkno
, HASH_WRITE
, LH_BITMAP_PAGE
);
225 mappage
= BufferGetPage(mapbuf
);
226 freep
= HashPageGetBitmap(mappage
);
228 for (; bit
<= last_inpage
; j
++, bit
+= BITS_PER_MAP
)
230 if (freep
[j
] != ALL_SET
)
234 /* Reacquire exclusive lock on the meta page */
235 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
237 /* convert bit to bit number within page */
238 bit
+= _hash_firstfreebit(freep
[j
]);
239 bitmap_page_bit
= bit
;
241 /* convert bit to absolute bit number */
242 bit
+= (i
<< BMPG_SHIFT(metap
));
243 /* Calculate address of the recycled overflow page */
244 blkno
= bitno_to_blkno(metap
, bit
);
246 /* Fetch and init the recycled page */
247 ovflbuf
= _hash_getinitbuf(rel
, blkno
);
253 /* No free space here, try to advance to next map page */
254 _hash_relbuf(rel
, mapbuf
);
255 mapbuf
= InvalidBuffer
;
257 j
= 0; /* scan from start of next map page */
260 /* Reacquire exclusive lock on the meta page */
261 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
265 * No free pages --- have to extend the relation to add an overflow page.
266 * First, check to see if we have to add a new bitmap page too.
268 if (last_bit
== (uint32
) (BMPGSZ_BIT(metap
) - 1))
271 * We create the new bitmap page with all pages marked "in use".
272 * Actually two pages in the new bitmap's range will exist
273 * immediately: the bitmap page itself, and the following page which
274 * is the one we return to the caller. Both of these are correctly
275 * marked "in use". Subsequent pages do not exist yet, but it is
276 * convenient to pre-mark them as "in use" too.
278 bit
= metap
->hashm_spares
[splitnum
];
280 /* metapage already has a write lock */
281 if (metap
->hashm_nmaps
>= HASH_MAX_BITMAPS
)
283 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
284 errmsg("out of overflow pages in hash index \"%s\"",
285 RelationGetRelationName(rel
))));
287 newmapbuf
= _hash_getnewbuf(rel
, bitno_to_blkno(metap
, bit
), MAIN_FORKNUM
);
292 * Nothing to do here; since the page will be past the last used page,
293 * we know its bitmap bit was preinitialized to "in use".
297 /* Calculate address of the new overflow page */
298 bit
= BufferIsValid(newmapbuf
) ?
299 metap
->hashm_spares
[splitnum
] + 1 : metap
->hashm_spares
[splitnum
];
300 blkno
= bitno_to_blkno(metap
, bit
);
303 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
304 * relation length stays in sync with ours. XXX It's annoying to do this
305 * with metapage write lock held; would be better to use a lock that
306 * doesn't block incoming searches.
308 * It is okay to hold two buffer locks here (one on tail page of bucket
309 * and other on new overflow page) since there cannot be anyone else
310 * contending for access to ovflbuf.
312 ovflbuf
= _hash_getnewbuf(rel
, blkno
, MAIN_FORKNUM
);
317 * Do the update. No ereport(ERROR) until changes are logged. We want to
318 * log the changes for bitmap page and overflow page together to avoid
319 * loss of pages in case the new page is added.
321 START_CRIT_SECTION();
325 Assert(BufferIsValid(mapbuf
));
327 /* mark page "in use" in the bitmap */
328 SETBIT(freep
, bitmap_page_bit
);
329 MarkBufferDirty(mapbuf
);
333 /* update the count to indicate new overflow page is added */
334 metap
->hashm_spares
[splitnum
]++;
336 if (BufferIsValid(newmapbuf
))
338 _hash_initbitmapbuffer(newmapbuf
, metap
->hashm_bmsize
, false);
339 MarkBufferDirty(newmapbuf
);
341 /* add the new bitmap page to the metapage's list of bitmaps */
342 metap
->hashm_mapp
[metap
->hashm_nmaps
] = BufferGetBlockNumber(newmapbuf
);
343 metap
->hashm_nmaps
++;
344 metap
->hashm_spares
[splitnum
]++;
347 MarkBufferDirty(metabuf
);
350 * for new overflow page, we don't need to explicitly set the bit in
351 * bitmap page, as by default that will be set to "in use".
356 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
357 * changing it if someone moved it while we were searching bitmap pages.
359 if (metap
->hashm_firstfree
== orig_firstfree
)
361 metap
->hashm_firstfree
= bit
+ 1;
362 MarkBufferDirty(metabuf
);
365 /* initialize new overflow page */
366 ovflpage
= BufferGetPage(ovflbuf
);
367 ovflopaque
= HashPageGetOpaque(ovflpage
);
368 ovflopaque
->hasho_prevblkno
= BufferGetBlockNumber(buf
);
369 ovflopaque
->hasho_nextblkno
= InvalidBlockNumber
;
370 ovflopaque
->hasho_bucket
= pageopaque
->hasho_bucket
;
371 ovflopaque
->hasho_flag
= LH_OVERFLOW_PAGE
;
372 ovflopaque
->hasho_page_id
= HASHO_PAGE_ID
;
374 MarkBufferDirty(ovflbuf
);
376 /* logically chain overflow page to previous page */
377 pageopaque
->hasho_nextblkno
= BufferGetBlockNumber(ovflbuf
);
379 MarkBufferDirty(buf
);
382 if (RelationNeedsWAL(rel
))
385 xl_hash_add_ovfl_page xlrec
;
387 xlrec
.bmpage_found
= page_found
;
388 xlrec
.bmsize
= metap
->hashm_bmsize
;
391 XLogRegisterData((char *) &xlrec
, SizeOfHashAddOvflPage
);
393 XLogRegisterBuffer(0, ovflbuf
, REGBUF_WILL_INIT
);
394 XLogRegisterBufData(0, (char *) &pageopaque
->hasho_bucket
, sizeof(Bucket
));
396 XLogRegisterBuffer(1, buf
, REGBUF_STANDARD
);
398 if (BufferIsValid(mapbuf
))
400 XLogRegisterBuffer(2, mapbuf
, REGBUF_STANDARD
);
401 XLogRegisterBufData(2, (char *) &bitmap_page_bit
, sizeof(uint32
));
404 if (BufferIsValid(newmapbuf
))
405 XLogRegisterBuffer(3, newmapbuf
, REGBUF_WILL_INIT
);
407 XLogRegisterBuffer(4, metabuf
, REGBUF_STANDARD
);
408 XLogRegisterBufData(4, (char *) &metap
->hashm_firstfree
, sizeof(uint32
));
410 recptr
= XLogInsert(RM_HASH_ID
, XLOG_HASH_ADD_OVFL_PAGE
);
412 PageSetLSN(BufferGetPage(ovflbuf
), recptr
);
413 PageSetLSN(BufferGetPage(buf
), recptr
);
415 if (BufferIsValid(mapbuf
))
416 PageSetLSN(BufferGetPage(mapbuf
), recptr
);
418 if (BufferIsValid(newmapbuf
))
419 PageSetLSN(BufferGetPage(newmapbuf
), recptr
);
421 PageSetLSN(BufferGetPage(metabuf
), recptr
);
427 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
429 _hash_relbuf(rel
, buf
);
431 if (BufferIsValid(mapbuf
))
432 _hash_relbuf(rel
, mapbuf
);
434 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
436 if (BufferIsValid(newmapbuf
))
437 _hash_relbuf(rel
, newmapbuf
);
443 * _hash_firstfreebit()
445 * Return the number of the first bit that is not set in the word 'map'.
448 _hash_firstfreebit(uint32 map
)
454 for (i
= 0; i
< BITS_PER_MAP
; i
++)
461 elog(ERROR
, "firstfreebit found no free bit");
463 return 0; /* keep compiler quiet */
467 * _hash_freeovflpage() -
469 * Remove this overflow page from its bucket's chain, and mark the page as
470 * free. On entry, ovflbuf is write-locked; it is released before exiting.
472 * Add the tuples (itups) to wbuf in this function. We could do that in the
473 * caller as well, but the advantage of doing it here is we can easily write
474 * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
475 * removal of overflow page has to done as an atomic operation, otherwise
476 * during replay on standby users might find duplicate records.
478 * Since this function is invoked in VACUUM, we provide an access strategy
479 * parameter that controls fetches of the bucket pages.
481 * Returns the block number of the page that followed the given page
482 * in the bucket, or InvalidBlockNumber if no following page.
484 * NB: caller must not hold lock on metapage, nor on page, that's next to
485 * ovflbuf in the bucket chain. We don't acquire the lock on page that's
486 * prior to ovflbuf in chain if it is same as wbuf because the caller already
487 * has a lock on same.
490 _hash_freeovflpage(Relation rel
, Buffer bucketbuf
, Buffer ovflbuf
,
491 Buffer wbuf
, IndexTuple
*itups
, OffsetNumber
*itup_offsets
,
492 Size
*tups_size
, uint16 nitups
,
493 BufferAccessStrategy bstrategy
)
498 BlockNumber ovflblkno
;
499 BlockNumber prevblkno
;
501 BlockNumber nextblkno
;
502 BlockNumber writeblkno
;
503 HashPageOpaque ovflopaque
;
510 Bucket bucket PG_USED_FOR_ASSERTS_ONLY
;
511 Buffer prevbuf
= InvalidBuffer
;
512 Buffer nextbuf
= InvalidBuffer
;
513 bool update_metap
= false;
515 /* Get information from the doomed page */
516 _hash_checkpage(rel
, ovflbuf
, LH_OVERFLOW_PAGE
);
517 ovflblkno
= BufferGetBlockNumber(ovflbuf
);
518 ovflpage
= BufferGetPage(ovflbuf
);
519 ovflopaque
= HashPageGetOpaque(ovflpage
);
520 nextblkno
= ovflopaque
->hasho_nextblkno
;
521 prevblkno
= ovflopaque
->hasho_prevblkno
;
522 writeblkno
= BufferGetBlockNumber(wbuf
);
523 bucket
= ovflopaque
->hasho_bucket
;
526 * Fix up the bucket chain. this is a doubly-linked list, so we must fix
527 * up the bucket chain members behind and ahead of the overflow page being
528 * deleted. Concurrency issues are avoided by using lock chaining as
529 * described atop hashbucketcleanup.
531 if (BlockNumberIsValid(prevblkno
))
533 if (prevblkno
== writeblkno
)
536 prevbuf
= _hash_getbuf_with_strategy(rel
,
539 LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
,
542 if (BlockNumberIsValid(nextblkno
))
543 nextbuf
= _hash_getbuf_with_strategy(rel
,
549 /* Note: bstrategy is intentionally not used for metapage and bitmap */
551 /* Read the metapage so we can determine which bitmap page to use */
552 metabuf
= _hash_getbuf(rel
, HASH_METAPAGE
, HASH_READ
, LH_META_PAGE
);
553 metap
= HashPageGetMeta(BufferGetPage(metabuf
));
555 /* Identify which bit to set */
556 ovflbitno
= _hash_ovflblkno_to_bitno(metap
, ovflblkno
);
558 bitmappage
= ovflbitno
>> BMPG_SHIFT(metap
);
559 bitmapbit
= ovflbitno
& BMPG_MASK(metap
);
561 if (bitmappage
>= metap
->hashm_nmaps
)
562 elog(ERROR
, "invalid overflow bit number %u", ovflbitno
);
563 blkno
= metap
->hashm_mapp
[bitmappage
];
565 /* Release metapage lock while we access the bitmap page */
566 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
568 /* read the bitmap page to clear the bitmap bit */
569 mapbuf
= _hash_getbuf(rel
, blkno
, HASH_WRITE
, LH_BITMAP_PAGE
);
570 mappage
= BufferGetPage(mapbuf
);
571 freep
= HashPageGetBitmap(mappage
);
572 Assert(ISSET(freep
, bitmapbit
));
574 /* Get write-lock on metapage to update firstfree */
575 LockBuffer(metabuf
, BUFFER_LOCK_EXCLUSIVE
);
577 /* This operation needs to log multiple tuples, prepare WAL for that */
578 if (RelationNeedsWAL(rel
))
579 XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS
, 4 + nitups
);
581 START_CRIT_SECTION();
584 * we have to insert tuples on the "write" page, being careful to preserve
585 * hashkey ordering. (If we insert many tuples into the same "write" page
586 * it would be worth qsort'ing them).
590 _hash_pgaddmultitup(rel
, wbuf
, itups
, itup_offsets
, nitups
);
591 MarkBufferDirty(wbuf
);
595 * Reinitialize the freed overflow page. Just zeroing the page won't
596 * work, because WAL replay routines expect pages to be initialized. See
597 * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
598 * careful to make the special space valid here so that tools like
599 * pageinspect won't get confused.
601 _hash_pageinit(ovflpage
, BufferGetPageSize(ovflbuf
));
603 ovflopaque
= HashPageGetOpaque(ovflpage
);
605 ovflopaque
->hasho_prevblkno
= InvalidBlockNumber
;
606 ovflopaque
->hasho_nextblkno
= InvalidBlockNumber
;
607 ovflopaque
->hasho_bucket
= InvalidBucket
;
608 ovflopaque
->hasho_flag
= LH_UNUSED_PAGE
;
609 ovflopaque
->hasho_page_id
= HASHO_PAGE_ID
;
611 MarkBufferDirty(ovflbuf
);
613 if (BufferIsValid(prevbuf
))
615 Page prevpage
= BufferGetPage(prevbuf
);
616 HashPageOpaque prevopaque
= HashPageGetOpaque(prevpage
);
618 Assert(prevopaque
->hasho_bucket
== bucket
);
619 prevopaque
->hasho_nextblkno
= nextblkno
;
620 MarkBufferDirty(prevbuf
);
622 if (BufferIsValid(nextbuf
))
624 Page nextpage
= BufferGetPage(nextbuf
);
625 HashPageOpaque nextopaque
= HashPageGetOpaque(nextpage
);
627 Assert(nextopaque
->hasho_bucket
== bucket
);
628 nextopaque
->hasho_prevblkno
= prevblkno
;
629 MarkBufferDirty(nextbuf
);
632 /* Clear the bitmap bit to indicate that this overflow page is free */
633 CLRBIT(freep
, bitmapbit
);
634 MarkBufferDirty(mapbuf
);
636 /* if this is now the first free page, update hashm_firstfree */
637 if (ovflbitno
< metap
->hashm_firstfree
)
639 metap
->hashm_firstfree
= ovflbitno
;
641 MarkBufferDirty(metabuf
);
645 if (RelationNeedsWAL(rel
))
647 xl_hash_squeeze_page xlrec
;
650 bool mod_wbuf
= false;
652 xlrec
.prevblkno
= prevblkno
;
653 xlrec
.nextblkno
= nextblkno
;
654 xlrec
.ntups
= nitups
;
655 xlrec
.is_prim_bucket_same_wrt
= (wbuf
== bucketbuf
);
656 xlrec
.is_prev_bucket_same_wrt
= (wbuf
== prevbuf
);
659 XLogRegisterData((char *) &xlrec
, SizeOfHashSqueezePage
);
662 * bucket buffer was not changed, but still needs to be registered to
663 * ensure that we can acquire a cleanup lock on it during replay.
665 if (!xlrec
.is_prim_bucket_same_wrt
)
667 uint8 flags
= REGBUF_STANDARD
| REGBUF_NO_IMAGE
| REGBUF_NO_CHANGE
;
669 XLogRegisterBuffer(0, bucketbuf
, flags
);
674 XLogRegisterBuffer(1, wbuf
, REGBUF_STANDARD
);
676 /* Remember that wbuf is modified. */
679 XLogRegisterBufData(1, (char *) itup_offsets
,
680 nitups
* sizeof(OffsetNumber
));
681 for (i
= 0; i
< nitups
; i
++)
682 XLogRegisterBufData(1, (char *) itups
[i
], tups_size
[i
]);
684 else if (xlrec
.is_prim_bucket_same_wrt
|| xlrec
.is_prev_bucket_same_wrt
)
689 * A write buffer needs to be registered even if no tuples are
690 * added to it to ensure that we can acquire a cleanup lock on it
691 * if it is the same as primary bucket buffer or update the
692 * nextblkno if it is same as the previous bucket buffer.
694 Assert(xlrec
.ntups
== 0);
696 wbuf_flags
= REGBUF_STANDARD
;
697 if (!xlrec
.is_prev_bucket_same_wrt
)
699 wbuf_flags
|= REGBUF_NO_CHANGE
;
703 /* Remember that wbuf is modified. */
706 XLogRegisterBuffer(1, wbuf
, wbuf_flags
);
709 XLogRegisterBuffer(2, ovflbuf
, REGBUF_STANDARD
);
712 * If prevpage and the writepage (block in which we are moving tuples
713 * from overflow) are same, then no need to separately register
714 * prevpage. During replay, we can directly update the nextblock in
717 if (BufferIsValid(prevbuf
) && !xlrec
.is_prev_bucket_same_wrt
)
718 XLogRegisterBuffer(3, prevbuf
, REGBUF_STANDARD
);
720 if (BufferIsValid(nextbuf
))
721 XLogRegisterBuffer(4, nextbuf
, REGBUF_STANDARD
);
723 XLogRegisterBuffer(5, mapbuf
, REGBUF_STANDARD
);
724 XLogRegisterBufData(5, (char *) &bitmapbit
, sizeof(uint32
));
728 XLogRegisterBuffer(6, metabuf
, REGBUF_STANDARD
);
729 XLogRegisterBufData(6, (char *) &metap
->hashm_firstfree
, sizeof(uint32
));
732 recptr
= XLogInsert(RM_HASH_ID
, XLOG_HASH_SQUEEZE_PAGE
);
734 /* Set LSN iff wbuf is modified. */
736 PageSetLSN(BufferGetPage(wbuf
), recptr
);
738 PageSetLSN(BufferGetPage(ovflbuf
), recptr
);
740 if (BufferIsValid(prevbuf
) && !xlrec
.is_prev_bucket_same_wrt
)
741 PageSetLSN(BufferGetPage(prevbuf
), recptr
);
742 if (BufferIsValid(nextbuf
))
743 PageSetLSN(BufferGetPage(nextbuf
), recptr
);
745 PageSetLSN(BufferGetPage(mapbuf
), recptr
);
748 PageSetLSN(BufferGetPage(metabuf
), recptr
);
753 /* release previous bucket if it is not same as write bucket */
754 if (BufferIsValid(prevbuf
) && prevblkno
!= writeblkno
)
755 _hash_relbuf(rel
, prevbuf
);
757 if (BufferIsValid(ovflbuf
))
758 _hash_relbuf(rel
, ovflbuf
);
760 if (BufferIsValid(nextbuf
))
761 _hash_relbuf(rel
, nextbuf
);
763 _hash_relbuf(rel
, mapbuf
);
764 _hash_relbuf(rel
, metabuf
);
771 * _hash_initbitmapbuffer()
773 * Initialize a new bitmap page. All bits in the new bitmap page are set to
774 * "1", indicating "in use".
777 _hash_initbitmapbuffer(Buffer buf
, uint16 bmsize
, bool initpage
)
783 pg
= BufferGetPage(buf
);
785 /* initialize the page */
787 _hash_pageinit(pg
, BufferGetPageSize(buf
));
789 /* initialize the page's special space */
790 op
= HashPageGetOpaque(pg
);
791 op
->hasho_prevblkno
= InvalidBlockNumber
;
792 op
->hasho_nextblkno
= InvalidBlockNumber
;
793 op
->hasho_bucket
= InvalidBucket
;
794 op
->hasho_flag
= LH_BITMAP_PAGE
;
795 op
->hasho_page_id
= HASHO_PAGE_ID
;
797 /* set all of the bits to 1 */
798 freep
= HashPageGetBitmap(pg
);
799 memset(freep
, 0xFF, bmsize
);
802 * Set pd_lower just past the end of the bitmap page data. We could even
803 * set pd_lower equal to pd_upper, but this is more precise and makes the
804 * page look compressible to xlog.c.
806 ((PageHeader
) pg
)->pd_lower
= ((char *) freep
+ bmsize
) - (char *) pg
;
811 * _hash_squeezebucket(rel, bucket)
813 * Try to squeeze the tuples onto pages occurring earlier in the
814 * bucket chain in an attempt to free overflow pages. When we start
815 * the "squeezing", the page from which we start taking tuples (the
816 * "read" page) is the last bucket in the bucket chain and the page
817 * onto which we start squeezing tuples (the "write" page) is the
818 * first page in the bucket chain. The read page works backward and
819 * the write page works forward; the procedure terminates when the
820 * read page and write page are the same page.
822 * At completion of this procedure, it is guaranteed that all pages in
823 * the bucket are nonempty, unless the bucket is totally empty (in
824 * which case all overflow pages will be freed). The original implementation
825 * required that to be true on entry as well, but it's a lot easier for
826 * callers to leave empty overflow pages and let this guy clean it up.
828 * Caller must acquire cleanup lock on the primary page of the target
829 * bucket to exclude any scans that are in progress, which could easily
830 * be confused into returning the same tuple more than once or some tuples
831 * not at all by the rearrangement we are performing here. To prevent
832 * any concurrent scan to cross the squeeze scan we use lock chaining
833 * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup.
835 * We need to retain a pin on the primary bucket to ensure that no concurrent
838 * Since this function is invoked in VACUUM, we provide an access strategy
839 * parameter that controls fetches of the bucket pages.
842 _hash_squeezebucket(Relation rel
,
844 BlockNumber bucket_blkno
,
846 BufferAccessStrategy bstrategy
)
854 HashPageOpaque wopaque
;
855 HashPageOpaque ropaque
;
858 * start squeezing into the primary bucket page.
860 wblkno
= bucket_blkno
;
862 wpage
= BufferGetPage(wbuf
);
863 wopaque
= HashPageGetOpaque(wpage
);
866 * if there aren't any overflow pages, there's nothing to squeeze. caller
867 * is responsible for releasing the pin on primary bucket page.
869 if (!BlockNumberIsValid(wopaque
->hasho_nextblkno
))
871 LockBuffer(wbuf
, BUFFER_LOCK_UNLOCK
);
876 * Find the last page in the bucket chain by starting at the base bucket
877 * page and working forward. Note: we assume that a hash bucket chain is
878 * usually smaller than the buffer ring being used by VACUUM, else using
879 * the access strategy here would be counterproductive.
881 rbuf
= InvalidBuffer
;
885 rblkno
= ropaque
->hasho_nextblkno
;
886 if (rbuf
!= InvalidBuffer
)
887 _hash_relbuf(rel
, rbuf
);
888 rbuf
= _hash_getbuf_with_strategy(rel
,
893 rpage
= BufferGetPage(rbuf
);
894 ropaque
= HashPageGetOpaque(rpage
);
895 Assert(ropaque
->hasho_bucket
== bucket
);
896 } while (BlockNumberIsValid(ropaque
->hasho_nextblkno
));
899 * squeeze the tuples.
903 OffsetNumber roffnum
;
904 OffsetNumber maxroffnum
;
905 OffsetNumber deletable
[MaxOffsetNumber
];
906 IndexTuple itups
[MaxIndexTuplesPerPage
];
907 Size tups_size
[MaxIndexTuplesPerPage
];
908 OffsetNumber itup_offsets
[MaxIndexTuplesPerPage
];
909 uint16 ndeletable
= 0;
911 Size all_tups_size
= 0;
913 bool retain_pin
= false;
916 /* Scan each tuple in "read" page */
917 maxroffnum
= PageGetMaxOffsetNumber(rpage
);
918 for (roffnum
= FirstOffsetNumber
;
919 roffnum
<= maxroffnum
;
920 roffnum
= OffsetNumberNext(roffnum
))
925 /* skip dead tuples */
926 if (ItemIdIsDead(PageGetItemId(rpage
, roffnum
)))
929 itup
= (IndexTuple
) PageGetItem(rpage
,
930 PageGetItemId(rpage
, roffnum
));
931 itemsz
= IndexTupleSize(itup
);
932 itemsz
= MAXALIGN(itemsz
);
935 * Walk up the bucket chain, looking for a page big enough for
936 * this item and all other accumulated items. Exit if we reach
939 while (PageGetFreeSpaceForMultipleTuples(wpage
, nitups
+ 1) < (all_tups_size
+ itemsz
))
941 Buffer next_wbuf
= InvalidBuffer
;
942 bool tups_moved
= false;
944 Assert(!PageIsEmpty(wpage
));
946 if (wblkno
== bucket_blkno
)
949 wblkno
= wopaque
->hasho_nextblkno
;
950 Assert(BlockNumberIsValid(wblkno
));
952 /* don't need to move to next page if we reached the read page */
953 if (wblkno
!= rblkno
)
954 next_wbuf
= _hash_getbuf_with_strategy(rel
,
962 Assert(nitups
== ndeletable
);
965 * This operation needs to log multiple tuples, prepare
968 if (RelationNeedsWAL(rel
))
969 XLogEnsureRecordSpace(0, 3 + nitups
);
971 START_CRIT_SECTION();
974 * we have to insert tuples on the "write" page, being
975 * careful to preserve hashkey ordering. (If we insert
976 * many tuples into the same "write" page it would be
977 * worth qsort'ing them).
979 _hash_pgaddmultitup(rel
, wbuf
, itups
, itup_offsets
, nitups
);
980 MarkBufferDirty(wbuf
);
982 /* Delete tuples we already moved off read page */
983 PageIndexMultiDelete(rpage
, deletable
, ndeletable
);
984 MarkBufferDirty(rbuf
);
987 if (RelationNeedsWAL(rel
))
990 xl_hash_move_page_contents xlrec
;
992 xlrec
.ntups
= nitups
;
993 xlrec
.is_prim_bucket_same_wrt
= (wbuf
== bucket_buf
);
996 XLogRegisterData((char *) &xlrec
, SizeOfHashMovePageContents
);
999 * bucket buffer was not changed, but still needs to
1000 * be registered to ensure that we can acquire a
1001 * cleanup lock on it during replay.
1003 if (!xlrec
.is_prim_bucket_same_wrt
)
1005 int flags
= REGBUF_STANDARD
| REGBUF_NO_IMAGE
| REGBUF_NO_CHANGE
;
1007 XLogRegisterBuffer(0, bucket_buf
, flags
);
1010 XLogRegisterBuffer(1, wbuf
, REGBUF_STANDARD
);
1011 XLogRegisterBufData(1, (char *) itup_offsets
,
1012 nitups
* sizeof(OffsetNumber
));
1013 for (i
= 0; i
< nitups
; i
++)
1014 XLogRegisterBufData(1, (char *) itups
[i
], tups_size
[i
]);
1016 XLogRegisterBuffer(2, rbuf
, REGBUF_STANDARD
);
1017 XLogRegisterBufData(2, (char *) deletable
,
1018 ndeletable
* sizeof(OffsetNumber
));
1020 recptr
= XLogInsert(RM_HASH_ID
, XLOG_HASH_MOVE_PAGE_CONTENTS
);
1022 PageSetLSN(BufferGetPage(wbuf
), recptr
);
1023 PageSetLSN(BufferGetPage(rbuf
), recptr
);
1032 * release the lock on previous page after acquiring the lock
1036 LockBuffer(wbuf
, BUFFER_LOCK_UNLOCK
);
1038 _hash_relbuf(rel
, wbuf
);
1040 /* nothing more to do if we reached the read page */
1041 if (rblkno
== wblkno
)
1043 _hash_relbuf(rel
, rbuf
);
1048 wpage
= BufferGetPage(wbuf
);
1049 wopaque
= HashPageGetOpaque(wpage
);
1050 Assert(wopaque
->hasho_bucket
== bucket
);
1054 for (i
= 0; i
< nitups
; i
++)
1061 * after moving the tuples, rpage would have been compacted,
1062 * so we need to rescan it.
1068 /* remember tuple for deletion from "read" page */
1069 deletable
[ndeletable
++] = roffnum
;
1072 * we need a copy of index tuples as they can be freed as part of
1073 * overflow page, however we need them to write a WAL record in
1074 * _hash_freeovflpage.
1076 itups
[nitups
] = CopyIndexTuple(itup
);
1077 tups_size
[nitups
++] = itemsz
;
1078 all_tups_size
+= itemsz
;
1082 * If we reach here, there are no live tuples on the "read" page ---
1083 * it was empty when we got to it, or we moved them all. So we can
1084 * just free the page without bothering with deleting tuples
1085 * individually. Then advance to the previous "read" page.
1087 * Tricky point here: if our read and write pages are adjacent in the
1088 * bucket chain, our write lock on wbuf will conflict with
1089 * _hash_freeovflpage's attempt to update the sibling links of the
1090 * removed page. In that case, we don't need to lock it again.
1092 rblkno
= ropaque
->hasho_prevblkno
;
1093 Assert(BlockNumberIsValid(rblkno
));
1095 /* free this overflow page (releases rbuf) */
1096 _hash_freeovflpage(rel
, bucket_buf
, rbuf
, wbuf
, itups
, itup_offsets
,
1097 tups_size
, nitups
, bstrategy
);
1100 for (i
= 0; i
< nitups
; i
++)
1103 /* are we freeing the page adjacent to wbuf? */
1104 if (rblkno
== wblkno
)
1106 /* retain the pin on primary bucket page till end of bucket scan */
1107 if (wblkno
== bucket_blkno
)
1108 LockBuffer(wbuf
, BUFFER_LOCK_UNLOCK
);
1110 _hash_relbuf(rel
, wbuf
);
1114 rbuf
= _hash_getbuf_with_strategy(rel
,
1119 rpage
= BufferGetPage(rbuf
);
1120 ropaque
= HashPageGetOpaque(rpage
);
1121 Assert(ropaque
->hasho_bucket
== bucket
);