1 /*-------------------------------------------------------------------------
4 * Overflow page management code for the Postgres hash access method
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
14 * Overflow pages look like ordinary relation pages.
16 *-------------------------------------------------------------------------
20 #include "access/hash.h"
21 #include "storage/bufmgr.h"
22 #include "utils/rel.h"
25 static Buffer
_hash_getovflpage(Relation rel
, Buffer metabuf
);
26 static uint32
_hash_firstfreebit(uint32 map
);
30 * Convert overflow page bit number (its index in the free-page bitmaps)
31 * to block number within the index.
34 bitno_to_blkno(HashMetaPage metap
, uint32 ovflbitnum
)
36 uint32 splitnum
= metap
->hashm_ovflpoint
;
39 /* Convert zero-based bitnumber to 1-based page number */
42 /* Determine the split number for this page (must be >= 1) */
44 i
< splitnum
&& ovflbitnum
> metap
->hashm_spares
[i
];
49 * Convert to absolute page number by adding the number of bucket pages
50 * that exist before this split point.
52 return (BlockNumber
) ((1 << i
) + ovflbitnum
);
56 * Convert overflow page block number to bit number for free-page bitmap.
59 blkno_to_bitno(HashMetaPage metap
, BlockNumber ovflblkno
)
61 uint32 splitnum
= metap
->hashm_ovflpoint
;
65 /* Determine the split number containing this page */
66 for (i
= 1; i
<= splitnum
; i
++)
68 if (ovflblkno
<= (BlockNumber
) (1 << i
))
70 bitnum
= ovflblkno
- (1 << i
);
71 if (bitnum
<= metap
->hashm_spares
[i
])
72 return bitnum
- 1; /* -1 to convert 1-based to 0-based */
75 elog(ERROR
, "invalid overflow block number %u", ovflblkno
);
76 return 0; /* keep compiler quiet */
82 * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
84 * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
85 * dropped before exiting (we assume the caller is not interested in 'buf'
86 * anymore). The returned overflow page will be pinned and write-locked;
87 * it is guaranteed to be empty.
89 * The caller must hold a pin, but no lock, on the metapage buffer.
90 * That buffer is returned in the same state.
92 * The caller must hold at least share lock on the bucket, to ensure that
93 * no one else tries to compact the bucket meanwhile. This guarantees that
94 * 'buf' won't stop being part of the bucket while it's unlocked.
96 * NB: since this could be executed concurrently by multiple processes,
97 * one should not assume that the returned overflow page will be the
98 * immediate successor of the originally passed 'buf'. Additional overflow
99 * pages might have been added to the bucket chain in between.
102 _hash_addovflpage(Relation rel
, Buffer metabuf
, Buffer buf
)
107 HashPageOpaque pageopaque
;
108 HashPageOpaque ovflopaque
;
110 /* allocate and lock an empty overflow page */
111 ovflbuf
= _hash_getovflpage(rel
, metabuf
);
114 * Write-lock the tail page. It is okay to hold two buffer locks here
115 * since there cannot be anyone else contending for access to ovflbuf.
117 _hash_chgbufaccess(rel
, buf
, HASH_NOLOCK
, HASH_WRITE
);
119 /* probably redundant... */
120 _hash_checkpage(rel
, buf
, LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
);
122 /* loop to find current tail page, in case someone else inserted too */
125 BlockNumber nextblkno
;
127 page
= BufferGetPage(buf
);
128 pageopaque
= (HashPageOpaque
) PageGetSpecialPointer(page
);
129 nextblkno
= pageopaque
->hasho_nextblkno
;
131 if (!BlockNumberIsValid(nextblkno
))
134 /* we assume we do not need to write the unmodified page */
135 _hash_relbuf(rel
, buf
);
137 buf
= _hash_getbuf(rel
, nextblkno
, HASH_WRITE
, LH_OVERFLOW_PAGE
);
140 /* now that we have correct backlink, initialize new overflow page */
141 ovflpage
= BufferGetPage(ovflbuf
);
142 ovflopaque
= (HashPageOpaque
) PageGetSpecialPointer(ovflpage
);
143 ovflopaque
->hasho_prevblkno
= BufferGetBlockNumber(buf
);
144 ovflopaque
->hasho_nextblkno
= InvalidBlockNumber
;
145 ovflopaque
->hasho_bucket
= pageopaque
->hasho_bucket
;
146 ovflopaque
->hasho_flag
= LH_OVERFLOW_PAGE
;
147 ovflopaque
->hasho_page_id
= HASHO_PAGE_ID
;
149 MarkBufferDirty(ovflbuf
);
151 /* logically chain overflow page to previous page */
152 pageopaque
->hasho_nextblkno
= BufferGetBlockNumber(ovflbuf
);
153 _hash_wrtbuf(rel
, buf
);
159 * _hash_getovflpage()
161 * Find an available overflow page and return it. The returned buffer
162 * is pinned and write-locked, and has had _hash_pageinit() applied,
163 * but it is caller's responsibility to fill the special space.
165 * The caller must hold a pin, but no lock, on the metapage buffer.
166 * That buffer is left in the same state at exit.
169 _hash_getovflpage(Relation rel
, Buffer metabuf
)
175 uint32 orig_firstfree
;
177 uint32
*freep
= NULL
;
186 /* Get exclusive lock on the meta page */
187 _hash_chgbufaccess(rel
, metabuf
, HASH_NOLOCK
, HASH_WRITE
);
189 _hash_checkpage(rel
, metabuf
, LH_META_PAGE
);
190 metap
= HashPageGetMeta(BufferGetPage(metabuf
));
192 /* start search at hashm_firstfree */
193 orig_firstfree
= metap
->hashm_firstfree
;
194 first_page
= orig_firstfree
>> BMPG_SHIFT(metap
);
195 bit
= orig_firstfree
& BMPG_MASK(metap
);
197 j
= bit
/ BITS_PER_MAP
;
198 bit
&= ~(BITS_PER_MAP
- 1);
200 /* outer loop iterates once per bitmap page */
203 BlockNumber mapblkno
;
207 /* want to end search with the last existing overflow page */
208 splitnum
= metap
->hashm_ovflpoint
;
209 max_ovflpg
= metap
->hashm_spares
[splitnum
] - 1;
210 last_page
= max_ovflpg
>> BMPG_SHIFT(metap
);
211 last_bit
= max_ovflpg
& BMPG_MASK(metap
);
216 Assert(i
< metap
->hashm_nmaps
);
217 mapblkno
= metap
->hashm_mapp
[i
];
220 last_inpage
= last_bit
;
222 last_inpage
= BMPGSZ_BIT(metap
) - 1;
224 /* Release exclusive lock on metapage while reading bitmap page */
225 _hash_chgbufaccess(rel
, metabuf
, HASH_READ
, HASH_NOLOCK
);
227 mapbuf
= _hash_getbuf(rel
, mapblkno
, HASH_WRITE
, LH_BITMAP_PAGE
);
228 mappage
= BufferGetPage(mapbuf
);
229 freep
= HashPageGetBitmap(mappage
);
231 for (; bit
<= last_inpage
; j
++, bit
+= BITS_PER_MAP
)
233 if (freep
[j
] != ALL_SET
)
237 /* No free space here, try to advance to next map page */
238 _hash_relbuf(rel
, mapbuf
);
240 j
= 0; /* scan from start of next map page */
243 /* Reacquire exclusive lock on the meta page */
244 _hash_chgbufaccess(rel
, metabuf
, HASH_NOLOCK
, HASH_WRITE
);
248 * No free pages --- have to extend the relation to add an overflow page.
249 * First, check to see if we have to add a new bitmap page too.
251 if (last_bit
== (uint32
) (BMPGSZ_BIT(metap
) - 1))
254 * We create the new bitmap page with all pages marked "in use".
255 * Actually two pages in the new bitmap's range will exist
256 * immediately: the bitmap page itself, and the following page which
257 * is the one we return to the caller. Both of these are correctly
258 * marked "in use". Subsequent pages do not exist yet, but it is
259 * convenient to pre-mark them as "in use" too.
261 bit
= metap
->hashm_spares
[splitnum
];
262 _hash_initbitmap(rel
, metap
, bitno_to_blkno(metap
, bit
));
263 metap
->hashm_spares
[splitnum
]++;
268 * Nothing to do here; since the page will be past the last used page,
269 * we know its bitmap bit was preinitialized to "in use".
273 /* Calculate address of the new overflow page */
274 bit
= metap
->hashm_spares
[splitnum
];
275 blkno
= bitno_to_blkno(metap
, bit
);
278 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
279 * relation length stays in sync with ours. XXX It's annoying to do this
280 * with metapage write lock held; would be better to use a lock that
281 * doesn't block incoming searches.
283 newbuf
= _hash_getnewbuf(rel
, blkno
);
285 metap
->hashm_spares
[splitnum
]++;
288 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
289 * changing it if someone moved it while we were searching bitmap pages.
291 if (metap
->hashm_firstfree
== orig_firstfree
)
292 metap
->hashm_firstfree
= bit
+ 1;
294 /* Write updated metapage and release lock, but not pin */
295 _hash_chgbufaccess(rel
, metabuf
, HASH_WRITE
, HASH_NOLOCK
);
300 /* convert bit to bit number within page */
301 bit
+= _hash_firstfreebit(freep
[j
]);
303 /* mark page "in use" in the bitmap */
305 _hash_wrtbuf(rel
, mapbuf
);
307 /* Reacquire exclusive lock on the meta page */
308 _hash_chgbufaccess(rel
, metabuf
, HASH_NOLOCK
, HASH_WRITE
);
310 /* convert bit to absolute bit number */
311 bit
+= (i
<< BMPG_SHIFT(metap
));
313 /* Calculate address of the recycled overflow page */
314 blkno
= bitno_to_blkno(metap
, bit
);
317 * Adjust hashm_firstfree to avoid redundant searches. But don't risk
318 * changing it if someone moved it while we were searching bitmap pages.
320 if (metap
->hashm_firstfree
== orig_firstfree
)
322 metap
->hashm_firstfree
= bit
+ 1;
324 /* Write updated metapage and release lock, but not pin */
325 _hash_chgbufaccess(rel
, metabuf
, HASH_WRITE
, HASH_NOLOCK
);
329 /* We didn't change the metapage, so no need to write */
330 _hash_chgbufaccess(rel
, metabuf
, HASH_READ
, HASH_NOLOCK
);
333 /* Fetch, init, and return the recycled page */
334 return _hash_getinitbuf(rel
, blkno
);
338 * _hash_firstfreebit()
340 * Return the number of the first bit that is not set in the word 'map'.
343 _hash_firstfreebit(uint32 map
)
349 for (i
= 0; i
< BITS_PER_MAP
; i
++)
356 elog(ERROR
, "firstfreebit found no free bit");
358 return 0; /* keep compiler quiet */
362 * _hash_freeovflpage() -
364 * Remove this overflow page from its bucket's chain, and mark the page as
365 * free. On entry, ovflbuf is write-locked; it is released before exiting.
367 * Since this function is invoked in VACUUM, we provide an access strategy
368 * parameter that controls fetches of the bucket pages.
370 * Returns the block number of the page that followed the given page
371 * in the bucket, or InvalidBlockNumber if no following page.
373 * NB: caller must not hold lock on metapage, nor on either page that's
374 * adjacent in the bucket chain. The caller had better hold exclusive lock
375 * on the bucket, too.
378 _hash_freeovflpage(Relation rel
, Buffer ovflbuf
,
379 BufferAccessStrategy bstrategy
)
384 BlockNumber ovflblkno
;
385 BlockNumber prevblkno
;
387 BlockNumber nextblkno
;
388 HashPageOpaque ovflopaque
;
397 /* Get information from the doomed page */
398 _hash_checkpage(rel
, ovflbuf
, LH_OVERFLOW_PAGE
);
399 ovflblkno
= BufferGetBlockNumber(ovflbuf
);
400 ovflpage
= BufferGetPage(ovflbuf
);
401 ovflopaque
= (HashPageOpaque
) PageGetSpecialPointer(ovflpage
);
402 nextblkno
= ovflopaque
->hasho_nextblkno
;
403 prevblkno
= ovflopaque
->hasho_prevblkno
;
404 bucket
= ovflopaque
->hasho_bucket
;
407 * Zero the page for debugging's sake; then write and release it. (Note:
408 * if we failed to zero the page here, we'd have problems with the Assert
409 * in _hash_pageinit() when the page is reused.)
411 MemSet(ovflpage
, 0, BufferGetPageSize(ovflbuf
));
412 _hash_wrtbuf(rel
, ovflbuf
);
415 * Fix up the bucket chain. this is a doubly-linked list, so we must fix
416 * up the bucket chain members behind and ahead of the overflow page being
417 * deleted. No concurrency issues since we hold exclusive lock on the
420 if (BlockNumberIsValid(prevblkno
))
422 Buffer prevbuf
= _hash_getbuf_with_strategy(rel
,
425 LH_BUCKET_PAGE
| LH_OVERFLOW_PAGE
,
427 Page prevpage
= BufferGetPage(prevbuf
);
428 HashPageOpaque prevopaque
= (HashPageOpaque
) PageGetSpecialPointer(prevpage
);
430 Assert(prevopaque
->hasho_bucket
== bucket
);
431 prevopaque
->hasho_nextblkno
= nextblkno
;
432 _hash_wrtbuf(rel
, prevbuf
);
434 if (BlockNumberIsValid(nextblkno
))
436 Buffer nextbuf
= _hash_getbuf_with_strategy(rel
,
441 Page nextpage
= BufferGetPage(nextbuf
);
442 HashPageOpaque nextopaque
= (HashPageOpaque
) PageGetSpecialPointer(nextpage
);
444 Assert(nextopaque
->hasho_bucket
== bucket
);
445 nextopaque
->hasho_prevblkno
= prevblkno
;
446 _hash_wrtbuf(rel
, nextbuf
);
449 /* Note: bstrategy is intentionally not used for metapage and bitmap */
451 /* Read the metapage so we can determine which bitmap page to use */
452 metabuf
= _hash_getbuf(rel
, HASH_METAPAGE
, HASH_READ
, LH_META_PAGE
);
453 metap
= HashPageGetMeta(BufferGetPage(metabuf
));
455 /* Identify which bit to set */
456 ovflbitno
= blkno_to_bitno(metap
, ovflblkno
);
458 bitmappage
= ovflbitno
>> BMPG_SHIFT(metap
);
459 bitmapbit
= ovflbitno
& BMPG_MASK(metap
);
461 if (bitmappage
>= metap
->hashm_nmaps
)
462 elog(ERROR
, "invalid overflow bit number %u", ovflbitno
);
463 blkno
= metap
->hashm_mapp
[bitmappage
];
465 /* Release metapage lock while we access the bitmap page */
466 _hash_chgbufaccess(rel
, metabuf
, HASH_READ
, HASH_NOLOCK
);
468 /* Clear the bitmap bit to indicate that this overflow page is free */
469 mapbuf
= _hash_getbuf(rel
, blkno
, HASH_WRITE
, LH_BITMAP_PAGE
);
470 mappage
= BufferGetPage(mapbuf
);
471 freep
= HashPageGetBitmap(mappage
);
472 Assert(ISSET(freep
, bitmapbit
));
473 CLRBIT(freep
, bitmapbit
);
474 _hash_wrtbuf(rel
, mapbuf
);
476 /* Get write-lock on metapage to update firstfree */
477 _hash_chgbufaccess(rel
, metabuf
, HASH_NOLOCK
, HASH_WRITE
);
479 /* if this is now the first free page, update hashm_firstfree */
480 if (ovflbitno
< metap
->hashm_firstfree
)
482 metap
->hashm_firstfree
= ovflbitno
;
483 _hash_wrtbuf(rel
, metabuf
);
487 /* no need to change metapage */
488 _hash_relbuf(rel
, metabuf
);
498 * Initialize a new bitmap page. The metapage has a write-lock upon
499 * entering the function, and must be written by caller after return.
501 * 'blkno' is the block number of the new bitmap page.
503 * All bits in the new bitmap page are set to "1", indicating "in use".
506 _hash_initbitmap(Relation rel
, HashMetaPage metap
, BlockNumber blkno
)
514 * It is okay to write-lock the new bitmap page while holding metapage
515 * write lock, because no one else could be contending for the new page.
516 * Also, the metapage lock makes it safe to extend the index using
519 * There is some loss of concurrency in possibly doing I/O for the new
520 * page while holding the metapage lock, but this path is taken so seldom
521 * that it's not worth worrying about.
523 buf
= _hash_getnewbuf(rel
, blkno
);
524 pg
= BufferGetPage(buf
);
526 /* initialize the page's special space */
527 op
= (HashPageOpaque
) PageGetSpecialPointer(pg
);
528 op
->hasho_prevblkno
= InvalidBlockNumber
;
529 op
->hasho_nextblkno
= InvalidBlockNumber
;
530 op
->hasho_bucket
= -1;
531 op
->hasho_flag
= LH_BITMAP_PAGE
;
532 op
->hasho_page_id
= HASHO_PAGE_ID
;
534 /* set all of the bits to 1 */
535 freep
= HashPageGetBitmap(pg
);
536 MemSet(freep
, 0xFF, BMPGSZ_BYTE(metap
));
538 /* write out the new bitmap page (releasing write lock and pin) */
539 _hash_wrtbuf(rel
, buf
);
541 /* add the new bitmap page to the metapage's list of bitmaps */
542 /* metapage already has a write lock */
543 if (metap
->hashm_nmaps
>= HASH_MAX_BITMAPS
)
545 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
546 errmsg("out of overflow pages in hash index \"%s\"",
547 RelationGetRelationName(rel
))));
549 metap
->hashm_mapp
[metap
->hashm_nmaps
] = blkno
;
551 metap
->hashm_nmaps
++;
556 * _hash_squeezebucket(rel, bucket)
558 * Try to squeeze the tuples onto pages occurring earlier in the
559 * bucket chain in an attempt to free overflow pages. When we start
560 * the "squeezing", the page from which we start taking tuples (the
561 * "read" page) is the last bucket in the bucket chain and the page
562 * onto which we start squeezing tuples (the "write" page) is the
563 * first page in the bucket chain. The read page works backward and
564 * the write page works forward; the procedure terminates when the
565 * read page and write page are the same page.
567 * At completion of this procedure, it is guaranteed that all pages in
568 * the bucket are nonempty, unless the bucket is totally empty (in
569 * which case all overflow pages will be freed). The original implementation
570 * required that to be true on entry as well, but it's a lot easier for
571 * callers to leave empty overflow pages and let this guy clean it up.
573 * Caller must hold exclusive lock on the target bucket. This allows
574 * us to safely lock multiple pages in the bucket.
576 * Since this function is invoked in VACUUM, we provide an access strategy
577 * parameter that controls fetches of the bucket pages.
580 _hash_squeezebucket(Relation rel
,
582 BlockNumber bucket_blkno
,
583 BufferAccessStrategy bstrategy
)
591 HashPageOpaque wopaque
;
592 HashPageOpaque ropaque
;
593 OffsetNumber woffnum
;
594 OffsetNumber roffnum
;
599 * start squeezing into the base bucket page.
601 wblkno
= bucket_blkno
;
602 wbuf
= _hash_getbuf_with_strategy(rel
,
607 wpage
= BufferGetPage(wbuf
);
608 wopaque
= (HashPageOpaque
) PageGetSpecialPointer(wpage
);
611 * if there aren't any overflow pages, there's nothing to squeeze.
613 if (!BlockNumberIsValid(wopaque
->hasho_nextblkno
))
615 _hash_relbuf(rel
, wbuf
);
620 * Find the last page in the bucket chain by starting at the base bucket
621 * page and working forward. Note: we assume that a hash bucket chain is
622 * usually smaller than the buffer ring being used by VACUUM, else using
623 * the access strategy here would be counterproductive.
628 rblkno
= ropaque
->hasho_nextblkno
;
629 if (ropaque
!= wopaque
)
630 _hash_relbuf(rel
, rbuf
);
631 rbuf
= _hash_getbuf_with_strategy(rel
,
636 rpage
= BufferGetPage(rbuf
);
637 ropaque
= (HashPageOpaque
) PageGetSpecialPointer(rpage
);
638 Assert(ropaque
->hasho_bucket
== bucket
);
639 } while (BlockNumberIsValid(ropaque
->hasho_nextblkno
));
642 * squeeze the tuples.
644 roffnum
= FirstOffsetNumber
;
647 /* this test is needed in case page is empty on entry */
648 if (roffnum
<= PageGetMaxOffsetNumber(rpage
))
650 itup
= (IndexTuple
) PageGetItem(rpage
,
651 PageGetItemId(rpage
, roffnum
));
652 itemsz
= IndexTupleDSize(*itup
);
653 itemsz
= MAXALIGN(itemsz
);
656 * Walk up the bucket chain, looking for a page big enough for
657 * this item. Exit if we reach the read page.
659 while (PageGetFreeSpace(wpage
) < itemsz
)
661 Assert(!PageIsEmpty(wpage
));
663 wblkno
= wopaque
->hasho_nextblkno
;
664 Assert(BlockNumberIsValid(wblkno
));
666 _hash_wrtbuf(rel
, wbuf
);
668 if (rblkno
== wblkno
)
670 /* wbuf is already released */
671 _hash_wrtbuf(rel
, rbuf
);
675 wbuf
= _hash_getbuf_with_strategy(rel
,
680 wpage
= BufferGetPage(wbuf
);
681 wopaque
= (HashPageOpaque
) PageGetSpecialPointer(wpage
);
682 Assert(wopaque
->hasho_bucket
== bucket
);
686 * we have found room so insert on the "write" page.
688 woffnum
= OffsetNumberNext(PageGetMaxOffsetNumber(wpage
));
689 if (PageAddItem(wpage
, (Item
) itup
, itemsz
, woffnum
, false, false)
690 == InvalidOffsetNumber
)
691 elog(ERROR
, "failed to add index item to \"%s\"",
692 RelationGetRelationName(rel
));
695 * delete the tuple from the "read" page. PageIndexTupleDelete
696 * repacks the ItemId array, so 'roffnum' will be "advanced" to
699 PageIndexTupleDelete(rpage
, roffnum
);
703 * if the "read" page is now empty because of the deletion (or because
704 * it was empty when we got to it), free it.
706 * Tricky point here: if our read and write pages are adjacent in the
707 * bucket chain, our write lock on wbuf will conflict with
708 * _hash_freeovflpage's attempt to update the sibling links of the
709 * removed page. However, in that case we are done anyway, so we can
710 * simply drop the write lock before calling _hash_freeovflpage.
712 if (PageIsEmpty(rpage
))
714 rblkno
= ropaque
->hasho_prevblkno
;
715 Assert(BlockNumberIsValid(rblkno
));
717 /* are we freeing the page adjacent to wbuf? */
718 if (rblkno
== wblkno
)
720 /* yes, so release wbuf lock first */
721 _hash_wrtbuf(rel
, wbuf
);
722 /* free this overflow page (releases rbuf) */
723 _hash_freeovflpage(rel
, rbuf
, bstrategy
);
728 /* free this overflow page, then get the previous one */
729 _hash_freeovflpage(rel
, rbuf
, bstrategy
);
731 rbuf
= _hash_getbuf_with_strategy(rel
,
736 rpage
= BufferGetPage(rbuf
);
737 ropaque
= (HashPageOpaque
) PageGetSpecialPointer(rpage
);
738 Assert(ropaque
->hasho_bucket
== bucket
);
740 roffnum
= FirstOffsetNumber
;