src/backend/access/hash/hashovfl.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashovfl.c
   4  *        Overflow page management code for the Postgres hash access method
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  * NOTES
  14  *        Overflow pages look like ordinary relation pages.
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/hash.h"
  21 #include "storage/bufmgr.h"
  22 #include "utils/rel.h"
  23
  24
  25 static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
  26 static uint32 _hash_firstfreebit(uint32 map);
  27
  28
  29 /*
  30  * Convert overflow page bit number (its index in the free-page bitmaps)
  31  * to block number within the index.
  32  */
  33 static BlockNumber
  34 bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
  35 {
  36         uint32          splitnum = metap->hashm_ovflpoint;
  37         uint32          i;
  38
  39         /* Convert zero-based bitnumber to 1-based page number */
  40         ovflbitnum += 1;
  41
  42         /* Determine the split number for this page (must be >= 1) */
  43         for (i = 1;
  44                  i < splitnum && ovflbitnum > metap->hashm_spares[i];
  45                  i++)
  46                  /* loop */ ;
  47
  48         /*
  49          * Convert to absolute page number by adding the number of bucket pages
  50          * that exist before this split point.
  51          */
  52         return (BlockNumber) ((1 << i) + ovflbitnum);
  53 }
  54
  55 /*
  56  * Convert overflow page block number to bit number for free-page bitmap.
  57  */
  58 static uint32
  59 blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
  60 {
  61         uint32          splitnum = metap->hashm_ovflpoint;
  62         uint32          i;
  63         uint32          bitnum;
  64
  65         /* Determine the split number containing this page */
  66         for (i = 1; i <= splitnum; i++)
  67         {
  68                 if (ovflblkno <= (BlockNumber) (1 << i))
  69                         break;                          /* oops */
  70                 bitnum = ovflblkno - (1 << i);
  71                 if (bitnum <= metap->hashm_spares[i])
  72                         return bitnum - 1;      /* -1 to convert 1-based to 0-based */
  73         }
  74
  75         elog(ERROR, "invalid overflow block number %u", ovflblkno);
  76         return 0;                                       /* keep compiler quiet */
  77 }
  78
  79 /*
  80  *      _hash_addovflpage
  81  *
  82  *      Add an overflow page to the bucket whose last page is pointed to by 'buf'.
  83  *
  84  *      On entry, the caller must hold a pin but no lock on 'buf'.      The pin is
  85  *      dropped before exiting (we assume the caller is not interested in 'buf'
  86  *      anymore).  The returned overflow page will be pinned and write-locked;
  87  *      it is guaranteed to be empty.
  88  *
  89  *      The caller must hold a pin, but no lock, on the metapage buffer.
  90  *      That buffer is returned in the same state.
  91  *
  92  *      The caller must hold at least share lock on the bucket, to ensure that
  93  *      no one else tries to compact the bucket meanwhile.      This guarantees that
  94  *      'buf' won't stop being part of the bucket while it's unlocked.
  95  *
  96  * NB: since this could be executed concurrently by multiple processes,
  97  * one should not assume that the returned overflow page will be the
  98  * immediate successor of the originally passed 'buf'.  Additional overflow
  99  * pages might have been added to the bucket chain in between.
 100  */
 101 Buffer
 102 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
 103 {
 104         Buffer          ovflbuf;
 105         Page            page;
 106         Page            ovflpage;
 107         HashPageOpaque pageopaque;
 108         HashPageOpaque ovflopaque;
 109
 110         /* allocate and lock an empty overflow page */
 111         ovflbuf = _hash_getovflpage(rel, metabuf);
 112
 113         /*
 114          * Write-lock the tail page.  It is okay to hold two buffer locks here
 115          * since there cannot be anyone else contending for access to ovflbuf.
 116          */
 117         _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);
 118
 119         /* probably redundant... */
 120         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 121
 122         /* loop to find current tail page, in case someone else inserted too */
 123         for (;;)
 124         {
 125                 BlockNumber nextblkno;
 126
 127                 page = BufferGetPage(buf);
 128                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 129                 nextblkno = pageopaque->hasho_nextblkno;
 130
 131                 if (!BlockNumberIsValid(nextblkno))
 132                         break;
 133
 134                 /* we assume we do not need to write the unmodified page */
 135                 _hash_relbuf(rel, buf);
 136
 137                 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 138         }
 139
 140         /* now that we have correct backlink, initialize new overflow page */
 141         ovflpage = BufferGetPage(ovflbuf);
 142         ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
 143         ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
 144         ovflopaque->hasho_nextblkno = InvalidBlockNumber;
 145         ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
 146         ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
 147         ovflopaque->hasho_page_id = HASHO_PAGE_ID;
 148
 149         MarkBufferDirty(ovflbuf);
 150
 151         /* logically chain overflow page to previous page */
 152         pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
 153         _hash_wrtbuf(rel, buf);
 154
 155         return ovflbuf;
 156 }
 157
 158 /*
 159  *      _hash_getovflpage()
 160  *
 161  *      Find an available overflow page and return it.  The returned buffer
 162  *      is pinned and write-locked, and has had _hash_pageinit() applied,
 163  *      but it is caller's responsibility to fill the special space.
 164  *
 165  * The caller must hold a pin, but no lock, on the metapage buffer.
 166  * That buffer is left in the same state at exit.
 167  */
 168 static Buffer
 169 _hash_getovflpage(Relation rel, Buffer metabuf)
 170 {
 171         HashMetaPage metap;
 172         Buffer          mapbuf = 0;
 173         Buffer          newbuf;
 174         BlockNumber blkno;
 175         uint32          orig_firstfree;
 176         uint32          splitnum;
 177         uint32     *freep = NULL;
 178         uint32          max_ovflpg;
 179         uint32          bit;
 180         uint32          first_page;
 181         uint32          last_bit;
 182         uint32          last_page;
 183         uint32          i,
 184                                 j;
 185
 186         /* Get exclusive lock on the meta page */
 187         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 188
 189         _hash_checkpage(rel, metabuf, LH_META_PAGE);
 190         metap = HashPageGetMeta(BufferGetPage(metabuf));
 191
 192         /* start search at hashm_firstfree */
 193         orig_firstfree = metap->hashm_firstfree;
 194         first_page = orig_firstfree >> BMPG_SHIFT(metap);
 195         bit = orig_firstfree & BMPG_MASK(metap);
 196         i = first_page;
 197         j = bit / BITS_PER_MAP;
 198         bit &= ~(BITS_PER_MAP - 1);
 199
 200         /* outer loop iterates once per bitmap page */
 201         for (;;)
 202         {
 203                 BlockNumber mapblkno;
 204                 Page            mappage;
 205                 uint32          last_inpage;
 206
 207                 /* want to end search with the last existing overflow page */
 208                 splitnum = metap->hashm_ovflpoint;
 209                 max_ovflpg = metap->hashm_spares[splitnum] - 1;
 210                 last_page = max_ovflpg >> BMPG_SHIFT(metap);
 211                 last_bit = max_ovflpg & BMPG_MASK(metap);
 212
 213                 if (i > last_page)
 214                         break;
 215
 216                 Assert(i < metap->hashm_nmaps);
 217                 mapblkno = metap->hashm_mapp[i];
 218
 219                 if (i == last_page)
 220                         last_inpage = last_bit;
 221                 else
 222                         last_inpage = BMPGSZ_BIT(metap) - 1;
 223
 224                 /* Release exclusive lock on metapage while reading bitmap page */
 225                 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 226
 227                 mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
 228                 mappage = BufferGetPage(mapbuf);
 229                 freep = HashPageGetBitmap(mappage);
 230
 231                 for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
 232                 {
 233                         if (freep[j] != ALL_SET)
 234                                 goto found;
 235                 }
 236
 237                 /* No free space here, try to advance to next map page */
 238                 _hash_relbuf(rel, mapbuf);
 239                 i++;
 240                 j = 0;                                  /* scan from start of next map page */
 241                 bit = 0;
 242
 243                 /* Reacquire exclusive lock on the meta page */
 244                 _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 245         }
 246
 247         /*
 248          * No free pages --- have to extend the relation to add an overflow page.
 249          * First, check to see if we have to add a new bitmap page too.
 250          */
 251         if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
 252         {
 253                 /*
 254                  * We create the new bitmap page with all pages marked "in use".
 255                  * Actually two pages in the new bitmap's range will exist
 256                  * immediately: the bitmap page itself, and the following page which
 257                  * is the one we return to the caller.  Both of these are correctly
 258                  * marked "in use".  Subsequent pages do not exist yet, but it is
 259                  * convenient to pre-mark them as "in use" too.
 260                  */
 261                 bit = metap->hashm_spares[splitnum];
 262                 _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
 263                 metap->hashm_spares[splitnum]++;
 264         }
 265         else
 266         {
 267                 /*
 268                  * Nothing to do here; since the page will be past the last used page,
 269                  * we know its bitmap bit was preinitialized to "in use".
 270                  */
 271         }
 272
 273         /* Calculate address of the new overflow page */
 274         bit = metap->hashm_spares[splitnum];
 275         blkno = bitno_to_blkno(metap, bit);
 276
 277         /*
 278          * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
 279          * relation length stays in sync with ours.  XXX It's annoying to do this
 280          * with metapage write lock held; would be better to use a lock that
 281          * doesn't block incoming searches.
 282          */
 283         newbuf = _hash_getnewbuf(rel, blkno);
 284
 285         metap->hashm_spares[splitnum]++;
 286
 287         /*
 288          * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
 289          * changing it if someone moved it while we were searching bitmap pages.
 290          */
 291         if (metap->hashm_firstfree == orig_firstfree)
 292                 metap->hashm_firstfree = bit + 1;
 293
 294         /* Write updated metapage and release lock, but not pin */
 295         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
 296
 297         return newbuf;
 298
 299 found:
 300         /* convert bit to bit number within page */
 301         bit += _hash_firstfreebit(freep[j]);
 302
 303         /* mark page "in use" in the bitmap */
 304         SETBIT(freep, bit);
 305         _hash_wrtbuf(rel, mapbuf);
 306
 307         /* Reacquire exclusive lock on the meta page */
 308         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 309
 310         /* convert bit to absolute bit number */
 311         bit += (i << BMPG_SHIFT(metap));
 312
 313         /* Calculate address of the recycled overflow page */
 314         blkno = bitno_to_blkno(metap, bit);
 315
 316         /*
 317          * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
 318          * changing it if someone moved it while we were searching bitmap pages.
 319          */
 320         if (metap->hashm_firstfree == orig_firstfree)
 321         {
 322                 metap->hashm_firstfree = bit + 1;
 323
 324                 /* Write updated metapage and release lock, but not pin */
 325                 _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
 326         }
 327         else
 328         {
 329                 /* We didn't change the metapage, so no need to write */
 330                 _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 331         }
 332
 333         /* Fetch, init, and return the recycled page */
 334         return _hash_getinitbuf(rel, blkno);
 335 }
 336
 337 /*
 338  *      _hash_firstfreebit()
 339  *
 340  *      Return the number of the first bit that is not set in the word 'map'.
 341  */
 342 static uint32
 343 _hash_firstfreebit(uint32 map)
 344 {
 345         uint32          i,
 346                                 mask;
 347
 348         mask = 0x1;
 349         for (i = 0; i < BITS_PER_MAP; i++)
 350         {
 351                 if (!(mask & map))
 352                         return i;
 353                 mask <<= 1;
 354         }
 355
 356         elog(ERROR, "firstfreebit found no free bit");
 357
 358         return 0;                                       /* keep compiler quiet */
 359 }
 360
 361 /*
 362  *      _hash_freeovflpage() -
 363  *
 364  *      Remove this overflow page from its bucket's chain, and mark the page as
 365  *      free.  On entry, ovflbuf is write-locked; it is released before exiting.
 366  *
 367  *      Since this function is invoked in VACUUM, we provide an access strategy
 368  *      parameter that controls fetches of the bucket pages.
 369  *
 370  *      Returns the block number of the page that followed the given page
 371  *      in the bucket, or InvalidBlockNumber if no following page.
 372  *
 373  *      NB: caller must not hold lock on metapage, nor on either page that's
 374  *      adjacent in the bucket chain.  The caller had better hold exclusive lock
 375  *      on the bucket, too.
 376  */
 377 BlockNumber
 378 _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 379                                    BufferAccessStrategy bstrategy)
 380 {
 381         HashMetaPage metap;
 382         Buffer          metabuf;
 383         Buffer          mapbuf;
 384         BlockNumber ovflblkno;
 385         BlockNumber prevblkno;
 386         BlockNumber blkno;
 387         BlockNumber nextblkno;
 388         HashPageOpaque ovflopaque;
 389         Page            ovflpage;
 390         Page            mappage;
 391         uint32     *freep;
 392         uint32          ovflbitno;
 393         int32           bitmappage,
 394                                 bitmapbit;
 395         Bucket          bucket;
 396
 397         /* Get information from the doomed page */
 398         _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
 399         ovflblkno = BufferGetBlockNumber(ovflbuf);
 400         ovflpage = BufferGetPage(ovflbuf);
 401         ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
 402         nextblkno = ovflopaque->hasho_nextblkno;
 403         prevblkno = ovflopaque->hasho_prevblkno;
 404         bucket = ovflopaque->hasho_bucket;
 405
 406         /*
 407          * Zero the page for debugging's sake; then write and release it. (Note:
 408          * if we failed to zero the page here, we'd have problems with the Assert
 409          * in _hash_pageinit() when the page is reused.)
 410          */
 411         MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
 412         _hash_wrtbuf(rel, ovflbuf);
 413
 414         /*
 415          * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
 416          * up the bucket chain members behind and ahead of the overflow page being
 417          * deleted.  No concurrency issues since we hold exclusive lock on the
 418          * entire bucket.
 419          */
 420         if (BlockNumberIsValid(prevblkno))
 421         {
 422                 Buffer          prevbuf = _hash_getbuf_with_strategy(rel,
 423                                                                                                                  prevblkno,
 424                                                                                                                  HASH_WRITE,
 425                                                                                    LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
 426                                                                                                                  bstrategy);
 427                 Page            prevpage = BufferGetPage(prevbuf);
 428                 HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);
 429
 430                 Assert(prevopaque->hasho_bucket == bucket);
 431                 prevopaque->hasho_nextblkno = nextblkno;
 432                 _hash_wrtbuf(rel, prevbuf);
 433         }
 434         if (BlockNumberIsValid(nextblkno))
 435         {
 436                 Buffer          nextbuf = _hash_getbuf_with_strategy(rel,
 437                                                                                                                  nextblkno,
 438                                                                                                                  HASH_WRITE,
 439                                                                                                                  LH_OVERFLOW_PAGE,
 440                                                                                                                  bstrategy);
 441                 Page            nextpage = BufferGetPage(nextbuf);
 442                 HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);
 443
 444                 Assert(nextopaque->hasho_bucket == bucket);
 445                 nextopaque->hasho_prevblkno = prevblkno;
 446                 _hash_wrtbuf(rel, nextbuf);
 447         }
 448
 449         /* Note: bstrategy is intentionally not used for metapage and bitmap */
 450
 451         /* Read the metapage so we can determine which bitmap page to use */
 452         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 453         metap = HashPageGetMeta(BufferGetPage(metabuf));
 454
 455         /* Identify which bit to set */
 456         ovflbitno = blkno_to_bitno(metap, ovflblkno);
 457
 458         bitmappage = ovflbitno >> BMPG_SHIFT(metap);
 459         bitmapbit = ovflbitno & BMPG_MASK(metap);
 460
 461         if (bitmappage >= metap->hashm_nmaps)
 462                 elog(ERROR, "invalid overflow bit number %u", ovflbitno);
 463         blkno = metap->hashm_mapp[bitmappage];
 464
 465         /* Release metapage lock while we access the bitmap page */
 466         _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 467
 468         /* Clear the bitmap bit to indicate that this overflow page is free */
 469         mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
 470         mappage = BufferGetPage(mapbuf);
 471         freep = HashPageGetBitmap(mappage);
 472         Assert(ISSET(freep, bitmapbit));
 473         CLRBIT(freep, bitmapbit);
 474         _hash_wrtbuf(rel, mapbuf);
 475
 476         /* Get write-lock on metapage to update firstfree */
 477         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 478
 479         /* if this is now the first free page, update hashm_firstfree */
 480         if (ovflbitno < metap->hashm_firstfree)
 481         {
 482                 metap->hashm_firstfree = ovflbitno;
 483                 _hash_wrtbuf(rel, metabuf);
 484         }
 485         else
 486         {
 487                 /* no need to change metapage */
 488                 _hash_relbuf(rel, metabuf);
 489         }
 490
 491         return nextblkno;
 492 }
 493
 494
 495 /*
 496  *      _hash_initbitmap()
 497  *
 498  *       Initialize a new bitmap page.  The metapage has a write-lock upon
 499  *       entering the function, and must be written by caller after return.
 500  *
 501  * 'blkno' is the block number of the new bitmap page.
 502  *
 503  * All bits in the new bitmap page are set to "1", indicating "in use".
 504  */
 505 void
 506 _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 507 {
 508         Buffer          buf;
 509         Page            pg;
 510         HashPageOpaque op;
 511         uint32     *freep;
 512
 513         /*
 514          * It is okay to write-lock the new bitmap page while holding metapage
 515          * write lock, because no one else could be contending for the new page.
 516          * Also, the metapage lock makes it safe to extend the index using
 517          * _hash_getnewbuf.
 518          *
 519          * There is some loss of concurrency in possibly doing I/O for the new
 520          * page while holding the metapage lock, but this path is taken so seldom
 521          * that it's not worth worrying about.
 522          */
 523         buf = _hash_getnewbuf(rel, blkno);
 524         pg = BufferGetPage(buf);
 525
 526         /* initialize the page's special space */
 527         op = (HashPageOpaque) PageGetSpecialPointer(pg);
 528         op->hasho_prevblkno = InvalidBlockNumber;
 529         op->hasho_nextblkno = InvalidBlockNumber;
 530         op->hasho_bucket = -1;
 531         op->hasho_flag = LH_BITMAP_PAGE;
 532         op->hasho_page_id = HASHO_PAGE_ID;
 533
 534         /* set all of the bits to 1 */
 535         freep = HashPageGetBitmap(pg);
 536         MemSet(freep, 0xFF, BMPGSZ_BYTE(metap));
 537
 538         /* write out the new bitmap page (releasing write lock and pin) */
 539         _hash_wrtbuf(rel, buf);
 540
 541         /* add the new bitmap page to the metapage's list of bitmaps */
 542         /* metapage already has a write lock */
 543         if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
 544                 ereport(ERROR,
 545                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 546                                  errmsg("out of overflow pages in hash index \"%s\"",
 547                                                 RelationGetRelationName(rel))));
 548
 549         metap->hashm_mapp[metap->hashm_nmaps] = blkno;
 550
 551         metap->hashm_nmaps++;
 552 }
 553
 554
 555 /*
 556  *      _hash_squeezebucket(rel, bucket)
 557  *
 558  *      Try to squeeze the tuples onto pages occurring earlier in the
 559  *      bucket chain in an attempt to free overflow pages. When we start
 560  *      the "squeezing", the page from which we start taking tuples (the
 561  *      "read" page) is the last bucket in the bucket chain and the page
 562  *      onto which we start squeezing tuples (the "write" page) is the
 563  *      first page in the bucket chain.  The read page works backward and
 564  *      the write page works forward; the procedure terminates when the
 565  *      read page and write page are the same page.
 566  *
 567  *      At completion of this procedure, it is guaranteed that all pages in
 568  *      the bucket are nonempty, unless the bucket is totally empty (in
 569  *      which case all overflow pages will be freed).  The original implementation
 570  *      required that to be true on entry as well, but it's a lot easier for
 571  *      callers to leave empty overflow pages and let this guy clean it up.
 572  *
 573  *      Caller must hold exclusive lock on the target bucket.  This allows
 574  *      us to safely lock multiple pages in the bucket.
 575  *
 576  *      Since this function is invoked in VACUUM, we provide an access strategy
 577  *      parameter that controls fetches of the bucket pages.
 578  */
 579 void
 580 _hash_squeezebucket(Relation rel,
 581                                         Bucket bucket,
 582                                         BlockNumber bucket_blkno,
 583                                         BufferAccessStrategy bstrategy)
 584 {
 585         Buffer          wbuf;
 586         Buffer          rbuf = 0;
 587         BlockNumber wblkno;
 588         BlockNumber rblkno;
 589         Page            wpage;
 590         Page            rpage;
 591         HashPageOpaque wopaque;
 592         HashPageOpaque ropaque;
 593         OffsetNumber woffnum;
 594         OffsetNumber roffnum;
 595         IndexTuple      itup;
 596         Size            itemsz;
 597
 598         /*
 599          * start squeezing into the base bucket page.
 600          */
 601         wblkno = bucket_blkno;
 602         wbuf = _hash_getbuf_with_strategy(rel,
 603                                                                           wblkno,
 604                                                                           HASH_WRITE,
 605                                                                           LH_BUCKET_PAGE,
 606                                                                           bstrategy);
 607         wpage = BufferGetPage(wbuf);
 608         wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 609
 610         /*
 611          * if there aren't any overflow pages, there's nothing to squeeze.
 612          */
 613         if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
 614         {
 615                 _hash_relbuf(rel, wbuf);
 616                 return;
 617         }
 618
 619         /*
 620          * Find the last page in the bucket chain by starting at the base bucket
 621          * page and working forward.  Note: we assume that a hash bucket chain is
 622          * usually smaller than the buffer ring being used by VACUUM, else using
 623          * the access strategy here would be counterproductive.
 624          */
 625         ropaque = wopaque;
 626         do
 627         {
 628                 rblkno = ropaque->hasho_nextblkno;
 629                 if (ropaque != wopaque)
 630                         _hash_relbuf(rel, rbuf);
 631                 rbuf = _hash_getbuf_with_strategy(rel,
 632                                                                                   rblkno,
 633                                                                                   HASH_WRITE,
 634                                                                                   LH_OVERFLOW_PAGE,
 635                                                                                   bstrategy);
 636                 rpage = BufferGetPage(rbuf);
 637                 ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 638                 Assert(ropaque->hasho_bucket == bucket);
 639         } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
 640
 641         /*
 642          * squeeze the tuples.
 643          */
 644         roffnum = FirstOffsetNumber;
 645         for (;;)
 646         {
 647                 /* this test is needed in case page is empty on entry */
 648                 if (roffnum <= PageGetMaxOffsetNumber(rpage))
 649                 {
 650                         itup = (IndexTuple) PageGetItem(rpage,
 651                                                                                         PageGetItemId(rpage, roffnum));
 652                         itemsz = IndexTupleDSize(*itup);
 653                         itemsz = MAXALIGN(itemsz);
 654
 655                         /*
 656                          * Walk up the bucket chain, looking for a page big enough for
 657                          * this item.  Exit if we reach the read page.
 658                          */
 659                         while (PageGetFreeSpace(wpage) < itemsz)
 660                         {
 661                                 Assert(!PageIsEmpty(wpage));
 662
 663                                 wblkno = wopaque->hasho_nextblkno;
 664                                 Assert(BlockNumberIsValid(wblkno));
 665
 666                                 _hash_wrtbuf(rel, wbuf);
 667
 668                                 if (rblkno == wblkno)
 669                                 {
 670                                         /* wbuf is already released */
 671                                         _hash_wrtbuf(rel, rbuf);
 672                                         return;
 673                                 }
 674
 675                                 wbuf = _hash_getbuf_with_strategy(rel,
 676                                                                                                   wblkno,
 677                                                                                                   HASH_WRITE,
 678                                                                                                   LH_OVERFLOW_PAGE,
 679                                                                                                   bstrategy);
 680                                 wpage = BufferGetPage(wbuf);
 681                                 wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
 682                                 Assert(wopaque->hasho_bucket == bucket);
 683                         }
 684
 685                         /*
 686                          * we have found room so insert on the "write" page.
 687                          */
 688                         woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
 689                         if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
 690                                 == InvalidOffsetNumber)
 691                                 elog(ERROR, "failed to add index item to \"%s\"",
 692                                          RelationGetRelationName(rel));
 693
 694                         /*
 695                          * delete the tuple from the "read" page. PageIndexTupleDelete
 696                          * repacks the ItemId array, so 'roffnum' will be "advanced" to
 697                          * the "next" ItemId.
 698                          */
 699                         PageIndexTupleDelete(rpage, roffnum);
 700                 }
 701
 702                 /*
 703                  * if the "read" page is now empty because of the deletion (or because
 704                  * it was empty when we got to it), free it.
 705                  *
 706                  * Tricky point here: if our read and write pages are adjacent in the
 707                  * bucket chain, our write lock on wbuf will conflict with
 708                  * _hash_freeovflpage's attempt to update the sibling links of the
 709                  * removed page.  However, in that case we are done anyway, so we can
 710                  * simply drop the write lock before calling _hash_freeovflpage.
 711                  */
 712                 if (PageIsEmpty(rpage))
 713                 {
 714                         rblkno = ropaque->hasho_prevblkno;
 715                         Assert(BlockNumberIsValid(rblkno));
 716
 717                         /* are we freeing the page adjacent to wbuf? */
 718                         if (rblkno == wblkno)
 719                         {
 720                                 /* yes, so release wbuf lock first */
 721                                 _hash_wrtbuf(rel, wbuf);
 722                                 /* free this overflow page (releases rbuf) */
 723                                 _hash_freeovflpage(rel, rbuf, bstrategy);
 724                                 /* done */
 725                                 return;
 726                         }
 727
 728                         /* free this overflow page, then get the previous one */
 729                         _hash_freeovflpage(rel, rbuf, bstrategy);
 730
 731                         rbuf = _hash_getbuf_with_strategy(rel,
 732                                                                                           rblkno,
 733                                                                                           HASH_WRITE,
 734                                                                                           LH_OVERFLOW_PAGE,
 735                                                                                           bstrategy);
 736                         rpage = BufferGetPage(rbuf);
 737                         ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
 738                         Assert(ropaque->hasho_bucket == bucket);
 739
 740                         roffnum = FirstOffsetNumber;
 741                 }
 742         }
 743
 744         /* NOTREACHED */
 745 }