src/backend/access/hash/hashovfl.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashovfl.c
   4  *        Overflow page management code for the Postgres hash access method
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/hash/hashovfl.c
  12  *
  13  * NOTES
  14  *        Overflow pages look like ordinary relation pages.
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/hash.h"
  21 #include "access/hash_xlog.h"
  22 #include "access/xloginsert.h"
  23 #include "miscadmin.h"
  24 #include "utils/rel.h"
  25
  26
  27 static uint32 _hash_firstfreebit(uint32 map);
  28
  29
  30 /*
  31  * Convert overflow page bit number (its index in the free-page bitmaps)
  32  * to block number within the index.
  33  */
  34 static BlockNumber
  35 bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
  36 {
  37         uint32          splitnum = metap->hashm_ovflpoint;
  38         uint32          i;
  39
  40         /* Convert zero-based bitnumber to 1-based page number */
  41         ovflbitnum += 1;
  42
  43         /* Determine the split number for this page (must be >= 1) */
  44         for (i = 1;
  45                  i < splitnum && ovflbitnum > metap->hashm_spares[i];
  46                  i++)
  47                  /* loop */ ;
  48
  49         /*
  50          * Convert to absolute page number by adding the number of bucket pages
  51          * that exist before this split point.
  52          */
  53         return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
  54 }
  55
  56 /*
  57  * _hash_ovflblkno_to_bitno
  58  *
  59  * Convert overflow page block number to bit number for free-page bitmap.
  60  */
  61 uint32
  62 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
  63 {
  64         uint32          splitnum = metap->hashm_ovflpoint;
  65         uint32          i;
  66         uint32          bitnum;
  67
  68         /* Determine the split number containing this page */
  69         for (i = 1; i <= splitnum; i++)
  70         {
  71                 if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
  72                         break;                          /* oops */
  73                 bitnum = ovflblkno - _hash_get_totalbuckets(i);
  74
  75                 /*
  76                  * bitnum has to be greater than number of overflow page added in
  77                  * previous split point. The overflow page at this splitnum (i) if any
  78                  * should start from (_hash_get_totalbuckets(i) +
  79                  * metap->hashm_spares[i - 1] + 1).
  80                  */
  81                 if (bitnum > metap->hashm_spares[i - 1] &&
  82                         bitnum <= metap->hashm_spares[i])
  83                         return bitnum - 1;      /* -1 to convert 1-based to 0-based */
  84         }
  85
  86         ereport(ERROR,
  87                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  88                          errmsg("invalid overflow block number %u", ovflblkno)));
  89         return 0;                                       /* keep compiler quiet */
  90 }
  91
  92 /*
  93  *      _hash_addovflpage
  94  *
  95  *      Add an overflow page to the bucket whose last page is pointed to by 'buf'.
  96  *
  97  *      On entry, the caller must hold a pin but no lock on 'buf'.  The pin is
  98  *      dropped before exiting (we assume the caller is not interested in 'buf'
  99  *      anymore) if not asked to retain.  The pin will be retained only for the
 100  *      primary bucket.  The returned overflow page will be pinned and
 101  *      write-locked; it is guaranteed to be empty.
 102  *
 103  *      The caller must hold a pin, but no lock, on the metapage buffer.
 104  *      That buffer is returned in the same state.
 105  *
 106  * NB: since this could be executed concurrently by multiple processes,
 107  * one should not assume that the returned overflow page will be the
 108  * immediate successor of the originally passed 'buf'.  Additional overflow
 109  * pages might have been added to the bucket chain in between.
 110  */
 111 Buffer
 112 _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
 113 {
 114         Buffer          ovflbuf;
 115         Page            page;
 116         Page            ovflpage;
 117         HashPageOpaque pageopaque;
 118         HashPageOpaque ovflopaque;
 119         HashMetaPage metap;
 120         Buffer          mapbuf = InvalidBuffer;
 121         Buffer          newmapbuf = InvalidBuffer;
 122         BlockNumber blkno;
 123         uint32          orig_firstfree;
 124         uint32          splitnum;
 125         uint32     *freep = NULL;
 126         uint32          max_ovflpg;
 127         uint32          bit;
 128         uint32          bitmap_page_bit;
 129         uint32          first_page;
 130         uint32          last_bit;
 131         uint32          last_page;
 132         uint32          i,
 133                                 j;
 134         bool            page_found = false;
 135
 136         /*
 137          * Write-lock the tail page.  Here, we need to maintain locking order such
 138          * that, first acquire the lock on tail page of bucket, then on meta page
 139          * to find and lock the bitmap page and if it is found, then lock on meta
 140          * page is released, then finally acquire the lock on new overflow buffer.
 141          * We need this locking order to avoid deadlock with backends that are
 142          * doing inserts.
 143          *
 144          * Note: We could have avoided locking many buffers here if we made two
 145          * WAL records for acquiring an overflow page (one to allocate an overflow
 146          * page and another to add it to overflow bucket chain).  However, doing
 147          * so can leak an overflow page, if the system crashes after allocation.
 148          * Needless to say, it is better to have a single record from a
 149          * performance point of view as well.
 150          */
 151         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 152
 153         /* probably redundant... */
 154         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 155
 156         /* loop to find current tail page, in case someone else inserted too */
 157         for (;;)
 158         {
 159                 BlockNumber nextblkno;
 160
 161                 page = BufferGetPage(buf);
 162                 pageopaque = HashPageGetOpaque(page);
 163                 nextblkno = pageopaque->hasho_nextblkno;
 164
 165                 if (!BlockNumberIsValid(nextblkno))
 166                         break;
 167
 168                 /* we assume we do not need to write the unmodified page */
 169                 if (retain_pin)
 170                 {
 171                         /* pin will be retained only for the primary bucket page */
 172                         Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
 173                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 174                 }
 175                 else
 176                         _hash_relbuf(rel, buf);
 177
 178                 retain_pin = false;
 179
 180                 buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 181         }
 182
 183         /* Get exclusive lock on the meta page */
 184         LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 185
 186         _hash_checkpage(rel, metabuf, LH_META_PAGE);
 187         metap = HashPageGetMeta(BufferGetPage(metabuf));
 188
 189         /* start search at hashm_firstfree */
 190         orig_firstfree = metap->hashm_firstfree;
 191         first_page = orig_firstfree >> BMPG_SHIFT(metap);
 192         bit = orig_firstfree & BMPG_MASK(metap);
 193         i = first_page;
 194         j = bit / BITS_PER_MAP;
 195         bit &= ~(BITS_PER_MAP - 1);
 196
 197         /* outer loop iterates once per bitmap page */
 198         for (;;)
 199         {
 200                 BlockNumber mapblkno;
 201                 Page            mappage;
 202                 uint32          last_inpage;
 203
 204                 /* want to end search with the last existing overflow page */
 205                 splitnum = metap->hashm_ovflpoint;
 206                 max_ovflpg = metap->hashm_spares[splitnum] - 1;
 207                 last_page = max_ovflpg >> BMPG_SHIFT(metap);
 208                 last_bit = max_ovflpg & BMPG_MASK(metap);
 209
 210                 if (i > last_page)
 211                         break;
 212
 213                 Assert(i < metap->hashm_nmaps);
 214                 mapblkno = metap->hashm_mapp[i];
 215
 216                 if (i == last_page)
 217                         last_inpage = last_bit;
 218                 else
 219                         last_inpage = BMPGSZ_BIT(metap) - 1;
 220
 221                 /* Release exclusive lock on metapage while reading bitmap page */
 222                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 223
 224                 mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
 225                 mappage = BufferGetPage(mapbuf);
 226                 freep = HashPageGetBitmap(mappage);
 227
 228                 for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
 229                 {
 230                         if (freep[j] != ALL_SET)
 231                         {
 232                                 page_found = true;
 233
 234                                 /* Reacquire exclusive lock on the meta page */
 235                                 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 236
 237                                 /* convert bit to bit number within page */
 238                                 bit += _hash_firstfreebit(freep[j]);
 239                                 bitmap_page_bit = bit;
 240
 241                                 /* convert bit to absolute bit number */
 242                                 bit += (i << BMPG_SHIFT(metap));
 243                                 /* Calculate address of the recycled overflow page */
 244                                 blkno = bitno_to_blkno(metap, bit);
 245
 246                                 /* Fetch and init the recycled page */
 247                                 ovflbuf = _hash_getinitbuf(rel, blkno);
 248
 249                                 goto found;
 250                         }
 251                 }
 252
 253                 /* No free space here, try to advance to next map page */
 254                 _hash_relbuf(rel, mapbuf);
 255                 mapbuf = InvalidBuffer;
 256                 i++;
 257                 j = 0;                                  /* scan from start of next map page */
 258                 bit = 0;
 259
 260                 /* Reacquire exclusive lock on the meta page */
 261                 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 262         }
 263
 264         /*
 265          * No free pages --- have to extend the relation to add an overflow page.
 266          * First, check to see if we have to add a new bitmap page too.
 267          */
 268         if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
 269         {
 270                 /*
 271                  * We create the new bitmap page with all pages marked "in use".
 272                  * Actually two pages in the new bitmap's range will exist
 273                  * immediately: the bitmap page itself, and the following page which
 274                  * is the one we return to the caller.  Both of these are correctly
 275                  * marked "in use".  Subsequent pages do not exist yet, but it is
 276                  * convenient to pre-mark them as "in use" too.
 277                  */
 278                 bit = metap->hashm_spares[splitnum];
 279
 280                 /* metapage already has a write lock */
 281                 if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
 282                         ereport(ERROR,
 283                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 284                                          errmsg("out of overflow pages in hash index \"%s\"",
 285                                                         RelationGetRelationName(rel))));
 286
 287                 newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
 288         }
 289         else
 290         {
 291                 /*
 292                  * Nothing to do here; since the page will be past the last used page,
 293                  * we know its bitmap bit was preinitialized to "in use".
 294                  */
 295         }
 296
 297         /* Calculate address of the new overflow page */
 298         bit = BufferIsValid(newmapbuf) ?
 299                 metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
 300         blkno = bitno_to_blkno(metap, bit);
 301
 302         /*
 303          * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
 304          * relation length stays in sync with ours.  XXX It's annoying to do this
 305          * with metapage write lock held; would be better to use a lock that
 306          * doesn't block incoming searches.
 307          *
 308          * It is okay to hold two buffer locks here (one on tail page of bucket
 309          * and other on new overflow page) since there cannot be anyone else
 310          * contending for access to ovflbuf.
 311          */
 312         ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
 313
 314 found:
 315
 316         /*
 317          * Do the update.  No ereport(ERROR) until changes are logged. We want to
 318          * log the changes for bitmap page and overflow page together to avoid
 319          * loss of pages in case the new page is added.
 320          */
 321         START_CRIT_SECTION();
 322
 323         if (page_found)
 324         {
 325                 Assert(BufferIsValid(mapbuf));
 326
 327                 /* mark page "in use" in the bitmap */
 328                 SETBIT(freep, bitmap_page_bit);
 329                 MarkBufferDirty(mapbuf);
 330         }
 331         else
 332         {
 333                 /* update the count to indicate new overflow page is added */
 334                 metap->hashm_spares[splitnum]++;
 335
 336                 if (BufferIsValid(newmapbuf))
 337                 {
 338                         _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
 339                         MarkBufferDirty(newmapbuf);
 340
 341                         /* add the new bitmap page to the metapage's list of bitmaps */
 342                         metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
 343                         metap->hashm_nmaps++;
 344                         metap->hashm_spares[splitnum]++;
 345                 }
 346
 347                 MarkBufferDirty(metabuf);
 348
 349                 /*
 350                  * for new overflow page, we don't need to explicitly set the bit in
 351                  * bitmap page, as by default that will be set to "in use".
 352                  */
 353         }
 354
 355         /*
 356          * Adjust hashm_firstfree to avoid redundant searches.  But don't risk
 357          * changing it if someone moved it while we were searching bitmap pages.
 358          */
 359         if (metap->hashm_firstfree == orig_firstfree)
 360         {
 361                 metap->hashm_firstfree = bit + 1;
 362                 MarkBufferDirty(metabuf);
 363         }
 364
 365         /* initialize new overflow page */
 366         ovflpage = BufferGetPage(ovflbuf);
 367         ovflopaque = HashPageGetOpaque(ovflpage);
 368         ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
 369         ovflopaque->hasho_nextblkno = InvalidBlockNumber;
 370         ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
 371         ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
 372         ovflopaque->hasho_page_id = HASHO_PAGE_ID;
 373
 374         MarkBufferDirty(ovflbuf);
 375
 376         /* logically chain overflow page to previous page */
 377         pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
 378
 379         MarkBufferDirty(buf);
 380
 381         /* XLOG stuff */
 382         if (RelationNeedsWAL(rel))
 383         {
 384                 XLogRecPtr      recptr;
 385                 xl_hash_add_ovfl_page xlrec;
 386
 387                 xlrec.bmpage_found = page_found;
 388                 xlrec.bmsize = metap->hashm_bmsize;
 389
 390                 XLogBeginInsert();
 391                 XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
 392
 393                 XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
 394                 XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
 395
 396                 XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
 397
 398                 if (BufferIsValid(mapbuf))
 399                 {
 400                         XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
 401                         XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
 402                 }
 403
 404                 if (BufferIsValid(newmapbuf))
 405                         XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
 406
 407                 XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
 408                 XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
 409
 410                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
 411
 412                 PageSetLSN(BufferGetPage(ovflbuf), recptr);
 413                 PageSetLSN(BufferGetPage(buf), recptr);
 414
 415                 if (BufferIsValid(mapbuf))
 416                         PageSetLSN(BufferGetPage(mapbuf), recptr);
 417
 418                 if (BufferIsValid(newmapbuf))
 419                         PageSetLSN(BufferGetPage(newmapbuf), recptr);
 420
 421                 PageSetLSN(BufferGetPage(metabuf), recptr);
 422         }
 423
 424         END_CRIT_SECTION();
 425
 426         if (retain_pin)
 427                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 428         else
 429                 _hash_relbuf(rel, buf);
 430
 431         if (BufferIsValid(mapbuf))
 432                 _hash_relbuf(rel, mapbuf);
 433
 434         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 435
 436         if (BufferIsValid(newmapbuf))
 437                 _hash_relbuf(rel, newmapbuf);
 438
 439         return ovflbuf;
 440 }
 441
 442 /*
 443  *      _hash_firstfreebit()
 444  *
 445  *      Return the number of the first bit that is not set in the word 'map'.
 446  */
 447 static uint32
 448 _hash_firstfreebit(uint32 map)
 449 {
 450         uint32          i,
 451                                 mask;
 452
 453         mask = 0x1;
 454         for (i = 0; i < BITS_PER_MAP; i++)
 455         {
 456                 if (!(mask & map))
 457                         return i;
 458                 mask <<= 1;
 459         }
 460
 461         elog(ERROR, "firstfreebit found no free bit");
 462
 463         return 0;                                       /* keep compiler quiet */
 464 }
 465
 466 /*
 467  *      _hash_freeovflpage() -
 468  *
 469  *      Remove this overflow page from its bucket's chain, and mark the page as
 470  *      free.  On entry, ovflbuf is write-locked; it is released before exiting.
 471  *
 472  *      Add the tuples (itups) to wbuf in this function.  We could do that in the
 473  *      caller as well, but the advantage of doing it here is we can easily write
 474  *      the WAL for XLOG_HASH_SQUEEZE_PAGE operation.  Addition of tuples and
 475  *      removal of overflow page has to done as an atomic operation, otherwise
 476  *      during replay on standby users might find duplicate records.
 477  *
 478  *      Since this function is invoked in VACUUM, we provide an access strategy
 479  *      parameter that controls fetches of the bucket pages.
 480  *
 481  *      Returns the block number of the page that followed the given page
 482  *      in the bucket, or InvalidBlockNumber if no following page.
 483  *
 484  *      NB: caller must not hold lock on metapage, nor on page, that's next to
 485  *      ovflbuf in the bucket chain.  We don't acquire the lock on page that's
 486  *      prior to ovflbuf in chain if it is same as wbuf because the caller already
 487  *      has a lock on same.
 488  */
 489 BlockNumber
 490 _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
 491                                    Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
 492                                    Size *tups_size, uint16 nitups,
 493                                    BufferAccessStrategy bstrategy)
 494 {
 495         HashMetaPage metap;
 496         Buffer          metabuf;
 497         Buffer          mapbuf;
 498         BlockNumber ovflblkno;
 499         BlockNumber prevblkno;
 500         BlockNumber blkno;
 501         BlockNumber nextblkno;
 502         BlockNumber writeblkno;
 503         HashPageOpaque ovflopaque;
 504         Page            ovflpage;
 505         Page            mappage;
 506         uint32     *freep;
 507         uint32          ovflbitno;
 508         int32           bitmappage,
 509                                 bitmapbit;
 510         Bucket          bucket PG_USED_FOR_ASSERTS_ONLY;
 511         Buffer          prevbuf = InvalidBuffer;
 512         Buffer          nextbuf = InvalidBuffer;
 513         bool            update_metap = false;
 514
 515         /* Get information from the doomed page */
 516         _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
 517         ovflblkno = BufferGetBlockNumber(ovflbuf);
 518         ovflpage = BufferGetPage(ovflbuf);
 519         ovflopaque = HashPageGetOpaque(ovflpage);
 520         nextblkno = ovflopaque->hasho_nextblkno;
 521         prevblkno = ovflopaque->hasho_prevblkno;
 522         writeblkno = BufferGetBlockNumber(wbuf);
 523         bucket = ovflopaque->hasho_bucket;
 524
 525         /*
 526          * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
 527          * up the bucket chain members behind and ahead of the overflow page being
 528          * deleted.  Concurrency issues are avoided by using lock chaining as
 529          * described atop hashbucketcleanup.
 530          */
 531         if (BlockNumberIsValid(prevblkno))
 532         {
 533                 if (prevblkno == writeblkno)
 534                         prevbuf = wbuf;
 535                 else
 536                         prevbuf = _hash_getbuf_with_strategy(rel,
 537                                                                                                  prevblkno,
 538                                                                                                  HASH_WRITE,
 539                                                                                                  LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
 540                                                                                                  bstrategy);
 541         }
 542         if (BlockNumberIsValid(nextblkno))
 543                 nextbuf = _hash_getbuf_with_strategy(rel,
 544                                                                                          nextblkno,
 545                                                                                          HASH_WRITE,
 546                                                                                          LH_OVERFLOW_PAGE,
 547                                                                                          bstrategy);
 548
 549         /* Note: bstrategy is intentionally not used for metapage and bitmap */
 550
 551         /* Read the metapage so we can determine which bitmap page to use */
 552         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 553         metap = HashPageGetMeta(BufferGetPage(metabuf));
 554
 555         /* Identify which bit to set */
 556         ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
 557
 558         bitmappage = ovflbitno >> BMPG_SHIFT(metap);
 559         bitmapbit = ovflbitno & BMPG_MASK(metap);
 560
 561         if (bitmappage >= metap->hashm_nmaps)
 562                 elog(ERROR, "invalid overflow bit number %u", ovflbitno);
 563         blkno = metap->hashm_mapp[bitmappage];
 564
 565         /* Release metapage lock while we access the bitmap page */
 566         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 567
 568         /* read the bitmap page to clear the bitmap bit */
 569         mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
 570         mappage = BufferGetPage(mapbuf);
 571         freep = HashPageGetBitmap(mappage);
 572         Assert(ISSET(freep, bitmapbit));
 573
 574         /* Get write-lock on metapage to update firstfree */
 575         LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 576
 577         /* This operation needs to log multiple tuples, prepare WAL for that */
 578         if (RelationNeedsWAL(rel))
 579                 XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
 580
 581         START_CRIT_SECTION();
 582
 583         /*
 584          * we have to insert tuples on the "write" page, being careful to preserve
 585          * hashkey ordering.  (If we insert many tuples into the same "write" page
 586          * it would be worth qsort'ing them).
 587          */
 588         if (nitups > 0)
 589         {
 590                 _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
 591                 MarkBufferDirty(wbuf);
 592         }
 593
 594         /*
 595          * Reinitialize the freed overflow page.  Just zeroing the page won't
 596          * work, because WAL replay routines expect pages to be initialized. See
 597          * explanation of RBM_NORMAL mode atop XLogReadBufferExtended.  We are
 598          * careful to make the special space valid here so that tools like
 599          * pageinspect won't get confused.
 600          */
 601         _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
 602
 603         ovflopaque = HashPageGetOpaque(ovflpage);
 604
 605         ovflopaque->hasho_prevblkno = InvalidBlockNumber;
 606         ovflopaque->hasho_nextblkno = InvalidBlockNumber;
 607         ovflopaque->hasho_bucket = InvalidBucket;
 608         ovflopaque->hasho_flag = LH_UNUSED_PAGE;
 609         ovflopaque->hasho_page_id = HASHO_PAGE_ID;
 610
 611         MarkBufferDirty(ovflbuf);
 612
 613         if (BufferIsValid(prevbuf))
 614         {
 615                 Page            prevpage = BufferGetPage(prevbuf);
 616                 HashPageOpaque prevopaque = HashPageGetOpaque(prevpage);
 617
 618                 Assert(prevopaque->hasho_bucket == bucket);
 619                 prevopaque->hasho_nextblkno = nextblkno;
 620                 MarkBufferDirty(prevbuf);
 621         }
 622         if (BufferIsValid(nextbuf))
 623         {
 624                 Page            nextpage = BufferGetPage(nextbuf);
 625                 HashPageOpaque nextopaque = HashPageGetOpaque(nextpage);
 626
 627                 Assert(nextopaque->hasho_bucket == bucket);
 628                 nextopaque->hasho_prevblkno = prevblkno;
 629                 MarkBufferDirty(nextbuf);
 630         }
 631
 632         /* Clear the bitmap bit to indicate that this overflow page is free */
 633         CLRBIT(freep, bitmapbit);
 634         MarkBufferDirty(mapbuf);
 635
 636         /* if this is now the first free page, update hashm_firstfree */
 637         if (ovflbitno < metap->hashm_firstfree)
 638         {
 639                 metap->hashm_firstfree = ovflbitno;
 640                 update_metap = true;
 641                 MarkBufferDirty(metabuf);
 642         }
 643
 644         /* XLOG stuff */
 645         if (RelationNeedsWAL(rel))
 646         {
 647                 xl_hash_squeeze_page xlrec;
 648                 XLogRecPtr      recptr;
 649                 int                     i;
 650                 bool            mod_wbuf = false;
 651
 652                 xlrec.prevblkno = prevblkno;
 653                 xlrec.nextblkno = nextblkno;
 654                 xlrec.ntups = nitups;
 655                 xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
 656                 xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
 657
 658                 XLogBeginInsert();
 659                 XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
 660
 661                 /*
 662                  * bucket buffer was not changed, but still needs to be registered to
 663                  * ensure that we can acquire a cleanup lock on it during replay.
 664                  */
 665                 if (!xlrec.is_prim_bucket_same_wrt)
 666                 {
 667                         uint8           flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
 668
 669                         XLogRegisterBuffer(0, bucketbuf, flags);
 670                 }
 671
 672                 if (xlrec.ntups > 0)
 673                 {
 674                         XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
 675
 676                         /* Remember that wbuf is modified. */
 677                         mod_wbuf = true;
 678
 679                         XLogRegisterBufData(1, (char *) itup_offsets,
 680                                                                 nitups * sizeof(OffsetNumber));
 681                         for (i = 0; i < nitups; i++)
 682                                 XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
 683                 }
 684                 else if (xlrec.is_prim_bucket_same_wrt || xlrec.is_prev_bucket_same_wrt)
 685                 {
 686                         uint8           wbuf_flags;
 687
 688                         /*
 689                          * A write buffer needs to be registered even if no tuples are
 690                          * added to it to ensure that we can acquire a cleanup lock on it
 691                          * if it is the same as primary bucket buffer or update the
 692                          * nextblkno if it is same as the previous bucket buffer.
 693                          */
 694                         Assert(xlrec.ntups == 0);
 695
 696                         wbuf_flags = REGBUF_STANDARD;
 697                         if (!xlrec.is_prev_bucket_same_wrt)
 698                         {
 699                                 wbuf_flags |= REGBUF_NO_CHANGE;
 700                         }
 701                         else
 702                         {
 703                                 /* Remember that wbuf is modified. */
 704                                 mod_wbuf = true;
 705                         }
 706                         XLogRegisterBuffer(1, wbuf, wbuf_flags);
 707                 }
 708
 709                 XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
 710
 711                 /*
 712                  * If prevpage and the writepage (block in which we are moving tuples
 713                  * from overflow) are same, then no need to separately register
 714                  * prevpage.  During replay, we can directly update the nextblock in
 715                  * writepage.
 716                  */
 717                 if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
 718                         XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
 719
 720                 if (BufferIsValid(nextbuf))
 721                         XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
 722
 723                 XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
 724                 XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
 725
 726                 if (update_metap)
 727                 {
 728                         XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
 729                         XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
 730                 }
 731
 732                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
 733
 734                 /* Set LSN iff wbuf is modified. */
 735                 if (mod_wbuf)
 736                         PageSetLSN(BufferGetPage(wbuf), recptr);
 737
 738                 PageSetLSN(BufferGetPage(ovflbuf), recptr);
 739
 740                 if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
 741                         PageSetLSN(BufferGetPage(prevbuf), recptr);
 742                 if (BufferIsValid(nextbuf))
 743                         PageSetLSN(BufferGetPage(nextbuf), recptr);
 744
 745                 PageSetLSN(BufferGetPage(mapbuf), recptr);
 746
 747                 if (update_metap)
 748                         PageSetLSN(BufferGetPage(metabuf), recptr);
 749         }
 750
 751         END_CRIT_SECTION();
 752
 753         /* release previous bucket if it is not same as write bucket */
 754         if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
 755                 _hash_relbuf(rel, prevbuf);
 756
 757         if (BufferIsValid(ovflbuf))
 758                 _hash_relbuf(rel, ovflbuf);
 759
 760         if (BufferIsValid(nextbuf))
 761                 _hash_relbuf(rel, nextbuf);
 762
 763         _hash_relbuf(rel, mapbuf);
 764         _hash_relbuf(rel, metabuf);
 765
 766         return nextblkno;
 767 }
 768
 769
 770 /*
 771  *      _hash_initbitmapbuffer()
 772  *
 773  *       Initialize a new bitmap page.  All bits in the new bitmap page are set to
 774  *       "1", indicating "in use".
 775  */
 776 void
 777 _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
 778 {
 779         Page            pg;
 780         HashPageOpaque op;
 781         uint32     *freep;
 782
 783         pg = BufferGetPage(buf);
 784
 785         /* initialize the page */
 786         if (initpage)
 787                 _hash_pageinit(pg, BufferGetPageSize(buf));
 788
 789         /* initialize the page's special space */
 790         op = HashPageGetOpaque(pg);
 791         op->hasho_prevblkno = InvalidBlockNumber;
 792         op->hasho_nextblkno = InvalidBlockNumber;
 793         op->hasho_bucket = InvalidBucket;
 794         op->hasho_flag = LH_BITMAP_PAGE;
 795         op->hasho_page_id = HASHO_PAGE_ID;
 796
 797         /* set all of the bits to 1 */
 798         freep = HashPageGetBitmap(pg);
 799         memset(freep, 0xFF, bmsize);
 800
 801         /*
 802          * Set pd_lower just past the end of the bitmap page data.  We could even
 803          * set pd_lower equal to pd_upper, but this is more precise and makes the
 804          * page look compressible to xlog.c.
 805          */
 806         ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
 807 }
 808
 809
 810 /*
 811  *      _hash_squeezebucket(rel, bucket)
 812  *
 813  *      Try to squeeze the tuples onto pages occurring earlier in the
 814  *      bucket chain in an attempt to free overflow pages. When we start
 815  *      the "squeezing", the page from which we start taking tuples (the
 816  *      "read" page) is the last bucket in the bucket chain and the page
 817  *      onto which we start squeezing tuples (the "write" page) is the
 818  *      first page in the bucket chain.  The read page works backward and
 819  *      the write page works forward; the procedure terminates when the
 820  *      read page and write page are the same page.
 821  *
 822  *      At completion of this procedure, it is guaranteed that all pages in
 823  *      the bucket are nonempty, unless the bucket is totally empty (in
 824  *      which case all overflow pages will be freed).  The original implementation
 825  *      required that to be true on entry as well, but it's a lot easier for
 826  *      callers to leave empty overflow pages and let this guy clean it up.
 827  *
 828  *      Caller must acquire cleanup lock on the primary page of the target
 829  *      bucket to exclude any scans that are in progress, which could easily
 830  *      be confused into returning the same tuple more than once or some tuples
 831  *      not at all by the rearrangement we are performing here.  To prevent
 832  *      any concurrent scan to cross the squeeze scan we use lock chaining
 833  *      similar to hashbucketcleanup.  Refer comments atop hashbucketcleanup.
 834  *
 835  *      We need to retain a pin on the primary bucket to ensure that no concurrent
 836  *      split can start.
 837  *
 838  *      Since this function is invoked in VACUUM, we provide an access strategy
 839  *      parameter that controls fetches of the bucket pages.
 840  */
 841 void
 842 _hash_squeezebucket(Relation rel,
 843                                         Bucket bucket,
 844                                         BlockNumber bucket_blkno,
 845                                         Buffer bucket_buf,
 846                                         BufferAccessStrategy bstrategy)
 847 {
 848         BlockNumber wblkno;
 849         BlockNumber rblkno;
 850         Buffer          wbuf;
 851         Buffer          rbuf;
 852         Page            wpage;
 853         Page            rpage;
 854         HashPageOpaque wopaque;
 855         HashPageOpaque ropaque;
 856
 857         /*
 858          * start squeezing into the primary bucket page.
 859          */
 860         wblkno = bucket_blkno;
 861         wbuf = bucket_buf;
 862         wpage = BufferGetPage(wbuf);
 863         wopaque = HashPageGetOpaque(wpage);
 864
 865         /*
 866          * if there aren't any overflow pages, there's nothing to squeeze. caller
 867          * is responsible for releasing the pin on primary bucket page.
 868          */
 869         if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
 870         {
 871                 LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
 872                 return;
 873         }
 874
 875         /*
 876          * Find the last page in the bucket chain by starting at the base bucket
 877          * page and working forward.  Note: we assume that a hash bucket chain is
 878          * usually smaller than the buffer ring being used by VACUUM, else using
 879          * the access strategy here would be counterproductive.
 880          */
 881         rbuf = InvalidBuffer;
 882         ropaque = wopaque;
 883         do
 884         {
 885                 rblkno = ropaque->hasho_nextblkno;
 886                 if (rbuf != InvalidBuffer)
 887                         _hash_relbuf(rel, rbuf);
 888                 rbuf = _hash_getbuf_with_strategy(rel,
 889                                                                                   rblkno,
 890                                                                                   HASH_WRITE,
 891                                                                                   LH_OVERFLOW_PAGE,
 892                                                                                   bstrategy);
 893                 rpage = BufferGetPage(rbuf);
 894                 ropaque = HashPageGetOpaque(rpage);
 895                 Assert(ropaque->hasho_bucket == bucket);
 896         } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
 897
 898         /*
 899          * squeeze the tuples.
 900          */
 901         for (;;)
 902         {
 903                 OffsetNumber roffnum;
 904                 OffsetNumber maxroffnum;
 905                 OffsetNumber deletable[MaxOffsetNumber];
 906                 IndexTuple      itups[MaxIndexTuplesPerPage];
 907                 Size            tups_size[MaxIndexTuplesPerPage];
 908                 OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
 909                 uint16          ndeletable = 0;
 910                 uint16          nitups = 0;
 911                 Size            all_tups_size = 0;
 912                 int                     i;
 913                 bool            retain_pin = false;
 914
 915 readpage:
 916                 /* Scan each tuple in "read" page */
 917                 maxroffnum = PageGetMaxOffsetNumber(rpage);
 918                 for (roffnum = FirstOffsetNumber;
 919                          roffnum <= maxroffnum;
 920                          roffnum = OffsetNumberNext(roffnum))
 921                 {
 922                         IndexTuple      itup;
 923                         Size            itemsz;
 924
 925                         /* skip dead tuples */
 926                         if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
 927                                 continue;
 928
 929                         itup = (IndexTuple) PageGetItem(rpage,
 930                                                                                         PageGetItemId(rpage, roffnum));
 931                         itemsz = IndexTupleSize(itup);
 932                         itemsz = MAXALIGN(itemsz);
 933
 934                         /*
 935                          * Walk up the bucket chain, looking for a page big enough for
 936                          * this item and all other accumulated items.  Exit if we reach
 937                          * the read page.
 938                          */
 939                         while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
 940                         {
 941                                 Buffer          next_wbuf = InvalidBuffer;
 942                                 bool            tups_moved = false;
 943
 944                                 Assert(!PageIsEmpty(wpage));
 945
 946                                 if (wblkno == bucket_blkno)
 947                                         retain_pin = true;
 948
 949                                 wblkno = wopaque->hasho_nextblkno;
 950                                 Assert(BlockNumberIsValid(wblkno));
 951
 952                                 /* don't need to move to next page if we reached the read page */
 953                                 if (wblkno != rblkno)
 954                                         next_wbuf = _hash_getbuf_with_strategy(rel,
 955                                                                                                                    wblkno,
 956                                                                                                                    HASH_WRITE,
 957                                                                                                                    LH_OVERFLOW_PAGE,
 958                                                                                                                    bstrategy);
 959
 960                                 if (nitups > 0)
 961                                 {
 962                                         Assert(nitups == ndeletable);
 963
 964                                         /*
 965                                          * This operation needs to log multiple tuples, prepare
 966                                          * WAL for that.
 967                                          */
 968                                         if (RelationNeedsWAL(rel))
 969                                                 XLogEnsureRecordSpace(0, 3 + nitups);
 970
 971                                         START_CRIT_SECTION();
 972
 973                                         /*
 974                                          * we have to insert tuples on the "write" page, being
 975                                          * careful to preserve hashkey ordering.  (If we insert
 976                                          * many tuples into the same "write" page it would be
 977                                          * worth qsort'ing them).
 978                                          */
 979                                         _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
 980                                         MarkBufferDirty(wbuf);
 981
 982                                         /* Delete tuples we already moved off read page */
 983                                         PageIndexMultiDelete(rpage, deletable, ndeletable);
 984                                         MarkBufferDirty(rbuf);
 985
 986                                         /* XLOG stuff */
 987                                         if (RelationNeedsWAL(rel))
 988                                         {
 989                                                 XLogRecPtr      recptr;
 990                                                 xl_hash_move_page_contents xlrec;
 991
 992                                                 xlrec.ntups = nitups;
 993                                                 xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf);
 994
 995                                                 XLogBeginInsert();
 996                                                 XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);
 997
 998                                                 /*
 999                                                  * bucket buffer was not changed, but still needs to
1000                                                  * be registered to ensure that we can acquire a
1001                                                  * cleanup lock on it during replay.
1002                                                  */
1003                                                 if (!xlrec.is_prim_bucket_same_wrt)
1004                                                 {
1005                                                         int                     flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
1006
1007                                                         XLogRegisterBuffer(0, bucket_buf, flags);
1008                                                 }
1009
1010                                                 XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
1011                                                 XLogRegisterBufData(1, (char *) itup_offsets,
1012                                                                                         nitups * sizeof(OffsetNumber));
1013                                                 for (i = 0; i < nitups; i++)
1014                                                         XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
1015
1016                                                 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
1017                                                 XLogRegisterBufData(2, (char *) deletable,
1018                                                                                         ndeletable * sizeof(OffsetNumber));
1019
1020                                                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
1021
1022                                                 PageSetLSN(BufferGetPage(wbuf), recptr);
1023                                                 PageSetLSN(BufferGetPage(rbuf), recptr);
1024                                         }
1025
1026                                         END_CRIT_SECTION();
1027
1028                                         tups_moved = true;
1029                                 }
1030
1031                                 /*
1032                                  * release the lock on previous page after acquiring the lock
1033                                  * on next page
1034                                  */
1035                                 if (retain_pin)
1036                                         LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1037                                 else
1038                                         _hash_relbuf(rel, wbuf);
1039
1040                                 /* nothing more to do if we reached the read page */
1041                                 if (rblkno == wblkno)
1042                                 {
1043                                         _hash_relbuf(rel, rbuf);
1044                                         return;
1045                                 }
1046
1047                                 wbuf = next_wbuf;
1048                                 wpage = BufferGetPage(wbuf);
1049                                 wopaque = HashPageGetOpaque(wpage);
1050                                 Assert(wopaque->hasho_bucket == bucket);
1051                                 retain_pin = false;
1052
1053                                 /* be tidy */
1054                                 for (i = 0; i < nitups; i++)
1055                                         pfree(itups[i]);
1056                                 nitups = 0;
1057                                 all_tups_size = 0;
1058                                 ndeletable = 0;
1059
1060                                 /*
1061                                  * after moving the tuples, rpage would have been compacted,
1062                                  * so we need to rescan it.
1063                                  */
1064                                 if (tups_moved)
1065                                         goto readpage;
1066                         }
1067
1068                         /* remember tuple for deletion from "read" page */
1069                         deletable[ndeletable++] = roffnum;
1070
1071                         /*
1072                          * we need a copy of index tuples as they can be freed as part of
1073                          * overflow page, however we need them to write a WAL record in
1074                          * _hash_freeovflpage.
1075                          */
1076                         itups[nitups] = CopyIndexTuple(itup);
1077                         tups_size[nitups++] = itemsz;
1078                         all_tups_size += itemsz;
1079                 }
1080
1081                 /*
1082                  * If we reach here, there are no live tuples on the "read" page ---
1083                  * it was empty when we got to it, or we moved them all.  So we can
1084                  * just free the page without bothering with deleting tuples
1085                  * individually.  Then advance to the previous "read" page.
1086                  *
1087                  * Tricky point here: if our read and write pages are adjacent in the
1088                  * bucket chain, our write lock on wbuf will conflict with
1089                  * _hash_freeovflpage's attempt to update the sibling links of the
1090                  * removed page.  In that case, we don't need to lock it again.
1091                  */
1092                 rblkno = ropaque->hasho_prevblkno;
1093                 Assert(BlockNumberIsValid(rblkno));
1094
1095                 /* free this overflow page (releases rbuf) */
1096                 _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1097                                                    tups_size, nitups, bstrategy);
1098
1099                 /* be tidy */
1100                 for (i = 0; i < nitups; i++)
1101                         pfree(itups[i]);
1102
1103                 /* are we freeing the page adjacent to wbuf? */
1104                 if (rblkno == wblkno)
1105                 {
1106                         /* retain the pin on primary bucket page till end of bucket scan */
1107                         if (wblkno == bucket_blkno)
1108                                 LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1109                         else
1110                                 _hash_relbuf(rel, wbuf);
1111                         return;
1112                 }
1113
1114                 rbuf = _hash_getbuf_with_strategy(rel,
1115                                                                                   rblkno,
1116                                                                                   HASH_WRITE,
1117                                                                                   LH_OVERFLOW_PAGE,
1118                                                                                   bstrategy);
1119                 rpage = BufferGetPage(rbuf);
1120                 ropaque = HashPageGetOpaque(rpage);
1121                 Assert(ropaque->hasho_bucket == bucket);
1122         }
1123
1124         /* NOTREACHED */
1125 }