src/backend/access/nbtree/nbtpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtpage.c
   4  *        BTree-specific page management code for the Postgres btree access
   5  *        method.
   6  *
   7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        src/backend/access/nbtree/nbtpage.c
  13  *
  14  *      NOTES
  15  *         Postgres btree pages look like ordinary relation pages.  The opaque
  16  *         data at high addresses includes pointers to left and right siblings
  17  *         and flag data describing page state.  The first page in a btree, page
  18  *         zero, is special -- it stores meta-information describing the tree.
  19  *         Pages one and higher store the actual tree data.
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include "access/nbtree.h"
  26 #include "access/nbtxlog.h"
  27 #include "access/tableam.h"
  28 #include "access/transam.h"
  29 #include "access/xlog.h"
  30 #include "access/xloginsert.h"
  31 #include "miscadmin.h"
  32 #include "storage/indexfsm.h"
  33 #include "storage/lmgr.h"
  34 #include "storage/predicate.h"
  35 #include "utils/snapmgr.h"
  36
  37 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
  38 static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
  39 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
  40                                                                          bool *rightsib_empty);
  41 static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
  42                                                                          OffsetNumber *deletable, int ndeletable);
  43 static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
  44                                                                    BTStack stack, Buffer *topparent, OffsetNumber *topoff,
  45                                                                    BlockNumber *target, BlockNumber *rightsib);
  46 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
  47                                                            TransactionId latestRemovedXid);
  48
  49 /*
  50  *      _bt_initmetapage() -- Fill a page buffer with a correct metapage image
  51  */
  52 void
  53 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
  54                                  bool allequalimage)
  55 {
  56         BTMetaPageData *metad;
  57         BTPageOpaque metaopaque;
  58
  59         _bt_pageinit(page, BLCKSZ);
  60
  61         metad = BTPageGetMeta(page);
  62         metad->btm_magic = BTREE_MAGIC;
  63         metad->btm_version = BTREE_VERSION;
  64         metad->btm_root = rootbknum;
  65         metad->btm_level = level;
  66         metad->btm_fastroot = rootbknum;
  67         metad->btm_fastlevel = level;
  68         metad->btm_oldest_btpo_xact = InvalidTransactionId;
  69         metad->btm_last_cleanup_num_heap_tuples = -1.0;
  70         metad->btm_allequalimage = allequalimage;
  71
  72         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
  73         metaopaque->btpo_flags = BTP_META;
  74
  75         /*
  76          * Set pd_lower just past the end of the metadata.  This is essential,
  77          * because without doing so, metadata will be lost if xlog.c compresses
  78          * the page.
  79          */
  80         ((PageHeader) page)->pd_lower =
  81                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
  82 }
  83
  84 /*
  85  *      _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
  86  *              3, the last version that can be updated without broadly affecting
  87  *              on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
  88  *
  89  *              This routine does purely in-memory image upgrade.  Caller is
  90  *              responsible for locking, WAL-logging etc.
  91  */
  92 void
  93 _bt_upgrademetapage(Page page)
  94 {
  95         BTMetaPageData *metad;
  96         BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
  97
  98         metad = BTPageGetMeta(page);
  99         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 100
 101         /* It must be really a meta page of upgradable version */
 102         Assert(metaopaque->btpo_flags & BTP_META);
 103         Assert(metad->btm_version < BTREE_NOVAC_VERSION);
 104         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 105
 106         /* Set version number and fill extra fields added into version 3 */
 107         metad->btm_version = BTREE_NOVAC_VERSION;
 108         metad->btm_oldest_btpo_xact = InvalidTransactionId;
 109         metad->btm_last_cleanup_num_heap_tuples = -1.0;
 110         /* Only a REINDEX can set this field */
 111         Assert(!metad->btm_allequalimage);
 112         metad->btm_allequalimage = false;
 113
 114         /* Adjust pd_lower (see _bt_initmetapage() for details) */
 115         ((PageHeader) page)->pd_lower =
 116                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
 117 }
 118
 119 /*
 120  * Get metadata from share-locked buffer containing metapage, while performing
 121  * standard sanity checks.
 122  *
 123  * Callers that cache data returned here in local cache should note that an
 124  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
 125  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
 126  */
 127 static BTMetaPageData *
 128 _bt_getmeta(Relation rel, Buffer metabuf)
 129 {
 130         Page            metapg;
 131         BTPageOpaque metaopaque;
 132         BTMetaPageData *metad;
 133
 134         metapg = BufferGetPage(metabuf);
 135         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 136         metad = BTPageGetMeta(metapg);
 137
 138         /* sanity-check the metapage */
 139         if (!P_ISMETA(metaopaque) ||
 140                 metad->btm_magic != BTREE_MAGIC)
 141                 ereport(ERROR,
 142                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 143                                  errmsg("index \"%s\" is not a btree",
 144                                                 RelationGetRelationName(rel))));
 145
 146         if (metad->btm_version < BTREE_MIN_VERSION ||
 147                 metad->btm_version > BTREE_VERSION)
 148                 ereport(ERROR,
 149                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 150                                  errmsg("version mismatch in index \"%s\": file version %d, "
 151                                                 "current version %d, minimal supported version %d",
 152                                                 RelationGetRelationName(rel),
 153                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 154
 155         return metad;
 156 }
 157
 158 /*
 159  *      _bt_update_meta_cleanup_info() -- Update cleanup-related information in
 160  *                                                                        the metapage.
 161  *
 162  *              This routine checks if provided cleanup-related information is matching
 163  *              to those written in the metapage.  On mismatch, metapage is overwritten.
 164  */
 165 void
 166 _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 167                                                          float8 numHeapTuples)
 168 {
 169         Buffer          metabuf;
 170         Page            metapg;
 171         BTMetaPageData *metad;
 172         bool            needsRewrite = false;
 173         XLogRecPtr      recptr;
 174
 175         /* read the metapage and check if it needs rewrite */
 176         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 177         metapg = BufferGetPage(metabuf);
 178         metad = BTPageGetMeta(metapg);
 179
 180         /* outdated version of metapage always needs rewrite */
 181         if (metad->btm_version < BTREE_NOVAC_VERSION)
 182                 needsRewrite = true;
 183         else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
 184                          metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
 185                 needsRewrite = true;
 186
 187         if (!needsRewrite)
 188         {
 189                 _bt_relbuf(rel, metabuf);
 190                 return;
 191         }
 192
 193         /* trade in our read lock for a write lock */
 194         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 195         LockBuffer(metabuf, BT_WRITE);
 196
 197         START_CRIT_SECTION();
 198
 199         /* upgrade meta-page if needed */
 200         if (metad->btm_version < BTREE_NOVAC_VERSION)
 201                 _bt_upgrademetapage(metapg);
 202
 203         /* update cleanup-related information */
 204         metad->btm_oldest_btpo_xact = oldestBtpoXact;
 205         metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
 206         MarkBufferDirty(metabuf);
 207
 208         /* write wal record if needed */
 209         if (RelationNeedsWAL(rel))
 210         {
 211                 xl_btree_metadata md;
 212
 213                 XLogBeginInsert();
 214                 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 215
 216                 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
 217                 md.version = metad->btm_version;
 218                 md.root = metad->btm_root;
 219                 md.level = metad->btm_level;
 220                 md.fastroot = metad->btm_fastroot;
 221                 md.fastlevel = metad->btm_fastlevel;
 222                 md.oldest_btpo_xact = oldestBtpoXact;
 223                 md.last_cleanup_num_heap_tuples = numHeapTuples;
 224                 md.allequalimage = metad->btm_allequalimage;
 225
 226                 XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
 227
 228                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
 229
 230                 PageSetLSN(metapg, recptr);
 231         }
 232
 233         END_CRIT_SECTION();
 234         _bt_relbuf(rel, metabuf);
 235 }
 236
 237 /*
 238  *      _bt_getroot() -- Get the root page of the btree.
 239  *
 240  *              Since the root page can move around the btree file, we have to read
 241  *              its location from the metadata page, and then read the root page
 242  *              itself.  If no root page exists yet, we have to create one.
 243  *
 244  *              The access type parameter (BT_READ or BT_WRITE) controls whether
 245  *              a new root page will be created or not.  If access = BT_READ,
 246  *              and no root page exists, we just return InvalidBuffer.  For
 247  *              BT_WRITE, we try to create the root page if it doesn't exist.
 248  *              NOTE that the returned root page will have only a read lock set
 249  *              on it even if access = BT_WRITE!
 250  *
 251  *              The returned page is not necessarily the true root --- it could be
 252  *              a "fast root" (a page that is alone in its level due to deletions).
 253  *              Also, if the root page is split while we are "in flight" to it,
 254  *              what we will return is the old root, which is now just the leftmost
 255  *              page on a probably-not-very-wide level.  For most purposes this is
 256  *              as good as or better than the true root, so we do not bother to
 257  *              insist on finding the true root.  We do, however, guarantee to
 258  *              return a live (not deleted or half-dead) page.
 259  *
 260  *              On successful return, the root page is pinned and read-locked.
 261  *              The metadata page is not locked or pinned on exit.
 262  */
 263 Buffer
 264 _bt_getroot(Relation rel, int access)
 265 {
 266         Buffer          metabuf;
 267         Buffer          rootbuf;
 268         Page            rootpage;
 269         BTPageOpaque rootopaque;
 270         BlockNumber rootblkno;
 271         uint32          rootlevel;
 272         BTMetaPageData *metad;
 273
 274         /*
 275          * Try to use previously-cached metapage data to find the root.  This
 276          * normally saves one buffer access per index search, which is a very
 277          * helpful savings in bufmgr traffic and hence contention.
 278          */
 279         if (rel->rd_amcache != NULL)
 280         {
 281                 metad = (BTMetaPageData *) rel->rd_amcache;
 282                 /* We shouldn't have cached it if any of these fail */
 283                 Assert(metad->btm_magic == BTREE_MAGIC);
 284                 Assert(metad->btm_version >= BTREE_MIN_VERSION);
 285                 Assert(metad->btm_version <= BTREE_VERSION);
 286                 Assert(!metad->btm_allequalimage ||
 287                            metad->btm_version > BTREE_NOVAC_VERSION);
 288                 Assert(metad->btm_root != P_NONE);
 289
 290                 rootblkno = metad->btm_fastroot;
 291                 Assert(rootblkno != P_NONE);
 292                 rootlevel = metad->btm_fastlevel;
 293
 294                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 295                 rootpage = BufferGetPage(rootbuf);
 296                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 297
 298                 /*
 299                  * Since the cache might be stale, we check the page more carefully
 300                  * here than normal.  We *must* check that it's not deleted. If it's
 301                  * not alone on its level, then we reject too --- this may be overly
 302                  * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
 303                  * because that's not set in a "fast root".
 304                  */
 305                 if (!P_IGNORE(rootopaque) &&
 306                         rootopaque->btpo.level == rootlevel &&
 307                         P_LEFTMOST(rootopaque) &&
 308                         P_RIGHTMOST(rootopaque))
 309                 {
 310                         /* OK, accept cached page as the root */
 311                         return rootbuf;
 312                 }
 313                 _bt_relbuf(rel, rootbuf);
 314                 /* Cache is stale, throw it away */
 315                 if (rel->rd_amcache)
 316                         pfree(rel->rd_amcache);
 317                 rel->rd_amcache = NULL;
 318         }
 319
 320         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 321         metad = _bt_getmeta(rel, metabuf);
 322
 323         /* if no root page initialized yet, do it */
 324         if (metad->btm_root == P_NONE)
 325         {
 326                 Page            metapg;
 327
 328                 /* If access = BT_READ, caller doesn't want us to create root yet */
 329                 if (access == BT_READ)
 330                 {
 331                         _bt_relbuf(rel, metabuf);
 332                         return InvalidBuffer;
 333                 }
 334
 335                 /* trade in our read lock for a write lock */
 336                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 337                 LockBuffer(metabuf, BT_WRITE);
 338
 339                 /*
 340                  * Race condition:      if someone else initialized the metadata between
 341                  * the time we released the read lock and acquired the write lock, we
 342                  * must avoid doing it again.
 343                  */
 344                 if (metad->btm_root != P_NONE)
 345                 {
 346                         /*
 347                          * Metadata initialized by someone else.  In order to guarantee no
 348                          * deadlocks, we have to release the metadata page and start all
 349                          * over again.  (Is that really true? But it's hardly worth trying
 350                          * to optimize this case.)
 351                          */
 352                         _bt_relbuf(rel, metabuf);
 353                         return _bt_getroot(rel, access);
 354                 }
 355
 356                 /*
 357                  * Get, initialize, write, and leave a lock of the appropriate type on
 358                  * the new root page.  Since this is the first page in the tree, it's
 359                  * a leaf as well as the root.
 360                  */
 361                 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 362                 rootblkno = BufferGetBlockNumber(rootbuf);
 363                 rootpage = BufferGetPage(rootbuf);
 364                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 365                 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 366                 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
 367                 rootopaque->btpo.level = 0;
 368                 rootopaque->btpo_cycleid = 0;
 369                 /* Get raw page pointer for metapage */
 370                 metapg = BufferGetPage(metabuf);
 371
 372                 /* NO ELOG(ERROR) till meta is updated */
 373                 START_CRIT_SECTION();
 374
 375                 /* upgrade metapage if needed */
 376                 if (metad->btm_version < BTREE_NOVAC_VERSION)
 377                         _bt_upgrademetapage(metapg);
 378
 379                 metad->btm_root = rootblkno;
 380                 metad->btm_level = 0;
 381                 metad->btm_fastroot = rootblkno;
 382                 metad->btm_fastlevel = 0;
 383                 metad->btm_oldest_btpo_xact = InvalidTransactionId;
 384                 metad->btm_last_cleanup_num_heap_tuples = -1.0;
 385
 386                 MarkBufferDirty(rootbuf);
 387                 MarkBufferDirty(metabuf);
 388
 389                 /* XLOG stuff */
 390                 if (RelationNeedsWAL(rel))
 391                 {
 392                         xl_btree_newroot xlrec;
 393                         XLogRecPtr      recptr;
 394                         xl_btree_metadata md;
 395
 396                         XLogBeginInsert();
 397                         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
 398                         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 399
 400                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
 401                         md.version = metad->btm_version;
 402                         md.root = rootblkno;
 403                         md.level = 0;
 404                         md.fastroot = rootblkno;
 405                         md.fastlevel = 0;
 406                         md.oldest_btpo_xact = InvalidTransactionId;
 407                         md.last_cleanup_num_heap_tuples = -1.0;
 408                         md.allequalimage = metad->btm_allequalimage;
 409
 410                         XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 411
 412                         xlrec.rootblk = rootblkno;
 413                         xlrec.level = 0;
 414
 415                         XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
 416
 417                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
 418
 419                         PageSetLSN(rootpage, recptr);
 420                         PageSetLSN(metapg, recptr);
 421                 }
 422
 423                 END_CRIT_SECTION();
 424
 425                 /*
 426                  * swap root write lock for read lock.  There is no danger of anyone
 427                  * else accessing the new root page while it's unlocked, since no one
 428                  * else knows where it is yet.
 429                  */
 430                 LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 431                 LockBuffer(rootbuf, BT_READ);
 432
 433                 /* okay, metadata is correct, release lock on it without caching */
 434                 _bt_relbuf(rel, metabuf);
 435         }
 436         else
 437         {
 438                 rootblkno = metad->btm_fastroot;
 439                 Assert(rootblkno != P_NONE);
 440                 rootlevel = metad->btm_fastlevel;
 441
 442                 /*
 443                  * Cache the metapage data for next time
 444                  */
 445                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 446                                                                                          sizeof(BTMetaPageData));
 447                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 448
 449                 /*
 450                  * We are done with the metapage; arrange to release it via first
 451                  * _bt_relandgetbuf call
 452                  */
 453                 rootbuf = metabuf;
 454
 455                 for (;;)
 456                 {
 457                         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 458                         rootpage = BufferGetPage(rootbuf);
 459                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 460
 461                         if (!P_IGNORE(rootopaque))
 462                                 break;
 463
 464                         /* it's dead, Jim.  step right one page */
 465                         if (P_RIGHTMOST(rootopaque))
 466                                 elog(ERROR, "no live root page found in index \"%s\"",
 467                                          RelationGetRelationName(rel));
 468                         rootblkno = rootopaque->btpo_next;
 469                 }
 470
 471                 /* Note: can't check btpo.level on deleted pages */
 472                 if (rootopaque->btpo.level != rootlevel)
 473                         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 474                                  rootblkno, RelationGetRelationName(rel),
 475                                  rootopaque->btpo.level, rootlevel);
 476         }
 477
 478         /*
 479          * By here, we have a pin and read lock on the root page, and no lock set
 480          * on the metadata page.  Return the root page's buffer.
 481          */
 482         return rootbuf;
 483 }
 484
 485 /*
 486  *      _bt_gettrueroot() -- Get the true root page of the btree.
 487  *
 488  *              This is the same as the BT_READ case of _bt_getroot(), except
 489  *              we follow the true-root link not the fast-root link.
 490  *
 491  * By the time we acquire lock on the root page, it might have been split and
 492  * not be the true root anymore.  This is okay for the present uses of this
 493  * routine; we only really need to be able to move up at least one tree level
 494  * from whatever non-root page we were at.  If we ever do need to lock the
 495  * one true root page, we could loop here, re-reading the metapage on each
 496  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 497  * moving to the root --- that'd deadlock against any concurrent root split.)
 498  */
 499 Buffer
 500 _bt_gettrueroot(Relation rel)
 501 {
 502         Buffer          metabuf;
 503         Page            metapg;
 504         BTPageOpaque metaopaque;
 505         Buffer          rootbuf;
 506         Page            rootpage;
 507         BTPageOpaque rootopaque;
 508         BlockNumber rootblkno;
 509         uint32          rootlevel;
 510         BTMetaPageData *metad;
 511
 512         /*
 513          * We don't try to use cached metapage data here, since (a) this path is
 514          * not performance-critical, and (b) if we are here it suggests our cache
 515          * is out-of-date anyway.  In light of point (b), it's probably safest to
 516          * actively flush any cached metapage info.
 517          */
 518         if (rel->rd_amcache)
 519                 pfree(rel->rd_amcache);
 520         rel->rd_amcache = NULL;
 521
 522         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 523         metapg = BufferGetPage(metabuf);
 524         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 525         metad = BTPageGetMeta(metapg);
 526
 527         if (!P_ISMETA(metaopaque) ||
 528                 metad->btm_magic != BTREE_MAGIC)
 529                 ereport(ERROR,
 530                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 531                                  errmsg("index \"%s\" is not a btree",
 532                                                 RelationGetRelationName(rel))));
 533
 534         if (metad->btm_version < BTREE_MIN_VERSION ||
 535                 metad->btm_version > BTREE_VERSION)
 536                 ereport(ERROR,
 537                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 538                                  errmsg("version mismatch in index \"%s\": file version %d, "
 539                                                 "current version %d, minimal supported version %d",
 540                                                 RelationGetRelationName(rel),
 541                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 542
 543         /* if no root page initialized yet, fail */
 544         if (metad->btm_root == P_NONE)
 545         {
 546                 _bt_relbuf(rel, metabuf);
 547                 return InvalidBuffer;
 548         }
 549
 550         rootblkno = metad->btm_root;
 551         rootlevel = metad->btm_level;
 552
 553         /*
 554          * We are done with the metapage; arrange to release it via first
 555          * _bt_relandgetbuf call
 556          */
 557         rootbuf = metabuf;
 558
 559         for (;;)
 560         {
 561                 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 562                 rootpage = BufferGetPage(rootbuf);
 563                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 564
 565                 if (!P_IGNORE(rootopaque))
 566                         break;
 567
 568                 /* it's dead, Jim.  step right one page */
 569                 if (P_RIGHTMOST(rootopaque))
 570                         elog(ERROR, "no live root page found in index \"%s\"",
 571                                  RelationGetRelationName(rel));
 572                 rootblkno = rootopaque->btpo_next;
 573         }
 574
 575         /* Note: can't check btpo.level on deleted pages */
 576         if (rootopaque->btpo.level != rootlevel)
 577                 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 578                          rootblkno, RelationGetRelationName(rel),
 579                          rootopaque->btpo.level, rootlevel);
 580
 581         return rootbuf;
 582 }
 583
 584 /*
 585  *      _bt_getrootheight() -- Get the height of the btree search tree.
 586  *
 587  *              We return the level (counting from zero) of the current fast root.
 588  *              This represents the number of tree levels we'd have to descend through
 589  *              to start any btree index search.
 590  *
 591  *              This is used by the planner for cost-estimation purposes.  Since it's
 592  *              only an estimate, slightly-stale data is fine, hence we don't worry
 593  *              about updating previously cached data.
 594  */
 595 int
 596 _bt_getrootheight(Relation rel)
 597 {
 598         BTMetaPageData *metad;
 599
 600         if (rel->rd_amcache == NULL)
 601         {
 602                 Buffer          metabuf;
 603
 604                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 605                 metad = _bt_getmeta(rel, metabuf);
 606
 607                 /*
 608                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
 609                  * to be made, so just stop here and report the index height is zero.
 610                  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
 611                  */
 612                 if (metad->btm_root == P_NONE)
 613                 {
 614                         _bt_relbuf(rel, metabuf);
 615                         return 0;
 616                 }
 617
 618                 /*
 619                  * Cache the metapage data for next time
 620                  */
 621                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 622                                                                                          sizeof(BTMetaPageData));
 623                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 624                 _bt_relbuf(rel, metabuf);
 625         }
 626
 627         /* Get cached page */
 628         metad = (BTMetaPageData *) rel->rd_amcache;
 629         /* We shouldn't have cached it if any of these fail */
 630         Assert(metad->btm_magic == BTREE_MAGIC);
 631         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 632         Assert(metad->btm_version <= BTREE_VERSION);
 633         Assert(!metad->btm_allequalimage ||
 634                    metad->btm_version > BTREE_NOVAC_VERSION);
 635         Assert(metad->btm_fastroot != P_NONE);
 636
 637         return metad->btm_fastlevel;
 638 }
 639
 640 /*
 641  *      _bt_metaversion() -- Get version/status info from metapage.
 642  *
 643  *              Sets caller's *heapkeyspace and *allequalimage arguments using data
 644  *              from the B-Tree metapage (could be locally-cached version).  This
 645  *              information needs to be stashed in insertion scankey, so we provide a
 646  *              single function that fetches both at once.
 647  *
 648  *              This is used to determine the rules that must be used to descend a
 649  *              btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
 650  *              pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
 651  *              performance when inserting a new BTScanInsert-wise duplicate tuple
 652  *              among many leaf pages already full of such duplicates.
 653  *
 654  *              Also sets allequalimage field, which indicates whether or not it is
 655  *              safe to apply deduplication.  We rely on the assumption that
 656  *              btm_allequalimage will be zero'ed on heapkeyspace indexes that were
 657  *              pg_upgrade'd from Postgres 12.
 658  */
 659 void
 660 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
 661 {
 662         BTMetaPageData *metad;
 663
 664         if (rel->rd_amcache == NULL)
 665         {
 666                 Buffer          metabuf;
 667
 668                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 669                 metad = _bt_getmeta(rel, metabuf);
 670
 671                 /*
 672                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
 673                  * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
 674                  * be changed to allow this case.)
 675                  */
 676                 if (metad->btm_root == P_NONE)
 677                 {
 678                         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
 679                         *allequalimage = metad->btm_allequalimage;
 680
 681                         _bt_relbuf(rel, metabuf);
 682                         return;
 683                 }
 684
 685                 /*
 686                  * Cache the metapage data for next time
 687                  *
 688                  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
 689                  * can change the nbtree version for an index without invalidating any
 690                  * local cache.  This is okay because it can only happen when moving
 691                  * from version 2 to version 3, both of which are !heapkeyspace
 692                  * versions.
 693                  */
 694                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 695                                                                                          sizeof(BTMetaPageData));
 696                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 697                 _bt_relbuf(rel, metabuf);
 698         }
 699
 700         /* Get cached page */
 701         metad = (BTMetaPageData *) rel->rd_amcache;
 702         /* We shouldn't have cached it if any of these fail */
 703         Assert(metad->btm_magic == BTREE_MAGIC);
 704         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 705         Assert(metad->btm_version <= BTREE_VERSION);
 706         Assert(!metad->btm_allequalimage ||
 707                    metad->btm_version > BTREE_NOVAC_VERSION);
 708         Assert(metad->btm_fastroot != P_NONE);
 709
 710         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
 711         *allequalimage = metad->btm_allequalimage;
 712 }
 713
 714 /*
 715  *      _bt_checkpage() -- Verify that a freshly-read page looks sane.
 716  */
 717 void
 718 _bt_checkpage(Relation rel, Buffer buf)
 719 {
 720         Page            page = BufferGetPage(buf);
 721
 722         /*
 723          * ReadBuffer verifies that every newly-read page passes
 724          * PageHeaderIsValid, which means it either contains a reasonably sane
 725          * page header or is all-zero.  We have to defend against the all-zero
 726          * case, however.
 727          */
 728         if (PageIsNew(page))
 729                 ereport(ERROR,
 730                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 731                                  errmsg("index \"%s\" contains unexpected zero page at block %u",
 732                                                 RelationGetRelationName(rel),
 733                                                 BufferGetBlockNumber(buf)),
 734                                  errhint("Please REINDEX it.")));
 735
 736         /*
 737          * Additionally check that the special area looks sane.
 738          */
 739         if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
 740                 ereport(ERROR,
 741                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 742                                  errmsg("index \"%s\" contains corrupted page at block %u",
 743                                                 RelationGetRelationName(rel),
 744                                                 BufferGetBlockNumber(buf)),
 745                                  errhint("Please REINDEX it.")));
 746 }
 747
 748 /*
 749  * Log the reuse of a page from the FSM.
 750  */
 751 static void
 752 _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
 753 {
 754         xl_btree_reuse_page xlrec_reuse;
 755
 756         /*
 757          * Note that we don't register the buffer with the record, because this
 758          * operation doesn't modify the page. This record only exists to provide a
 759          * conflict point for Hot Standby.
 760          */
 761
 762         /* XLOG stuff */
 763         xlrec_reuse.node = rel->rd_node;
 764         xlrec_reuse.block = blkno;
 765         xlrec_reuse.latestRemovedXid = latestRemovedXid;
 766
 767         XLogBeginInsert();
 768         XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
 769
 770         XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
 771 }
 772
 773 /*
 774  *      _bt_getbuf() -- Get a buffer by block number for read or write.
 775  *
 776  *              blkno == P_NEW means to get an unallocated index page.  The page
 777  *              will be initialized before returning it.
 778  *
 779  *              When this routine returns, the appropriate lock is set on the
 780  *              requested buffer and its reference count has been incremented
 781  *              (ie, the buffer is "locked and pinned").  Also, we apply
 782  *              _bt_checkpage to sanity-check the page (except in P_NEW case).
 783  */
 784 Buffer
 785 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 786 {
 787         Buffer          buf;
 788
 789         if (blkno != P_NEW)
 790         {
 791                 /* Read an existing block of the relation */
 792                 buf = ReadBuffer(rel, blkno);
 793                 LockBuffer(buf, access);
 794                 _bt_checkpage(rel, buf);
 795         }
 796         else
 797         {
 798                 bool            needLock;
 799                 Page            page;
 800
 801                 Assert(access == BT_WRITE);
 802
 803                 /*
 804                  * First see if the FSM knows of any free pages.
 805                  *
 806                  * We can't trust the FSM's report unreservedly; we have to check that
 807                  * the page is still free.  (For example, an already-free page could
 808                  * have been re-used between the time the last VACUUM scanned it and
 809                  * the time the VACUUM made its FSM updates.)
 810                  *
 811                  * In fact, it's worse than that: we can't even assume that it's safe
 812                  * to take a lock on the reported page.  If somebody else has a lock
 813                  * on it, or even worse our own caller does, we could deadlock.  (The
 814                  * own-caller scenario is actually not improbable. Consider an index
 815                  * on a serial or timestamp column.  Nearly all splits will be at the
 816                  * rightmost page, so it's entirely likely that _bt_split will call us
 817                  * while holding a lock on the page most recently acquired from FSM. A
 818                  * VACUUM running concurrently with the previous split could well have
 819                  * placed that page back in FSM.)
 820                  *
 821                  * To get around that, we ask for only a conditional lock on the
 822                  * reported page.  If we fail, then someone else is using the page,
 823                  * and we may reasonably assume it's not free.  (If we happen to be
 824                  * wrong, the worst consequence is the page will be lost to use till
 825                  * the next VACUUM, which is no big problem.)
 826                  */
 827                 for (;;)
 828                 {
 829                         blkno = GetFreeIndexPage(rel);
 830                         if (blkno == InvalidBlockNumber)
 831                                 break;
 832                         buf = ReadBuffer(rel, blkno);
 833                         if (ConditionalLockBuffer(buf))
 834                         {
 835                                 page = BufferGetPage(buf);
 836                                 if (_bt_page_recyclable(page))
 837                                 {
 838                                         /*
 839                                          * If we are generating WAL for Hot Standby then create a
 840                                          * WAL record that will allow us to conflict with queries
 841                                          * running on standby, in case they have snapshots older
 842                                          * than btpo.xact.  This can only apply if the page does
 843                                          * have a valid btpo.xact value, ie not if it's new.  (We
 844                                          * must check that because an all-zero page has no special
 845                                          * space.)
 846                                          */
 847                                         if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
 848                                                 !PageIsNew(page))
 849                                         {
 850                                                 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 851
 852                                                 _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
 853                                         }
 854
 855                                         /* Okay to use page.  Re-initialize and return it */
 856                                         _bt_pageinit(page, BufferGetPageSize(buf));
 857                                         return buf;
 858                                 }
 859                                 elog(DEBUG2, "FSM returned nonrecyclable page");
 860                                 _bt_relbuf(rel, buf);
 861                         }
 862                         else
 863                         {
 864                                 elog(DEBUG2, "FSM returned nonlockable page");
 865                                 /* couldn't get lock, so just drop pin */
 866                                 ReleaseBuffer(buf);
 867                         }
 868                 }
 869
 870                 /*
 871                  * Extend the relation by one page.
 872                  *
 873                  * We have to use a lock to ensure no one else is extending the rel at
 874                  * the same time, else we will both try to initialize the same new
 875                  * page.  We can skip locking for new or temp relations, however,
 876                  * since no one else could be accessing them.
 877                  */
 878                 needLock = !RELATION_IS_LOCAL(rel);
 879
 880                 if (needLock)
 881                         LockRelationForExtension(rel, ExclusiveLock);
 882
 883                 buf = ReadBuffer(rel, P_NEW);
 884
 885                 /* Acquire buffer lock on new page */
 886                 LockBuffer(buf, BT_WRITE);
 887
 888                 /*
 889                  * Release the file-extension lock; it's now OK for someone else to
 890                  * extend the relation some more.  Note that we cannot release this
 891                  * lock before we have buffer lock on the new page, or we risk a race
 892                  * condition against btvacuumscan --- see comments therein.
 893                  */
 894                 if (needLock)
 895                         UnlockRelationForExtension(rel, ExclusiveLock);
 896
 897                 /* Initialize the new page before returning it */
 898                 page = BufferGetPage(buf);
 899                 Assert(PageIsNew(page));
 900                 _bt_pageinit(page, BufferGetPageSize(buf));
 901         }
 902
 903         /* ref count and lock type are correct */
 904         return buf;
 905 }
 906
 907 /*
 908  *      _bt_relandgetbuf() -- release a locked buffer and get another one.
 909  *
 910  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
 911  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
 912  * then it reduces to just _bt_getbuf; allowing this case simplifies some
 913  * callers.
 914  *
 915  * The original motivation for using this was to avoid two entries to the
 916  * bufmgr when one would do.  However, now it's mainly just a notational
 917  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
 918  * is when the target page is the same one already in the buffer.
 919  */
 920 Buffer
 921 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 922 {
 923         Buffer          buf;
 924
 925         Assert(blkno != P_NEW);
 926         if (BufferIsValid(obuf))
 927                 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 928         buf = ReleaseAndReadBuffer(obuf, rel, blkno);
 929         LockBuffer(buf, access);
 930         _bt_checkpage(rel, buf);
 931         return buf;
 932 }
 933
 934 /*
 935  *      _bt_relbuf() -- release a locked buffer.
 936  *
 937  * Lock and pin (refcount) are both dropped.
 938  */
 939 void
 940 _bt_relbuf(Relation rel, Buffer buf)
 941 {
 942         UnlockReleaseBuffer(buf);
 943 }
 944
 945 /*
 946  *      _bt_pageinit() -- Initialize a new page.
 947  *
 948  * On return, the page header is initialized; data space is empty;
 949  * special space is zeroed out.
 950  */
 951 void
 952 _bt_pageinit(Page page, Size size)
 953 {
 954         PageInit(page, size, sizeof(BTPageOpaqueData));
 955 }
 956
 957 /*
 958  *      _bt_page_recyclable() -- Is an existing page recyclable?
 959  *
 960  * This exists to make sure _bt_getbuf and btvacuumscan have the same
 961  * policy about whether a page is safe to re-use.  But note that _bt_getbuf
 962  * knows enough to distinguish the PageIsNew condition from the other one.
 963  * At some point it might be appropriate to redesign this to have a three-way
 964  * result value.
 965  */
 966 bool
 967 _bt_page_recyclable(Page page)
 968 {
 969         BTPageOpaque opaque;
 970
 971         /*
 972          * It's possible to find an all-zeroes page in an index --- for example, a
 973          * backend might successfully extend the relation one page and then crash
 974          * before it is able to make a WAL entry for adding the page. If we find a
 975          * zeroed page then reclaim it.
 976          */
 977         if (PageIsNew(page))
 978                 return true;
 979
 980         /*
 981          * Otherwise, recycle if deleted and too old to have any processes
 982          * interested in it.
 983          */
 984         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 985         if (P_ISDELETED(opaque) &&
 986                 TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
 987                 return true;
 988         return false;
 989 }
 990
 991 /*
 992  * Delete item(s) from a btree leaf page during VACUUM.
 993  *
 994  * This routine assumes that the caller has a super-exclusive write lock on
 995  * the buffer.  Also, the given deletable and updatable arrays *must* be
 996  * sorted in ascending order.
 997  *
 998  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
 999  * in an existing posting list item are to be removed by VACUUM.  This works
1000  * by updating/overwriting an existing item with caller's new version of the
1001  * item (a version that lacks the TIDs that are to be deleted).
1002  *
1003  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
1004  * generate their own latestRemovedXid by accessing the heap directly, whereas
1005  * VACUUMs rely on the initial heap scan taking care of it indirectly.  Also,
1006  * only VACUUM can perform granular deletes of individual TIDs in posting list
1007  * tuples.
1008  */
1009 void
1010 _bt_delitems_vacuum(Relation rel, Buffer buf,
1011                                         OffsetNumber *deletable, int ndeletable,
1012                                         BTVacuumPosting *updatable, int nupdatable)
1013 {
1014         Page            page = BufferGetPage(buf);
1015         BTPageOpaque opaque;
1016         Size            itemsz;
1017         char       *updatedbuf = NULL;
1018         Size            updatedbuflen = 0;
1019         OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1020
1021         /* Shouldn't be called unless there's something to do */
1022         Assert(ndeletable > 0 || nupdatable > 0);
1023
1024         for (int i = 0; i < nupdatable; i++)
1025         {
1026                 /* Replace work area IndexTuple with updated version */
1027                 _bt_update_posting(updatable[i]);
1028
1029                 /* Maintain array of updatable page offsets for WAL record */
1030                 updatedoffsets[i] = updatable[i]->updatedoffset;
1031         }
1032
1033         /* XLOG stuff -- allocate and fill buffer before critical section */
1034         if (nupdatable > 0 && RelationNeedsWAL(rel))
1035         {
1036                 Size            offset = 0;
1037
1038                 for (int i = 0; i < nupdatable; i++)
1039                 {
1040                         BTVacuumPosting vacposting = updatable[i];
1041
1042                         itemsz = SizeOfBtreeUpdate +
1043                                 vacposting->ndeletedtids * sizeof(uint16);
1044                         updatedbuflen += itemsz;
1045                 }
1046
1047                 updatedbuf = palloc(updatedbuflen);
1048                 for (int i = 0; i < nupdatable; i++)
1049                 {
1050                         BTVacuumPosting vacposting = updatable[i];
1051                         xl_btree_update update;
1052
1053                         update.ndeletedtids = vacposting->ndeletedtids;
1054                         memcpy(updatedbuf + offset, &update.ndeletedtids,
1055                                    SizeOfBtreeUpdate);
1056                         offset += SizeOfBtreeUpdate;
1057
1058                         itemsz = update.ndeletedtids * sizeof(uint16);
1059                         memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1060                         offset += itemsz;
1061                 }
1062         }
1063
1064         /* No ereport(ERROR) until changes are logged */
1065         START_CRIT_SECTION();
1066
1067         /*
1068          * Handle posting tuple updates.
1069          *
1070          * Deliberately do this before handling simple deletes.  If we did it the
1071          * other way around (i.e. WAL record order -- simple deletes before
1072          * updates) then we'd have to make compensating changes to the 'updatable'
1073          * array of offset numbers.
1074          *
1075          * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1076          * happens to already be set.  Although we unset the BTP_HAS_GARBAGE page
1077          * level flag, unsetting individual LP_DEAD bits should still be avoided.
1078          */
1079         for (int i = 0; i < nupdatable; i++)
1080         {
1081                 OffsetNumber updatedoffset = updatedoffsets[i];
1082                 IndexTuple      itup;
1083
1084                 itup = updatable[i]->itup;
1085                 itemsz = MAXALIGN(IndexTupleSize(itup));
1086                 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1087                                                                          itemsz))
1088                         elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1089                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1090         }
1091
1092         /* Now handle simple deletes of entire tuples */
1093         if (ndeletable > 0)
1094                 PageIndexMultiDelete(page, deletable, ndeletable);
1095
1096         /*
1097          * We can clear the vacuum cycle ID since this page has certainly been
1098          * processed by the current vacuum scan.
1099          */
1100         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1101         opaque->btpo_cycleid = 0;
1102
1103         /*
1104          * Mark the page as not containing any LP_DEAD items.  This is not
1105          * certainly true (there might be some that have recently been marked, but
1106          * weren't targeted by VACUUM's heap scan), but it will be true often
1107          * enough.  VACUUM does not delete items purely because they have their
1108          * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1109          * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1110          *
1111          * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1112          * limited, since we never falsely unset an LP_DEAD bit.  Workloads that
1113          * are particularly dependent on LP_DEAD bits being set quickly will
1114          * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1115          * again anyway.  Furthermore, attempting a deduplication pass will remove
1116          * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1117          * is set or not.
1118          */
1119         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1120
1121         MarkBufferDirty(buf);
1122
1123         /* XLOG stuff */
1124         if (RelationNeedsWAL(rel))
1125         {
1126                 XLogRecPtr      recptr;
1127                 xl_btree_vacuum xlrec_vacuum;
1128
1129                 xlrec_vacuum.ndeleted = ndeletable;
1130                 xlrec_vacuum.nupdated = nupdatable;
1131
1132                 XLogBeginInsert();
1133                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1134                 XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1135
1136                 if (ndeletable > 0)
1137                         XLogRegisterBufData(0, (char *) deletable,
1138                                                                 ndeletable * sizeof(OffsetNumber));
1139
1140                 if (nupdatable > 0)
1141                 {
1142                         XLogRegisterBufData(0, (char *) updatedoffsets,
1143                                                                 nupdatable * sizeof(OffsetNumber));
1144                         XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1145                 }
1146
1147                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1148
1149                 PageSetLSN(page, recptr);
1150         }
1151
1152         END_CRIT_SECTION();
1153
1154         /* can't leak memory here */
1155         if (updatedbuf != NULL)
1156                 pfree(updatedbuf);
1157         /* free tuples generated by calling _bt_update_posting() */
1158         for (int i = 0; i < nupdatable; i++)
1159                 pfree(updatable[i]->itup);
1160 }
1161
1162 /*
1163  * Delete item(s) from a btree leaf page during single-page cleanup.
1164  *
1165  * This routine assumes that the caller has pinned and write locked the
1166  * buffer.  Also, the given deletable array *must* be sorted in ascending
1167  * order.
1168  *
1169  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1170  * the page, but it needs to generate its own latestRemovedXid by accessing
1171  * the heap.  This is used by the REDO routine to generate recovery conflicts.
1172  * Also, it doesn't handle posting list tuples unless the entire tuple can be
1173  * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
1174  */
1175 void
1176 _bt_delitems_delete(Relation rel, Buffer buf,
1177                                         OffsetNumber *deletable, int ndeletable,
1178                                         Relation heapRel)
1179 {
1180         Page            page = BufferGetPage(buf);
1181         BTPageOpaque opaque;
1182         TransactionId latestRemovedXid = InvalidTransactionId;
1183
1184         /* Shouldn't be called unless there's something to do */
1185         Assert(ndeletable > 0);
1186
1187         if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
1188                 latestRemovedXid =
1189                         _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1190
1191         /* No ereport(ERROR) until changes are logged */
1192         START_CRIT_SECTION();
1193
1194         /* Fix the page */
1195         PageIndexMultiDelete(page, deletable, ndeletable);
1196
1197         /*
1198          * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1199          * because this is not called by VACUUM.  Just clear the BTP_HAS_GARBAGE
1200          * page flag, since we deleted all items with their LP_DEAD bit set.
1201          */
1202         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1203         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1204
1205         MarkBufferDirty(buf);
1206
1207         /* XLOG stuff */
1208         if (RelationNeedsWAL(rel))
1209         {
1210                 XLogRecPtr      recptr;
1211                 xl_btree_delete xlrec_delete;
1212
1213                 xlrec_delete.latestRemovedXid = latestRemovedXid;
1214                 xlrec_delete.ndeleted = ndeletable;
1215
1216                 XLogBeginInsert();
1217                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1218                 XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1219
1220                 /*
1221                  * The deletable array is not in the buffer, but pretend that it is.
1222                  * When XLogInsert stores the whole buffer, the array need not be
1223                  * stored too.
1224                  */
1225                 XLogRegisterBufData(0, (char *) deletable,
1226                                                         ndeletable * sizeof(OffsetNumber));
1227
1228                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1229
1230                 PageSetLSN(page, recptr);
1231         }
1232
1233         END_CRIT_SECTION();
1234 }
1235
1236 /*
1237  * Get the latestRemovedXid from the table entries pointed to by the non-pivot
1238  * tuples being deleted.
1239  *
1240  * This is a specialized version of index_compute_xid_horizon_for_tuples().
1241  * It's needed because btree tuples don't always store table TID using the
1242  * standard index tuple header field.
1243  */
1244 static TransactionId
1245 _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
1246                                 OffsetNumber *deletable, int ndeletable)
1247 {
1248         TransactionId latestRemovedXid = InvalidTransactionId;
1249         int                     spacenhtids;
1250         int                     nhtids;
1251         ItemPointer htids;
1252
1253         /* Array will grow iff there are posting list tuples to consider */
1254         spacenhtids = ndeletable;
1255         nhtids = 0;
1256         htids = (ItemPointer) palloc(sizeof(ItemPointerData) * spacenhtids);
1257         for (int i = 0; i < ndeletable; i++)
1258         {
1259                 ItemId          itemid;
1260                 IndexTuple      itup;
1261
1262                 itemid = PageGetItemId(page, deletable[i]);
1263                 itup = (IndexTuple) PageGetItem(page, itemid);
1264
1265                 Assert(ItemIdIsDead(itemid));
1266                 Assert(!BTreeTupleIsPivot(itup));
1267
1268                 if (!BTreeTupleIsPosting(itup))
1269                 {
1270                         if (nhtids + 1 > spacenhtids)
1271                         {
1272                                 spacenhtids *= 2;
1273                                 htids = (ItemPointer)
1274                                         repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1275                         }
1276
1277                         Assert(ItemPointerIsValid(&itup->t_tid));
1278                         ItemPointerCopy(&itup->t_tid, &htids[nhtids]);
1279                         nhtids++;
1280                 }
1281                 else
1282                 {
1283                         int                     nposting = BTreeTupleGetNPosting(itup);
1284
1285                         if (nhtids + nposting > spacenhtids)
1286                         {
1287                                 spacenhtids = Max(spacenhtids * 2, nhtids + nposting);
1288                                 htids = (ItemPointer)
1289                                         repalloc(htids, sizeof(ItemPointerData) * spacenhtids);
1290                         }
1291
1292                         for (int j = 0; j < nposting; j++)
1293                         {
1294                                 ItemPointer htid = BTreeTupleGetPostingN(itup, j);
1295
1296                                 Assert(ItemPointerIsValid(htid));
1297                                 ItemPointerCopy(htid, &htids[nhtids]);
1298                                 nhtids++;
1299                         }
1300                 }
1301         }
1302
1303         Assert(nhtids >= ndeletable);
1304
1305         latestRemovedXid =
1306                 table_compute_xid_horizon_for_tuples(heapRel, htids, nhtids);
1307
1308         pfree(htids);
1309
1310         return latestRemovedXid;
1311 }
1312
1313 /*
1314  * Returns true, if the given block has the half-dead flag set.
1315  */
1316 static bool
1317 _bt_is_page_halfdead(Relation rel, BlockNumber blk)
1318 {
1319         Buffer          buf;
1320         Page            page;
1321         BTPageOpaque opaque;
1322         bool            result;
1323
1324         buf = _bt_getbuf(rel, blk, BT_READ);
1325         page = BufferGetPage(buf);
1326         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1327
1328         result = P_ISHALFDEAD(opaque);
1329         _bt_relbuf(rel, buf);
1330
1331         return result;
1332 }
1333
1334 /*
1335  * Subroutine to find the parent of the branch we're deleting.  This climbs
1336  * up the tree until it finds a page with more than one child, i.e. a page
1337  * that will not be totally emptied by the deletion.  The chain of pages below
1338  * it, with one downlink each, will form the branch that we need to delete.
1339  *
1340  * If we cannot remove the downlink from the parent, because it's the
1341  * rightmost entry, returns false.  On success, *topparent and *topoff are set
1342  * to the buffer holding the parent, and the offset of the downlink in it.
1343  * *topparent is write-locked, the caller is responsible for releasing it when
1344  * done.  *target is set to the topmost page in the branch to-be-deleted, i.e.
1345  * the page whose downlink *topparent / *topoff point to, and *rightsib to its
1346  * right sibling.
1347  *
1348  * "child" is the leaf page we wish to delete, and "stack" is a search stack
1349  * leading to it (it actually leads to the leftmost leaf page with a high key
1350  * matching that of the page to be deleted in !heapkeyspace indexes).  Note
1351  * that we will update the stack entry(s) to reflect current downlink
1352  * positions --- this is essentially the same as the corresponding step of
1353  * splitting, and is not expected to affect caller.  The caller should
1354  * initialize *target and *rightsib to the leaf page and its right sibling.
1355  *
1356  * Note: it's OK to release page locks on any internal pages between the leaf
1357  * and *topparent, because a safe deletion can't become unsafe due to
1358  * concurrent activity.  An internal page can only acquire an entry if the
1359  * child is split, but that cannot happen as long as we hold a lock on the
1360  * leaf.
1361  */
1362 static bool
1363 _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1364                                            Buffer *topparent, OffsetNumber *topoff,
1365                                            BlockNumber *target, BlockNumber *rightsib)
1366 {
1367         BlockNumber parent;
1368         OffsetNumber poffset,
1369                                 maxoff;
1370         Buffer          pbuf;
1371         Page            page;
1372         BTPageOpaque opaque;
1373         BlockNumber leftsib;
1374
1375         /*
1376          * Locate the downlink of "child" in the parent, updating the stack entry
1377          * if needed.  This is how !heapkeyspace indexes deal with having
1378          * non-unique high keys in leaf level pages.  Even heapkeyspace indexes
1379          * can have a stale stack due to insertions into the parent.
1380          */
1381         pbuf = _bt_getstackbuf(rel, stack, child);
1382         if (pbuf == InvalidBuffer)
1383                 ereport(ERROR,
1384                                 (errcode(ERRCODE_INDEX_CORRUPTED),
1385                                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
1386                                                                  RelationGetRelationName(rel), child)));
1387         parent = stack->bts_blkno;
1388         poffset = stack->bts_offset;
1389
1390         page = BufferGetPage(pbuf);
1391         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1392         maxoff = PageGetMaxOffsetNumber(page);
1393
1394         /*
1395          * If the target is the rightmost child of its parent, then we can't
1396          * delete, unless it's also the only child.
1397          */
1398         if (poffset >= maxoff)
1399         {
1400                 /* It's rightmost child... */
1401                 if (poffset == P_FIRSTDATAKEY(opaque))
1402                 {
1403                         /*
1404                          * It's only child, so safe if parent would itself be removable.
1405                          * We have to check the parent itself, and then recurse to test
1406                          * the conditions at the parent's parent.
1407                          */
1408                         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1409                                 P_INCOMPLETE_SPLIT(opaque))
1410                         {
1411                                 _bt_relbuf(rel, pbuf);
1412                                 return false;
1413                         }
1414
1415                         *target = parent;
1416                         *rightsib = opaque->btpo_next;
1417                         leftsib = opaque->btpo_prev;
1418
1419                         _bt_relbuf(rel, pbuf);
1420
1421                         /*
1422                          * Like in _bt_pagedel, check that the left sibling is not marked
1423                          * with INCOMPLETE_SPLIT flag.  That would mean that there is no
1424                          * downlink to the page to be deleted, and the page deletion
1425                          * algorithm isn't prepared to handle that.
1426                          */
1427                         if (leftsib != P_NONE)
1428                         {
1429                                 Buffer          lbuf;
1430                                 Page            lpage;
1431                                 BTPageOpaque lopaque;
1432
1433                                 lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1434                                 lpage = BufferGetPage(lbuf);
1435                                 lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1436
1437                                 /*
1438                                  * If the left sibling was concurrently split, so that its
1439                                  * next-pointer doesn't point to the current page anymore, the
1440                                  * split that created the current page must be completed. (We
1441                                  * don't allow splitting an incompletely split page again
1442                                  * until the previous split has been completed)
1443                                  */
1444                                 if (lopaque->btpo_next == parent &&
1445                                         P_INCOMPLETE_SPLIT(lopaque))
1446                                 {
1447                                         _bt_relbuf(rel, lbuf);
1448                                         return false;
1449                                 }
1450                                 _bt_relbuf(rel, lbuf);
1451                         }
1452
1453                         return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
1454                                                                                   topparent, topoff, target, rightsib);
1455                 }
1456                 else
1457                 {
1458                         /* Unsafe to delete */
1459                         _bt_relbuf(rel, pbuf);
1460                         return false;
1461                 }
1462         }
1463         else
1464         {
1465                 /* Not rightmost child, so safe to delete */
1466                 *topparent = pbuf;
1467                 *topoff = poffset;
1468                 return true;
1469         }
1470 }
1471
1472 /*
1473  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1474  *
1475  * This action unlinks the page from the b-tree structure, removing all
1476  * pointers leading to it --- but not touching its own left and right links.
1477  * The page cannot be physically reclaimed right away, since other processes
1478  * may currently be trying to follow links leading to the page; they have to
1479  * be allowed to use its right-link to recover.  See nbtree/README.
1480  *
1481  * On entry, the target buffer must be pinned and locked (either read or write
1482  * lock is OK).  This lock and pin will be dropped before exiting.
1483  *
1484  * Returns the number of pages successfully deleted (zero if page cannot
1485  * be deleted now; could be more than one if parent or sibling pages were
1486  * deleted too).
1487  *
1488  * NOTE: this leaks memory.  Rather than trying to clean up everything
1489  * carefully, it's better to run it in a temp context that can be reset
1490  * frequently.
1491  */
1492 int
1493 _bt_pagedel(Relation rel, Buffer buf)
1494 {
1495         int                     ndeleted = 0;
1496         BlockNumber rightsib;
1497         bool            rightsib_empty;
1498         Page            page;
1499         BTPageOpaque opaque;
1500
1501         /*
1502          * "stack" is a search stack leading (approximately) to the target page.
1503          * It is initially NULL, but when iterating, we keep it to avoid
1504          * duplicated search effort.
1505          *
1506          * Also, when "stack" is not NULL, we have already checked that the
1507          * current page is not the right half of an incomplete split, i.e. the
1508          * left sibling does not have its INCOMPLETE_SPLIT flag set.
1509          */
1510         BTStack         stack = NULL;
1511
1512         for (;;)
1513         {
1514                 page = BufferGetPage(buf);
1515                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1516
1517                 /*
1518                  * Internal pages are never deleted directly, only as part of deleting
1519                  * the whole branch all the way down to leaf level.
1520                  */
1521                 if (!P_ISLEAF(opaque))
1522                 {
1523                         /*
1524                          * Pre-9.4 page deletion only marked internal pages as half-dead,
1525                          * but now we only use that flag on leaf pages. The old algorithm
1526                          * was never supposed to leave half-dead pages in the tree, it was
1527                          * just a transient state, but it was nevertheless possible in
1528                          * error scenarios. We don't know how to deal with them here. They
1529                          * are harmless as far as searches are considered, but inserts
1530                          * into the deleted keyspace could add out-of-order downlinks in
1531                          * the upper levels. Log a notice, hopefully the admin will notice
1532                          * and reindex.
1533                          */
1534                         if (P_ISHALFDEAD(opaque))
1535                                 ereport(LOG,
1536                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
1537                                                  errmsg("index \"%s\" contains a half-dead internal page",
1538                                                                 RelationGetRelationName(rel)),
1539                                                  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1540                         _bt_relbuf(rel, buf);
1541                         return ndeleted;
1542                 }
1543
1544                 /*
1545                  * We can never delete rightmost pages nor root pages.  While at it,
1546                  * check that page is not already deleted and is empty.
1547                  *
1548                  * To keep the algorithm simple, we also never delete an incompletely
1549                  * split page (they should be rare enough that this doesn't make any
1550                  * meaningful difference to disk usage):
1551                  *
1552                  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1553                  * left half of an incomplete split, but ensuring that it's not the
1554                  * right half is more complicated.  For that, we have to check that
1555                  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set.  On
1556                  * the first iteration, we temporarily release the lock on the current
1557                  * page, and check the left sibling and also construct a search stack
1558                  * to.  On subsequent iterations, we know we stepped right from a page
1559                  * that passed these tests, so it's OK.
1560                  */
1561                 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1562                         P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1563                         P_INCOMPLETE_SPLIT(opaque))
1564                 {
1565                         /* Should never fail to delete a half-dead page */
1566                         Assert(!P_ISHALFDEAD(opaque));
1567
1568                         _bt_relbuf(rel, buf);
1569                         return ndeleted;
1570                 }
1571
1572                 /*
1573                  * First, remove downlink pointing to the page (or a parent of the
1574                  * page, if we are going to delete a taller branch), and mark the page
1575                  * as half-dead.
1576                  */
1577                 if (!P_ISHALFDEAD(opaque))
1578                 {
1579                         /*
1580                          * We need an approximate pointer to the page's parent page.  We
1581                          * use a variant of the standard search mechanism to search for
1582                          * the page's high key; this will give us a link to either the
1583                          * current parent or someplace to its left (if there are multiple
1584                          * equal high keys, which is possible with !heapkeyspace indexes).
1585                          *
1586                          * Also check if this is the right-half of an incomplete split
1587                          * (see comment above).
1588                          */
1589                         if (!stack)
1590                         {
1591                                 BTScanInsert itup_key;
1592                                 ItemId          itemid;
1593                                 IndexTuple      targetkey;
1594                                 Buffer          lbuf;
1595                                 BlockNumber leftsib;
1596
1597                                 itemid = PageGetItemId(page, P_HIKEY);
1598                                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1599
1600                                 leftsib = opaque->btpo_prev;
1601
1602                                 /*
1603                                  * To avoid deadlocks, we'd better drop the leaf page lock
1604                                  * before going further.
1605                                  */
1606                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1607
1608                                 /*
1609                                  * Fetch the left sibling, to check that it's not marked with
1610                                  * INCOMPLETE_SPLIT flag.  That would mean that the page
1611                                  * to-be-deleted doesn't have a downlink, and the page
1612                                  * deletion algorithm isn't prepared to handle that.
1613                                  */
1614                                 if (leftsib != P_NONE)
1615                                 {
1616                                         BTPageOpaque lopaque;
1617                                         Page            lpage;
1618
1619                                         lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1620                                         lpage = BufferGetPage(lbuf);
1621                                         lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1622
1623                                         /*
1624                                          * If the left sibling is split again by another backend,
1625                                          * after we released the lock, we know that the first
1626                                          * split must have finished, because we don't allow an
1627                                          * incompletely-split page to be split again.  So we don't
1628                                          * need to walk right here.
1629                                          */
1630                                         if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
1631                                                 P_INCOMPLETE_SPLIT(lopaque))
1632                                         {
1633                                                 ReleaseBuffer(buf);
1634                                                 _bt_relbuf(rel, lbuf);
1635                                                 return ndeleted;
1636                                         }
1637                                         _bt_relbuf(rel, lbuf);
1638                                 }
1639
1640                                 /* we need an insertion scan key for the search, so build one */
1641                                 itup_key = _bt_mkscankey(rel, targetkey);
1642                                 /* find the leftmost leaf page with matching pivot/high key */
1643                                 itup_key->pivotsearch = true;
1644                                 stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
1645                                 /* don't need a lock or second pin on the page */
1646                                 _bt_relbuf(rel, lbuf);
1647
1648                                 /*
1649                                  * Re-lock the leaf page, and start over, to re-check that the
1650                                  * page can still be deleted.
1651                                  */
1652                                 LockBuffer(buf, BT_WRITE);
1653                                 continue;
1654                         }
1655
1656                         if (!_bt_mark_page_halfdead(rel, buf, stack))
1657                         {
1658                                 _bt_relbuf(rel, buf);
1659                                 return ndeleted;
1660                         }
1661                 }
1662
1663                 /*
1664                  * Then unlink it from its siblings.  Each call to
1665                  * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1666                  * making it shallower.  Iterate until the leaf page is gone.
1667                  */
1668                 rightsib_empty = false;
1669                 while (P_ISHALFDEAD(opaque))
1670                 {
1671                         /* will check for interrupts, once lock is released */
1672                         if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
1673                         {
1674                                 /* _bt_unlink_halfdead_page already released buffer */
1675                                 return ndeleted;
1676                         }
1677                         ndeleted++;
1678                 }
1679
1680                 rightsib = opaque->btpo_next;
1681
1682                 _bt_relbuf(rel, buf);
1683
1684                 /*
1685                  * Check here, as calling loops will have locks held, preventing
1686                  * interrupts from being processed.
1687                  */
1688                 CHECK_FOR_INTERRUPTS();
1689
1690                 /*
1691                  * The page has now been deleted. If its right sibling is completely
1692                  * empty, it's possible that the reason we haven't deleted it earlier
1693                  * is that it was the rightmost child of the parent. Now that we
1694                  * removed the downlink for this page, the right sibling might now be
1695                  * the only child of the parent, and could be removed. It would be
1696                  * picked up by the next vacuum anyway, but might as well try to
1697                  * remove it now, so loop back to process the right sibling.
1698                  *
1699                  * Note: This relies on the assumption that _bt_getstackbuf() will be
1700                  * able to reuse our original descent stack with a different child
1701                  * block (provided that the child block is to the right of the
1702                  * original leaf page reached by _bt_search()). It will even update
1703                  * the descent stack each time we loop around, avoiding repeated work.
1704                  */
1705                 if (!rightsib_empty)
1706                         break;
1707
1708                 buf = _bt_getbuf(rel, rightsib, BT_WRITE);
1709         }
1710
1711         return ndeleted;
1712 }
1713
1714 /*
1715  * First stage of page deletion.  Remove the downlink to the top of the
1716  * branch being deleted, and mark the leaf page as half-dead.
1717  */
1718 static bool
1719 _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1720 {
1721         BlockNumber leafblkno;
1722         BlockNumber leafrightsib;
1723         BlockNumber target;
1724         BlockNumber rightsib;
1725         ItemId          itemid;
1726         Page            page;
1727         BTPageOpaque opaque;
1728         Buffer          topparent;
1729         OffsetNumber topoff;
1730         OffsetNumber nextoffset;
1731         IndexTuple      itup;
1732         IndexTupleData trunctuple;
1733
1734         page = BufferGetPage(leafbuf);
1735         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1736
1737         Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
1738                    !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
1739                    P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1740
1741         /*
1742          * Save info about the leaf page.
1743          */
1744         leafblkno = BufferGetBlockNumber(leafbuf);
1745         leafrightsib = opaque->btpo_next;
1746
1747         /*
1748          * Before attempting to lock the parent page, check that the right sibling
1749          * is not in half-dead state.  A half-dead right sibling would have no
1750          * downlink in the parent, which would be highly confusing later when we
1751          * delete the downlink that follows the current page's downlink. (I
1752          * believe the deletion would work correctly, but it would fail the
1753          * cross-check we make that the following downlink points to the right
1754          * sibling of the delete page.)
1755          */
1756         if (_bt_is_page_halfdead(rel, leafrightsib))
1757         {
1758                 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
1759                          leafblkno, leafrightsib);
1760                 return false;
1761         }
1762
1763         /*
1764          * We cannot delete a page that is the rightmost child of its immediate
1765          * parent, unless it is the only child --- in which case the parent has to
1766          * be deleted too, and the same condition applies recursively to it. We
1767          * have to check this condition all the way up before trying to delete,
1768          * and lock the final parent of the to-be-deleted subtree.
1769          *
1770          * However, we won't need to repeat the above _bt_is_page_halfdead() check
1771          * for parent/ancestor pages because of the rightmost restriction. The
1772          * leaf check will apply to a right "cousin" leaf page rather than a
1773          * simple right sibling leaf page in cases where we actually go on to
1774          * perform internal page deletion. The right cousin leaf page is
1775          * representative of the left edge of the subtree to the right of the
1776          * to-be-deleted subtree as a whole.  (Besides, internal pages are never
1777          * marked half-dead, so it isn't even possible to directly assess if an
1778          * internal page is part of some other to-be-deleted subtree.)
1779          */
1780         rightsib = leafrightsib;
1781         target = leafblkno;
1782         if (!_bt_lock_branch_parent(rel, leafblkno, stack,
1783                                                                 &topparent, &topoff, &target, &rightsib))
1784                 return false;
1785
1786         /*
1787          * Check that the parent-page index items we're about to delete/overwrite
1788          * contain what we expect.  This can fail if the index has become corrupt
1789          * for some reason.  We want to throw any error before entering the
1790          * critical section --- otherwise it'd be a PANIC.
1791          *
1792          * The test on the target item is just an Assert because
1793          * _bt_lock_branch_parent should have guaranteed it has the expected
1794          * contents.  The test on the next-child downlink is known to sometimes
1795          * fail in the field, though.
1796          */
1797         page = BufferGetPage(topparent);
1798         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1799
1800 #ifdef USE_ASSERT_CHECKING
1801         itemid = PageGetItemId(page, topoff);
1802         itup = (IndexTuple) PageGetItem(page, itemid);
1803         Assert(BTreeTupleGetDownLink(itup) == target);
1804 #endif
1805
1806         nextoffset = OffsetNumberNext(topoff);
1807         itemid = PageGetItemId(page, nextoffset);
1808         itup = (IndexTuple) PageGetItem(page, itemid);
1809         if (BTreeTupleGetDownLink(itup) != rightsib)
1810                 ereport(ERROR,
1811                                         (errcode(ERRCODE_INDEX_CORRUPTED),
1812                                          errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1813                                                                          rightsib, target, BTreeTupleGetDownLink(itup),
1814                                          BufferGetBlockNumber(topparent), RelationGetRelationName(rel))));
1815
1816         /*
1817          * Any insert which would have gone on the leaf block will now go to its
1818          * right sibling.
1819          */
1820         PredicateLockPageCombine(rel, leafblkno, leafrightsib);
1821
1822         /* No ereport(ERROR) until changes are logged */
1823         START_CRIT_SECTION();
1824
1825         /*
1826          * Update parent.  The normal case is a tad tricky because we want to
1827          * delete the target's downlink and the *following* key.  Easiest way is
1828          * to copy the right sibling's downlink over the target downlink, and then
1829          * delete the following item.
1830          */
1831         page = BufferGetPage(topparent);
1832         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1833
1834         itemid = PageGetItemId(page, topoff);
1835         itup = (IndexTuple) PageGetItem(page, itemid);
1836         BTreeTupleSetDownLink(itup, rightsib);
1837
1838         nextoffset = OffsetNumberNext(topoff);
1839         PageIndexTupleDelete(page, nextoffset);
1840
1841         /*
1842          * Mark the leaf page as half-dead, and stamp it with a pointer to the
1843          * highest internal page in the branch we're deleting.  We use the tid of
1844          * the high key to store it.
1845          */
1846         page = BufferGetPage(leafbuf);
1847         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1848         opaque->btpo_flags |= BTP_HALF_DEAD;
1849
1850         Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
1851         MemSet(&trunctuple, 0, sizeof(IndexTupleData));
1852         trunctuple.t_info = sizeof(IndexTupleData);
1853         if (target != leafblkno)
1854                 BTreeTupleSetTopParent(&trunctuple, target);
1855         else
1856                 BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
1857
1858         if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
1859                                                                  IndexTupleSize(&trunctuple)))
1860                 elog(ERROR, "could not overwrite high key in half-dead page");
1861
1862         /* Must mark buffers dirty before XLogInsert */
1863         MarkBufferDirty(topparent);
1864         MarkBufferDirty(leafbuf);
1865
1866         /* XLOG stuff */
1867         if (RelationNeedsWAL(rel))
1868         {
1869                 xl_btree_mark_page_halfdead xlrec;
1870                 XLogRecPtr      recptr;
1871
1872                 xlrec.poffset = topoff;
1873                 xlrec.leafblk = leafblkno;
1874                 if (target != leafblkno)
1875                         xlrec.topparent = target;
1876                 else
1877                         xlrec.topparent = InvalidBlockNumber;
1878
1879                 XLogBeginInsert();
1880                 XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
1881                 XLogRegisterBuffer(1, topparent, REGBUF_STANDARD);
1882
1883                 page = BufferGetPage(leafbuf);
1884                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1885                 xlrec.leftblk = opaque->btpo_prev;
1886                 xlrec.rightblk = opaque->btpo_next;
1887
1888                 XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
1889
1890                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
1891
1892                 page = BufferGetPage(topparent);
1893                 PageSetLSN(page, recptr);
1894                 page = BufferGetPage(leafbuf);
1895                 PageSetLSN(page, recptr);
1896         }
1897
1898         END_CRIT_SECTION();
1899
1900         _bt_relbuf(rel, topparent);
1901         return true;
1902 }
1903
1904 /*
1905  * Unlink a page in a branch of half-dead pages from its siblings.
1906  *
1907  * If the leaf page still has a downlink pointing to it, unlinks the highest
1908  * parent in the to-be-deleted branch instead of the leaf page.  To get rid
1909  * of the whole branch, including the leaf page itself, iterate until the
1910  * leaf page is deleted.
1911  *
1912  * Returns 'false' if the page could not be unlinked (shouldn't happen).
1913  * If the (current) right sibling of the page is empty, *rightsib_empty is
1914  * set to true.
1915  *
1916  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1917  * On success exit, we'll be holding pin and write lock.  On failure exit,
1918  * we'll release both pin and lock before returning (we define it that way
1919  * to avoid having to reacquire a lock we already released).
1920  */
1921 static bool
1922 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
1923 {
1924         BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
1925         BlockNumber leafleftsib;
1926         BlockNumber leafrightsib;
1927         BlockNumber target;
1928         BlockNumber leftsib;
1929         BlockNumber rightsib;
1930         Buffer          lbuf = InvalidBuffer;
1931         Buffer          buf;
1932         Buffer          rbuf;
1933         Buffer          metabuf = InvalidBuffer;
1934         Page            metapg = NULL;
1935         BTMetaPageData *metad = NULL;
1936         ItemId          itemid;
1937         Page            page;
1938         BTPageOpaque opaque;
1939         bool            rightsib_is_rightmost;
1940         int                     targetlevel;
1941         IndexTuple      leafhikey;
1942         BlockNumber nextchild;
1943
1944         page = BufferGetPage(leafbuf);
1945         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1946
1947         Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1948
1949         /*
1950          * Remember some information about the leaf page.
1951          */
1952         itemid = PageGetItemId(page, P_HIKEY);
1953         leafhikey = (IndexTuple) PageGetItem(page, itemid);
1954         leafleftsib = opaque->btpo_prev;
1955         leafrightsib = opaque->btpo_next;
1956
1957         LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1958
1959         /*
1960          * Check here, as calling loops will have locks held, preventing
1961          * interrupts from being processed.
1962          */
1963         CHECK_FOR_INTERRUPTS();
1964
1965         /*
1966          * If the leaf page still has a parent pointing to it (or a chain of
1967          * parents), we don't unlink the leaf page yet, but the topmost remaining
1968          * parent in the branch.  Set 'target' and 'buf' to reference the page
1969          * actually being unlinked.
1970          */
1971         target = BTreeTupleGetTopParent(leafhikey);
1972
1973         if (target != InvalidBlockNumber)
1974         {
1975                 Assert(target != leafblkno);
1976
1977                 /* fetch the block number of the topmost parent's left sibling */
1978                 buf = _bt_getbuf(rel, target, BT_READ);
1979                 page = BufferGetPage(buf);
1980                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1981                 leftsib = opaque->btpo_prev;
1982                 targetlevel = opaque->btpo.level;
1983
1984                 /*
1985                  * To avoid deadlocks, we'd better drop the target page lock before
1986                  * going further.
1987                  */
1988                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1989         }
1990         else
1991         {
1992                 target = leafblkno;
1993
1994                 buf = leafbuf;
1995                 leftsib = leafleftsib;
1996                 targetlevel = 0;
1997         }
1998
1999         /*
2000          * We have to lock the pages we need to modify in the standard order:
2001          * moving right, then up.  Else we will deadlock against other writers.
2002          *
2003          * So, first lock the leaf page, if it's not the target.  Then find and
2004          * write-lock the current left sibling of the target page.  The sibling
2005          * that was current a moment ago could have split, so we may have to move
2006          * right.  This search could fail if either the sibling or the target page
2007          * was deleted by someone else meanwhile; if so, give up.  (Right now,
2008          * that should never happen, since page deletion is only done in VACUUM
2009          * and there shouldn't be multiple VACUUMs concurrently on the same
2010          * table.)
2011          */
2012         if (target != leafblkno)
2013                 LockBuffer(leafbuf, BT_WRITE);
2014         if (leftsib != P_NONE)
2015         {
2016                 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2017                 page = BufferGetPage(lbuf);
2018                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2019                 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2020                 {
2021                         /* step right one page */
2022                         leftsib = opaque->btpo_next;
2023                         _bt_relbuf(rel, lbuf);
2024
2025                         /*
2026                          * It'd be good to check for interrupts here, but it's not easy to
2027                          * do so because a lock is always held. This block isn't
2028                          * frequently reached, so hopefully the consequences of not
2029                          * checking interrupts aren't too bad.
2030                          */
2031
2032                         if (leftsib == P_NONE)
2033                         {
2034                                 elog(LOG, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
2035                                          target,
2036                                          RelationGetRelationName(rel));
2037                                 if (target != leafblkno)
2038                                 {
2039                                         /* we have only a pin on target, but pin+lock on leafbuf */
2040                                         ReleaseBuffer(buf);
2041                                         _bt_relbuf(rel, leafbuf);
2042                                 }
2043                                 else
2044                                 {
2045                                         /* we have only a pin on leafbuf */
2046                                         ReleaseBuffer(leafbuf);
2047                                 }
2048                                 return false;
2049                         }
2050                         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2051                         page = BufferGetPage(lbuf);
2052                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2053                 }
2054         }
2055         else
2056                 lbuf = InvalidBuffer;
2057
2058         /*
2059          * Next write-lock the target page itself.  It should be okay to take just
2060          * a write lock not a superexclusive lock, since no scans would stop on an
2061          * empty page.
2062          */
2063         LockBuffer(buf, BT_WRITE);
2064         page = BufferGetPage(buf);
2065         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2066
2067         /*
2068          * Check page is still empty etc, else abandon deletion.  This is just for
2069          * paranoia's sake; a half-dead page cannot resurrect because there can be
2070          * only one vacuum process running at a time.
2071          */
2072         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2073         {
2074                 elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2075                          target, RelationGetRelationName(rel));
2076         }
2077         if (opaque->btpo_prev != leftsib)
2078                 ereport(ERROR,
2079                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2080                                  errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2081                                                                  target, RelationGetRelationName(rel))));
2082
2083         if (target == leafblkno)
2084         {
2085                 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2086                         !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2087                         elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2088                                  target, RelationGetRelationName(rel));
2089                 nextchild = InvalidBlockNumber;
2090         }
2091         else
2092         {
2093                 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2094                         P_ISLEAF(opaque))
2095                         elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2096                                  target, RelationGetRelationName(rel));
2097
2098                 /* remember the next non-leaf child down in the branch. */
2099                 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2100                 nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
2101                 if (nextchild == leafblkno)
2102                         nextchild = InvalidBlockNumber;
2103         }
2104
2105         /*
2106          * And next write-lock the (current) right sibling.
2107          */
2108         rightsib = opaque->btpo_next;
2109         rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2110         page = BufferGetPage(rbuf);
2111         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2112         if (opaque->btpo_prev != target)
2113                 ereport(ERROR,
2114                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2115                                  errmsg_internal("right sibling's left-link doesn't match: "
2116                                                                  "block %u links to %u instead of expected %u in index \"%s\"",
2117                                                                  rightsib, opaque->btpo_prev, target,
2118                                                                  RelationGetRelationName(rel))));
2119         rightsib_is_rightmost = P_RIGHTMOST(opaque);
2120         *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2121
2122         /*
2123          * If we are deleting the next-to-last page on the target's level, then
2124          * the rightsib is a candidate to become the new fast root. (In theory, it
2125          * might be possible to push the fast root even further down, but the odds
2126          * of doing so are slim, and the locking considerations daunting.)
2127          *
2128          * We can safely acquire a lock on the metapage here --- see comments for
2129          * _bt_newroot().
2130          */
2131         if (leftsib == P_NONE && rightsib_is_rightmost)
2132         {
2133                 page = BufferGetPage(rbuf);
2134                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2135                 if (P_RIGHTMOST(opaque))
2136                 {
2137                         /* rightsib will be the only one left on the level */
2138                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2139                         metapg = BufferGetPage(metabuf);
2140                         metad = BTPageGetMeta(metapg);
2141
2142                         /*
2143                          * The expected case here is btm_fastlevel == targetlevel+1; if
2144                          * the fastlevel is <= targetlevel, something is wrong, and we
2145                          * choose to overwrite it to fix it.
2146                          */
2147                         if (metad->btm_fastlevel > targetlevel + 1)
2148                         {
2149                                 /* no update wanted */
2150                                 _bt_relbuf(rel, metabuf);
2151                                 metabuf = InvalidBuffer;
2152                         }
2153                 }
2154         }
2155
2156         /*
2157          * Here we begin doing the deletion.
2158          */
2159
2160         /* No ereport(ERROR) until changes are logged */
2161         START_CRIT_SECTION();
2162
2163         /*
2164          * Update siblings' side-links.  Note the target page's side-links will
2165          * continue to point to the siblings.  Asserts here are just rechecking
2166          * things we already verified above.
2167          */
2168         if (BufferIsValid(lbuf))
2169         {
2170                 page = BufferGetPage(lbuf);
2171                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2172                 Assert(opaque->btpo_next == target);
2173                 opaque->btpo_next = rightsib;
2174         }
2175         page = BufferGetPage(rbuf);
2176         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2177         Assert(opaque->btpo_prev == target);
2178         opaque->btpo_prev = leftsib;
2179
2180         /*
2181          * If we deleted a parent of the targeted leaf page, instead of the leaf
2182          * itself, update the leaf to point to the next remaining child in the
2183          * branch.
2184          */
2185         if (target != leafblkno)
2186                 BTreeTupleSetTopParent(leafhikey, nextchild);
2187
2188         /*
2189          * Mark the page itself deleted.  It can be recycled when all current
2190          * transactions are gone.  Storing GetTopTransactionId() would work, but
2191          * we're in VACUUM and would not otherwise have an XID.  Having already
2192          * updated links to the target, ReadNewTransactionId() suffices as an
2193          * upper bound.  Any scan having retained a now-stale link is advertising
2194          * in its PGXACT an xmin less than or equal to the value we read here.  It
2195          * will continue to do so, holding back RecentGlobalXmin, for the duration
2196          * of that scan.
2197          */
2198         page = BufferGetPage(buf);
2199         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2200         opaque->btpo_flags &= ~BTP_HALF_DEAD;
2201         opaque->btpo_flags |= BTP_DELETED;
2202         opaque->btpo.xact = ReadNewTransactionId();
2203
2204         /* And update the metapage, if needed */
2205         if (BufferIsValid(metabuf))
2206         {
2207                 /* upgrade metapage if needed */
2208                 if (metad->btm_version < BTREE_NOVAC_VERSION)
2209                         _bt_upgrademetapage(metapg);
2210                 metad->btm_fastroot = rightsib;
2211                 metad->btm_fastlevel = targetlevel;
2212                 MarkBufferDirty(metabuf);
2213         }
2214
2215         /* Must mark buffers dirty before XLogInsert */
2216         MarkBufferDirty(rbuf);
2217         MarkBufferDirty(buf);
2218         if (BufferIsValid(lbuf))
2219                 MarkBufferDirty(lbuf);
2220         if (target != leafblkno)
2221                 MarkBufferDirty(leafbuf);
2222
2223         /* XLOG stuff */
2224         if (RelationNeedsWAL(rel))
2225         {
2226                 xl_btree_unlink_page xlrec;
2227                 xl_btree_metadata xlmeta;
2228                 uint8           xlinfo;
2229                 XLogRecPtr      recptr;
2230
2231                 XLogBeginInsert();
2232
2233                 XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2234                 if (BufferIsValid(lbuf))
2235                         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2236                 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2237                 if (target != leafblkno)
2238                         XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2239
2240                 /* information on the unlinked block */
2241                 xlrec.leftsib = leftsib;
2242                 xlrec.rightsib = rightsib;
2243                 xlrec.btpo_xact = opaque->btpo.xact;
2244
2245                 /* information needed to recreate the leaf block (if not the target) */
2246                 xlrec.leafleftsib = leafleftsib;
2247                 xlrec.leafrightsib = leafrightsib;
2248                 xlrec.topparent = nextchild;
2249
2250                 XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2251
2252                 if (BufferIsValid(metabuf))
2253                 {
2254                         XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2255
2256                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2257                         xlmeta.version = metad->btm_version;
2258                         xlmeta.root = metad->btm_root;
2259                         xlmeta.level = metad->btm_level;
2260                         xlmeta.fastroot = metad->btm_fastroot;
2261                         xlmeta.fastlevel = metad->btm_fastlevel;
2262                         xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
2263                         xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
2264                         xlmeta.allequalimage = metad->btm_allequalimage;
2265
2266                         XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2267                         xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2268                 }
2269                 else
2270                         xlinfo = XLOG_BTREE_UNLINK_PAGE;
2271
2272                 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2273
2274                 if (BufferIsValid(metabuf))
2275                 {
2276                         PageSetLSN(metapg, recptr);
2277                 }
2278                 page = BufferGetPage(rbuf);
2279                 PageSetLSN(page, recptr);
2280                 page = BufferGetPage(buf);
2281                 PageSetLSN(page, recptr);
2282                 if (BufferIsValid(lbuf))
2283                 {
2284                         page = BufferGetPage(lbuf);
2285                         PageSetLSN(page, recptr);
2286                 }
2287                 if (target != leafblkno)
2288                 {
2289                         page = BufferGetPage(leafbuf);
2290                         PageSetLSN(page, recptr);
2291                 }
2292         }
2293
2294         END_CRIT_SECTION();
2295
2296         /* release metapage */
2297         if (BufferIsValid(metabuf))
2298                 _bt_relbuf(rel, metabuf);
2299
2300         /* release siblings */
2301         if (BufferIsValid(lbuf))
2302                 _bt_relbuf(rel, lbuf);
2303         _bt_relbuf(rel, rbuf);
2304
2305         /*
2306          * Release the target, if it was not the leaf block.  The leaf is always
2307          * kept locked.
2308          */
2309         if (target != leafblkno)
2310                 _bt_relbuf(rel, buf);
2311
2312         return true;
2313 }