src/backend/access/nbtree/nbtpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtpage.c
   4  *        BTree-specific page management code for the Postgres btree access
   5  *        method.
   6  *
   7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        src/backend/access/nbtree/nbtpage.c
  13  *
  14  *      NOTES
  15  *         Postgres btree pages look like ordinary relation pages.  The opaque
  16  *         data at high addresses includes pointers to left and right siblings
  17  *         and flag data describing page state.  The first page in a btree, page
  18  *         zero, is special -- it stores meta-information describing the tree.
  19  *         Pages one and higher store the actual tree data.
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include "access/nbtree.h"
  26 #include "access/nbtxlog.h"
  27 #include "access/tableam.h"
  28 #include "access/transam.h"
  29 #include "access/xlog.h"
  30 #include "access/xloginsert.h"
  31 #include "miscadmin.h"
  32 #include "storage/indexfsm.h"
  33 #include "storage/lmgr.h"
  34 #include "storage/predicate.h"
  35 #include "utils/memdebug.h"
  36 #include "utils/snapmgr.h"
  37
  38 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
  39 static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
  40                                                            FullTransactionId safexid);
  41 static void _bt_delitems_delete(Relation rel, Buffer buf,
  42                                                                 TransactionId latestRemovedXid,
  43                                                                 OffsetNumber *deletable, int ndeletable,
  44                                                                 BTVacuumPosting *updatable, int nupdatable);
  45 static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
  46                                                                  OffsetNumber *updatedoffsets,
  47                                                                  Size *updatedbuflen, bool needswal);
  48 static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
  49                                                                    BTStack stack);
  50 static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
  51                                                                          BlockNumber scanblkno,
  52                                                                          bool *rightsib_empty,
  53                                                                          BTVacState *vstate);
  54 static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
  55                                                                         BTStack stack,
  56                                                                         Buffer *subtreeparent,
  57                                                                         OffsetNumber *poffset,
  58                                                                         BlockNumber *topparent,
  59                                                                         BlockNumber *topparentrightsib);
  60
  61 /*
  62  *      _bt_initmetapage() -- Fill a page buffer with a correct metapage image
  63  */
  64 void
  65 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
  66                                  bool allequalimage)
  67 {
  68         BTMetaPageData *metad;
  69         BTPageOpaque metaopaque;
  70
  71         _bt_pageinit(page, BLCKSZ);
  72
  73         metad = BTPageGetMeta(page);
  74         metad->btm_magic = BTREE_MAGIC;
  75         metad->btm_version = BTREE_VERSION;
  76         metad->btm_root = rootbknum;
  77         metad->btm_level = level;
  78         metad->btm_fastroot = rootbknum;
  79         metad->btm_fastlevel = level;
  80         metad->btm_last_cleanup_num_delpages = 0;
  81         metad->btm_last_cleanup_num_heap_tuples = -1.0;
  82         metad->btm_allequalimage = allequalimage;
  83
  84         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
  85         metaopaque->btpo_flags = BTP_META;
  86
  87         /*
  88          * Set pd_lower just past the end of the metadata.  This is essential,
  89          * because without doing so, metadata will be lost if xlog.c compresses
  90          * the page.
  91          */
  92         ((PageHeader) page)->pd_lower =
  93                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
  94 }
  95
  96 /*
  97  *      _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
  98  *              3, the last version that can be updated without broadly affecting
  99  *              on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
 100  *
 101  *              This routine does purely in-memory image upgrade.  Caller is
 102  *              responsible for locking, WAL-logging etc.
 103  */
 104 void
 105 _bt_upgrademetapage(Page page)
 106 {
 107         BTMetaPageData *metad;
 108         BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
 109
 110         metad = BTPageGetMeta(page);
 111         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 112
 113         /* It must be really a meta page of upgradable version */
 114         Assert(metaopaque->btpo_flags & BTP_META);
 115         Assert(metad->btm_version < BTREE_NOVAC_VERSION);
 116         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 117
 118         /* Set version number and fill extra fields added into version 3 */
 119         metad->btm_version = BTREE_NOVAC_VERSION;
 120         metad->btm_last_cleanup_num_delpages = 0;
 121         metad->btm_last_cleanup_num_heap_tuples = -1.0;
 122         /* Only a REINDEX can set this field */
 123         Assert(!metad->btm_allequalimage);
 124         metad->btm_allequalimage = false;
 125
 126         /* Adjust pd_lower (see _bt_initmetapage() for details) */
 127         ((PageHeader) page)->pd_lower =
 128                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
 129 }
 130
 131 /*
 132  * Get metadata from share-locked buffer containing metapage, while performing
 133  * standard sanity checks.
 134  *
 135  * Callers that cache data returned here in local cache should note that an
 136  * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
 137  * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
 138  */
 139 static BTMetaPageData *
 140 _bt_getmeta(Relation rel, Buffer metabuf)
 141 {
 142         Page            metapg;
 143         BTPageOpaque metaopaque;
 144         BTMetaPageData *metad;
 145
 146         metapg = BufferGetPage(metabuf);
 147         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 148         metad = BTPageGetMeta(metapg);
 149
 150         /* sanity-check the metapage */
 151         if (!P_ISMETA(metaopaque) ||
 152                 metad->btm_magic != BTREE_MAGIC)
 153                 ereport(ERROR,
 154                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 155                                  errmsg("index \"%s\" is not a btree",
 156                                                 RelationGetRelationName(rel))));
 157
 158         if (metad->btm_version < BTREE_MIN_VERSION ||
 159                 metad->btm_version > BTREE_VERSION)
 160                 ereport(ERROR,
 161                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 162                                  errmsg("version mismatch in index \"%s\": file version %d, "
 163                                                 "current version %d, minimal supported version %d",
 164                                                 RelationGetRelationName(rel),
 165                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 166
 167         return metad;
 168 }
 169
 170 /*
 171  *      _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup().
 172  *
 173  *              This routine is called at the end of each VACUUM's btvacuumcleanup()
 174  *              call.  Its purpose is to maintain the metapage fields that are used by
 175  *              _bt_vacuum_needs_cleanup() to decide whether or not a btvacuumscan()
 176  *              call should go ahead for an entire VACUUM operation.
 177  *
 178  *              See btvacuumcleanup() and _bt_vacuum_needs_cleanup() for details of
 179  *              the two fields that we maintain here.
 180  *
 181  *              The information that we maintain for btvacuumcleanup() describes the
 182  *              state of the index (as well as the table it indexes) just _after_ the
 183  *              ongoing VACUUM operation.  The next _bt_vacuum_needs_cleanup() call
 184  *              will consider the information we saved for it during the next VACUUM
 185  *              operation (assuming that there will be no btbulkdelete() call during
 186  *              the next VACUUM operation -- if there is then the question of skipping
 187  *              btvacuumscan() doesn't even arise).
 188  */
 189 void
 190 _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
 191                                          float8 num_heap_tuples)
 192 {
 193         Buffer          metabuf;
 194         Page            metapg;
 195         BTMetaPageData *metad;
 196         bool            rewrite = false;
 197         XLogRecPtr      recptr;
 198
 199         /*
 200          * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
 201          * field started out as a TransactionId field called btm_oldest_btpo_xact.
 202          * Both "versions" are just uint32 fields.  It was convenient to repurpose
 203          * the field when we began to use 64-bit XIDs in deleted pages.
 204          *
 205          * It's possible that a pg_upgrade'd database will contain an XID value in
 206          * what is now recognized as the metapage's btm_last_cleanup_num_delpages
 207          * field.  _bt_vacuum_needs_cleanup() may even believe that this value
 208          * indicates that there are lots of pages that it needs to recycle, when
 209          * in reality there are only one or two.  The worst that can happen is
 210          * that there will be a call to btvacuumscan a little earlier, which will
 211          * set btm_last_cleanup_num_delpages to a sane value when we're called.
 212          */
 213         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 214         metapg = BufferGetPage(metabuf);
 215         metad = BTPageGetMeta(metapg);
 216
 217         /* Always dynamically upgrade index/metapage when BTREE_MIN_VERSION */
 218         if (metad->btm_version < BTREE_NOVAC_VERSION)
 219                 rewrite = true;
 220         else if (metad->btm_last_cleanup_num_delpages != num_delpages)
 221                 rewrite = true;
 222         else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples)
 223                 rewrite = true;
 224
 225         if (!rewrite)
 226         {
 227                 _bt_relbuf(rel, metabuf);
 228                 return;
 229         }
 230
 231         /* trade in our read lock for a write lock */
 232         _bt_unlockbuf(rel, metabuf);
 233         _bt_lockbuf(rel, metabuf, BT_WRITE);
 234
 235         START_CRIT_SECTION();
 236
 237         /* upgrade meta-page if needed */
 238         if (metad->btm_version < BTREE_NOVAC_VERSION)
 239                 _bt_upgrademetapage(metapg);
 240
 241         /* update cleanup-related information */
 242         metad->btm_last_cleanup_num_delpages = num_delpages;
 243         metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples;
 244         MarkBufferDirty(metabuf);
 245
 246         /* write wal record if needed */
 247         if (RelationNeedsWAL(rel))
 248         {
 249                 xl_btree_metadata md;
 250
 251                 XLogBeginInsert();
 252                 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 253
 254                 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
 255                 md.version = metad->btm_version;
 256                 md.root = metad->btm_root;
 257                 md.level = metad->btm_level;
 258                 md.fastroot = metad->btm_fastroot;
 259                 md.fastlevel = metad->btm_fastlevel;
 260                 md.last_cleanup_num_delpages = num_delpages;
 261                 md.last_cleanup_num_heap_tuples = num_heap_tuples;
 262                 md.allequalimage = metad->btm_allequalimage;
 263
 264                 XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
 265
 266                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
 267
 268                 PageSetLSN(metapg, recptr);
 269         }
 270
 271         END_CRIT_SECTION();
 272
 273         _bt_relbuf(rel, metabuf);
 274 }
 275
 276 /*
 277  *      _bt_getroot() -- Get the root page of the btree.
 278  *
 279  *              Since the root page can move around the btree file, we have to read
 280  *              its location from the metadata page, and then read the root page
 281  *              itself.  If no root page exists yet, we have to create one.
 282  *
 283  *              The access type parameter (BT_READ or BT_WRITE) controls whether
 284  *              a new root page will be created or not.  If access = BT_READ,
 285  *              and no root page exists, we just return InvalidBuffer.  For
 286  *              BT_WRITE, we try to create the root page if it doesn't exist.
 287  *              NOTE that the returned root page will have only a read lock set
 288  *              on it even if access = BT_WRITE!
 289  *
 290  *              The returned page is not necessarily the true root --- it could be
 291  *              a "fast root" (a page that is alone in its level due to deletions).
 292  *              Also, if the root page is split while we are "in flight" to it,
 293  *              what we will return is the old root, which is now just the leftmost
 294  *              page on a probably-not-very-wide level.  For most purposes this is
 295  *              as good as or better than the true root, so we do not bother to
 296  *              insist on finding the true root.  We do, however, guarantee to
 297  *              return a live (not deleted or half-dead) page.
 298  *
 299  *              On successful return, the root page is pinned and read-locked.
 300  *              The metadata page is not locked or pinned on exit.
 301  */
 302 Buffer
 303 _bt_getroot(Relation rel, int access)
 304 {
 305         Buffer          metabuf;
 306         Buffer          rootbuf;
 307         Page            rootpage;
 308         BTPageOpaque rootopaque;
 309         BlockNumber rootblkno;
 310         uint32          rootlevel;
 311         BTMetaPageData *metad;
 312
 313         /*
 314          * Try to use previously-cached metapage data to find the root.  This
 315          * normally saves one buffer access per index search, which is a very
 316          * helpful savings in bufmgr traffic and hence contention.
 317          */
 318         if (rel->rd_amcache != NULL)
 319         {
 320                 metad = (BTMetaPageData *) rel->rd_amcache;
 321                 /* We shouldn't have cached it if any of these fail */
 322                 Assert(metad->btm_magic == BTREE_MAGIC);
 323                 Assert(metad->btm_version >= BTREE_MIN_VERSION);
 324                 Assert(metad->btm_version <= BTREE_VERSION);
 325                 Assert(!metad->btm_allequalimage ||
 326                            metad->btm_version > BTREE_NOVAC_VERSION);
 327                 Assert(metad->btm_root != P_NONE);
 328
 329                 rootblkno = metad->btm_fastroot;
 330                 Assert(rootblkno != P_NONE);
 331                 rootlevel = metad->btm_fastlevel;
 332
 333                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 334                 rootpage = BufferGetPage(rootbuf);
 335                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 336
 337                 /*
 338                  * Since the cache might be stale, we check the page more carefully
 339                  * here than normal.  We *must* check that it's not deleted. If it's
 340                  * not alone on its level, then we reject too --- this may be overly
 341                  * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
 342                  * because that's not set in a "fast root".
 343                  */
 344                 if (!P_IGNORE(rootopaque) &&
 345                         rootopaque->btpo_level == rootlevel &&
 346                         P_LEFTMOST(rootopaque) &&
 347                         P_RIGHTMOST(rootopaque))
 348                 {
 349                         /* OK, accept cached page as the root */
 350                         return rootbuf;
 351                 }
 352                 _bt_relbuf(rel, rootbuf);
 353                 /* Cache is stale, throw it away */
 354                 if (rel->rd_amcache)
 355                         pfree(rel->rd_amcache);
 356                 rel->rd_amcache = NULL;
 357         }
 358
 359         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 360         metad = _bt_getmeta(rel, metabuf);
 361
 362         /* if no root page initialized yet, do it */
 363         if (metad->btm_root == P_NONE)
 364         {
 365                 Page            metapg;
 366
 367                 /* If access = BT_READ, caller doesn't want us to create root yet */
 368                 if (access == BT_READ)
 369                 {
 370                         _bt_relbuf(rel, metabuf);
 371                         return InvalidBuffer;
 372                 }
 373
 374                 /* trade in our read lock for a write lock */
 375                 _bt_unlockbuf(rel, metabuf);
 376                 _bt_lockbuf(rel, metabuf, BT_WRITE);
 377
 378                 /*
 379                  * Race condition:      if someone else initialized the metadata between
 380                  * the time we released the read lock and acquired the write lock, we
 381                  * must avoid doing it again.
 382                  */
 383                 if (metad->btm_root != P_NONE)
 384                 {
 385                         /*
 386                          * Metadata initialized by someone else.  In order to guarantee no
 387                          * deadlocks, we have to release the metadata page and start all
 388                          * over again.  (Is that really true? But it's hardly worth trying
 389                          * to optimize this case.)
 390                          */
 391                         _bt_relbuf(rel, metabuf);
 392                         return _bt_getroot(rel, access);
 393                 }
 394
 395                 /*
 396                  * Get, initialize, write, and leave a lock of the appropriate type on
 397                  * the new root page.  Since this is the first page in the tree, it's
 398                  * a leaf as well as the root.
 399                  */
 400                 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 401                 rootblkno = BufferGetBlockNumber(rootbuf);
 402                 rootpage = BufferGetPage(rootbuf);
 403                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 404                 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 405                 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
 406                 rootopaque->btpo_level = 0;
 407                 rootopaque->btpo_cycleid = 0;
 408                 /* Get raw page pointer for metapage */
 409                 metapg = BufferGetPage(metabuf);
 410
 411                 /* NO ELOG(ERROR) till meta is updated */
 412                 START_CRIT_SECTION();
 413
 414                 /* upgrade metapage if needed */
 415                 if (metad->btm_version < BTREE_NOVAC_VERSION)
 416                         _bt_upgrademetapage(metapg);
 417
 418                 metad->btm_root = rootblkno;
 419                 metad->btm_level = 0;
 420                 metad->btm_fastroot = rootblkno;
 421                 metad->btm_fastlevel = 0;
 422                 metad->btm_last_cleanup_num_delpages = 0;
 423                 metad->btm_last_cleanup_num_heap_tuples = -1.0;
 424
 425                 MarkBufferDirty(rootbuf);
 426                 MarkBufferDirty(metabuf);
 427
 428                 /* XLOG stuff */
 429                 if (RelationNeedsWAL(rel))
 430                 {
 431                         xl_btree_newroot xlrec;
 432                         XLogRecPtr      recptr;
 433                         xl_btree_metadata md;
 434
 435                         XLogBeginInsert();
 436                         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
 437                         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 438
 439                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
 440                         md.version = metad->btm_version;
 441                         md.root = rootblkno;
 442                         md.level = 0;
 443                         md.fastroot = rootblkno;
 444                         md.fastlevel = 0;
 445                         md.last_cleanup_num_delpages = 0;
 446                         md.last_cleanup_num_heap_tuples = -1.0;
 447                         md.allequalimage = metad->btm_allequalimage;
 448
 449                         XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
 450
 451                         xlrec.rootblk = rootblkno;
 452                         xlrec.level = 0;
 453
 454                         XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
 455
 456                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
 457
 458                         PageSetLSN(rootpage, recptr);
 459                         PageSetLSN(metapg, recptr);
 460                 }
 461
 462                 END_CRIT_SECTION();
 463
 464                 /*
 465                  * swap root write lock for read lock.  There is no danger of anyone
 466                  * else accessing the new root page while it's unlocked, since no one
 467                  * else knows where it is yet.
 468                  */
 469                 _bt_unlockbuf(rel, rootbuf);
 470                 _bt_lockbuf(rel, rootbuf, BT_READ);
 471
 472                 /* okay, metadata is correct, release lock on it without caching */
 473                 _bt_relbuf(rel, metabuf);
 474         }
 475         else
 476         {
 477                 rootblkno = metad->btm_fastroot;
 478                 Assert(rootblkno != P_NONE);
 479                 rootlevel = metad->btm_fastlevel;
 480
 481                 /*
 482                  * Cache the metapage data for next time
 483                  */
 484                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 485                                                                                          sizeof(BTMetaPageData));
 486                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 487
 488                 /*
 489                  * We are done with the metapage; arrange to release it via first
 490                  * _bt_relandgetbuf call
 491                  */
 492                 rootbuf = metabuf;
 493
 494                 for (;;)
 495                 {
 496                         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 497                         rootpage = BufferGetPage(rootbuf);
 498                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 499
 500                         if (!P_IGNORE(rootopaque))
 501                                 break;
 502
 503                         /* it's dead, Jim.  step right one page */
 504                         if (P_RIGHTMOST(rootopaque))
 505                                 elog(ERROR, "no live root page found in index \"%s\"",
 506                                          RelationGetRelationName(rel));
 507                         rootblkno = rootopaque->btpo_next;
 508                 }
 509
 510                 if (rootopaque->btpo_level != rootlevel)
 511                         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 512                                  rootblkno, RelationGetRelationName(rel),
 513                                  rootopaque->btpo_level, rootlevel);
 514         }
 515
 516         /*
 517          * By here, we have a pin and read lock on the root page, and no lock set
 518          * on the metadata page.  Return the root page's buffer.
 519          */
 520         return rootbuf;
 521 }
 522
 523 /*
 524  *      _bt_gettrueroot() -- Get the true root page of the btree.
 525  *
 526  *              This is the same as the BT_READ case of _bt_getroot(), except
 527  *              we follow the true-root link not the fast-root link.
 528  *
 529  * By the time we acquire lock on the root page, it might have been split and
 530  * not be the true root anymore.  This is okay for the present uses of this
 531  * routine; we only really need to be able to move up at least one tree level
 532  * from whatever non-root page we were at.  If we ever do need to lock the
 533  * one true root page, we could loop here, re-reading the metapage on each
 534  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 535  * moving to the root --- that'd deadlock against any concurrent root split.)
 536  */
 537 Buffer
 538 _bt_gettrueroot(Relation rel)
 539 {
 540         Buffer          metabuf;
 541         Page            metapg;
 542         BTPageOpaque metaopaque;
 543         Buffer          rootbuf;
 544         Page            rootpage;
 545         BTPageOpaque rootopaque;
 546         BlockNumber rootblkno;
 547         uint32          rootlevel;
 548         BTMetaPageData *metad;
 549
 550         /*
 551          * We don't try to use cached metapage data here, since (a) this path is
 552          * not performance-critical, and (b) if we are here it suggests our cache
 553          * is out-of-date anyway.  In light of point (b), it's probably safest to
 554          * actively flush any cached metapage info.
 555          */
 556         if (rel->rd_amcache)
 557                 pfree(rel->rd_amcache);
 558         rel->rd_amcache = NULL;
 559
 560         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 561         metapg = BufferGetPage(metabuf);
 562         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 563         metad = BTPageGetMeta(metapg);
 564
 565         if (!P_ISMETA(metaopaque) ||
 566                 metad->btm_magic != BTREE_MAGIC)
 567                 ereport(ERROR,
 568                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 569                                  errmsg("index \"%s\" is not a btree",
 570                                                 RelationGetRelationName(rel))));
 571
 572         if (metad->btm_version < BTREE_MIN_VERSION ||
 573                 metad->btm_version > BTREE_VERSION)
 574                 ereport(ERROR,
 575                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 576                                  errmsg("version mismatch in index \"%s\": file version %d, "
 577                                                 "current version %d, minimal supported version %d",
 578                                                 RelationGetRelationName(rel),
 579                                                 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
 580
 581         /* if no root page initialized yet, fail */
 582         if (metad->btm_root == P_NONE)
 583         {
 584                 _bt_relbuf(rel, metabuf);
 585                 return InvalidBuffer;
 586         }
 587
 588         rootblkno = metad->btm_root;
 589         rootlevel = metad->btm_level;
 590
 591         /*
 592          * We are done with the metapage; arrange to release it via first
 593          * _bt_relandgetbuf call
 594          */
 595         rootbuf = metabuf;
 596
 597         for (;;)
 598         {
 599                 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 600                 rootpage = BufferGetPage(rootbuf);
 601                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 602
 603                 if (!P_IGNORE(rootopaque))
 604                         break;
 605
 606                 /* it's dead, Jim.  step right one page */
 607                 if (P_RIGHTMOST(rootopaque))
 608                         elog(ERROR, "no live root page found in index \"%s\"",
 609                                  RelationGetRelationName(rel));
 610                 rootblkno = rootopaque->btpo_next;
 611         }
 612
 613         if (rootopaque->btpo_level != rootlevel)
 614                 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 615                          rootblkno, RelationGetRelationName(rel),
 616                          rootopaque->btpo_level, rootlevel);
 617
 618         return rootbuf;
 619 }
 620
 621 /*
 622  *      _bt_getrootheight() -- Get the height of the btree search tree.
 623  *
 624  *              We return the level (counting from zero) of the current fast root.
 625  *              This represents the number of tree levels we'd have to descend through
 626  *              to start any btree index search.
 627  *
 628  *              This is used by the planner for cost-estimation purposes.  Since it's
 629  *              only an estimate, slightly-stale data is fine, hence we don't worry
 630  *              about updating previously cached data.
 631  */
 632 int
 633 _bt_getrootheight(Relation rel)
 634 {
 635         BTMetaPageData *metad;
 636
 637         if (rel->rd_amcache == NULL)
 638         {
 639                 Buffer          metabuf;
 640
 641                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 642                 metad = _bt_getmeta(rel, metabuf);
 643
 644                 /*
 645                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
 646                  * to be made, so just stop here and report the index height is zero.
 647                  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
 648                  */
 649                 if (metad->btm_root == P_NONE)
 650                 {
 651                         _bt_relbuf(rel, metabuf);
 652                         return 0;
 653                 }
 654
 655                 /*
 656                  * Cache the metapage data for next time
 657                  */
 658                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 659                                                                                          sizeof(BTMetaPageData));
 660                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 661                 _bt_relbuf(rel, metabuf);
 662         }
 663
 664         /* Get cached page */
 665         metad = (BTMetaPageData *) rel->rd_amcache;
 666         /* We shouldn't have cached it if any of these fail */
 667         Assert(metad->btm_magic == BTREE_MAGIC);
 668         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 669         Assert(metad->btm_version <= BTREE_VERSION);
 670         Assert(!metad->btm_allequalimage ||
 671                    metad->btm_version > BTREE_NOVAC_VERSION);
 672         Assert(metad->btm_fastroot != P_NONE);
 673
 674         return metad->btm_fastlevel;
 675 }
 676
 677 /*
 678  *      _bt_metaversion() -- Get version/status info from metapage.
 679  *
 680  *              Sets caller's *heapkeyspace and *allequalimage arguments using data
 681  *              from the B-Tree metapage (could be locally-cached version).  This
 682  *              information needs to be stashed in insertion scankey, so we provide a
 683  *              single function that fetches both at once.
 684  *
 685  *              This is used to determine the rules that must be used to descend a
 686  *              btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
 687  *              pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
 688  *              performance when inserting a new BTScanInsert-wise duplicate tuple
 689  *              among many leaf pages already full of such duplicates.
 690  *
 691  *              Also sets allequalimage field, which indicates whether or not it is
 692  *              safe to apply deduplication.  We rely on the assumption that
 693  *              btm_allequalimage will be zero'ed on heapkeyspace indexes that were
 694  *              pg_upgrade'd from Postgres 12.
 695  */
 696 void
 697 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
 698 {
 699         BTMetaPageData *metad;
 700
 701         if (rel->rd_amcache == NULL)
 702         {
 703                 Buffer          metabuf;
 704
 705                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 706                 metad = _bt_getmeta(rel, metabuf);
 707
 708                 /*
 709                  * If there's no root page yet, _bt_getroot() doesn't expect a cache
 710                  * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
 711                  * be changed to allow this case.)
 712                  */
 713                 if (metad->btm_root == P_NONE)
 714                 {
 715                         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
 716                         *allequalimage = metad->btm_allequalimage;
 717
 718                         _bt_relbuf(rel, metabuf);
 719                         return;
 720                 }
 721
 722                 /*
 723                  * Cache the metapage data for next time
 724                  *
 725                  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
 726                  * can change the nbtree version for an index without invalidating any
 727                  * local cache.  This is okay because it can only happen when moving
 728                  * from version 2 to version 3, both of which are !heapkeyspace
 729                  * versions.
 730                  */
 731                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 732                                                                                          sizeof(BTMetaPageData));
 733                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 734                 _bt_relbuf(rel, metabuf);
 735         }
 736
 737         /* Get cached page */
 738         metad = (BTMetaPageData *) rel->rd_amcache;
 739         /* We shouldn't have cached it if any of these fail */
 740         Assert(metad->btm_magic == BTREE_MAGIC);
 741         Assert(metad->btm_version >= BTREE_MIN_VERSION);
 742         Assert(metad->btm_version <= BTREE_VERSION);
 743         Assert(!metad->btm_allequalimage ||
 744                    metad->btm_version > BTREE_NOVAC_VERSION);
 745         Assert(metad->btm_fastroot != P_NONE);
 746
 747         *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
 748         *allequalimage = metad->btm_allequalimage;
 749 }
 750
 751 /*
 752  *      _bt_checkpage() -- Verify that a freshly-read page looks sane.
 753  */
 754 void
 755 _bt_checkpage(Relation rel, Buffer buf)
 756 {
 757         Page            page = BufferGetPage(buf);
 758
 759         /*
 760          * ReadBuffer verifies that every newly-read page passes
 761          * PageHeaderIsValid, which means it either contains a reasonably sane
 762          * page header or is all-zero.  We have to defend against the all-zero
 763          * case, however.
 764          */
 765         if (PageIsNew(page))
 766                 ereport(ERROR,
 767                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 768                                  errmsg("index \"%s\" contains unexpected zero page at block %u",
 769                                                 RelationGetRelationName(rel),
 770                                                 BufferGetBlockNumber(buf)),
 771                                  errhint("Please REINDEX it.")));
 772
 773         /*
 774          * Additionally check that the special area looks sane.
 775          */
 776         if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
 777                 ereport(ERROR,
 778                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 779                                  errmsg("index \"%s\" contains corrupted page at block %u",
 780                                                 RelationGetRelationName(rel),
 781                                                 BufferGetBlockNumber(buf)),
 782                                  errhint("Please REINDEX it.")));
 783 }
 784
 785 /*
 786  * Log the reuse of a page from the FSM.
 787  */
 788 static void
 789 _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
 790 {
 791         xl_btree_reuse_page xlrec_reuse;
 792
 793         /*
 794          * Note that we don't register the buffer with the record, because this
 795          * operation doesn't modify the page. This record only exists to provide a
 796          * conflict point for Hot Standby.
 797          */
 798
 799         /* XLOG stuff */
 800         xlrec_reuse.node = rel->rd_node;
 801         xlrec_reuse.block = blkno;
 802         xlrec_reuse.latestRemovedFullXid = safexid;
 803
 804         XLogBeginInsert();
 805         XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
 806
 807         XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
 808 }
 809
 810 /*
 811  *      _bt_getbuf() -- Get a buffer by block number for read or write.
 812  *
 813  *              blkno == P_NEW means to get an unallocated index page.  The page
 814  *              will be initialized before returning it.
 815  *
 816  *              The general rule in nbtree is that it's never okay to access a
 817  *              page without holding both a buffer pin and a buffer lock on
 818  *              the page's buffer.
 819  *
 820  *              When this routine returns, the appropriate lock is set on the
 821  *              requested buffer and its reference count has been incremented
 822  *              (ie, the buffer is "locked and pinned").  Also, we apply
 823  *              _bt_checkpage to sanity-check the page (except in P_NEW case),
 824  *              and perform Valgrind client requests that help Valgrind detect
 825  *              unsafe page accesses.
 826  *
 827  *              Note: raw LockBuffer() calls are disallowed in nbtree; all
 828  *              buffer lock requests need to go through wrapper functions such
 829  *              as _bt_lockbuf().
 830  */
 831 Buffer
 832 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 833 {
 834         Buffer          buf;
 835
 836         if (blkno != P_NEW)
 837         {
 838                 /* Read an existing block of the relation */
 839                 buf = ReadBuffer(rel, blkno);
 840                 _bt_lockbuf(rel, buf, access);
 841                 _bt_checkpage(rel, buf);
 842         }
 843         else
 844         {
 845                 bool            needLock;
 846                 Page            page;
 847
 848                 Assert(access == BT_WRITE);
 849
 850                 /*
 851                  * First see if the FSM knows of any free pages.
 852                  *
 853                  * We can't trust the FSM's report unreservedly; we have to check that
 854                  * the page is still free.  (For example, an already-free page could
 855                  * have been re-used between the time the last VACUUM scanned it and
 856                  * the time the VACUUM made its FSM updates.)
 857                  *
 858                  * In fact, it's worse than that: we can't even assume that it's safe
 859                  * to take a lock on the reported page.  If somebody else has a lock
 860                  * on it, or even worse our own caller does, we could deadlock.  (The
 861                  * own-caller scenario is actually not improbable. Consider an index
 862                  * on a serial or timestamp column.  Nearly all splits will be at the
 863                  * rightmost page, so it's entirely likely that _bt_split will call us
 864                  * while holding a lock on the page most recently acquired from FSM. A
 865                  * VACUUM running concurrently with the previous split could well have
 866                  * placed that page back in FSM.)
 867                  *
 868                  * To get around that, we ask for only a conditional lock on the
 869                  * reported page.  If we fail, then someone else is using the page,
 870                  * and we may reasonably assume it's not free.  (If we happen to be
 871                  * wrong, the worst consequence is the page will be lost to use till
 872                  * the next VACUUM, which is no big problem.)
 873                  */
 874                 for (;;)
 875                 {
 876                         blkno = GetFreeIndexPage(rel);
 877                         if (blkno == InvalidBlockNumber)
 878                                 break;
 879                         buf = ReadBuffer(rel, blkno);
 880                         if (_bt_conditionallockbuf(rel, buf))
 881                         {
 882                                 page = BufferGetPage(buf);
 883
 884                                 /*
 885                                  * It's possible to find an all-zeroes page in an index.  For
 886                                  * example, a backend might successfully extend the relation
 887                                  * one page and then crash before it is able to make a WAL
 888                                  * entry for adding the page.  If we find a zeroed page then
 889                                  * reclaim it immediately.
 890                                  */
 891                                 if (PageIsNew(page))
 892                                 {
 893                                         /* Okay to use page.  Initialize and return it. */
 894                                         _bt_pageinit(page, BufferGetPageSize(buf));
 895                                         return buf;
 896                                 }
 897
 898                                 if (BTPageIsRecyclable(page))
 899                                 {
 900                                         /*
 901                                          * If we are generating WAL for Hot Standby then create a
 902                                          * WAL record that will allow us to conflict with queries
 903                                          * running on standby, in case they have snapshots older
 904                                          * than safexid value
 905                                          */
 906                                         if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
 907                                                 _bt_log_reuse_page(rel, blkno,
 908                                                                                    BTPageGetDeleteXid(page));
 909
 910                                         /* Okay to use page.  Re-initialize and return it. */
 911                                         _bt_pageinit(page, BufferGetPageSize(buf));
 912                                         return buf;
 913                                 }
 914                                 elog(DEBUG2, "FSM returned nonrecyclable page");
 915                                 _bt_relbuf(rel, buf);
 916                         }
 917                         else
 918                         {
 919                                 elog(DEBUG2, "FSM returned nonlockable page");
 920                                 /* couldn't get lock, so just drop pin */
 921                                 ReleaseBuffer(buf);
 922                         }
 923                 }
 924
 925                 /*
 926                  * Extend the relation by one page.
 927                  *
 928                  * We have to use a lock to ensure no one else is extending the rel at
 929                  * the same time, else we will both try to initialize the same new
 930                  * page.  We can skip locking for new or temp relations, however,
 931                  * since no one else could be accessing them.
 932                  */
 933                 needLock = !RELATION_IS_LOCAL(rel);
 934
 935                 if (needLock)
 936                         LockRelationForExtension(rel, ExclusiveLock);
 937
 938                 buf = ReadBuffer(rel, P_NEW);
 939
 940                 /* Acquire buffer lock on new page */
 941                 _bt_lockbuf(rel, buf, BT_WRITE);
 942
 943                 /*
 944                  * Release the file-extension lock; it's now OK for someone else to
 945                  * extend the relation some more.  Note that we cannot release this
 946                  * lock before we have buffer lock on the new page, or we risk a race
 947                  * condition against btvacuumscan --- see comments therein.
 948                  */
 949                 if (needLock)
 950                         UnlockRelationForExtension(rel, ExclusiveLock);
 951
 952                 /* Initialize the new page before returning it */
 953                 page = BufferGetPage(buf);
 954                 Assert(PageIsNew(page));
 955                 _bt_pageinit(page, BufferGetPageSize(buf));
 956         }
 957
 958         /* ref count and lock type are correct */
 959         return buf;
 960 }
 961
 962 /*
 963  *      _bt_relandgetbuf() -- release a locked buffer and get another one.
 964  *
 965  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
 966  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
 967  * then it reduces to just _bt_getbuf; allowing this case simplifies some
 968  * callers.
 969  *
 970  * The original motivation for using this was to avoid two entries to the
 971  * bufmgr when one would do.  However, now it's mainly just a notational
 972  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
 973  * is when the target page is the same one already in the buffer.
 974  */
 975 Buffer
 976 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 977 {
 978         Buffer          buf;
 979
 980         Assert(blkno != P_NEW);
 981         if (BufferIsValid(obuf))
 982                 _bt_unlockbuf(rel, obuf);
 983         buf = ReleaseAndReadBuffer(obuf, rel, blkno);
 984         _bt_lockbuf(rel, buf, access);
 985
 986         _bt_checkpage(rel, buf);
 987         return buf;
 988 }
 989
 990 /*
 991  *      _bt_relbuf() -- release a locked buffer.
 992  *
 993  * Lock and pin (refcount) are both dropped.
 994  */
 995 void
 996 _bt_relbuf(Relation rel, Buffer buf)
 997 {
 998         _bt_unlockbuf(rel, buf);
 999         ReleaseBuffer(buf);
1000 }
1001
1002 /*
1003  *      _bt_lockbuf() -- lock a pinned buffer.
1004  *
1005  * Lock is acquired without acquiring another pin.  This is like a raw
1006  * LockBuffer() call, but performs extra steps needed by Valgrind.
1007  *
1008  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1009  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1010  */
1011 void
1012 _bt_lockbuf(Relation rel, Buffer buf, int access)
1013 {
1014         /* LockBuffer() asserts that pin is held by this backend */
1015         LockBuffer(buf, access);
1016
1017         /*
1018          * It doesn't matter that _bt_unlockbuf() won't get called in the
1019          * event of an nbtree error (e.g. a unique violation error).  That
1020          * won't cause Valgrind false positives.
1021          *
1022          * The nbtree client requests are superimposed on top of the
1023          * bufmgr.c buffer pin client requests.  In the event of an nbtree
1024          * error the buffer will certainly get marked as defined when the
1025          * backend once again acquires its first pin on the buffer. (Of
1026          * course, if the backend never touches the buffer again then it
1027          * doesn't matter that it remains non-accessible to Valgrind.)
1028          *
1029          * Note: When an IndexTuple C pointer gets computed using an
1030          * ItemId read from a page while a lock was held, the C pointer
1031          * becomes unsafe to dereference forever as soon as the lock is
1032          * released.  Valgrind can only detect cases where the pointer
1033          * gets dereferenced with no _current_ lock/pin held, though.
1034          */
1035         if (!RelationUsesLocalBuffers(rel))
1036                 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1037 }
1038
1039 /*
1040  *      _bt_unlockbuf() -- unlock a pinned buffer.
1041  */
1042 void
1043 _bt_unlockbuf(Relation rel, Buffer buf)
1044 {
1045         /*
1046          * Buffer is pinned and locked, which means that it is expected to be
1047          * defined and addressable.  Check that proactively.
1048          */
1049         VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1050
1051         /* LockBuffer() asserts that pin is held by this backend */
1052         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1053
1054         if (!RelationUsesLocalBuffers(rel))
1055                 VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1056 }
1057
1058 /*
1059  *      _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1060  *      buffer.
1061  *
1062  * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1063  * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1064  */
1065 bool
1066 _bt_conditionallockbuf(Relation rel, Buffer buf)
1067 {
1068         /* ConditionalLockBuffer() asserts that pin is held by this backend */
1069         if (!ConditionalLockBuffer(buf))
1070                 return false;
1071
1072         if (!RelationUsesLocalBuffers(rel))
1073                 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1074
1075         return true;
1076 }
1077
1078 /*
1079  *      _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
1080  *      lock.
1081  */
1082 void
1083 _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1084 {
1085         /*
1086          * Buffer is pinned and locked, which means that it is expected to be
1087          * defined and addressable.  Check that proactively.
1088          */
1089         VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1090
1091         /* LockBuffer() asserts that pin is held by this backend */
1092         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1093         LockBufferForCleanup(buf);
1094 }
1095
1096 /*
1097  *      _bt_pageinit() -- Initialize a new page.
1098  *
1099  * On return, the page header is initialized; data space is empty;
1100  * special space is zeroed out.
1101  */
1102 void
1103 _bt_pageinit(Page page, Size size)
1104 {
1105         PageInit(page, size, sizeof(BTPageOpaqueData));
1106 }
1107
1108 /*
1109  * Delete item(s) from a btree leaf page during VACUUM.
1110  *
1111  * This routine assumes that the caller has a super-exclusive write lock on
1112  * the buffer.  Also, the given deletable and updatable arrays *must* be
1113  * sorted in ascending order.
1114  *
1115  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1116  * in an existing posting list item are to be removed.  This works by
1117  * updating/overwriting an existing item with caller's new version of the item
1118  * (a version that lacks the TIDs that are to be deleted).
1119  *
1120  * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
1121  * generate their own latestRemovedXid by accessing the table directly,
1122  * whereas VACUUMs rely on the initial VACUUM table scan performing
1123  * WAL-logging that takes care of the issue for the table's indexes
1124  * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
1125  * deletes don't do.
1126  */
1127 void
1128 _bt_delitems_vacuum(Relation rel, Buffer buf,
1129                                         OffsetNumber *deletable, int ndeletable,
1130                                         BTVacuumPosting *updatable, int nupdatable)
1131 {
1132         Page            page = BufferGetPage(buf);
1133         BTPageOpaque opaque;
1134         bool            needswal = RelationNeedsWAL(rel);
1135         char       *updatedbuf = NULL;
1136         Size            updatedbuflen = 0;
1137         OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1138
1139         /* Shouldn't be called unless there's something to do */
1140         Assert(ndeletable > 0 || nupdatable > 0);
1141
1142         /* Generate new version of posting lists without deleted TIDs */
1143         if (nupdatable > 0)
1144                 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1145                                                                                  updatedoffsets, &updatedbuflen,
1146                                                                                  needswal);
1147
1148         /* No ereport(ERROR) until changes are logged */
1149         START_CRIT_SECTION();
1150
1151         /*
1152          * Handle posting tuple updates.
1153          *
1154          * Deliberately do this before handling simple deletes.  If we did it the
1155          * other way around (i.e. WAL record order -- simple deletes before
1156          * updates) then we'd have to make compensating changes to the 'updatable'
1157          * array of offset numbers.
1158          *
1159          * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1160          * happens to already be set.  It's important that we not interfere with
1161          * _bt_delitems_delete().
1162          */
1163         for (int i = 0; i < nupdatable; i++)
1164         {
1165                 OffsetNumber updatedoffset = updatedoffsets[i];
1166                 IndexTuple      itup;
1167                 Size            itemsz;
1168
1169                 itup = updatable[i]->itup;
1170                 itemsz = MAXALIGN(IndexTupleSize(itup));
1171                 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1172                                                                          itemsz))
1173                         elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1174                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1175         }
1176
1177         /* Now handle simple deletes of entire tuples */
1178         if (ndeletable > 0)
1179                 PageIndexMultiDelete(page, deletable, ndeletable);
1180
1181         /*
1182          * We can clear the vacuum cycle ID since this page has certainly been
1183          * processed by the current vacuum scan.
1184          */
1185         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1186         opaque->btpo_cycleid = 0;
1187
1188         /*
1189          * Clear the BTP_HAS_GARBAGE page flag.
1190          *
1191          * This flag indicates the presence of LP_DEAD items on the page (though
1192          * not reliably).  Note that we only rely on it with pg_upgrade'd
1193          * !heapkeyspace indexes.  That's why clearing it here won't usually
1194          * interfere with _bt_delitems_delete().
1195          */
1196         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1197
1198         MarkBufferDirty(buf);
1199
1200         /* XLOG stuff */
1201         if (needswal)
1202         {
1203                 XLogRecPtr      recptr;
1204                 xl_btree_vacuum xlrec_vacuum;
1205
1206                 xlrec_vacuum.ndeleted = ndeletable;
1207                 xlrec_vacuum.nupdated = nupdatable;
1208
1209                 XLogBeginInsert();
1210                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1211                 XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1212
1213                 if (ndeletable > 0)
1214                         XLogRegisterBufData(0, (char *) deletable,
1215                                                                 ndeletable * sizeof(OffsetNumber));
1216
1217                 if (nupdatable > 0)
1218                 {
1219                         XLogRegisterBufData(0, (char *) updatedoffsets,
1220                                                                 nupdatable * sizeof(OffsetNumber));
1221                         XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1222                 }
1223
1224                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1225
1226                 PageSetLSN(page, recptr);
1227         }
1228
1229         END_CRIT_SECTION();
1230
1231         /* can't leak memory here */
1232         if (updatedbuf != NULL)
1233                 pfree(updatedbuf);
1234         /* free tuples allocated within _bt_delitems_update() */
1235         for (int i = 0; i < nupdatable; i++)
1236                 pfree(updatable[i]->itup);
1237 }
1238
1239 /*
1240  * Delete item(s) from a btree leaf page during single-page cleanup.
1241  *
1242  * This routine assumes that the caller has pinned and write locked the
1243  * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
1244  * ascending order.
1245  *
1246  * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1247  * in an existing posting list item are to be removed.  This works by
1248  * updating/overwriting an existing item with caller's new version of the item
1249  * (a version that lacks the TIDs that are to be deleted).
1250  *
1251  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1252  * the page, but it needs its own latestRemovedXid from caller (caller gets
1253  * this from tableam).  This is used by the REDO routine to generate recovery
1254  * conflicts.  The other difference is that only _bt_delitems_vacuum will
1255  * clear page's VACUUM cycle ID.
1256  */
1257 static void
1258 _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid,
1259                                         OffsetNumber *deletable, int ndeletable,
1260                                         BTVacuumPosting *updatable, int nupdatable)
1261 {
1262         Page            page = BufferGetPage(buf);
1263         BTPageOpaque opaque;
1264         bool            needswal = RelationNeedsWAL(rel);
1265         char       *updatedbuf = NULL;
1266         Size            updatedbuflen = 0;
1267         OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1268
1269         /* Shouldn't be called unless there's something to do */
1270         Assert(ndeletable > 0 || nupdatable > 0);
1271
1272         /* Generate new versions of posting lists without deleted TIDs */
1273         if (nupdatable > 0)
1274                 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1275                                                                                  updatedoffsets, &updatedbuflen,
1276                                                                                  needswal);
1277
1278         /* No ereport(ERROR) until changes are logged */
1279         START_CRIT_SECTION();
1280
1281         /* Handle updates and deletes just like _bt_delitems_vacuum */
1282         for (int i = 0; i < nupdatable; i++)
1283         {
1284                 OffsetNumber updatedoffset = updatedoffsets[i];
1285                 IndexTuple      itup;
1286                 Size            itemsz;
1287
1288                 itup = updatable[i]->itup;
1289                 itemsz = MAXALIGN(IndexTupleSize(itup));
1290                 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1291                                                                          itemsz))
1292                         elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1293                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1294         }
1295
1296         if (ndeletable > 0)
1297                 PageIndexMultiDelete(page, deletable, ndeletable);
1298
1299         /*
1300          * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1301          * this point.  The VACUUM command alone controls vacuum cycle IDs.
1302          */
1303         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1304
1305         /*
1306          * Clear the BTP_HAS_GARBAGE page flag.
1307          *
1308          * This flag indicates the presence of LP_DEAD items on the page (though
1309          * not reliably).  Note that we only rely on it with pg_upgrade'd
1310          * !heapkeyspace indexes.
1311          */
1312         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1313
1314         MarkBufferDirty(buf);
1315
1316         /* XLOG stuff */
1317         if (needswal)
1318         {
1319                 XLogRecPtr      recptr;
1320                 xl_btree_delete xlrec_delete;
1321
1322                 xlrec_delete.latestRemovedXid = latestRemovedXid;
1323                 xlrec_delete.ndeleted = ndeletable;
1324                 xlrec_delete.nupdated = nupdatable;
1325
1326                 XLogBeginInsert();
1327                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1328                 XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1329
1330                 if (ndeletable > 0)
1331                         XLogRegisterBufData(0, (char *) deletable,
1332                                                                 ndeletable * sizeof(OffsetNumber));
1333
1334                 if (nupdatable > 0)
1335                 {
1336                         XLogRegisterBufData(0, (char *) updatedoffsets,
1337                                                                 nupdatable * sizeof(OffsetNumber));
1338                         XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1339                 }
1340
1341                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1342
1343                 PageSetLSN(page, recptr);
1344         }
1345
1346         END_CRIT_SECTION();
1347
1348         /* can't leak memory here */
1349         if (updatedbuf != NULL)
1350                 pfree(updatedbuf);
1351         /* free tuples allocated within _bt_delitems_update() */
1352         for (int i = 0; i < nupdatable; i++)
1353                 pfree(updatable[i]->itup);
1354 }
1355
1356 /*
1357  * Set up state needed to delete TIDs from posting list tuples via "updating"
1358  * the tuple.  Performs steps common to both _bt_delitems_vacuum and
1359  * _bt_delitems_delete.  These steps must take place before each function's
1360  * critical section begins.
1361  *
1362  * updatabable and nupdatable are inputs, though note that we will use
1363  * _bt_update_posting() to replace the original itup with a pointer to a final
1364  * version in palloc()'d memory.  Caller should free the tuples when its done.
1365  *
1366  * The first nupdatable entries from updatedoffsets are set to the page offset
1367  * number for posting list tuples that caller updates.  This is mostly useful
1368  * because caller may need to WAL-log the page offsets (though we always do
1369  * this for caller out of convenience).
1370  *
1371  * Returns buffer consisting of an array of xl_btree_update structs that
1372  * describe the steps we perform here for caller (though only when needswal is
1373  * true).  Also sets *updatedbuflen to the final size of the buffer.  This
1374  * buffer is used by caller when WAL logging is required.
1375  */
1376 static char *
1377 _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1378                                         OffsetNumber *updatedoffsets, Size *updatedbuflen,
1379                                         bool needswal)
1380 {
1381         char       *updatedbuf = NULL;
1382         Size            buflen = 0;
1383
1384         /* Shouldn't be called unless there's something to do */
1385         Assert(nupdatable > 0);
1386
1387         for (int i = 0; i < nupdatable; i++)
1388         {
1389                 BTVacuumPosting vacposting = updatable[i];
1390                 Size            itemsz;
1391
1392                 /* Replace work area IndexTuple with updated version */
1393                 _bt_update_posting(vacposting);
1394
1395                 /* Keep track of size of xl_btree_update for updatedbuf in passing */
1396                 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1397                 buflen += itemsz;
1398
1399                 /* Build updatedoffsets buffer in passing */
1400                 updatedoffsets[i] = vacposting->updatedoffset;
1401         }
1402
1403         /* XLOG stuff */
1404         if (needswal)
1405         {
1406                 Size            offset = 0;
1407
1408                 /* Allocate, set final size for caller */
1409                 updatedbuf = palloc(buflen);
1410                 *updatedbuflen = buflen;
1411                 for (int i = 0; i < nupdatable; i++)
1412                 {
1413                         BTVacuumPosting vacposting = updatable[i];
1414                         Size            itemsz;
1415                         xl_btree_update update;
1416
1417                         update.ndeletedtids = vacposting->ndeletedtids;
1418                         memcpy(updatedbuf + offset, &update.ndeletedtids,
1419                                    SizeOfBtreeUpdate);
1420                         offset += SizeOfBtreeUpdate;
1421
1422                         itemsz = update.ndeletedtids * sizeof(uint16);
1423                         memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1424                         offset += itemsz;
1425                 }
1426         }
1427
1428         return updatedbuf;
1429 }
1430
1431 /*
1432  * Comparator used by _bt_delitems_delete_check() to restore deltids array
1433  * back to its original leaf-page-wise sort order
1434  */
1435 static int
1436 _bt_delitems_cmp(const void *a, const void *b)
1437 {
1438         TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1439         TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1440
1441         if (indexdelete1->id > indexdelete2->id)
1442                 return 1;
1443         if (indexdelete1->id < indexdelete2->id)
1444                 return -1;
1445
1446         Assert(false);
1447
1448         return 0;
1449 }
1450
1451 /*
1452  * Try to delete item(s) from a btree leaf page during single-page cleanup.
1453  *
1454  * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
1455  * tuples from caller's deltids array: those whose TIDs are found safe to
1456  * delete by the tableam (or already marked LP_DEAD in index, and so already
1457  * known to be deletable by our simple index deletion caller).  We physically
1458  * delete index tuples from buf leaf page last of all (for index tuples where
1459  * that is known to be safe following our table_index_delete_tuples() call).
1460  *
1461  * Simple index deletion caller only includes TIDs from index tuples marked
1462  * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1463  * included without increasing the total number of distinct table blocks for
1464  * the deletion operation as a whole.  This approach often allows us to delete
1465  * some extra index tuples that were practically free for tableam to check in
1466  * passing (when they actually turn out to be safe to delete).  It probably
1467  * only makes sense for the tableam to go ahead with these extra checks when
1468  * it is block-orientated (otherwise the checks probably won't be practically
1469  * free, which we rely on).  The tableam interface requires the tableam side
1470  * to handle the problem, though, so this is okay (we as an index AM are free
1471  * to make the simplifying assumption that all tableams must be block-based).
1472  *
1473  * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1474  * without expecting that tableam will check most of them.  The tableam has
1475  * considerable discretion around which entries/blocks it checks.  Our role in
1476  * costing the bottom-up deletion operation is strictly advisory.
1477  *
1478  * Note: Caller must have added deltids entries (i.e. entries that go in
1479  * delstate's main array) in leaf-page-wise order: page offset number order,
1480  * TID order among entries taken from the same posting list tuple (tiebreak on
1481  * TID).  This order is convenient to work with here.
1482  *
1483  * Note: We also rely on the id field of each deltids element "capturing" this
1484  * original leaf-page-wise order.  That is, we expect to be able to get back
1485  * to the original leaf-page-wise order just by sorting deltids on the id
1486  * field (tableam will sort deltids for its own reasons, so we'll need to put
1487  * it back in leaf-page-wise order afterwards).
1488  */
1489 void
1490 _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
1491                                                   TM_IndexDeleteOp *delstate)
1492 {
1493         Page            page = BufferGetPage(buf);
1494         TransactionId latestRemovedXid;
1495         OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1496         int                     ndeletable = 0,
1497                                 nupdatable = 0;
1498         OffsetNumber deletable[MaxIndexTuplesPerPage];
1499         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1500
1501         /* Use tableam interface to determine which tuples to delete first */
1502         latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1503
1504         /* Should not WAL-log latestRemovedXid unless it's required */
1505         if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1506                 latestRemovedXid = InvalidTransactionId;
1507
1508         /*
1509          * Construct a leaf-page-wise description of what _bt_delitems_delete()
1510          * needs to do to physically delete index tuples from the page.
1511          *
1512          * Must sort deltids array to restore leaf-page-wise order (original order
1513          * before call to tableam).  This is the order that the loop expects.
1514          *
1515          * Note that deltids array might be a lot smaller now.  It might even have
1516          * no entries at all (with bottom-up deletion caller), in which case there
1517          * is nothing left to do.
1518          */
1519         qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1520                   _bt_delitems_cmp);
1521         if (delstate->ndeltids == 0)
1522         {
1523                 Assert(delstate->bottomup);
1524                 return;
1525         }
1526
1527         /* We definitely have to delete at least one index tuple (or one TID) */
1528         for (int i = 0; i < delstate->ndeltids; i++)
1529         {
1530                 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1531                 OffsetNumber idxoffnum = dstatus->idxoffnum;
1532                 ItemId          itemid = PageGetItemId(page, idxoffnum);
1533                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
1534                 int                     nestedi,
1535                                         nitem;
1536                 BTVacuumPosting vacposting;
1537
1538                 Assert(OffsetNumberIsValid(idxoffnum));
1539
1540                 if (idxoffnum == postingidxoffnum)
1541                 {
1542                         /*
1543                          * This deltid entry is a TID from a posting list tuple that has
1544                          * already been completely processed
1545                          */
1546                         Assert(BTreeTupleIsPosting(itup));
1547                         Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
1548                                                                           &delstate->deltids[i].tid) < 0);
1549                         Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
1550                                                                           &delstate->deltids[i].tid) >= 0);
1551                         continue;
1552                 }
1553
1554                 if (!BTreeTupleIsPosting(itup))
1555                 {
1556                         /* Plain non-pivot tuple */
1557                         Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1558                         if (dstatus->knowndeletable)
1559                                 deletable[ndeletable++] = idxoffnum;
1560                         continue;
1561                 }
1562
1563                 /*
1564                  * itup is a posting list tuple whose lowest deltids entry (which may
1565                  * or may not be for the first TID from itup) is considered here now.
1566                  * We should process all of the deltids entries for the posting list
1567                  * together now, though (not just the lowest).  Remember to skip over
1568                  * later itup-related entries during later iterations of outermost
1569                  * loop.
1570                  */
1571                 postingidxoffnum = idxoffnum;   /* Remember work in outermost loop */
1572                 nestedi = i;                    /* Initialize for first itup deltids entry */
1573                 vacposting = NULL;              /* Describes final action for itup */
1574                 nitem = BTreeTupleGetNPosting(itup);
1575                 for (int p = 0; p < nitem; p++)
1576                 {
1577                         ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1578                         int                     ptidcmp = -1;
1579
1580                         /*
1581                          * This nested loop reuses work across ptid TIDs taken from itup.
1582                          * We take advantage of the fact that both itup's TIDs and deltids
1583                          * entries (within a single itup/posting list grouping) must both
1584                          * be in ascending TID order.
1585                          */
1586                         for (; nestedi < delstate->ndeltids; nestedi++)
1587                         {
1588                                 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1589                                 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1590
1591                                 /* Stop once we get past all itup related deltids entries */
1592                                 Assert(tdstatus->idxoffnum >= idxoffnum);
1593                                 if (tdstatus->idxoffnum != idxoffnum)
1594                                         break;
1595
1596                                 /* Skip past non-deletable itup related entries up front */
1597                                 if (!tdstatus->knowndeletable)
1598                                         continue;
1599
1600                                 /* Entry is first partial ptid match (or an exact match)? */
1601                                 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1602                                 if (ptidcmp >= 0)
1603                                 {
1604                                         /* Greater than or equal (partial or exact) match... */
1605                                         break;
1606                                 }
1607                         }
1608
1609                         /* ...exact ptid match to a deletable deltids entry? */
1610                         if (ptidcmp != 0)
1611                                 continue;
1612
1613                         /* Exact match for deletable deltids entry -- ptid gets deleted */
1614                         if (vacposting == NULL)
1615                         {
1616                                 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1617                                                                         nitem * sizeof(uint16));
1618                                 vacposting->itup = itup;
1619                                 vacposting->updatedoffset = idxoffnum;
1620                                 vacposting->ndeletedtids = 0;
1621                         }
1622                         vacposting->deletetids[vacposting->ndeletedtids++] = p;
1623                 }
1624
1625                 /* Final decision on itup, a posting list tuple */
1626
1627                 if (vacposting == NULL)
1628                 {
1629                         /* No TIDs to delete from itup -- do nothing */
1630                 }
1631                 else if (vacposting->ndeletedtids == nitem)
1632                 {
1633                         /* Straight delete of itup (to delete all TIDs) */
1634                         deletable[ndeletable++] = idxoffnum;
1635                         /* Turns out we won't need granular information */
1636                         pfree(vacposting);
1637                 }
1638                 else
1639                 {
1640                         /* Delete some (but not all) TIDs from itup */
1641                         Assert(vacposting->ndeletedtids > 0 &&
1642                                    vacposting->ndeletedtids < nitem);
1643                         updatable[nupdatable++] = vacposting;
1644                 }
1645         }
1646
1647         /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1648         _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1649                                                 updatable, nupdatable);
1650
1651         /* be tidy */
1652         for (int i = 0; i < nupdatable; i++)
1653                 pfree(updatable[i]);
1654 }
1655
1656 /*
1657  * Check that leftsib page (the btpo_prev of target page) is not marked with
1658  * INCOMPLETE_SPLIT flag.  Used during page deletion.
1659  *
1660  * Returning true indicates that page flag is set in leftsib (which is
1661  * definitely still the left sibling of target).  When that happens, the
1662  * target doesn't have a downlink in parent, and the page deletion algorithm
1663  * isn't prepared to handle that.  Deletion of the target page (or the whole
1664  * subtree that contains the target page) cannot take place.
1665  *
1666  * Caller should not have a lock on the target page itself, since pages on the
1667  * same level must always be locked left to right to avoid deadlocks.
1668  */
1669 static bool
1670 _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
1671 {
1672         Buffer          buf;
1673         Page            page;
1674         BTPageOpaque opaque;
1675         bool            result;
1676
1677         /* Easy case: No left sibling */
1678         if (leftsib == P_NONE)
1679                 return false;
1680
1681         buf = _bt_getbuf(rel, leftsib, BT_READ);
1682         page = BufferGetPage(buf);
1683         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1684
1685         /*
1686          * If the left sibling was concurrently split, so that its next-pointer
1687          * doesn't point to the current page anymore, the split that created
1688          * target must be completed.  Caller can reasonably expect that there will
1689          * be a downlink to the target page that it can relocate using its stack.
1690          * (We don't allow splitting an incompletely split page again until the
1691          * previous split has been completed.)
1692          */
1693         result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1694         _bt_relbuf(rel, buf);
1695
1696         return result;
1697 }
1698
1699 /*
1700  * Check that leafrightsib page (the btpo_next of target leaf page) is not
1701  * marked with ISHALFDEAD flag.  Used during page deletion.
1702  *
1703  * Returning true indicates that page flag is set in leafrightsib, so page
1704  * deletion cannot go ahead.  Our caller is not prepared to deal with the case
1705  * where the parent page does not have a pivot tuples whose downlink points to
1706  * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
1707  * seem worth going to the trouble of teaching our caller to deal with it.
1708  * The situation will be resolved after VACUUM finishes the deletion of the
1709  * half-dead page (when a future VACUUM operation reaches the target page
1710  * again).
1711  *
1712  * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1713  * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
1714  * okay because of the restriction on deleting pages that are the rightmost
1715  * page of their parent (i.e. that such deletions can only take place when the
1716  * entire subtree must be deleted).  The leaf level check made here will apply
1717  * to a right "cousin" leaf page rather than a simple right sibling leaf page
1718  * in cases where caller actually goes on to attempt deleting pages that are
1719  * above the leaf page.  The right cousin leaf page is representative of the
1720  * left edge of the subtree to the right of the to-be-deleted subtree as a
1721  * whole, which is exactly the condition that our caller cares about.
1722  * (Besides, internal pages are never marked half-dead, so it isn't even
1723  * possible to _directly_ assess if an internal page is part of some other
1724  * to-be-deleted subtree.)
1725  */
1726 static bool
1727 _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
1728 {
1729         Buffer          buf;
1730         Page            page;
1731         BTPageOpaque opaque;
1732         bool            result;
1733
1734         Assert(leafrightsib != P_NONE);
1735
1736         buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1737         page = BufferGetPage(buf);
1738         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1739
1740         Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1741         result = P_ISHALFDEAD(opaque);
1742         _bt_relbuf(rel, buf);
1743
1744         return result;
1745 }
1746
1747 /*
1748  * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1749  *
1750  * This action unlinks the leaf page from the b-tree structure, removing all
1751  * pointers leading to it --- but not touching its own left and right links.
1752  * The page cannot be physically reclaimed right away, since other processes
1753  * may currently be trying to follow links leading to the page; they have to
1754  * be allowed to use its right-link to recover.  See nbtree/README.
1755  *
1756  * On entry, the target buffer must be pinned and locked (either read or write
1757  * lock is OK).  The page must be an empty leaf page, which may be half-dead
1758  * already (a half-dead page should only be passed to us when an earlier
1759  * VACUUM operation was interrupted, though).  Note in particular that caller
1760  * should never pass a buffer containing an existing deleted page here.  The
1761  * lock and pin on caller's buffer will be dropped before we return.
1762  *
1763  * Maintains bulk delete stats for caller, which are taken from vstate.  We
1764  * need to cooperate closely with caller here so that whole VACUUM operation
1765  * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1766  * delete in passing.  If such pages happen to be from a block number that is
1767  * ahead of the current scanblkno position, then caller is expected to count
1768  * them directly later on.  It's simpler for us to understand caller's
1769  * requirements than it would be for caller to understand when or how a
1770  * deleted page became deleted after the fact.
1771  *
1772  * NOTE: this leaks memory.  Rather than trying to clean up everything
1773  * carefully, it's better to run it in a temp context that can be reset
1774  * frequently.
1775  */
1776 void
1777 _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1778 {
1779         BlockNumber rightsib;
1780         bool            rightsib_empty;
1781         Page            page;
1782         BTPageOpaque opaque;
1783
1784         /*
1785          * Save original leafbuf block number from caller.  Only deleted blocks
1786          * that are <= scanblkno are added to bulk delete stat's pages_deleted
1787          * count.
1788          */
1789         BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1790
1791         /*
1792          * "stack" is a search stack leading (approximately) to the target page.
1793          * It is initially NULL, but when iterating, we keep it to avoid
1794          * duplicated search effort.
1795          *
1796          * Also, when "stack" is not NULL, we have already checked that the
1797          * current page is not the right half of an incomplete split, i.e. the
1798          * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1799          * when the current target page is to the right of caller's initial page
1800          * (the scanblkno page).
1801          */
1802         BTStack         stack = NULL;
1803
1804         for (;;)
1805         {
1806                 page = BufferGetPage(leafbuf);
1807                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1808
1809                 /*
1810                  * Internal pages are never deleted directly, only as part of deleting
1811                  * the whole subtree all the way down to leaf level.
1812                  *
1813                  * Also check for deleted pages here.  Caller never passes us a fully
1814                  * deleted page.  Only VACUUM can delete pages, so there can't have
1815                  * been a concurrent deletion.  Assume that we reached any deleted
1816                  * page encountered here by following a sibling link, and that the
1817                  * index is corrupt.
1818                  */
1819                 Assert(!P_ISDELETED(opaque));
1820                 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1821                 {
1822                         /*
1823                          * Pre-9.4 page deletion only marked internal pages as half-dead,
1824                          * but now we only use that flag on leaf pages. The old algorithm
1825                          * was never supposed to leave half-dead pages in the tree, it was
1826                          * just a transient state, but it was nevertheless possible in
1827                          * error scenarios. We don't know how to deal with them here. They
1828                          * are harmless as far as searches are considered, but inserts
1829                          * into the deleted keyspace could add out-of-order downlinks in
1830                          * the upper levels. Log a notice, hopefully the admin will notice
1831                          * and reindex.
1832                          */
1833                         if (P_ISHALFDEAD(opaque))
1834                                 ereport(LOG,
1835                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
1836                                                  errmsg("index \"%s\" contains a half-dead internal page",
1837                                                                 RelationGetRelationName(rel)),
1838                                                  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1839
1840                         if (P_ISDELETED(opaque))
1841                                 ereport(LOG,
1842                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
1843                                                  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1844                                                                                  BufferGetBlockNumber(leafbuf),
1845                                                                                  scanblkno,
1846                                                                                  RelationGetRelationName(rel))));
1847
1848                         _bt_relbuf(rel, leafbuf);
1849                         return;
1850                 }
1851
1852                 /*
1853                  * We can never delete rightmost pages nor root pages.  While at it,
1854                  * check that page is empty, since it's possible that the leafbuf page
1855                  * was empty a moment ago, but has since had some inserts.
1856                  *
1857                  * To keep the algorithm simple, we also never delete an incompletely
1858                  * split page (they should be rare enough that this doesn't make any
1859                  * meaningful difference to disk usage):
1860                  *
1861                  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1862                  * left half of an incomplete split, but ensuring that it's not the
1863                  * right half is more complicated.  For that, we have to check that
1864                  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1865                  * _bt_leftsib_splitflag().  On the first iteration, we temporarily
1866                  * release the lock on scanblkno/leafbuf, check the left sibling, and
1867                  * construct a search stack to scanblkno.  On subsequent iterations,
1868                  * we know we stepped right from a page that passed these tests, so
1869                  * it's OK.
1870                  */
1871                 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1872                         P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1873                         P_INCOMPLETE_SPLIT(opaque))
1874                 {
1875                         /* Should never fail to delete a half-dead page */
1876                         Assert(!P_ISHALFDEAD(opaque));
1877
1878                         _bt_relbuf(rel, leafbuf);
1879                         return;
1880                 }
1881
1882                 /*
1883                  * First, remove downlink pointing to the page (or a parent of the
1884                  * page, if we are going to delete a taller subtree), and mark the
1885                  * leafbuf page half-dead
1886                  */
1887                 if (!P_ISHALFDEAD(opaque))
1888                 {
1889                         /*
1890                          * We need an approximate pointer to the page's parent page.  We
1891                          * use a variant of the standard search mechanism to search for
1892                          * the page's high key; this will give us a link to either the
1893                          * current parent or someplace to its left (if there are multiple
1894                          * equal high keys, which is possible with !heapkeyspace indexes).
1895                          *
1896                          * Also check if this is the right-half of an incomplete split
1897                          * (see comment above).
1898                          */
1899                         if (!stack)
1900                         {
1901                                 BTScanInsert itup_key;
1902                                 ItemId          itemid;
1903                                 IndexTuple      targetkey;
1904                                 BlockNumber leftsib,
1905                                                         leafblkno;
1906                                 Buffer          sleafbuf;
1907
1908                                 itemid = PageGetItemId(page, P_HIKEY);
1909                                 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1910
1911                                 leftsib = opaque->btpo_prev;
1912                                 leafblkno = BufferGetBlockNumber(leafbuf);
1913
1914                                 /*
1915                                  * To avoid deadlocks, we'd better drop the leaf page lock
1916                                  * before going further.
1917                                  */
1918                                 _bt_unlockbuf(rel, leafbuf);
1919
1920                                 /*
1921                                  * Check that the left sibling of leafbuf (if any) is not
1922                                  * marked with INCOMPLETE_SPLIT flag before proceeding
1923                                  */
1924                                 Assert(leafblkno == scanblkno);
1925                                 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1926                                 {
1927                                         ReleaseBuffer(leafbuf);
1928                                         return;
1929                                 }
1930
1931                                 /* we need an insertion scan key for the search, so build one */
1932                                 itup_key = _bt_mkscankey(rel, targetkey);
1933                                 /* find the leftmost leaf page with matching pivot/high key */
1934                                 itup_key->pivotsearch = true;
1935                                 stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
1936                                 /* won't need a second lock or pin on leafbuf */
1937                                 _bt_relbuf(rel, sleafbuf);
1938
1939                                 /*
1940                                  * Re-lock the leaf page, and start over to use our stack
1941                                  * within _bt_mark_page_halfdead.  We must do it that way
1942                                  * because it's possible that leafbuf can no longer be
1943                                  * deleted.  We need to recheck.
1944                                  *
1945                                  * Note: We can't simply hold on to the sleafbuf lock instead,
1946                                  * because it's barely possible that sleafbuf is not the same
1947                                  * page as leafbuf.  This happens when leafbuf split after our
1948                                  * original lock was dropped, but before _bt_search finished
1949                                  * its descent.  We rely on the assumption that we'll find
1950                                  * leafbuf isn't safe to delete anymore in this scenario.
1951                                  * (Page deletion can cope with the stack being to the left of
1952                                  * leafbuf, but not to the right of leafbuf.)
1953                                  */
1954                                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1955                                 continue;
1956                         }
1957
1958                         /*
1959                          * See if it's safe to delete the leaf page, and determine how
1960                          * many parent/internal pages above the leaf level will be
1961                          * deleted.  If it's safe then _bt_mark_page_halfdead will also
1962                          * perform the first phase of deletion, which includes marking the
1963                          * leafbuf page half-dead.
1964                          */
1965                         Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
1966                         if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
1967                         {
1968                                 _bt_relbuf(rel, leafbuf);
1969                                 return;
1970                         }
1971                 }
1972
1973                 /*
1974                  * Then unlink it from its siblings.  Each call to
1975                  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
1976                  * making it shallower.  Iterate until the leafbuf page is deleted.
1977                  */
1978                 rightsib_empty = false;
1979                 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
1980                 while (P_ISHALFDEAD(opaque))
1981                 {
1982                         /* Check for interrupts in _bt_unlink_halfdead_page */
1983                         if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
1984                                                                                   &rightsib_empty, vstate))
1985                         {
1986                                 /*
1987                                  * _bt_unlink_halfdead_page should never fail, since we
1988                                  * established that deletion is generally safe in
1989                                  * _bt_mark_page_halfdead -- index must be corrupt.
1990                                  *
1991                                  * Note that _bt_unlink_halfdead_page already released the
1992                                  * lock and pin on leafbuf for us.
1993                                  */
1994                                 Assert(false);
1995                                 return;
1996                         }
1997                 }
1998
1999                 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2000
2001                 rightsib = opaque->btpo_next;
2002
2003                 _bt_relbuf(rel, leafbuf);
2004
2005                 /*
2006                  * Check here, as calling loops will have locks held, preventing
2007                  * interrupts from being processed.
2008                  */
2009                 CHECK_FOR_INTERRUPTS();
2010
2011                 /*
2012                  * The page has now been deleted. If its right sibling is completely
2013                  * empty, it's possible that the reason we haven't deleted it earlier
2014                  * is that it was the rightmost child of the parent. Now that we
2015                  * removed the downlink for this page, the right sibling might now be
2016                  * the only child of the parent, and could be removed. It would be
2017                  * picked up by the next vacuum anyway, but might as well try to
2018                  * remove it now, so loop back to process the right sibling.
2019                  *
2020                  * Note: This relies on the assumption that _bt_getstackbuf() will be
2021                  * able to reuse our original descent stack with a different child
2022                  * block (provided that the child block is to the right of the
2023                  * original leaf page reached by _bt_search()). It will even update
2024                  * the descent stack each time we loop around, avoiding repeated work.
2025                  */
2026                 if (!rightsib_empty)
2027                         break;
2028
2029                 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2030         }
2031 }
2032
2033 /*
2034  * First stage of page deletion.
2035  *
2036  * Establish the height of the to-be-deleted subtree with leafbuf at its
2037  * lowest level, remove the downlink to the subtree, and mark leafbuf
2038  * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
2039  * but may include additional internal pages (at most one per level of the
2040  * tree below the root).
2041  *
2042  * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2043  * the rightmost child of its parent (and parent has more than one downlink).
2044  * Returns 'true' when the first stage of page deletion completed
2045  * successfully.
2046  */
2047 static bool
2048 _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
2049 {
2050         BlockNumber leafblkno;
2051         BlockNumber leafrightsib;
2052         BlockNumber topparent;
2053         BlockNumber topparentrightsib;
2054         ItemId          itemid;
2055         Page            page;
2056         BTPageOpaque opaque;
2057         Buffer          subtreeparent;
2058         OffsetNumber poffset;
2059         OffsetNumber nextoffset;
2060         IndexTuple      itup;
2061         IndexTupleData trunctuple;
2062
2063         page = BufferGetPage(leafbuf);
2064         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2065
2066         Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2067                    P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2068                    P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2069
2070         /*
2071          * Save info about the leaf page.
2072          */
2073         leafblkno = BufferGetBlockNumber(leafbuf);
2074         leafrightsib = opaque->btpo_next;
2075
2076         /*
2077          * Before attempting to lock the parent page, check that the right sibling
2078          * is not in half-dead state.  A half-dead right sibling would have no
2079          * downlink in the parent, which would be highly confusing later when we
2080          * delete the downlink.  It would fail the "right sibling of target page
2081          * is also the next child in parent page" cross-check below.
2082          */
2083         if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2084         {
2085                 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2086                          leafblkno, leafrightsib);
2087                 return false;
2088         }
2089
2090         /*
2091          * We cannot delete a page that is the rightmost child of its immediate
2092          * parent, unless it is the only child --- in which case the parent has to
2093          * be deleted too, and the same condition applies recursively to it. We
2094          * have to check this condition all the way up before trying to delete,
2095          * and lock the parent of the root of the to-be-deleted subtree (the
2096          * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
2097          * for us.  We remove the downlink to the "top parent" page (subtree root
2098          * page) from the subtree parent page below.
2099          *
2100          * Initialize topparent to be leafbuf page now.  The final to-be-deleted
2101          * subtree is often a degenerate one page subtree consisting only of the
2102          * leafbuf page.  When that happens, the leafbuf page is the final subtree
2103          * root page/top parent page.
2104          */
2105         topparent = leafblkno;
2106         topparentrightsib = leafrightsib;
2107         if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
2108                                                                  &subtreeparent, &poffset,
2109                                                                  &topparent, &topparentrightsib))
2110                 return false;
2111
2112         /*
2113          * Check that the parent-page index items we're about to delete/overwrite
2114          * in subtree parent page contain what we expect.  This can fail if the
2115          * index has become corrupt for some reason.  We want to throw any error
2116          * before entering the critical section --- otherwise it'd be a PANIC.
2117          */
2118         page = BufferGetPage(subtreeparent);
2119         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2120
2121 #ifdef USE_ASSERT_CHECKING
2122
2123         /*
2124          * This is just an assertion because _bt_lock_subtree_parent should have
2125          * guaranteed tuple has the expected contents
2126          */
2127         itemid = PageGetItemId(page, poffset);
2128         itup = (IndexTuple) PageGetItem(page, itemid);
2129         Assert(BTreeTupleGetDownLink(itup) == topparent);
2130 #endif
2131
2132         nextoffset = OffsetNumberNext(poffset);
2133         itemid = PageGetItemId(page, nextoffset);
2134         itup = (IndexTuple) PageGetItem(page, itemid);
2135         if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2136                 ereport(ERROR,
2137                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2138                                  errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2139                                                                  topparentrightsib, topparent,
2140                                                                  BTreeTupleGetDownLink(itup),
2141                                                                  BufferGetBlockNumber(subtreeparent),
2142                                                                  RelationGetRelationName(rel))));
2143
2144         /*
2145          * Any insert which would have gone on the leaf block will now go to its
2146          * right sibling.  In other words, the key space moves right.
2147          */
2148         PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2149
2150         /* No ereport(ERROR) until changes are logged */
2151         START_CRIT_SECTION();
2152
2153         /*
2154          * Update parent of subtree.  We want to delete the downlink to the top
2155          * parent page/root of the subtree, and the *following* key.  Easiest way
2156          * is to copy the right sibling's downlink over the downlink that points
2157          * to top parent page, and then delete the right sibling's original pivot
2158          * tuple.
2159          *
2160          * Lanin and Shasha make the key space move left when deleting a page,
2161          * whereas the key space moves right here.  That's why we cannot simply
2162          * delete the pivot tuple with the downlink to the top parent page.  See
2163          * nbtree/README.
2164          */
2165         page = BufferGetPage(subtreeparent);
2166         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2167
2168         itemid = PageGetItemId(page, poffset);
2169         itup = (IndexTuple) PageGetItem(page, itemid);
2170         BTreeTupleSetDownLink(itup, topparentrightsib);
2171
2172         nextoffset = OffsetNumberNext(poffset);
2173         PageIndexTupleDelete(page, nextoffset);
2174
2175         /*
2176          * Mark the leaf page as half-dead, and stamp it with a link to the top
2177          * parent page.  When the leaf page is also the top parent page, the link
2178          * is set to InvalidBlockNumber.
2179          */
2180         page = BufferGetPage(leafbuf);
2181         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2182         opaque->btpo_flags |= BTP_HALF_DEAD;
2183
2184         Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
2185         MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2186         trunctuple.t_info = sizeof(IndexTupleData);
2187         if (topparent != leafblkno)
2188                 BTreeTupleSetTopParent(&trunctuple, topparent);
2189         else
2190                 BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
2191
2192         if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2193                                                                  IndexTupleSize(&trunctuple)))
2194                 elog(ERROR, "could not overwrite high key in half-dead page");
2195
2196         /* Must mark buffers dirty before XLogInsert */
2197         MarkBufferDirty(subtreeparent);
2198         MarkBufferDirty(leafbuf);
2199
2200         /* XLOG stuff */
2201         if (RelationNeedsWAL(rel))
2202         {
2203                 xl_btree_mark_page_halfdead xlrec;
2204                 XLogRecPtr      recptr;
2205
2206                 xlrec.poffset = poffset;
2207                 xlrec.leafblk = leafblkno;
2208                 if (topparent != leafblkno)
2209                         xlrec.topparent = topparent;
2210                 else
2211                         xlrec.topparent = InvalidBlockNumber;
2212
2213                 XLogBeginInsert();
2214                 XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2215                 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2216
2217                 page = BufferGetPage(leafbuf);
2218                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2219                 xlrec.leftblk = opaque->btpo_prev;
2220                 xlrec.rightblk = opaque->btpo_next;
2221
2222                 XLogRegisterData((char *) &xlrec, SizeOfBtreeMarkPageHalfDead);
2223
2224                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2225
2226                 page = BufferGetPage(subtreeparent);
2227                 PageSetLSN(page, recptr);
2228                 page = BufferGetPage(leafbuf);
2229                 PageSetLSN(page, recptr);
2230         }
2231
2232         END_CRIT_SECTION();
2233
2234         _bt_relbuf(rel, subtreeparent);
2235         return true;
2236 }
2237
2238 /*
2239  * Second stage of page deletion.
2240  *
2241  * Unlinks a single page (in the subtree undergoing deletion) from its
2242  * siblings.  Also marks the page deleted.
2243  *
2244  * To get rid of the whole subtree, including the leaf page itself, call here
2245  * until the leaf page is deleted.  The original "top parent" established in
2246  * the first stage of deletion is deleted in the first call here, while the
2247  * leaf page is deleted in the last call here.  Note that the leaf page itself
2248  * is often the initial top parent page.
2249  *
2250  * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
2251  * the right sibling of the current target page is empty, *rightsib_empty is
2252  * set to true, allowing caller to delete the target's right sibling page in
2253  * passing.  Note that *rightsib_empty is only actually used by caller when
2254  * target page is leafbuf, following last call here for leafbuf/the subtree
2255  * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
2256  * consistent.)
2257  *
2258  * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2259  * On success exit, we'll be holding pin and write lock.  On failure exit,
2260  * we'll release both pin and lock before returning (we define it that way
2261  * to avoid having to reacquire a lock we already released).
2262  */
2263 static bool
2264 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
2265                                                  bool *rightsib_empty, BTVacState *vstate)
2266 {
2267         BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2268         IndexBulkDeleteResult *stats = vstate->stats;
2269         BlockNumber leafleftsib;
2270         BlockNumber leafrightsib;
2271         BlockNumber target;
2272         BlockNumber leftsib;
2273         BlockNumber rightsib;
2274         Buffer          lbuf = InvalidBuffer;
2275         Buffer          buf;
2276         Buffer          rbuf;
2277         Buffer          metabuf = InvalidBuffer;
2278         Page            metapg = NULL;
2279         BTMetaPageData *metad = NULL;
2280         ItemId          itemid;
2281         Page            page;
2282         BTPageOpaque opaque;
2283         FullTransactionId safexid;
2284         bool            rightsib_is_rightmost;
2285         uint32          targetlevel;
2286         IndexTuple      leafhikey;
2287         BlockNumber leaftopparent;
2288
2289         page = BufferGetPage(leafbuf);
2290         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2291
2292         Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2293
2294         /*
2295          * Remember some information about the leaf page.
2296          */
2297         itemid = PageGetItemId(page, P_HIKEY);
2298         leafhikey = (IndexTuple) PageGetItem(page, itemid);
2299         target = BTreeTupleGetTopParent(leafhikey);
2300         leafleftsib = opaque->btpo_prev;
2301         leafrightsib = opaque->btpo_next;
2302
2303         _bt_unlockbuf(rel, leafbuf);
2304
2305         /*
2306          * Check here, as calling loops will have locks held, preventing
2307          * interrupts from being processed.
2308          */
2309         CHECK_FOR_INTERRUPTS();
2310
2311         /* Unlink the current top parent of the subtree */
2312         if (!BlockNumberIsValid(target))
2313         {
2314                 /* Target is leaf page (or leaf page is top parent, if you prefer) */
2315                 target = leafblkno;
2316
2317                 buf = leafbuf;
2318                 leftsib = leafleftsib;
2319                 targetlevel = 0;
2320         }
2321         else
2322         {
2323                 /* Target is the internal page taken from leaf's top parent link */
2324                 Assert(target != leafblkno);
2325
2326                 /* Fetch the block number of the target's left sibling */
2327                 buf = _bt_getbuf(rel, target, BT_READ);
2328                 page = BufferGetPage(buf);
2329                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2330                 leftsib = opaque->btpo_prev;
2331                 targetlevel = opaque->btpo_level;
2332                 Assert(targetlevel > 0);
2333
2334                 /*
2335                  * To avoid deadlocks, we'd better drop the target page lock before
2336                  * going further.
2337                  */
2338                 _bt_unlockbuf(rel, buf);
2339         }
2340
2341         /*
2342          * We have to lock the pages we need to modify in the standard order:
2343          * moving right, then up.  Else we will deadlock against other writers.
2344          *
2345          * So, first lock the leaf page, if it's not the target.  Then find and
2346          * write-lock the current left sibling of the target page.  The sibling
2347          * that was current a moment ago could have split, so we may have to move
2348          * right.
2349          */
2350         if (target != leafblkno)
2351                 _bt_lockbuf(rel, leafbuf, BT_WRITE);
2352         if (leftsib != P_NONE)
2353         {
2354                 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2355                 page = BufferGetPage(lbuf);
2356                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2357                 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2358                 {
2359                         bool    leftsibvalid = true;
2360
2361                         /*
2362                          * Before we follow the link from the page that was the left
2363                          * sibling mere moments ago, validate its right link.  This
2364                          * reduces the opportunities for loop to fail to ever make any
2365                          * progress in the presence of index corruption.
2366                          *
2367                          * Note: we rely on the assumption that there can only be one
2368                          * vacuum process running at a time (against the same index).
2369                          */
2370                         if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2371                                 leftsib == opaque->btpo_next)
2372                                 leftsibvalid = false;
2373
2374                         leftsib = opaque->btpo_next;
2375                         _bt_relbuf(rel, lbuf);
2376
2377                         if (!leftsibvalid)
2378                         {
2379                                 if (target != leafblkno)
2380                                 {
2381                                         /* we have only a pin on target, but pin+lock on leafbuf */
2382                                         ReleaseBuffer(buf);
2383                                         _bt_relbuf(rel, leafbuf);
2384                                 }
2385                                 else
2386                                 {
2387                                         /* we have only a pin on leafbuf */
2388                                         ReleaseBuffer(leafbuf);
2389                                 }
2390
2391                                 ereport(LOG,
2392                                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2393                                                  errmsg_internal("valid left sibling for deletion target could not be located: "
2394                                                                                  "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
2395                                                                                  leftsib, target, leafblkno, scanblkno,
2396                                                                                  RelationGetRelationName(rel))));
2397
2398                                 return false;
2399                         }
2400
2401                         CHECK_FOR_INTERRUPTS();
2402
2403                         /* step right one page */
2404                         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2405                         page = BufferGetPage(lbuf);
2406                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2407                 }
2408         }
2409         else
2410                 lbuf = InvalidBuffer;
2411
2412         /* Next write-lock the target page itself */
2413         _bt_lockbuf(rel, buf, BT_WRITE);
2414         page = BufferGetPage(buf);
2415         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2416
2417         /*
2418          * Check page is still empty etc, else abandon deletion.  This is just for
2419          * paranoia's sake; a half-dead page cannot resurrect because there can be
2420          * only one vacuum process running at a time.
2421          */
2422         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2423                 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2424                          target, RelationGetRelationName(rel));
2425
2426         if (opaque->btpo_prev != leftsib)
2427                 ereport(ERROR,
2428                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2429                                  errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2430                                                                  leftsib, opaque->btpo_prev, target,
2431                                                                  RelationGetRelationName(rel))));
2432
2433         if (target == leafblkno)
2434         {
2435                 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2436                         !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2437                         elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2438                                  target, RelationGetRelationName(rel));
2439
2440                 /* Leaf page is also target page: don't set leaftopparent */
2441                 leaftopparent = InvalidBlockNumber;
2442         }
2443         else
2444         {
2445                 IndexTuple      finaldataitem;
2446
2447                 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2448                         P_ISLEAF(opaque))
2449                         elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2450                                  targetlevel, target, RelationGetRelationName(rel));
2451
2452                 /* Target is internal: set leaftopparent for next call here...  */
2453                 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2454                 finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2455                 leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2456                 /* ...except when it would be a redundant pointer-to-self */
2457                 if (leaftopparent == leafblkno)
2458                         leaftopparent = InvalidBlockNumber;
2459         }
2460
2461         /*
2462          * And next write-lock the (current) right sibling.
2463          */
2464         rightsib = opaque->btpo_next;
2465         rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2466         page = BufferGetPage(rbuf);
2467         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2468         if (opaque->btpo_prev != target)
2469                 ereport(ERROR,
2470                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2471                                  errmsg_internal("right sibling's left-link doesn't match: "
2472                                                                  "block %u links to %u instead of expected %u in index \"%s\"",
2473                                                                  rightsib, opaque->btpo_prev, target,
2474                                                                  RelationGetRelationName(rel))));
2475         rightsib_is_rightmost = P_RIGHTMOST(opaque);
2476         *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2477
2478         /*
2479          * If we are deleting the next-to-last page on the target's level, then
2480          * the rightsib is a candidate to become the new fast root. (In theory, it
2481          * might be possible to push the fast root even further down, but the odds
2482          * of doing so are slim, and the locking considerations daunting.)
2483          *
2484          * We can safely acquire a lock on the metapage here --- see comments for
2485          * _bt_newroot().
2486          */
2487         if (leftsib == P_NONE && rightsib_is_rightmost)
2488         {
2489                 page = BufferGetPage(rbuf);
2490                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2491                 if (P_RIGHTMOST(opaque))
2492                 {
2493                         /* rightsib will be the only one left on the level */
2494                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2495                         metapg = BufferGetPage(metabuf);
2496                         metad = BTPageGetMeta(metapg);
2497
2498                         /*
2499                          * The expected case here is btm_fastlevel == targetlevel+1; if
2500                          * the fastlevel is <= targetlevel, something is wrong, and we
2501                          * choose to overwrite it to fix it.
2502                          */
2503                         if (metad->btm_fastlevel > targetlevel + 1)
2504                         {
2505                                 /* no update wanted */
2506                                 _bt_relbuf(rel, metabuf);
2507                                 metabuf = InvalidBuffer;
2508                         }
2509                 }
2510         }
2511
2512         /*
2513          * Here we begin doing the deletion.
2514          */
2515
2516         /* No ereport(ERROR) until changes are logged */
2517         START_CRIT_SECTION();
2518
2519         /*
2520          * Update siblings' side-links.  Note the target page's side-links will
2521          * continue to point to the siblings.  Asserts here are just rechecking
2522          * things we already verified above.
2523          */
2524         if (BufferIsValid(lbuf))
2525         {
2526                 page = BufferGetPage(lbuf);
2527                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2528                 Assert(opaque->btpo_next == target);
2529                 opaque->btpo_next = rightsib;
2530         }
2531         page = BufferGetPage(rbuf);
2532         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2533         Assert(opaque->btpo_prev == target);
2534         opaque->btpo_prev = leftsib;
2535
2536         /*
2537          * If we deleted a parent of the targeted leaf page, instead of the leaf
2538          * itself, update the leaf to point to the next remaining child in the
2539          * subtree.
2540          *
2541          * Note: We rely on the fact that a buffer pin on the leaf page has been
2542          * held since leafhikey was initialized.  This is safe, though only
2543          * because the page was already half-dead at that point.  The leaf page
2544          * cannot have been modified by any other backend during the period when
2545          * no lock was held.
2546          */
2547         if (target != leafblkno)
2548                 BTreeTupleSetTopParent(leafhikey, leaftopparent);
2549
2550         /*
2551          * Mark the page itself deleted.  It can be recycled when all current
2552          * transactions are gone.  Storing GetTopTransactionId() would work, but
2553          * we're in VACUUM and would not otherwise have an XID.  Having already
2554          * updated links to the target, ReadNextFullTransactionId() suffices as an
2555          * upper bound.  Any scan having retained a now-stale link is advertising
2556          * in its PGPROC an xmin less than or equal to the value we read here.  It
2557          * will continue to do so, holding back the xmin horizon, for the duration
2558          * of that scan.
2559          */
2560         page = BufferGetPage(buf);
2561         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2562         Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2563
2564         /*
2565          * Store upper bound XID that's used to determine when deleted page is no
2566          * longer needed as a tombstone
2567          */
2568         safexid = ReadNextFullTransactionId();
2569         BTPageSetDeleted(page, safexid);
2570         opaque->btpo_cycleid = 0;
2571
2572         /* And update the metapage, if needed */
2573         if (BufferIsValid(metabuf))
2574         {
2575                 /* upgrade metapage if needed */
2576                 if (metad->btm_version < BTREE_NOVAC_VERSION)
2577                         _bt_upgrademetapage(metapg);
2578                 metad->btm_fastroot = rightsib;
2579                 metad->btm_fastlevel = targetlevel;
2580                 MarkBufferDirty(metabuf);
2581         }
2582
2583         /* Must mark buffers dirty before XLogInsert */
2584         MarkBufferDirty(rbuf);
2585         MarkBufferDirty(buf);
2586         if (BufferIsValid(lbuf))
2587                 MarkBufferDirty(lbuf);
2588         if (target != leafblkno)
2589                 MarkBufferDirty(leafbuf);
2590
2591         /* XLOG stuff */
2592         if (RelationNeedsWAL(rel))
2593         {
2594                 xl_btree_unlink_page xlrec;
2595                 xl_btree_metadata xlmeta;
2596                 uint8           xlinfo;
2597                 XLogRecPtr      recptr;
2598
2599                 XLogBeginInsert();
2600
2601                 XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2602                 if (BufferIsValid(lbuf))
2603                         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2604                 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2605                 if (target != leafblkno)
2606                         XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2607
2608                 /* information stored on the target/to-be-unlinked block */
2609                 xlrec.leftsib = leftsib;
2610                 xlrec.rightsib = rightsib;
2611                 xlrec.level = targetlevel;
2612                 xlrec.safexid = safexid;
2613
2614                 /* information needed to recreate the leaf block (if not the target) */
2615                 xlrec.leafleftsib = leafleftsib;
2616                 xlrec.leafrightsib = leafrightsib;
2617                 xlrec.leaftopparent = leaftopparent;
2618
2619                 XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
2620
2621                 if (BufferIsValid(metabuf))
2622                 {
2623                         XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2624
2625                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2626                         xlmeta.version = metad->btm_version;
2627                         xlmeta.root = metad->btm_root;
2628                         xlmeta.level = metad->btm_level;
2629                         xlmeta.fastroot = metad->btm_fastroot;
2630                         xlmeta.fastlevel = metad->btm_fastlevel;
2631                         xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2632                         xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
2633                         xlmeta.allequalimage = metad->btm_allequalimage;
2634
2635                         XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
2636                         xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2637                 }
2638                 else
2639                         xlinfo = XLOG_BTREE_UNLINK_PAGE;
2640
2641                 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2642
2643                 if (BufferIsValid(metabuf))
2644                 {
2645                         PageSetLSN(metapg, recptr);
2646                 }
2647                 page = BufferGetPage(rbuf);
2648                 PageSetLSN(page, recptr);
2649                 page = BufferGetPage(buf);
2650                 PageSetLSN(page, recptr);
2651                 if (BufferIsValid(lbuf))
2652                 {
2653                         page = BufferGetPage(lbuf);
2654                         PageSetLSN(page, recptr);
2655                 }
2656                 if (target != leafblkno)
2657                 {
2658                         page = BufferGetPage(leafbuf);
2659                         PageSetLSN(page, recptr);
2660                 }
2661         }
2662
2663         END_CRIT_SECTION();
2664
2665         /* release metapage */
2666         if (BufferIsValid(metabuf))
2667                 _bt_relbuf(rel, metabuf);
2668
2669         /* release siblings */
2670         if (BufferIsValid(lbuf))
2671                 _bt_relbuf(rel, lbuf);
2672         _bt_relbuf(rel, rbuf);
2673
2674         /* If the target is not leafbuf, we're done with it now -- release it */
2675         if (target != leafblkno)
2676                 _bt_relbuf(rel, buf);
2677
2678         /*
2679          * Maintain pages_newly_deleted, which is simply the number of pages
2680          * deleted by the ongoing VACUUM operation.
2681          *
2682          * Maintain pages_deleted in a way that takes into account how
2683          * btvacuumpage() will count deleted pages that have yet to become
2684          * scanblkno -- only count page when it's not going to get that treatment
2685          * later on.
2686          */
2687         stats->pages_newly_deleted++;
2688         if (target <= scanblkno)
2689                 stats->pages_deleted++;
2690
2691         return true;
2692 }
2693
2694 /*
2695  * Establish how tall the to-be-deleted subtree will be during the first stage
2696  * of page deletion.
2697  *
2698  * Caller's child argument is the block number of the page caller wants to
2699  * delete (this is leafbuf's block number, except when we're called
2700  * recursively).  stack is a search stack leading to it.  Note that we will
2701  * update the stack entry(s) to reflect current downlink positions --- this is
2702  * similar to the corresponding point in page split handling.
2703  *
2704  * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2705  * false.  Returns true on success, in which case caller can use certain
2706  * details established here to perform the first stage of deletion.  This
2707  * function is the last point at which page deletion may be deemed unsafe
2708  * (barring index corruption, or unexpected concurrent page deletions).
2709  *
2710  * We write lock the parent of the root of the to-be-deleted subtree for
2711  * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2712  * caller).  Caller will have to remove a downlink from *subtreeparent.  We
2713  * also set a *subtreeparent offset number in *poffset, to indicate the
2714  * location of the pivot tuple that contains the relevant downlink.
2715  *
2716  * The root of the to-be-deleted subtree is called the "top parent".  Note
2717  * that the leafbuf page is often the final "top parent" page (you can think
2718  * of the leafbuf page as a degenerate single page subtree when that happens).
2719  * Caller should initialize *topparent to the target leafbuf page block number
2720  * (while *topparentrightsib should be set to leafbuf's right sibling block
2721  * number).  We will update *topparent (and *topparentrightsib) for caller
2722  * here, though only when it turns out that caller will delete at least one
2723  * internal page (i.e. only when caller needs to store a valid link to the top
2724  * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2725  */
2726 static bool
2727 _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
2728                                                 Buffer *subtreeparent, OffsetNumber *poffset,
2729                                                 BlockNumber *topparent, BlockNumber *topparentrightsib)
2730 {
2731         BlockNumber parent,
2732                                 leftsibparent;
2733         OffsetNumber parentoffset,
2734                                 maxoff;
2735         Buffer          pbuf;
2736         Page            page;
2737         BTPageOpaque opaque;
2738
2739         /*
2740          * Locate the pivot tuple whose downlink points to "child".  Write lock
2741          * the parent page itself.
2742          */
2743         pbuf = _bt_getstackbuf(rel, stack, child);
2744         if (pbuf == InvalidBuffer)
2745                 ereport(ERROR,
2746                                 (errcode(ERRCODE_INDEX_CORRUPTED),
2747                                  errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2748                                                                  RelationGetRelationName(rel), child)));
2749         parent = stack->bts_blkno;
2750         parentoffset = stack->bts_offset;
2751
2752         page = BufferGetPage(pbuf);
2753         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2754         maxoff = PageGetMaxOffsetNumber(page);
2755         leftsibparent = opaque->btpo_prev;
2756
2757         /*
2758          * _bt_getstackbuf() completes page splits on returned parent buffer when
2759          * required.
2760          *
2761          * In general it's a bad idea for VACUUM to use up more disk space, which
2762          * is why page deletion does not finish incomplete page splits most of the
2763          * time.  We allow this limited exception because the risk is much lower,
2764          * and the potential downside of not proceeding is much higher:  A single
2765          * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2766          * prevent us from deleting hundreds of empty leaf pages from one level
2767          * down.
2768          */
2769         Assert(!P_INCOMPLETE_SPLIT(opaque));
2770
2771         if (parentoffset < maxoff)
2772         {
2773                 /*
2774                  * Child is not the rightmost child in parent, so it's safe to delete
2775                  * the subtree whose root/topparent is child page
2776                  */
2777                 *subtreeparent = pbuf;
2778                 *poffset = parentoffset;
2779                 return true;
2780         }
2781
2782         /*
2783          * Child is the rightmost child of parent.
2784          *
2785          * Since it's the rightmost child of parent, deleting the child (or
2786          * deleting the subtree whose root/topparent is the child page) is only
2787          * safe when it's also possible to delete the parent.
2788          */
2789         Assert(parentoffset == maxoff);
2790         if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2791         {
2792                 /*
2793                  * Child isn't parent's only child, or parent is rightmost on its
2794                  * entire level.  Definitely cannot delete any pages.
2795                  */
2796                 _bt_relbuf(rel, pbuf);
2797                 return false;
2798         }
2799
2800         /*
2801          * Now make sure that the parent deletion is itself safe by examining the
2802          * child's grandparent page.  Recurse, passing the parent page as the
2803          * child page (child's grandparent is the parent on the next level up). If
2804          * parent deletion is unsafe, then child deletion must also be unsafe (in
2805          * which case caller cannot delete any pages at all).
2806          */
2807         *topparent = parent;
2808         *topparentrightsib = opaque->btpo_next;
2809
2810         /*
2811          * Release lock on parent before recursing.
2812          *
2813          * It's OK to release page locks on parent before recursive call locks
2814          * grandparent.  An internal page can only acquire an entry if the child
2815          * is split, but that cannot happen as long as we still hold a lock on the
2816          * leafbuf page.
2817          */
2818         _bt_relbuf(rel, pbuf);
2819
2820         /*
2821          * Before recursing, check that the left sibling of parent (if any) is not
2822          * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2823          * parent lock).
2824          *
2825          * Note: We deliberately avoid completing incomplete splits here.
2826          */
2827         if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2828                 return false;
2829
2830         /* Recurse to examine child page's grandparent page */
2831         return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
2832                                                                    subtreeparent, poffset,
2833                                                                    topparent, topparentrightsib);
2834 }