src/backend/access/nbtree/nbtpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtpage.c
   4  *        BTree-specific page management code for the Postgres btree access
   5  *        method.
   6  *
   7  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  *
  11  * IDENTIFICATION
  12  *        $PostgreSQL$
  13  *
  14  *      NOTES
  15  *         Postgres btree pages look like ordinary relation pages.      The opaque
  16  *         data at high addresses includes pointers to left and right siblings
  17  *         and flag data describing page state.  The first page in a btree, page
  18  *         zero, is special -- it stores meta-information describing the tree.
  19  *         Pages one and higher store the actual tree data.
  20  *
  21  *-------------------------------------------------------------------------
  22  */
  23 #include "postgres.h"
  24
  25 #include "access/nbtree.h"
  26 #include "access/transam.h"
  27 #include "miscadmin.h"
  28 #include "storage/bufmgr.h"
  29 #include "storage/freespace.h"
  30 #include "storage/indexfsm.h"
  31 #include "storage/lmgr.h"
  32 #include "utils/inval.h"
  33 #include "utils/snapmgr.h"
  34
  35
  36 /*
  37  *      _bt_initmetapage() -- Fill a page buffer with a correct metapage image
  38  */
  39 void
  40 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
  41 {
  42         BTMetaPageData *metad;
  43         BTPageOpaque metaopaque;
  44
  45         _bt_pageinit(page, BLCKSZ);
  46
  47         metad = BTPageGetMeta(page);
  48         metad->btm_magic = BTREE_MAGIC;
  49         metad->btm_version = BTREE_VERSION;
  50         metad->btm_root = rootbknum;
  51         metad->btm_level = level;
  52         metad->btm_fastroot = rootbknum;
  53         metad->btm_fastlevel = level;
  54
  55         metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
  56         metaopaque->btpo_flags = BTP_META;
  57
  58         /*
  59          * Set pd_lower just past the end of the metadata.      This is not essential
  60          * but it makes the page look compressible to xlog.c.
  61          */
  62         ((PageHeader) page)->pd_lower =
  63                 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
  64 }
  65
  66 /*
  67  *      _bt_getroot() -- Get the root page of the btree.
  68  *
  69  *              Since the root page can move around the btree file, we have to read
  70  *              its location from the metadata page, and then read the root page
  71  *              itself.  If no root page exists yet, we have to create one.  The
  72  *              standard class of race conditions exists here; I think I covered
  73  *              them all in the Hopi Indian rain dance of lock requests below.
  74  *
  75  *              The access type parameter (BT_READ or BT_WRITE) controls whether
  76  *              a new root page will be created or not.  If access = BT_READ,
  77  *              and no root page exists, we just return InvalidBuffer.  For
  78  *              BT_WRITE, we try to create the root page if it doesn't exist.
  79  *              NOTE that the returned root page will have only a read lock set
  80  *              on it even if access = BT_WRITE!
  81  *
  82  *              The returned page is not necessarily the true root --- it could be
  83  *              a "fast root" (a page that is alone in its level due to deletions).
  84  *              Also, if the root page is split while we are "in flight" to it,
  85  *              what we will return is the old root, which is now just the leftmost
  86  *              page on a probably-not-very-wide level.  For most purposes this is
  87  *              as good as or better than the true root, so we do not bother to
  88  *              insist on finding the true root.  We do, however, guarantee to
  89  *              return a live (not deleted or half-dead) page.
  90  *
  91  *              On successful return, the root page is pinned and read-locked.
  92  *              The metadata page is not locked or pinned on exit.
  93  */
  94 Buffer
  95 _bt_getroot(Relation rel, int access)
  96 {
  97         Buffer          metabuf;
  98         Page            metapg;
  99         BTPageOpaque metaopaque;
 100         Buffer          rootbuf;
 101         Page            rootpage;
 102         BTPageOpaque rootopaque;
 103         BlockNumber rootblkno;
 104         uint32          rootlevel;
 105         BTMetaPageData *metad;
 106
 107         /*
 108          * Try to use previously-cached metapage data to find the root.  This
 109          * normally saves one buffer access per index search, which is a very
 110          * helpful savings in bufmgr traffic and hence contention.
 111          */
 112         if (rel->rd_amcache != NULL)
 113         {
 114                 metad = (BTMetaPageData *) rel->rd_amcache;
 115                 /* We shouldn't have cached it if any of these fail */
 116                 Assert(metad->btm_magic == BTREE_MAGIC);
 117                 Assert(metad->btm_version == BTREE_VERSION);
 118                 Assert(metad->btm_root != P_NONE);
 119
 120                 rootblkno = metad->btm_fastroot;
 121                 Assert(rootblkno != P_NONE);
 122                 rootlevel = metad->btm_fastlevel;
 123
 124                 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
 125                 rootpage = BufferGetPage(rootbuf);
 126                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 127
 128                 /*
 129                  * Since the cache might be stale, we check the page more carefully
 130                  * here than normal.  We *must* check that it's not deleted. If it's
 131                  * not alone on its level, then we reject too --- this may be overly
 132                  * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
 133                  * because that's not set in a "fast root".
 134                  */
 135                 if (!P_IGNORE(rootopaque) &&
 136                         rootopaque->btpo.level == rootlevel &&
 137                         P_LEFTMOST(rootopaque) &&
 138                         P_RIGHTMOST(rootopaque))
 139                 {
 140                         /* OK, accept cached page as the root */
 141                         return rootbuf;
 142                 }
 143                 _bt_relbuf(rel, rootbuf);
 144                 /* Cache is stale, throw it away */
 145                 if (rel->rd_amcache)
 146                         pfree(rel->rd_amcache);
 147                 rel->rd_amcache = NULL;
 148         }
 149
 150         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 151         metapg = BufferGetPage(metabuf);
 152         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 153         metad = BTPageGetMeta(metapg);
 154
 155         /* sanity-check the metapage */
 156         if (!(metaopaque->btpo_flags & BTP_META) ||
 157                 metad->btm_magic != BTREE_MAGIC)
 158                 ereport(ERROR,
 159                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 160                                  errmsg("index \"%s\" is not a btree",
 161                                                 RelationGetRelationName(rel))));
 162
 163         if (metad->btm_version != BTREE_VERSION)
 164                 ereport(ERROR,
 165                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 166                                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
 167                                                 RelationGetRelationName(rel),
 168                                                 metad->btm_version, BTREE_VERSION)));
 169
 170         /* if no root page initialized yet, do it */
 171         if (metad->btm_root == P_NONE)
 172         {
 173                 /* If access = BT_READ, caller doesn't want us to create root yet */
 174                 if (access == BT_READ)
 175                 {
 176                         _bt_relbuf(rel, metabuf);
 177                         return InvalidBuffer;
 178                 }
 179
 180                 /* trade in our read lock for a write lock */
 181                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 182                 LockBuffer(metabuf, BT_WRITE);
 183
 184                 /*
 185                  * Race condition:      if someone else initialized the metadata between
 186                  * the time we released the read lock and acquired the write lock, we
 187                  * must avoid doing it again.
 188                  */
 189                 if (metad->btm_root != P_NONE)
 190                 {
 191                         /*
 192                          * Metadata initialized by someone else.  In order to guarantee no
 193                          * deadlocks, we have to release the metadata page and start all
 194                          * over again.  (Is that really true? But it's hardly worth trying
 195                          * to optimize this case.)
 196                          */
 197                         _bt_relbuf(rel, metabuf);
 198                         return _bt_getroot(rel, access);
 199                 }
 200
 201                 /*
 202                  * Get, initialize, write, and leave a lock of the appropriate type on
 203                  * the new root page.  Since this is the first page in the tree, it's
 204                  * a leaf as well as the root.
 205                  */
 206                 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 207                 rootblkno = BufferGetBlockNumber(rootbuf);
 208                 rootpage = BufferGetPage(rootbuf);
 209                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 210                 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 211                 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
 212                 rootopaque->btpo.level = 0;
 213                 rootopaque->btpo_cycleid = 0;
 214
 215                 /* NO ELOG(ERROR) till meta is updated */
 216                 START_CRIT_SECTION();
 217
 218                 metad->btm_root = rootblkno;
 219                 metad->btm_level = 0;
 220                 metad->btm_fastroot = rootblkno;
 221                 metad->btm_fastlevel = 0;
 222
 223                 MarkBufferDirty(rootbuf);
 224                 MarkBufferDirty(metabuf);
 225
 226                 /* XLOG stuff */
 227                 if (!rel->rd_istemp)
 228                 {
 229                         xl_btree_newroot xlrec;
 230                         XLogRecPtr      recptr;
 231                         XLogRecData rdata;
 232
 233                         xlrec.node = rel->rd_node;
 234                         xlrec.rootblk = rootblkno;
 235                         xlrec.level = 0;
 236
 237                         rdata.data = (char *) &xlrec;
 238                         rdata.len = SizeOfBtreeNewroot;
 239                         rdata.buffer = InvalidBuffer;
 240                         rdata.next = NULL;
 241
 242                         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
 243
 244                         PageSetLSN(rootpage, recptr);
 245                         PageSetTLI(rootpage, ThisTimeLineID);
 246                         PageSetLSN(metapg, recptr);
 247                         PageSetTLI(metapg, ThisTimeLineID);
 248                 }
 249
 250                 END_CRIT_SECTION();
 251
 252                 /*
 253                  * Send out relcache inval for metapage change (probably unnecessary
 254                  * here, but let's be safe).
 255                  */
 256                 CacheInvalidateRelcache(rel);
 257
 258                 /*
 259                  * swap root write lock for read lock.  There is no danger of anyone
 260                  * else accessing the new root page while it's unlocked, since no one
 261                  * else knows where it is yet.
 262                  */
 263                 LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
 264                 LockBuffer(rootbuf, BT_READ);
 265
 266                 /* okay, metadata is correct, release lock on it */
 267                 _bt_relbuf(rel, metabuf);
 268         }
 269         else
 270         {
 271                 rootblkno = metad->btm_fastroot;
 272                 Assert(rootblkno != P_NONE);
 273                 rootlevel = metad->btm_fastlevel;
 274
 275                 /*
 276                  * Cache the metapage data for next time
 277                  */
 278                 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
 279                                                                                          sizeof(BTMetaPageData));
 280                 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
 281
 282                 /*
 283                  * We are done with the metapage; arrange to release it via first
 284                  * _bt_relandgetbuf call
 285                  */
 286                 rootbuf = metabuf;
 287
 288                 for (;;)
 289                 {
 290                         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 291                         rootpage = BufferGetPage(rootbuf);
 292                         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 293
 294                         if (!P_IGNORE(rootopaque))
 295                                 break;
 296
 297                         /* it's dead, Jim.  step right one page */
 298                         if (P_RIGHTMOST(rootopaque))
 299                                 elog(ERROR, "no live root page found in index \"%s\"",
 300                                          RelationGetRelationName(rel));
 301                         rootblkno = rootopaque->btpo_next;
 302                 }
 303
 304                 /* Note: can't check btpo.level on deleted pages */
 305                 if (rootopaque->btpo.level != rootlevel)
 306                         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 307                                  rootblkno, RelationGetRelationName(rel),
 308                                  rootopaque->btpo.level, rootlevel);
 309         }
 310
 311         /*
 312          * By here, we have a pin and read lock on the root page, and no lock set
 313          * on the metadata page.  Return the root page's buffer.
 314          */
 315         return rootbuf;
 316 }
 317
 318 /*
 319  *      _bt_gettrueroot() -- Get the true root page of the btree.
 320  *
 321  *              This is the same as the BT_READ case of _bt_getroot(), except
 322  *              we follow the true-root link not the fast-root link.
 323  *
 324  * By the time we acquire lock on the root page, it might have been split and
 325  * not be the true root anymore.  This is okay for the present uses of this
 326  * routine; we only really need to be able to move up at least one tree level
 327  * from whatever non-root page we were at.      If we ever do need to lock the
 328  * one true root page, we could loop here, re-reading the metapage on each
 329  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 330  * moving to the root --- that'd deadlock against any concurrent root split.)
 331  */
 332 Buffer
 333 _bt_gettrueroot(Relation rel)
 334 {
 335         Buffer          metabuf;
 336         Page            metapg;
 337         BTPageOpaque metaopaque;
 338         Buffer          rootbuf;
 339         Page            rootpage;
 340         BTPageOpaque rootopaque;
 341         BlockNumber rootblkno;
 342         uint32          rootlevel;
 343         BTMetaPageData *metad;
 344
 345         /*
 346          * We don't try to use cached metapage data here, since (a) this path is
 347          * not performance-critical, and (b) if we are here it suggests our cache
 348          * is out-of-date anyway.  In light of point (b), it's probably safest to
 349          * actively flush any cached metapage info.
 350          */
 351         if (rel->rd_amcache)
 352                 pfree(rel->rd_amcache);
 353         rel->rd_amcache = NULL;
 354
 355         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
 356         metapg = BufferGetPage(metabuf);
 357         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 358         metad = BTPageGetMeta(metapg);
 359
 360         if (!(metaopaque->btpo_flags & BTP_META) ||
 361                 metad->btm_magic != BTREE_MAGIC)
 362                 ereport(ERROR,
 363                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 364                                  errmsg("index \"%s\" is not a btree",
 365                                                 RelationGetRelationName(rel))));
 366
 367         if (metad->btm_version != BTREE_VERSION)
 368                 ereport(ERROR,
 369                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 370                                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
 371                                                 RelationGetRelationName(rel),
 372                                                 metad->btm_version, BTREE_VERSION)));
 373
 374         /* if no root page initialized yet, fail */
 375         if (metad->btm_root == P_NONE)
 376         {
 377                 _bt_relbuf(rel, metabuf);
 378                 return InvalidBuffer;
 379         }
 380
 381         rootblkno = metad->btm_root;
 382         rootlevel = metad->btm_level;
 383
 384         /*
 385          * We are done with the metapage; arrange to release it via first
 386          * _bt_relandgetbuf call
 387          */
 388         rootbuf = metabuf;
 389
 390         for (;;)
 391         {
 392                 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
 393                 rootpage = BufferGetPage(rootbuf);
 394                 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
 395
 396                 if (!P_IGNORE(rootopaque))
 397                         break;
 398
 399                 /* it's dead, Jim.  step right one page */
 400                 if (P_RIGHTMOST(rootopaque))
 401                         elog(ERROR, "no live root page found in index \"%s\"",
 402                                  RelationGetRelationName(rel));
 403                 rootblkno = rootopaque->btpo_next;
 404         }
 405
 406         /* Note: can't check btpo.level on deleted pages */
 407         if (rootopaque->btpo.level != rootlevel)
 408                 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
 409                          rootblkno, RelationGetRelationName(rel),
 410                          rootopaque->btpo.level, rootlevel);
 411
 412         return rootbuf;
 413 }
 414
 415 /*
 416  *      _bt_checkpage() -- Verify that a freshly-read page looks sane.
 417  */
 418 void
 419 _bt_checkpage(Relation rel, Buffer buf)
 420 {
 421         Page            page = BufferGetPage(buf);
 422
 423         /*
 424          * ReadBuffer verifies that every newly-read page passes
 425          * PageHeaderIsValid, which means it either contains a reasonably sane
 426          * page header or is all-zero.  We have to defend against the all-zero
 427          * case, however.
 428          */
 429         if (PageIsNew(page))
 430                 ereport(ERROR,
 431                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 432                          errmsg("index \"%s\" contains unexpected zero page at block %u",
 433                                         RelationGetRelationName(rel),
 434                                         BufferGetBlockNumber(buf)),
 435                                  errhint("Please REINDEX it.")));
 436
 437         /*
 438          * Additionally check that the special area looks sane.
 439          */
 440         if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
 441                 ereport(ERROR,
 442                                 (errcode(ERRCODE_INDEX_CORRUPTED),
 443                                  errmsg("index \"%s\" contains corrupted page at block %u",
 444                                                 RelationGetRelationName(rel),
 445                                                 BufferGetBlockNumber(buf)),
 446                                  errhint("Please REINDEX it.")));
 447 }
 448
 449 /*
 450  *      _bt_getbuf() -- Get a buffer by block number for read or write.
 451  *
 452  *              blkno == P_NEW means to get an unallocated index page.  The page
 453  *              will be initialized before returning it.
 454  *
 455  *              When this routine returns, the appropriate lock is set on the
 456  *              requested buffer and its reference count has been incremented
 457  *              (ie, the buffer is "locked and pinned").  Also, we apply
 458  *              _bt_checkpage to sanity-check the page (except in P_NEW case).
 459  */
 460 Buffer
 461 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 462 {
 463         Buffer          buf;
 464
 465         if (blkno != P_NEW)
 466         {
 467                 /* Read an existing block of the relation */
 468                 buf = ReadBuffer(rel, blkno);
 469                 LockBuffer(buf, access);
 470                 _bt_checkpage(rel, buf);
 471         }
 472         else
 473         {
 474                 bool            needLock;
 475                 Page            page;
 476
 477                 Assert(access == BT_WRITE);
 478
 479                 /*
 480                  * First see if the FSM knows of any free pages.
 481                  *
 482                  * We can't trust the FSM's report unreservedly; we have to check that
 483                  * the page is still free.      (For example, an already-free page could
 484                  * have been re-used between the time the last VACUUM scanned it and
 485                  * the time the VACUUM made its FSM updates.)
 486                  *
 487                  * In fact, it's worse than that: we can't even assume that it's safe
 488                  * to take a lock on the reported page.  If somebody else has a lock
 489                  * on it, or even worse our own caller does, we could deadlock.  (The
 490                  * own-caller scenario is actually not improbable. Consider an index
 491                  * on a serial or timestamp column.  Nearly all splits will be at the
 492                  * rightmost page, so it's entirely likely that _bt_split will call us
 493                  * while holding a lock on the page most recently acquired from FSM. A
 494                  * VACUUM running concurrently with the previous split could well have
 495                  * placed that page back in FSM.)
 496                  *
 497                  * To get around that, we ask for only a conditional lock on the
 498                  * reported page.  If we fail, then someone else is using the page,
 499                  * and we may reasonably assume it's not free.  (If we happen to be
 500                  * wrong, the worst consequence is the page will be lost to use till
 501                  * the next VACUUM, which is no big problem.)
 502                  */
 503                 for (;;)
 504                 {
 505                         blkno = GetFreeIndexPage(rel);
 506                         if (blkno == InvalidBlockNumber)
 507                                 break;
 508                         buf = ReadBuffer(rel, blkno);
 509                         if (ConditionalLockBuffer(buf))
 510                         {
 511                                 page = BufferGetPage(buf);
 512                                 if (_bt_page_recyclable(page))
 513                                 {
 514                                         /* Okay to use page.  Re-initialize and return it */
 515                                         _bt_pageinit(page, BufferGetPageSize(buf));
 516                                         return buf;
 517                                 }
 518                                 elog(DEBUG2, "FSM returned nonrecyclable page");
 519                                 _bt_relbuf(rel, buf);
 520                         }
 521                         else
 522                         {
 523                                 elog(DEBUG2, "FSM returned nonlockable page");
 524                                 /* couldn't get lock, so just drop pin */
 525                                 ReleaseBuffer(buf);
 526                         }
 527                 }
 528
 529                 /*
 530                  * Extend the relation by one page.
 531                  *
 532                  * We have to use a lock to ensure no one else is extending the rel at
 533                  * the same time, else we will both try to initialize the same new
 534                  * page.  We can skip locking for new or temp relations, however,
 535                  * since no one else could be accessing them.
 536                  */
 537                 needLock = !RELATION_IS_LOCAL(rel);
 538
 539                 if (needLock)
 540                         LockRelationForExtension(rel, ExclusiveLock);
 541
 542                 buf = ReadBuffer(rel, P_NEW);
 543
 544                 /* Acquire buffer lock on new page */
 545                 LockBuffer(buf, BT_WRITE);
 546
 547                 /*
 548                  * Release the file-extension lock; it's now OK for someone else to
 549                  * extend the relation some more.  Note that we cannot release this
 550                  * lock before we have buffer lock on the new page, or we risk a race
 551                  * condition against btvacuumscan --- see comments therein.
 552                  */
 553                 if (needLock)
 554                         UnlockRelationForExtension(rel, ExclusiveLock);
 555
 556                 /* Initialize the new page before returning it */
 557                 page = BufferGetPage(buf);
 558                 Assert(PageIsNew(page));
 559                 _bt_pageinit(page, BufferGetPageSize(buf));
 560         }
 561
 562         /* ref count and lock type are correct */
 563         return buf;
 564 }
 565
 566 /*
 567  *      _bt_relandgetbuf() -- release a locked buffer and get another one.
 568  *
 569  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
 570  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
 571  * then it reduces to just _bt_getbuf; allowing this case simplifies some
 572  * callers.
 573  *
 574  * The original motivation for using this was to avoid two entries to the
 575  * bufmgr when one would do.  However, now it's mainly just a notational
 576  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
 577  * is when the target page is the same one already in the buffer.
 578  */
 579 Buffer
 580 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 581 {
 582         Buffer          buf;
 583
 584         Assert(blkno != P_NEW);
 585         if (BufferIsValid(obuf))
 586                 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 587         buf = ReleaseAndReadBuffer(obuf, rel, blkno);
 588         LockBuffer(buf, access);
 589         _bt_checkpage(rel, buf);
 590         return buf;
 591 }
 592
 593 /*
 594  *      _bt_relbuf() -- release a locked buffer.
 595  *
 596  * Lock and pin (refcount) are both dropped.
 597  */
 598 void
 599 _bt_relbuf(Relation rel, Buffer buf)
 600 {
 601         UnlockReleaseBuffer(buf);
 602 }
 603
 604 /*
 605  *      _bt_pageinit() -- Initialize a new page.
 606  *
 607  * On return, the page header is initialized; data space is empty;
 608  * special space is zeroed out.
 609  */
 610 void
 611 _bt_pageinit(Page page, Size size)
 612 {
 613         PageInit(page, size, sizeof(BTPageOpaqueData));
 614 }
 615
 616 /*
 617  *      _bt_page_recyclable() -- Is an existing page recyclable?
 618  *
 619  * This exists to make sure _bt_getbuf and btvacuumscan have the same
 620  * policy about whether a page is safe to re-use.
 621  */
 622 bool
 623 _bt_page_recyclable(Page page)
 624 {
 625         BTPageOpaque opaque;
 626
 627         /*
 628          * It's possible to find an all-zeroes page in an index --- for example, a
 629          * backend might successfully extend the relation one page and then crash
 630          * before it is able to make a WAL entry for adding the page. If we find a
 631          * zeroed page then reclaim it.
 632          */
 633         if (PageIsNew(page))
 634                 return true;
 635
 636         /*
 637          * Otherwise, recycle if deleted and too old to have any processes
 638          * interested in it.
 639          */
 640         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 641         if (P_ISDELETED(opaque) &&
 642                 TransactionIdPrecedesOrEquals(opaque->btpo.xact, RecentXmin))
 643                 return true;
 644         return false;
 645 }
 646
 647 /*
 648  * Delete item(s) from a btree page.
 649  *
 650  * This must only be used for deleting leaf items.      Deleting an item on a
 651  * non-leaf page has to be done as part of an atomic action that includes
 652  * deleting the page it points to.
 653  *
 654  * This routine assumes that the caller has pinned and locked the buffer.
 655  * Also, the given itemnos *must* appear in increasing order in the array.
 656  */
 657 void
 658 _bt_delitems(Relation rel, Buffer buf,
 659                          OffsetNumber *itemnos, int nitems)
 660 {
 661         Page            page = BufferGetPage(buf);
 662         BTPageOpaque opaque;
 663
 664         /* No ereport(ERROR) until changes are logged */
 665         START_CRIT_SECTION();
 666
 667         /* Fix the page */
 668         PageIndexMultiDelete(page, itemnos, nitems);
 669
 670         /*
 671          * We can clear the vacuum cycle ID since this page has certainly been
 672          * processed by the current vacuum scan.
 673          */
 674         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 675         opaque->btpo_cycleid = 0;
 676
 677         /*
 678          * Mark the page as not containing any LP_DEAD items.  This is not
 679          * certainly true (there might be some that have recently been marked, but
 680          * weren't included in our target-item list), but it will almost always be
 681          * true and it doesn't seem worth an additional page scan to check it.
 682          * Remember that BTP_HAS_GARBAGE is only a hint anyway.
 683          */
 684         opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
 685
 686         MarkBufferDirty(buf);
 687
 688         /* XLOG stuff */
 689         if (!rel->rd_istemp)
 690         {
 691                 xl_btree_delete xlrec;
 692                 XLogRecPtr      recptr;
 693                 XLogRecData rdata[2];
 694
 695                 xlrec.node = rel->rd_node;
 696                 xlrec.block = BufferGetBlockNumber(buf);
 697
 698                 rdata[0].data = (char *) &xlrec;
 699                 rdata[0].len = SizeOfBtreeDelete;
 700                 rdata[0].buffer = InvalidBuffer;
 701                 rdata[0].next = &(rdata[1]);
 702
 703                 /*
 704                  * The target-offsets array is not in the buffer, but pretend that it
 705                  * is.  When XLogInsert stores the whole buffer, the offsets array
 706                  * need not be stored too.
 707                  */
 708                 if (nitems > 0)
 709                 {
 710                         rdata[1].data = (char *) itemnos;
 711                         rdata[1].len = nitems * sizeof(OffsetNumber);
 712                 }
 713                 else
 714                 {
 715                         rdata[1].data = NULL;
 716                         rdata[1].len = 0;
 717                 }
 718                 rdata[1].buffer = buf;
 719                 rdata[1].buffer_std = true;
 720                 rdata[1].next = NULL;
 721
 722                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 723
 724                 PageSetLSN(page, recptr);
 725                 PageSetTLI(page, ThisTimeLineID);
 726         }
 727
 728         END_CRIT_SECTION();
 729 }
 730
 731 /*
 732  * Subroutine to pre-check whether a page deletion is safe, that is, its
 733  * parent page would be left in a valid or deletable state.
 734  *
 735  * "target" is the page we wish to delete, and "stack" is a search stack
 736  * leading to it (approximately).  Note that we will update the stack
 737  * entry(s) to reflect current downlink positions --- this is harmless and
 738  * indeed saves later search effort in _bt_pagedel.
 739  *
 740  * Note: it's OK to release page locks after checking, because a safe
 741  * deletion can't become unsafe due to concurrent activity.  A non-rightmost
 742  * page cannot become rightmost unless there's a concurrent page deletion,
 743  * but only VACUUM does page deletion and we only allow one VACUUM on an index
 744  * at a time.  An only child could acquire a sibling (of the same parent) only
 745  * by being split ... but that would make it a non-rightmost child so the
 746  * deletion is still safe.
 747  */
 748 static bool
 749 _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
 750 {
 751         BlockNumber parent;
 752         OffsetNumber poffset,
 753                                 maxoff;
 754         Buffer          pbuf;
 755         Page            page;
 756         BTPageOpaque opaque;
 757
 758         /*
 759          * In recovery mode, assume the deletion being replayed is valid.  We
 760          * can't always check it because we won't have a full search stack, and we
 761          * should complain if there's a problem, anyway.
 762          */
 763         if (InRecovery)
 764                 return true;
 765
 766         /* Locate the parent's downlink (updating the stack entry if needed) */
 767         ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
 768         pbuf = _bt_getstackbuf(rel, stack, BT_READ);
 769         if (pbuf == InvalidBuffer)
 770                 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
 771                          RelationGetRelationName(rel), target);
 772         parent = stack->bts_blkno;
 773         poffset = stack->bts_offset;
 774
 775         page = BufferGetPage(pbuf);
 776         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 777         maxoff = PageGetMaxOffsetNumber(page);
 778
 779         /*
 780          * If the target is the rightmost child of its parent, then we can't
 781          * delete, unless it's also the only child.
 782          */
 783         if (poffset >= maxoff)
 784         {
 785                 /* It's rightmost child... */
 786                 if (poffset == P_FIRSTDATAKEY(opaque))
 787                 {
 788                         /*
 789                          * It's only child, so safe if parent would itself be removable.
 790                          * We have to check the parent itself, and then recurse to test
 791                          * the conditions at the parent's parent.
 792                          */
 793                         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
 794                         {
 795                                 _bt_relbuf(rel, pbuf);
 796                                 return false;
 797                         }
 798
 799                         _bt_relbuf(rel, pbuf);
 800                         return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
 801                 }
 802                 else
 803                 {
 804                         /* Unsafe to delete */
 805                         _bt_relbuf(rel, pbuf);
 806                         return false;
 807                 }
 808         }
 809         else
 810         {
 811                 /* Not rightmost child, so safe to delete */
 812                 _bt_relbuf(rel, pbuf);
 813                 return true;
 814         }
 815 }
 816
 817 /*
 818  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
 819  *
 820  * This action unlinks the page from the b-tree structure, removing all
 821  * pointers leading to it --- but not touching its own left and right links.
 822  * The page cannot be physically reclaimed right away, since other processes
 823  * may currently be trying to follow links leading to the page; they have to
 824  * be allowed to use its right-link to recover.  See nbtree/README.
 825  *
 826  * On entry, the target buffer must be pinned and locked (either read or write
 827  * lock is OK).  This lock and pin will be dropped before exiting.
 828  *
 829  * The "stack" argument can be a search stack leading (approximately) to the
 830  * target page, or NULL --- outside callers typically pass NULL since they
 831  * have not done such a search, but internal recursion cases pass the stack
 832  * to avoid duplicated search effort.
 833  *
 834  * Returns the number of pages successfully deleted (zero if page cannot
 835  * be deleted now; could be more than one if parent pages were deleted too).
 836  *
 837  * NOTE: this leaks memory.  Rather than trying to clean up everything
 838  * carefully, it's better to run it in a temp context that can be reset
 839  * frequently.
 840  */
 841 int
 842 _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full)
 843 {
 844         int                     result;
 845         BlockNumber target,
 846                                 leftsib,
 847                                 rightsib,
 848                                 parent;
 849         OffsetNumber poffset,
 850                                 maxoff;
 851         uint32          targetlevel,
 852                                 ilevel;
 853         ItemId          itemid;
 854         IndexTuple      targetkey,
 855                                 itup;
 856         ScanKey         itup_scankey;
 857         Buffer          lbuf,
 858                                 rbuf,
 859                                 pbuf;
 860         bool            parent_half_dead;
 861         bool            parent_one_child;
 862         bool            rightsib_empty;
 863         Buffer          metabuf = InvalidBuffer;
 864         Page            metapg = NULL;
 865         BTMetaPageData *metad = NULL;
 866         Page            page;
 867         BTPageOpaque opaque;
 868
 869         /*
 870          * We can never delete rightmost pages nor root pages.  While at it, check
 871          * that page is not already deleted and is empty.
 872          */
 873         page = BufferGetPage(buf);
 874         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 875         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
 876                 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
 877         {
 878                 /* Should never fail to delete a half-dead page */
 879                 Assert(!P_ISHALFDEAD(opaque));
 880
 881                 _bt_relbuf(rel, buf);
 882                 return 0;
 883         }
 884
 885         /*
 886          * Save info about page, including a copy of its high key (it must have
 887          * one, being non-rightmost).
 888          */
 889         target = BufferGetBlockNumber(buf);
 890         targetlevel = opaque->btpo.level;
 891         leftsib = opaque->btpo_prev;
 892         itemid = PageGetItemId(page, P_HIKEY);
 893         targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
 894
 895         /*
 896          * To avoid deadlocks, we'd better drop the target page lock before going
 897          * further.
 898          */
 899         _bt_relbuf(rel, buf);
 900
 901         /*
 902          * We need an approximate pointer to the page's parent page.  We use the
 903          * standard search mechanism to search for the page's high key; this will
 904          * give us a link to either the current parent or someplace to its left
 905          * (if there are multiple equal high keys).  In recursion cases, the
 906          * caller already generated a search stack and we can just re-use that
 907          * work.
 908          */
 909         if (stack == NULL)
 910         {
 911                 if (!InRecovery)
 912                 {
 913                         /* we need an insertion scan key to do our search, so build one */
 914                         itup_scankey = _bt_mkscankey(rel, targetkey);
 915                         /* find the leftmost leaf page containing this key */
 916                         stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
 917                                                            &lbuf, BT_READ);
 918                         /* don't need a pin on that either */
 919                         _bt_relbuf(rel, lbuf);
 920
 921                         /*
 922                          * If we are trying to delete an interior page, _bt_search did
 923                          * more than we needed.  Locate the stack item pointing to our
 924                          * parent level.
 925                          */
 926                         ilevel = 0;
 927                         for (;;)
 928                         {
 929                                 if (stack == NULL)
 930                                         elog(ERROR, "not enough stack items");
 931                                 if (ilevel == targetlevel)
 932                                         break;
 933                                 stack = stack->bts_parent;
 934                                 ilevel++;
 935                         }
 936                 }
 937                 else
 938                 {
 939                         /*
 940                          * During WAL recovery, we can't use _bt_search (for one reason,
 941                          * it might invoke user-defined comparison functions that expect
 942                          * facilities not available in recovery mode).  Instead, just set
 943                          * up a dummy stack pointing to the left end of the parent tree
 944                          * level, from which _bt_getstackbuf will walk right to the parent
 945                          * page.  Painful, but we don't care too much about performance in
 946                          * this scenario.
 947                          */
 948                         pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
 949                         stack = (BTStack) palloc(sizeof(BTStackData));
 950                         stack->bts_blkno = BufferGetBlockNumber(pbuf);
 951                         stack->bts_offset = InvalidOffsetNumber;
 952                         /* bts_btentry will be initialized below */
 953                         stack->bts_parent = NULL;
 954                         _bt_relbuf(rel, pbuf);
 955                 }
 956         }
 957
 958         /*
 959          * We cannot delete a page that is the rightmost child of its immediate
 960          * parent, unless it is the only child --- in which case the parent has to
 961          * be deleted too, and the same condition applies recursively to it. We
 962          * have to check this condition all the way up before trying to delete. We
 963          * don't need to re-test when deleting a non-leaf page, though.
 964          */
 965         if (targetlevel == 0 &&
 966                 !_bt_parent_deletion_safe(rel, target, stack))
 967                 return 0;
 968
 969         /*
 970          * We have to lock the pages we need to modify in the standard order:
 971          * moving right, then up.  Else we will deadlock against other writers.
 972          *
 973          * So, we need to find and write-lock the current left sibling of the
 974          * target page.  The sibling that was current a moment ago could have
 975          * split, so we may have to move right.  This search could fail if either
 976          * the sibling or the target page was deleted by someone else meanwhile;
 977          * if so, give up.      (Right now, that should never happen, since page
 978          * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
 979          * concurrently on the same table.)
 980          */
 981         if (leftsib != P_NONE)
 982         {
 983                 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
 984                 page = BufferGetPage(lbuf);
 985                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 986                 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
 987                 {
 988                         /* step right one page */
 989                         leftsib = opaque->btpo_next;
 990                         _bt_relbuf(rel, lbuf);
 991                         if (leftsib == P_NONE)
 992                         {
 993                                 elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
 994                                          RelationGetRelationName(rel));
 995                                 return 0;
 996                         }
 997                         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
 998                         page = BufferGetPage(lbuf);
 999                         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1000                 }
1001         }
1002         else
1003                 lbuf = InvalidBuffer;
1004
1005         /*
1006          * Next write-lock the target page itself.      It should be okay to take just
1007          * a write lock not a superexclusive lock, since no scans would stop on an
1008          * empty page.
1009          */
1010         buf = _bt_getbuf(rel, target, BT_WRITE);
1011         page = BufferGetPage(buf);
1012         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1013
1014         /*
1015          * Check page is still empty etc, else abandon deletion.  The empty check
1016          * is necessary since someone else might have inserted into it while we
1017          * didn't have it locked; the others are just for paranoia's sake.
1018          */
1019         if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1020                 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
1021         {
1022                 _bt_relbuf(rel, buf);
1023                 if (BufferIsValid(lbuf))
1024                         _bt_relbuf(rel, lbuf);
1025                 return 0;
1026         }
1027         if (opaque->btpo_prev != leftsib)
1028                 elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
1029                          target, RelationGetRelationName(rel));
1030
1031         /*
1032          * And next write-lock the (current) right sibling.
1033          */
1034         rightsib = opaque->btpo_next;
1035         rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
1036
1037         /*
1038          * Next find and write-lock the current parent of the target page. This is
1039          * essentially the same as the corresponding step of splitting.
1040          */
1041         ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
1042         pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
1043         if (pbuf == InvalidBuffer)
1044                 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
1045                          RelationGetRelationName(rel), target);
1046         parent = stack->bts_blkno;
1047         poffset = stack->bts_offset;
1048
1049         /*
1050          * If the target is the rightmost child of its parent, then we can't
1051          * delete, unless it's also the only child --- in which case the parent
1052          * changes to half-dead status.  The "can't delete" case should have been
1053          * detected by _bt_parent_deletion_safe, so complain if we see it now.
1054          */
1055         page = BufferGetPage(pbuf);
1056         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1057         maxoff = PageGetMaxOffsetNumber(page);
1058         parent_half_dead = false;
1059         parent_one_child = false;
1060         if (poffset >= maxoff)
1061         {
1062                 if (poffset == P_FIRSTDATAKEY(opaque))
1063                         parent_half_dead = true;
1064                 else
1065                         elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
1066                                  target, parent, RelationGetRelationName(rel));
1067         }
1068         else
1069         {
1070                 /* Will there be exactly one child left in this parent? */
1071                 if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
1072                         parent_one_child = true;
1073         }
1074
1075         /*
1076          * If we are deleting the next-to-last page on the target's level, then
1077          * the rightsib is a candidate to become the new fast root. (In theory, it
1078          * might be possible to push the fast root even further down, but the odds
1079          * of doing so are slim, and the locking considerations daunting.)
1080          *
1081          * We don't support handling this in the case where the parent is becoming
1082          * half-dead, even though it theoretically could occur.
1083          *
1084          * We can safely acquire a lock on the metapage here --- see comments for
1085          * _bt_newroot().
1086          */
1087         if (leftsib == P_NONE && !parent_half_dead)
1088         {
1089                 page = BufferGetPage(rbuf);
1090                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1091                 Assert(opaque->btpo.level == targetlevel);
1092                 if (P_RIGHTMOST(opaque))
1093                 {
1094                         /* rightsib will be the only one left on the level */
1095                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1096                         metapg = BufferGetPage(metabuf);
1097                         metad = BTPageGetMeta(metapg);
1098
1099                         /*
1100                          * The expected case here is btm_fastlevel == targetlevel+1; if
1101                          * the fastlevel is <= targetlevel, something is wrong, and we
1102                          * choose to overwrite it to fix it.
1103                          */
1104                         if (metad->btm_fastlevel > targetlevel + 1)
1105                         {
1106                                 /* no update wanted */
1107                                 _bt_relbuf(rel, metabuf);
1108                                 metabuf = InvalidBuffer;
1109                         }
1110                 }
1111         }
1112
1113         /*
1114          * Here we begin doing the deletion.
1115          */
1116
1117         /* No ereport(ERROR) until changes are logged */
1118         START_CRIT_SECTION();
1119
1120         /*
1121          * Update parent.  The normal case is a tad tricky because we want to
1122          * delete the target's downlink and the *following* key.  Easiest way is
1123          * to copy the right sibling's downlink over the target downlink, and then
1124          * delete the following item.
1125          */
1126         page = BufferGetPage(pbuf);
1127         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1128         if (parent_half_dead)
1129         {
1130                 PageIndexTupleDelete(page, poffset);
1131                 opaque->btpo_flags |= BTP_HALF_DEAD;
1132         }
1133         else
1134         {
1135                 OffsetNumber nextoffset;
1136
1137                 itemid = PageGetItemId(page, poffset);
1138                 itup = (IndexTuple) PageGetItem(page, itemid);
1139                 Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
1140                 ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
1141
1142                 nextoffset = OffsetNumberNext(poffset);
1143                 /* This part is just for double-checking */
1144                 itemid = PageGetItemId(page, nextoffset);
1145                 itup = (IndexTuple) PageGetItem(page, itemid);
1146                 if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
1147                         elog(PANIC, "right sibling %u of block %u is not next child of %u in index \"%s\"",
1148                                  rightsib, target, BufferGetBlockNumber(pbuf),
1149                                  RelationGetRelationName(rel));
1150                 PageIndexTupleDelete(page, nextoffset);
1151         }
1152
1153         /*
1154          * Update siblings' side-links.  Note the target page's side-links will
1155          * continue to point to the siblings.
1156          */
1157         if (BufferIsValid(lbuf))
1158         {
1159                 page = BufferGetPage(lbuf);
1160                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1161                 Assert(opaque->btpo_next == target);
1162                 opaque->btpo_next = rightsib;
1163         }
1164         page = BufferGetPage(rbuf);
1165         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1166         Assert(opaque->btpo_prev == target);
1167         opaque->btpo_prev = leftsib;
1168         rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
1169
1170         /*
1171          * Mark the page itself deleted.  It can be recycled when all current
1172          * transactions are gone; or immediately if we're doing VACUUM FULL.
1173          */
1174         page = BufferGetPage(buf);
1175         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1176         opaque->btpo_flags &= ~BTP_HALF_DEAD;
1177         opaque->btpo_flags |= BTP_DELETED;
1178         opaque->btpo.xact =
1179                 vacuum_full ? FrozenTransactionId : ReadNewTransactionId();
1180
1181         /* And update the metapage, if needed */
1182         if (BufferIsValid(metabuf))
1183         {
1184                 metad->btm_fastroot = rightsib;
1185                 metad->btm_fastlevel = targetlevel;
1186                 MarkBufferDirty(metabuf);
1187         }
1188
1189         /* Must mark buffers dirty before XLogInsert */
1190         MarkBufferDirty(pbuf);
1191         MarkBufferDirty(rbuf);
1192         MarkBufferDirty(buf);
1193         if (BufferIsValid(lbuf))
1194                 MarkBufferDirty(lbuf);
1195
1196         /* XLOG stuff */
1197         if (!rel->rd_istemp)
1198         {
1199                 xl_btree_delete_page xlrec;
1200                 xl_btree_metadata xlmeta;
1201                 uint8           xlinfo;
1202                 XLogRecPtr      recptr;
1203                 XLogRecData rdata[5];
1204                 XLogRecData *nextrdata;
1205
1206                 xlrec.target.node = rel->rd_node;
1207                 ItemPointerSet(&(xlrec.target.tid), parent, poffset);
1208                 xlrec.deadblk = target;
1209                 xlrec.leftblk = leftsib;
1210                 xlrec.rightblk = rightsib;
1211
1212                 rdata[0].data = (char *) &xlrec;
1213                 rdata[0].len = SizeOfBtreeDeletePage;
1214                 rdata[0].buffer = InvalidBuffer;
1215                 rdata[0].next = nextrdata = &(rdata[1]);
1216
1217                 if (BufferIsValid(metabuf))
1218                 {
1219                         xlmeta.root = metad->btm_root;
1220                         xlmeta.level = metad->btm_level;
1221                         xlmeta.fastroot = metad->btm_fastroot;
1222                         xlmeta.fastlevel = metad->btm_fastlevel;
1223
1224                         nextrdata->data = (char *) &xlmeta;
1225                         nextrdata->len = sizeof(xl_btree_metadata);
1226                         nextrdata->buffer = InvalidBuffer;
1227                         nextrdata->next = nextrdata + 1;
1228                         nextrdata++;
1229                         xlinfo = XLOG_BTREE_DELETE_PAGE_META;
1230                 }
1231                 else if (parent_half_dead)
1232                         xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
1233                 else
1234                         xlinfo = XLOG_BTREE_DELETE_PAGE;
1235
1236                 nextrdata->data = NULL;
1237                 nextrdata->len = 0;
1238                 nextrdata->next = nextrdata + 1;
1239                 nextrdata->buffer = pbuf;
1240                 nextrdata->buffer_std = true;
1241                 nextrdata++;
1242
1243                 nextrdata->data = NULL;
1244                 nextrdata->len = 0;
1245                 nextrdata->buffer = rbuf;
1246                 nextrdata->buffer_std = true;
1247                 nextrdata->next = NULL;
1248
1249                 if (BufferIsValid(lbuf))
1250                 {
1251                         nextrdata->next = nextrdata + 1;
1252                         nextrdata++;
1253                         nextrdata->data = NULL;
1254                         nextrdata->len = 0;
1255                         nextrdata->buffer = lbuf;
1256                         nextrdata->buffer_std = true;
1257                         nextrdata->next = NULL;
1258                 }
1259
1260                 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
1261
1262                 if (BufferIsValid(metabuf))
1263                 {
1264                         PageSetLSN(metapg, recptr);
1265                         PageSetTLI(metapg, ThisTimeLineID);
1266                 }
1267                 page = BufferGetPage(pbuf);
1268                 PageSetLSN(page, recptr);
1269                 PageSetTLI(page, ThisTimeLineID);
1270                 page = BufferGetPage(rbuf);
1271                 PageSetLSN(page, recptr);
1272                 PageSetTLI(page, ThisTimeLineID);
1273                 page = BufferGetPage(buf);
1274                 PageSetLSN(page, recptr);
1275                 PageSetTLI(page, ThisTimeLineID);
1276                 if (BufferIsValid(lbuf))
1277                 {
1278                         page = BufferGetPage(lbuf);
1279                         PageSetLSN(page, recptr);
1280                         PageSetTLI(page, ThisTimeLineID);
1281                 }
1282         }
1283
1284         END_CRIT_SECTION();
1285
1286         /* release metapage; send out relcache inval if metapage changed */
1287         if (BufferIsValid(metabuf))
1288         {
1289                 CacheInvalidateRelcache(rel);
1290                 _bt_relbuf(rel, metabuf);
1291         }
1292         /* can always release leftsib immediately */
1293         if (BufferIsValid(lbuf))
1294                 _bt_relbuf(rel, lbuf);
1295
1296         /*
1297          * If parent became half dead, recurse to delete it. Otherwise, if right
1298          * sibling is empty and is now the last child of the parent, recurse to
1299          * try to delete it.  (These cases cannot apply at the same time, though
1300          * the second case might itself recurse to the first.)
1301          *
1302          * When recursing to parent, we hold the lock on the target page until
1303          * done.  This delays any insertions into the keyspace that was just
1304          * effectively reassigned to the parent's right sibling.  If we allowed
1305          * that, and there were enough such insertions before we finish deleting
1306          * the parent, page splits within that keyspace could lead to inserting
1307          * out-of-order keys into the grandparent level.  It is thought that that
1308          * wouldn't have any serious consequences, but it still seems like a
1309          * pretty bad idea.
1310          */
1311         if (parent_half_dead)
1312         {
1313                 /* recursive call will release pbuf */
1314                 _bt_relbuf(rel, rbuf);
1315                 result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1;
1316                 _bt_relbuf(rel, buf);
1317         }
1318         else if (parent_one_child && rightsib_empty)
1319         {
1320                 _bt_relbuf(rel, pbuf);
1321                 _bt_relbuf(rel, buf);
1322                 /* recursive call will release rbuf */
1323                 result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1;
1324         }
1325         else
1326         {
1327                 _bt_relbuf(rel, pbuf);
1328                 _bt_relbuf(rel, buf);
1329                 _bt_relbuf(rel, rbuf);
1330                 result = 1;
1331         }
1332
1333         return result;
1334 }