src/backend/access/nbtree/nbtinsert.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * nbtinsert.c
   4  *        Item insertion in Lehman and Yao btrees for Postgres.
   5  *
   6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/nbtree/nbtinsert.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/nbtree.h"
  19 #include "access/nbtxlog.h"
  20 #include "access/transam.h"
  21 #include "access/xloginsert.h"
  22 #include "common/pg_prng.h"
  23 #include "lib/qunique.h"
  24 #include "miscadmin.h"
  25 #include "storage/lmgr.h"
  26 #include "storage/predicate.h"
  27 #include "storage/smgr.h"
  28
  29 /* Minimum tree height for application of fastpath optimization */
  30 #define BTREE_FASTPATH_MIN_LEVEL        2
  31
  32
  33 static BTStack _bt_search_insert(Relation rel, Relation heaprel,
  34                                                                  BTInsertState insertstate);
  35 static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
  36                                                                           Relation heapRel,
  37                                                                           IndexUniqueCheck checkUnique, bool *is_unique,
  38                                                                           uint32 *speculativeToken);
  39 static OffsetNumber _bt_findinsertloc(Relation rel,
  40                                                                           BTInsertState insertstate,
  41                                                                           bool checkingunique,
  42                                                                           bool indexUnchanged,
  43                                                                           BTStack stack,
  44                                                                           Relation heapRel);
  45 static void _bt_stepright(Relation rel, Relation heaprel,
  46                                                   BTInsertState insertstate, BTStack stack);
  47 static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
  48                                                    Buffer buf,
  49                                                    Buffer cbuf,
  50                                                    BTStack stack,
  51                                                    IndexTuple itup,
  52                                                    Size itemsz,
  53                                                    OffsetNumber newitemoff,
  54                                                    int postingoff,
  55                                                    bool split_only_page);
  56 static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
  57                                                 Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
  58                                                 Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
  59                                                 IndexTuple nposting, uint16 postingoff);
  60 static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
  61                                                           Buffer rbuf, BTStack stack, bool isroot, bool isonly);
  62 static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
  63 static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
  64                                                                 OffsetNumber itup_off, bool newfirstdataitem);
  65 static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
  66                                                                                  BTInsertState insertstate,
  67                                                                                  bool simpleonly, bool checkingunique,
  68                                                                                  bool uniquedup, bool indexUnchanged);
  69 static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
  70                                                            OffsetNumber *deletable, int ndeletable,
  71                                                            IndexTuple newitem, OffsetNumber minoff,
  72                                                            OffsetNumber maxoff);
  73 static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
  74                                                                    int ndeletable, IndexTuple newitem,
  75                                                                    int *nblocks);
  76 static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
  77
  78 /*
  79  *      _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
  80  *
  81  *              This routine is called by the public interface routine, btinsert.
  82  *              By here, itup is filled in, including the TID.
  83  *
  84  *              If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
  85  *              will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
  86  *              UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
  87  *              For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
  88  *              don't actually insert.
  89  *
  90  *              indexUnchanged executor hint indicates if itup is from an
  91  *              UPDATE that didn't logically change the indexed value, but
  92  *              must nevertheless have a new entry to point to a successor
  93  *              version.
  94  *
  95  *              The result value is only significant for UNIQUE_CHECK_PARTIAL:
  96  *              it must be true if the entry is known unique, else false.
  97  *              (In the current implementation we'll also return true after a
  98  *              successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
  99  *              that's just a coding artifact.)
 100  */
 101 bool
 102 _bt_doinsert(Relation rel, IndexTuple itup,
 103                          IndexUniqueCheck checkUnique, bool indexUnchanged,
 104                          Relation heapRel)
 105 {
 106         bool            is_unique = false;
 107         BTInsertStateData insertstate;
 108         BTScanInsert itup_key;
 109         BTStack         stack;
 110         bool            checkingunique = (checkUnique != UNIQUE_CHECK_NO);
 111
 112         /* we need an insertion scan key to do our search, so build one */
 113         itup_key = _bt_mkscankey(rel, itup);
 114
 115         if (checkingunique)
 116         {
 117                 if (!itup_key->anynullkeys)
 118                 {
 119                         /* No (heapkeyspace) scantid until uniqueness established */
 120                         itup_key->scantid = NULL;
 121                 }
 122                 else
 123                 {
 124                         /*
 125                          * Scan key for new tuple contains NULL key values.  Bypass
 126                          * checkingunique steps.  They are unnecessary because core code
 127                          * considers NULL unequal to every value, including NULL.
 128                          *
 129                          * This optimization avoids O(N^2) behavior within the
 130                          * _bt_findinsertloc() heapkeyspace path when a unique index has a
 131                          * large number of "duplicates" with NULL key values.
 132                          */
 133                         checkingunique = false;
 134                         /* Tuple is unique in the sense that core code cares about */
 135                         Assert(checkUnique != UNIQUE_CHECK_EXISTING);
 136                         is_unique = true;
 137                 }
 138         }
 139
 140         /*
 141          * Fill in the BTInsertState working area, to track the current page and
 142          * position within the page to insert on.
 143          *
 144          * Note that itemsz is passed down to lower level code that deals with
 145          * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
 146          * accounting code consistently considers the alignment overhead that we
 147          * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
 148          * already conservative about alignment, but we don't rely on that from
 149          * this distance.  Besides, preserving the "true" tuple size in index
 150          * tuple headers for the benefit of nbtsplitloc.c might happen someday.
 151          * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
 152          */
 153         insertstate.itup = itup;
 154         insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
 155         insertstate.itup_key = itup_key;
 156         insertstate.bounds_valid = false;
 157         insertstate.buf = InvalidBuffer;
 158         insertstate.postingoff = 0;
 159
 160 search:
 161
 162         /*
 163          * Find and lock the leaf page that the tuple should be added to by
 164          * searching from the root page.  insertstate.buf will hold a buffer that
 165          * is locked in exclusive mode afterwards.
 166          */
 167         stack = _bt_search_insert(rel, heapRel, &insertstate);
 168
 169         /*
 170          * checkingunique inserts are not allowed to go ahead when two tuples with
 171          * equal key attribute values would be visible to new MVCC snapshots once
 172          * the xact commits.  Check for conflicts in the locked page/buffer (if
 173          * needed) here.
 174          *
 175          * It might be necessary to check a page to the right in _bt_check_unique,
 176          * though that should be very rare.  In practice the first page the value
 177          * could be on (with scantid omitted) is almost always also the only page
 178          * that a matching tuple might be found on.  This is due to the behavior
 179          * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
 180          * only be allowed to cross a page boundary when there is no candidate
 181          * leaf page split point that avoids it.  Also, _bt_check_unique can use
 182          * the leaf page high key to determine that there will be no duplicates on
 183          * the right sibling without actually visiting it (it uses the high key in
 184          * cases where the new item happens to belong at the far right of the leaf
 185          * page).
 186          *
 187          * NOTE: obviously, _bt_check_unique can only detect keys that are already
 188          * in the index; so it cannot defend against concurrent insertions of the
 189          * same key.  We protect against that by means of holding a write lock on
 190          * the first page the value could be on, with omitted/-inf value for the
 191          * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
 192          * the same key must acquire a write lock on the same page, so only one
 193          * would-be inserter can be making the check at one time.  Furthermore,
 194          * once we are past the check we hold write locks continuously until we
 195          * have performed our insertion, so no later inserter can fail to see our
 196          * insertion.  (This requires some care in _bt_findinsertloc.)
 197          *
 198          * If we must wait for another xact, we release the lock while waiting,
 199          * and then must perform a new search.
 200          *
 201          * For a partial uniqueness check, we don't wait for the other xact. Just
 202          * let the tuple in and return false for possibly non-unique, or true for
 203          * definitely unique.
 204          */
 205         if (checkingunique)
 206         {
 207                 TransactionId xwait;
 208                 uint32          speculativeToken;
 209
 210                 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
 211                                                                  &is_unique, &speculativeToken);
 212
 213                 if (unlikely(TransactionIdIsValid(xwait)))
 214                 {
 215                         /* Have to wait for the other guy ... */
 216                         _bt_relbuf(rel, insertstate.buf);
 217                         insertstate.buf = InvalidBuffer;
 218
 219                         /*
 220                          * If it's a speculative insertion, wait for it to finish (ie. to
 221                          * go ahead with the insertion, or kill the tuple).  Otherwise
 222                          * wait for the transaction to finish as usual.
 223                          */
 224                         if (speculativeToken)
 225                                 SpeculativeInsertionWait(xwait, speculativeToken);
 226                         else
 227                                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
 228
 229                         /* start over... */
 230                         if (stack)
 231                                 _bt_freestack(stack);
 232                         goto search;
 233                 }
 234
 235                 /* Uniqueness is established -- restore heap tid as scantid */
 236                 if (itup_key->heapkeyspace)
 237                         itup_key->scantid = &itup->t_tid;
 238         }
 239
 240         if (checkUnique != UNIQUE_CHECK_EXISTING)
 241         {
 242                 OffsetNumber newitemoff;
 243
 244                 /*
 245                  * The only conflict predicate locking cares about for indexes is when
 246                  * an index tuple insert conflicts with an existing lock.  We don't
 247                  * know the actual page we're going to insert on for sure just yet in
 248                  * checkingunique and !heapkeyspace cases, but it's okay to use the
 249                  * first page the value could be on (with scantid omitted) instead.
 250                  */
 251                 CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
 252
 253                 /*
 254                  * Do the insertion.  Note that insertstate contains cached binary
 255                  * search bounds established within _bt_check_unique when insertion is
 256                  * checkingunique.
 257                  */
 258                 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 259                                                                            indexUnchanged, stack, heapRel);
 260                 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
 261                                            stack, itup, insertstate.itemsz, newitemoff,
 262                                            insertstate.postingoff, false);
 263         }
 264         else
 265         {
 266                 /* just release the buffer */
 267                 _bt_relbuf(rel, insertstate.buf);
 268         }
 269
 270         /* be tidy */
 271         if (stack)
 272                 _bt_freestack(stack);
 273         pfree(itup_key);
 274
 275         return is_unique;
 276 }
 277
 278 /*
 279  *      _bt_search_insert() -- _bt_search() wrapper for inserts
 280  *
 281  * Search the tree for a particular scankey, or more precisely for the first
 282  * leaf page it could be on.  Try to make use of the fastpath optimization's
 283  * rightmost leaf page cache before actually searching the tree from the root
 284  * page, though.
 285  *
 286  * Return value is a stack of parent-page pointers (though see notes about
 287  * fastpath optimization and page splits below).  insertstate->buf is set to
 288  * the address of the leaf-page buffer, which is write-locked and pinned in
 289  * all cases (if necessary by creating a new empty root page for caller).
 290  *
 291  * The fastpath optimization avoids most of the work of searching the tree
 292  * repeatedly when a single backend inserts successive new tuples on the
 293  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
 294  * page is maintained within _bt_insertonpg(), and used here.  The cache is
 295  * invalidated here when an insert of a non-pivot tuple must take place on a
 296  * non-rightmost leaf page.
 297  *
 298  * The optimization helps with indexes on an auto-incremented field.  It also
 299  * helps with indexes on datetime columns, as well as indexes with lots of
 300  * NULL values.  (NULLs usually get inserted in the rightmost page for single
 301  * column indexes, since they usually get treated as coming after everything
 302  * else in the key space.  Individual NULL tuples will generally be placed on
 303  * the rightmost leaf page due to the influence of the heap TID column.)
 304  *
 305  * Note that we avoid applying the optimization when there is insufficient
 306  * space on the rightmost page to fit caller's new item.  This is necessary
 307  * because we'll need to return a real descent stack when a page split is
 308  * expected (actually, caller can cope with a leaf page split that uses a NULL
 309  * stack, but that's very slow and so must be avoided).  Note also that the
 310  * fastpath optimization acquires the lock on the page conditionally as a way
 311  * of reducing extra contention when there are concurrent insertions into the
 312  * rightmost page (we give up if we'd have to wait for the lock).  We assume
 313  * that it isn't useful to apply the optimization when there is contention,
 314  * since each per-backend cache won't stay valid for long.
 315  */
 316 static BTStack
 317 _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
 318 {
 319         Assert(insertstate->buf == InvalidBuffer);
 320         Assert(!insertstate->bounds_valid);
 321         Assert(insertstate->postingoff == 0);
 322
 323         if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
 324         {
 325                 /* Simulate a _bt_getbuf() call with conditional locking */
 326                 insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
 327                 if (_bt_conditionallockbuf(rel, insertstate->buf))
 328                 {
 329                         Page            page;
 330                         BTPageOpaque opaque;
 331
 332                         _bt_checkpage(rel, insertstate->buf);
 333                         page = BufferGetPage(insertstate->buf);
 334                         opaque = BTPageGetOpaque(page);
 335
 336                         /*
 337                          * Check if the page is still the rightmost leaf page and has
 338                          * enough free space to accommodate the new tuple.  Also check
 339                          * that the insertion scan key is strictly greater than the first
 340                          * non-pivot tuple on the page.  (Note that we expect itup_key's
 341                          * scantid to be unset when our caller is a checkingunique
 342                          * inserter.)
 343                          */
 344                         if (P_RIGHTMOST(opaque) &&
 345                                 P_ISLEAF(opaque) &&
 346                                 !P_IGNORE(opaque) &&
 347                                 PageGetFreeSpace(page) > insertstate->itemsz &&
 348                                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
 349                                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
 350                         {
 351                                 /*
 352                                  * Caller can use the fastpath optimization because cached
 353                                  * block is still rightmost leaf page, which can fit caller's
 354                                  * new tuple without splitting.  Keep block in local cache for
 355                                  * next insert, and have caller use NULL stack.
 356                                  *
 357                                  * Note that _bt_insert_parent() has an assertion that catches
 358                                  * leaf page splits that somehow follow from a fastpath insert
 359                                  * (it should only be passed a NULL stack when it must deal
 360                                  * with a concurrent root page split, and never because a NULL
 361                                  * stack was returned here).
 362                                  */
 363                                 return NULL;
 364                         }
 365
 366                         /* Page unsuitable for caller, drop lock and pin */
 367                         _bt_relbuf(rel, insertstate->buf);
 368                 }
 369                 else
 370                 {
 371                         /* Lock unavailable, drop pin */
 372                         ReleaseBuffer(insertstate->buf);
 373                 }
 374
 375                 /* Forget block, since cache doesn't appear to be useful */
 376                 RelationSetTargetBlock(rel, InvalidBlockNumber);
 377         }
 378
 379         /* Cannot use optimization -- descend tree, return proper descent stack */
 380         return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
 381                                           BT_WRITE, NULL);
 382 }
 383
 384 /*
 385  *      _bt_check_unique() -- Check for violation of unique index constraint
 386  *
 387  * Returns InvalidTransactionId if there is no conflict, else an xact ID
 388  * we must wait for to see if it commits a conflicting tuple.   If an actual
 389  * conflict is detected, no return --- just ereport().  If an xact ID is
 390  * returned, and the conflicting tuple still has a speculative insertion in
 391  * progress, *speculativeToken is set to non-zero, and the caller can wait for
 392  * the verdict on the insertion using SpeculativeInsertionWait().
 393  *
 394  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
 395  * InvalidTransactionId because we don't want to wait.  In this case we
 396  * set *is_unique to false if there is a potential conflict, and the
 397  * core code must redo the uniqueness check later.
 398  *
 399  * As a side-effect, sets state in insertstate that can later be used by
 400  * _bt_findinsertloc() to reuse most of the binary search work we do
 401  * here.
 402  *
 403  * This code treats NULLs as equal, unlike the default semantics for unique
 404  * indexes.  So do not call here when there are NULL values in scan key and
 405  * the index uses the default NULLS DISTINCT mode.
 406  */
 407 static TransactionId
 408 _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 409                                  IndexUniqueCheck checkUnique, bool *is_unique,
 410                                  uint32 *speculativeToken)
 411 {
 412         IndexTuple      itup = insertstate->itup;
 413         IndexTuple      curitup = NULL;
 414         ItemId          curitemid = NULL;
 415         BTScanInsert itup_key = insertstate->itup_key;
 416         SnapshotData SnapshotDirty;
 417         OffsetNumber offset;
 418         OffsetNumber maxoff;
 419         Page            page;
 420         BTPageOpaque opaque;
 421         Buffer          nbuf = InvalidBuffer;
 422         bool            found = false;
 423         bool            inposting = false;
 424         bool            prevalldead = true;
 425         int                     curposti = 0;
 426
 427         /* Assume unique until we find a duplicate */
 428         *is_unique = true;
 429
 430         InitDirtySnapshot(SnapshotDirty);
 431
 432         page = BufferGetPage(insertstate->buf);
 433         opaque = BTPageGetOpaque(page);
 434         maxoff = PageGetMaxOffsetNumber(page);
 435
 436         /*
 437          * Find the first tuple with the same key.
 438          *
 439          * This also saves the binary search bounds in insertstate.  We use them
 440          * in the fastpath below, but also in the _bt_findinsertloc() call later.
 441          */
 442         Assert(!insertstate->bounds_valid);
 443         offset = _bt_binsrch_insert(rel, insertstate);
 444
 445         /*
 446          * Scan over all equal tuples, looking for live conflicts.
 447          */
 448         Assert(!insertstate->bounds_valid || insertstate->low == offset);
 449         Assert(!itup_key->anynullkeys);
 450         Assert(itup_key->scantid == NULL);
 451         for (;;)
 452         {
 453                 /*
 454                  * Each iteration of the loop processes one heap TID, not one index
 455                  * tuple.  Current offset number for page isn't usually advanced on
 456                  * iterations that process heap TIDs from posting list tuples.
 457                  *
 458                  * "inposting" state is set when _inside_ a posting list --- not when
 459                  * we're at the start (or end) of a posting list.  We advance curposti
 460                  * at the end of the iteration when inside a posting list tuple.  In
 461                  * general, every loop iteration either advances the page offset or
 462                  * advances curposti --- an iteration that handles the rightmost/max
 463                  * heap TID in a posting list finally advances the page offset (and
 464                  * unsets "inposting").
 465                  *
 466                  * Make sure the offset points to an actual index tuple before trying
 467                  * to examine it...
 468                  */
 469                 if (offset <= maxoff)
 470                 {
 471                         /*
 472                          * Fastpath: In most cases, we can use cached search bounds to
 473                          * limit our consideration to items that are definitely
 474                          * duplicates.  This fastpath doesn't apply when the original page
 475                          * is empty, or when initial offset is past the end of the
 476                          * original page, which may indicate that we need to examine a
 477                          * second or subsequent page.
 478                          *
 479                          * Note that this optimization allows us to avoid calling
 480                          * _bt_compare() directly when there are no duplicates, as long as
 481                          * the offset where the key will go is not at the end of the page.
 482                          */
 483                         if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
 484                         {
 485                                 Assert(insertstate->bounds_valid);
 486                                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
 487                                 Assert(insertstate->low <= insertstate->stricthigh);
 488                                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
 489                                 break;
 490                         }
 491
 492                         /*
 493                          * We can skip items that are already marked killed.
 494                          *
 495                          * In the presence of heavy update activity an index may contain
 496                          * many killed items with the same key; running _bt_compare() on
 497                          * each killed item gets expensive.  Just advance over killed
 498                          * items as quickly as we can.  We only apply _bt_compare() when
 499                          * we get to a non-killed item.  We could reuse the bounds to
 500                          * avoid _bt_compare() calls for known equal tuples, but it
 501                          * doesn't seem worth it.
 502                          */
 503                         if (!inposting)
 504                                 curitemid = PageGetItemId(page, offset);
 505                         if (inposting || !ItemIdIsDead(curitemid))
 506                         {
 507                                 ItemPointerData htid;
 508                                 bool            all_dead = false;
 509
 510                                 if (!inposting)
 511                                 {
 512                                         /* Plain tuple, or first TID in posting list tuple */
 513                                         if (_bt_compare(rel, itup_key, page, offset) != 0)
 514                                                 break;  /* we're past all the equal tuples */
 515
 516                                         /* Advanced curitup */
 517                                         curitup = (IndexTuple) PageGetItem(page, curitemid);
 518                                         Assert(!BTreeTupleIsPivot(curitup));
 519                                 }
 520
 521                                 /* okay, we gotta fetch the heap tuple using htid ... */
 522                                 if (!BTreeTupleIsPosting(curitup))
 523                                 {
 524                                         /* ... htid is from simple non-pivot tuple */
 525                                         Assert(!inposting);
 526                                         htid = curitup->t_tid;
 527                                 }
 528                                 else if (!inposting)
 529                                 {
 530                                         /* ... htid is first TID in new posting list */
 531                                         inposting = true;
 532                                         prevalldead = true;
 533                                         curposti = 0;
 534                                         htid = *BTreeTupleGetPostingN(curitup, 0);
 535                                 }
 536                                 else
 537                                 {
 538                                         /* ... htid is second or subsequent TID in posting list */
 539                                         Assert(curposti > 0);
 540                                         htid = *BTreeTupleGetPostingN(curitup, curposti);
 541                                 }
 542
 543                                 /*
 544                                  * If we are doing a recheck, we expect to find the tuple we
 545                                  * are rechecking.  It's not a duplicate, but we have to keep
 546                                  * scanning.
 547                                  */
 548                                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
 549                                         ItemPointerCompare(&htid, &itup->t_tid) == 0)
 550                                 {
 551                                         found = true;
 552                                 }
 553
 554                                 /*
 555                                  * Check if there's any table tuples for this index entry
 556                                  * satisfying SnapshotDirty. This is necessary because for AMs
 557                                  * with optimizations like heap's HOT, we have just a single
 558                                  * index entry for the entire chain.
 559                                  */
 560                                 else if (table_index_fetch_tuple_check(heapRel, &htid,
 561                                                                                                            &SnapshotDirty,
 562                                                                                                            &all_dead))
 563                                 {
 564                                         TransactionId xwait;
 565
 566                                         /*
 567                                          * It is a duplicate. If we are only doing a partial
 568                                          * check, then don't bother checking if the tuple is being
 569                                          * updated in another transaction. Just return the fact
 570                                          * that it is a potential conflict and leave the full
 571                                          * check till later. Don't invalidate binary search
 572                                          * bounds.
 573                                          */
 574                                         if (checkUnique == UNIQUE_CHECK_PARTIAL)
 575                                         {
 576                                                 if (nbuf != InvalidBuffer)
 577                                                         _bt_relbuf(rel, nbuf);
 578                                                 *is_unique = false;
 579                                                 return InvalidTransactionId;
 580                                         }
 581
 582                                         /*
 583                                          * If this tuple is being updated by other transaction
 584                                          * then we have to wait for its commit/abort.
 585                                          */
 586                                         xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
 587                                                 SnapshotDirty.xmin : SnapshotDirty.xmax;
 588
 589                                         if (TransactionIdIsValid(xwait))
 590                                         {
 591                                                 if (nbuf != InvalidBuffer)
 592                                                         _bt_relbuf(rel, nbuf);
 593                                                 /* Tell _bt_doinsert to wait... */
 594                                                 *speculativeToken = SnapshotDirty.speculativeToken;
 595                                                 /* Caller releases lock on buf immediately */
 596                                                 insertstate->bounds_valid = false;
 597                                                 return xwait;
 598                                         }
 599
 600                                         /*
 601                                          * Otherwise we have a definite conflict.  But before
 602                                          * complaining, look to see if the tuple we want to insert
 603                                          * is itself now committed dead --- if so, don't complain.
 604                                          * This is a waste of time in normal scenarios but we must
 605                                          * do it to support CREATE INDEX CONCURRENTLY.
 606                                          *
 607                                          * We must follow HOT-chains here because during
 608                                          * concurrent index build, we insert the root TID though
 609                                          * the actual tuple may be somewhere in the HOT-chain.
 610                                          * While following the chain we might not stop at the
 611                                          * exact tuple which triggered the insert, but that's OK
 612                                          * because if we find a live tuple anywhere in this chain,
 613                                          * we have a unique key conflict.  The other live tuple is
 614                                          * not part of this chain because it had a different index
 615                                          * entry.
 616                                          */
 617                                         htid = itup->t_tid;
 618                                         if (table_index_fetch_tuple_check(heapRel, &htid,
 619                                                                                                           SnapshotSelf, NULL))
 620                                         {
 621                                                 /* Normal case --- it's still live */
 622                                         }
 623                                         else
 624                                         {
 625                                                 /*
 626                                                  * It's been deleted, so no error, and no need to
 627                                                  * continue searching
 628                                                  */
 629                                                 break;
 630                                         }
 631
 632                                         /*
 633                                          * Check for a conflict-in as we would if we were going to
 634                                          * write to this page.  We aren't actually going to write,
 635                                          * but we want a chance to report SSI conflicts that would
 636                                          * otherwise be masked by this unique constraint
 637                                          * violation.
 638                                          */
 639                                         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
 640
 641                                         /*
 642                                          * This is a definite conflict.  Break the tuple down into
 643                                          * datums and report the error.  But first, make sure we
 644                                          * release the buffer locks we're holding ---
 645                                          * BuildIndexValueDescription could make catalog accesses,
 646                                          * which in the worst case might touch this same index and
 647                                          * cause deadlocks.
 648                                          */
 649                                         if (nbuf != InvalidBuffer)
 650                                                 _bt_relbuf(rel, nbuf);
 651                                         _bt_relbuf(rel, insertstate->buf);
 652                                         insertstate->buf = InvalidBuffer;
 653                                         insertstate->bounds_valid = false;
 654
 655                                         {
 656                                                 Datum           values[INDEX_MAX_KEYS];
 657                                                 bool            isnull[INDEX_MAX_KEYS];
 658                                                 char       *key_desc;
 659
 660                                                 index_deform_tuple(itup, RelationGetDescr(rel),
 661                                                                                    values, isnull);
 662
 663                                                 key_desc = BuildIndexValueDescription(rel, values,
 664                                                                                                                           isnull);
 665
 666                                                 ereport(ERROR,
 667                                                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
 668                                                                  errmsg("duplicate key value violates unique constraint \"%s\"",
 669                                                                                 RelationGetRelationName(rel)),
 670                                                                  key_desc ? errdetail("Key %s already exists.",
 671                                                                                                           key_desc) : 0,
 672                                                                  errtableconstraint(heapRel,
 673                                                                                                         RelationGetRelationName(rel))));
 674                                         }
 675                                 }
 676                                 else if (all_dead && (!inposting ||
 677                                                                           (prevalldead &&
 678                                                                            curposti == BTreeTupleGetNPosting(curitup) - 1)))
 679                                 {
 680                                         /*
 681                                          * The conflicting tuple (or all HOT chains pointed to by
 682                                          * all posting list TIDs) is dead to everyone, so mark the
 683                                          * index entry killed.
 684                                          */
 685                                         ItemIdMarkDead(curitemid);
 686                                         opaque->btpo_flags |= BTP_HAS_GARBAGE;
 687
 688                                         /*
 689                                          * Mark buffer with a dirty hint, since state is not
 690                                          * crucial. Be sure to mark the proper buffer dirty.
 691                                          */
 692                                         if (nbuf != InvalidBuffer)
 693                                                 MarkBufferDirtyHint(nbuf, true);
 694                                         else
 695                                                 MarkBufferDirtyHint(insertstate->buf, true);
 696                                 }
 697
 698                                 /*
 699                                  * Remember if posting list tuple has even a single HOT chain
 700                                  * whose members are not all dead
 701                                  */
 702                                 if (!all_dead && inposting)
 703                                         prevalldead = false;
 704                         }
 705                 }
 706
 707                 if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
 708                 {
 709                         /* Advance to next TID in same posting list */
 710                         curposti++;
 711                         continue;
 712                 }
 713                 else if (offset < maxoff)
 714                 {
 715                         /* Advance to next tuple */
 716                         curposti = 0;
 717                         inposting = false;
 718                         offset = OffsetNumberNext(offset);
 719                 }
 720                 else
 721                 {
 722                         int                     highkeycmp;
 723
 724                         /* If scankey == hikey we gotta check the next page too */
 725                         if (P_RIGHTMOST(opaque))
 726                                 break;
 727                         highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
 728                         Assert(highkeycmp <= 0);
 729                         if (highkeycmp != 0)
 730                                 break;
 731                         /* Advance to next non-dead page --- there must be one */
 732                         for (;;)
 733                         {
 734                                 BlockNumber nblkno = opaque->btpo_next;
 735
 736                                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
 737                                 page = BufferGetPage(nbuf);
 738                                 opaque = BTPageGetOpaque(page);
 739                                 if (!P_IGNORE(opaque))
 740                                         break;
 741                                 if (P_RIGHTMOST(opaque))
 742                                         elog(ERROR, "fell off the end of index \"%s\"",
 743                                                  RelationGetRelationName(rel));
 744                         }
 745                         /* Will also advance to next tuple */
 746                         curposti = 0;
 747                         inposting = false;
 748                         maxoff = PageGetMaxOffsetNumber(page);
 749                         offset = P_FIRSTDATAKEY(opaque);
 750                         /* Don't invalidate binary search bounds */
 751                 }
 752         }
 753
 754         /*
 755          * If we are doing a recheck then we should have found the tuple we are
 756          * checking.  Otherwise there's something very wrong --- probably, the
 757          * index is on a non-immutable expression.
 758          */
 759         if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
 760                 ereport(ERROR,
 761                                 (errcode(ERRCODE_INTERNAL_ERROR),
 762                                  errmsg("failed to re-find tuple within index \"%s\"",
 763                                                 RelationGetRelationName(rel)),
 764                                  errhint("This may be because of a non-immutable index expression."),
 765                                  errtableconstraint(heapRel,
 766                                                                         RelationGetRelationName(rel))));
 767
 768         if (nbuf != InvalidBuffer)
 769                 _bt_relbuf(rel, nbuf);
 770
 771         return InvalidTransactionId;
 772 }
 773
 774
 775 /*
 776  *      _bt_findinsertloc() -- Finds an insert location for a tuple
 777  *
 778  *              On entry, insertstate buffer contains the page the new tuple belongs
 779  *              on.  It is exclusive-locked and pinned by the caller.
 780  *
 781  *              If 'checkingunique' is true, the buffer on entry is the first page
 782  *              that contains duplicates of the new key.  If there are duplicates on
 783  *              multiple pages, the correct insertion position might be some page to
 784  *              the right, rather than the first page.  In that case, this function
 785  *              moves right to the correct target page.
 786  *
 787  *              (In a !heapkeyspace index, there can be multiple pages with the same
 788  *              high key, where the new tuple could legitimately be placed on.  In
 789  *              that case, the caller passes the first page containing duplicates,
 790  *              just like when checkingunique=true.  If that page doesn't have enough
 791  *              room for the new tuple, this function moves right, trying to find a
 792  *              legal page that does.)
 793  *
 794  *              If 'indexUnchanged' is true, this is for an UPDATE that didn't
 795  *              logically change the indexed value, but must nevertheless have a new
 796  *              entry to point to a successor version.  This hint from the executor
 797  *              will influence our behavior when the page might have to be split and
 798  *              we must consider our options.  Bottom-up index deletion can avoid
 799  *              pathological version-driven page splits, but we only want to go to the
 800  *              trouble of trying it when we already have moderate confidence that
 801  *              it's appropriate.  The hint should not significantly affect our
 802  *              behavior over time unless practically all inserts on to the leaf page
 803  *              get the hint.
 804  *
 805  *              On exit, insertstate buffer contains the chosen insertion page, and
 806  *              the offset within that page is returned.  If _bt_findinsertloc needed
 807  *              to move right, the lock and pin on the original page are released, and
 808  *              the new buffer is exclusively locked and pinned instead.
 809  *
 810  *              If insertstate contains cached binary search bounds, we will take
 811  *              advantage of them.  This avoids repeating comparisons that we made in
 812  *              _bt_check_unique() already.
 813  */
 814 static OffsetNumber
 815 _bt_findinsertloc(Relation rel,
 816                                   BTInsertState insertstate,
 817                                   bool checkingunique,
 818                                   bool indexUnchanged,
 819                                   BTStack stack,
 820                                   Relation heapRel)
 821 {
 822         BTScanInsert itup_key = insertstate->itup_key;
 823         Page            page = BufferGetPage(insertstate->buf);
 824         BTPageOpaque opaque;
 825         OffsetNumber newitemoff;
 826
 827         opaque = BTPageGetOpaque(page);
 828
 829         /* Check 1/3 of a page restriction */
 830         if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
 831                 _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
 832                                                          insertstate->itup);
 833
 834         Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
 835         Assert(!insertstate->bounds_valid || checkingunique);
 836         Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
 837         Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
 838         Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
 839
 840         if (itup_key->heapkeyspace)
 841         {
 842                 /* Keep track of whether checkingunique duplicate seen */
 843                 bool            uniquedup = indexUnchanged;
 844
 845                 /*
 846                  * If we're inserting into a unique index, we may have to walk right
 847                  * through leaf pages to find the one leaf page that we must insert on
 848                  * to.
 849                  *
 850                  * This is needed for checkingunique callers because a scantid was not
 851                  * used when we called _bt_search().  scantid can only be set after
 852                  * _bt_check_unique() has checked for duplicates.  The buffer
 853                  * initially stored in insertstate->buf has the page where the first
 854                  * duplicate key might be found, which isn't always the page that new
 855                  * tuple belongs on.  The heap TID attribute for new tuple (scantid)
 856                  * could force us to insert on a sibling page, though that should be
 857                  * very rare in practice.
 858                  */
 859                 if (checkingunique)
 860                 {
 861                         if (insertstate->low < insertstate->stricthigh)
 862                         {
 863                                 /* Encountered a duplicate in _bt_check_unique() */
 864                                 Assert(insertstate->bounds_valid);
 865                                 uniquedup = true;
 866                         }
 867
 868                         for (;;)
 869                         {
 870                                 /*
 871                                  * Does the new tuple belong on this page?
 872                                  *
 873                                  * The earlier _bt_check_unique() call may well have
 874                                  * established a strict upper bound on the offset for the new
 875                                  * item.  If it's not the last item of the page (i.e. if there
 876                                  * is at least one tuple on the page that goes after the tuple
 877                                  * we're inserting) then we know that the tuple belongs on
 878                                  * this page.  We can skip the high key check.
 879                                  */
 880                                 if (insertstate->bounds_valid &&
 881                                         insertstate->low <= insertstate->stricthigh &&
 882                                         insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
 883                                         break;
 884
 885                                 /* Test '<=', not '!=', since scantid is set now */
 886                                 if (P_RIGHTMOST(opaque) ||
 887                                         _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
 888                                         break;
 889
 890                                 _bt_stepright(rel, heapRel, insertstate, stack);
 891                                 /* Update local state after stepping right */
 892                                 page = BufferGetPage(insertstate->buf);
 893                                 opaque = BTPageGetOpaque(page);
 894                                 /* Assume duplicates (if checkingunique) */
 895                                 uniquedup = true;
 896                         }
 897                 }
 898
 899                 /*
 900                  * If the target page cannot fit newitem, try to avoid splitting the
 901                  * page on insert by performing deletion or deduplication now
 902                  */
 903                 if (PageGetFreeSpace(page) < insertstate->itemsz)
 904                         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
 905                                                                                  checkingunique, uniquedup,
 906                                                                                  indexUnchanged);
 907         }
 908         else
 909         {
 910                 /*----------
 911                  * This is a !heapkeyspace (version 2 or 3) index.  The current page
 912                  * is the first page that we could insert the new tuple to, but there
 913                  * may be other pages to the right that we could opt to use instead.
 914                  *
 915                  * If the new key is equal to one or more existing keys, we can
 916                  * legitimately place it anywhere in the series of equal keys.  In
 917                  * fact, if the new key is equal to the page's "high key" we can place
 918                  * it on the next page.  If it is equal to the high key, and there's
 919                  * not room to insert the new tuple on the current page without
 920                  * splitting, then we move right hoping to find more free space and
 921                  * avoid a split.
 922                  *
 923                  * Keep scanning right until we
 924                  *              (a) find a page with enough free space,
 925                  *              (b) reach the last page where the tuple can legally go, or
 926                  *              (c) get tired of searching.
 927                  * (c) is not flippant; it is important because if there are many
 928                  * pages' worth of equal keys, it's better to split one of the early
 929                  * pages than to scan all the way to the end of the run of equal keys
 930                  * on every insert.  We implement "get tired" as a random choice,
 931                  * since stopping after scanning a fixed number of pages wouldn't work
 932                  * well (we'd never reach the right-hand side of previously split
 933                  * pages).  The probability of moving right is set at 0.99, which may
 934                  * seem too high to change the behavior much, but it does an excellent
 935                  * job of preventing O(N^2) behavior with many equal keys.
 936                  *----------
 937                  */
 938                 while (PageGetFreeSpace(page) < insertstate->itemsz)
 939                 {
 940                         /*
 941                          * Before considering moving right, see if we can obtain enough
 942                          * space by erasing LP_DEAD items
 943                          */
 944                         if (P_HAS_GARBAGE(opaque))
 945                         {
 946                                 /* Perform simple deletion */
 947                                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
 948                                                                                          false, false, false);
 949
 950                                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
 951                                         break;          /* OK, now we have enough space */
 952                         }
 953
 954                         /*
 955                          * Nope, so check conditions (b) and (c) enumerated above
 956                          *
 957                          * The earlier _bt_check_unique() call may well have established a
 958                          * strict upper bound on the offset for the new item.  If it's not
 959                          * the last item of the page (i.e. if there is at least one tuple
 960                          * on the page that's greater than the tuple we're inserting to)
 961                          * then we know that the tuple belongs on this page.  We can skip
 962                          * the high key check.
 963                          */
 964                         if (insertstate->bounds_valid &&
 965                                 insertstate->low <= insertstate->stricthigh &&
 966                                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
 967                                 break;
 968
 969                         if (P_RIGHTMOST(opaque) ||
 970                                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
 971                                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
 972                                 break;
 973
 974                         _bt_stepright(rel, heapRel, insertstate, stack);
 975                         /* Update local state after stepping right */
 976                         page = BufferGetPage(insertstate->buf);
 977                         opaque = BTPageGetOpaque(page);
 978                 }
 979         }
 980
 981         /*
 982          * We should now be on the correct page.  Find the offset within the page
 983          * for the new tuple. (Possibly reusing earlier search bounds.)
 984          */
 985         Assert(P_RIGHTMOST(opaque) ||
 986                    _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
 987
 988         newitemoff = _bt_binsrch_insert(rel, insertstate);
 989
 990         if (insertstate->postingoff == -1)
 991         {
 992                 /*
 993                  * There is an overlapping posting list tuple with its LP_DEAD bit
 994                  * set.  We don't want to unnecessarily unset its LP_DEAD bit while
 995                  * performing a posting list split, so perform simple index tuple
 996                  * deletion early.
 997                  */
 998                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
 999                                                                          false, false, false);
1000
1001                 /*
1002                  * Do new binary search.  New insert location cannot overlap with any
1003                  * posting list now.
1004                  */
1005                 Assert(!insertstate->bounds_valid);
1006                 insertstate->postingoff = 0;
1007                 newitemoff = _bt_binsrch_insert(rel, insertstate);
1008                 Assert(insertstate->postingoff == 0);
1009         }
1010
1011         return newitemoff;
1012 }
1013
1014 /*
1015  * Step right to next non-dead page, during insertion.
1016  *
1017  * This is a bit more complicated than moving right in a search.  We must
1018  * write-lock the target page before releasing write lock on current page;
1019  * else someone else's _bt_check_unique scan could fail to see our insertion.
1020  * Write locks on intermediate dead pages won't do because we don't know when
1021  * they will get de-linked from the tree.
1022  *
1023  * This is more aggressive than it needs to be for non-unique !heapkeyspace
1024  * indexes.
1025  */
1026 static void
1027 _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1028                           BTStack stack)
1029 {
1030         Page            page;
1031         BTPageOpaque opaque;
1032         Buffer          rbuf;
1033         BlockNumber rblkno;
1034
1035         Assert(heaprel != NULL);
1036         page = BufferGetPage(insertstate->buf);
1037         opaque = BTPageGetOpaque(page);
1038
1039         rbuf = InvalidBuffer;
1040         rblkno = opaque->btpo_next;
1041         for (;;)
1042         {
1043                 rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1044                 page = BufferGetPage(rbuf);
1045                 opaque = BTPageGetOpaque(page);
1046
1047                 /*
1048                  * If this page was incompletely split, finish the split now.  We do
1049                  * this while holding a lock on the left sibling, which is not good
1050                  * because finishing the split could be a fairly lengthy operation.
1051                  * But this should happen very seldom.
1052                  */
1053                 if (P_INCOMPLETE_SPLIT(opaque))
1054                 {
1055                         _bt_finish_split(rel, heaprel, rbuf, stack);
1056                         rbuf = InvalidBuffer;
1057                         continue;
1058                 }
1059
1060                 if (!P_IGNORE(opaque))
1061                         break;
1062                 if (P_RIGHTMOST(opaque))
1063                         elog(ERROR, "fell off the end of index \"%s\"",
1064                                  RelationGetRelationName(rel));
1065
1066                 rblkno = opaque->btpo_next;
1067         }
1068         /* rbuf locked; unlock buf, update state for caller */
1069         _bt_relbuf(rel, insertstate->buf);
1070         insertstate->buf = rbuf;
1071         insertstate->bounds_valid = false;
1072 }
1073
1074 /*----------
1075  *      _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1076  *
1077  *              This recursive procedure does the following things:
1078  *
1079  *                      +  if postingoff != 0, splits existing posting list tuple
1080  *                         (since it overlaps with new 'itup' tuple).
1081  *                      +  if necessary, splits the target page, using 'itup_key' for
1082  *                         suffix truncation on leaf pages (caller passes NULL for
1083  *                         non-leaf pages).
1084  *                      +  inserts the new tuple (might be split from posting list).
1085  *                      +  if the page was split, pops the parent stack, and finds the
1086  *                         right place to insert the new child pointer (by walking
1087  *                         right using information stored in the parent stack).
1088  *                      +  invokes itself with the appropriate tuple for the right
1089  *                         child page on the parent.
1090  *                      +  updates the metapage if a true root or fast root is split.
1091  *
1092  *              On entry, we must have the correct buffer in which to do the
1093  *              insertion, and the buffer must be pinned and write-locked.  On return,
1094  *              we will have dropped both the pin and the lock on the buffer.
1095  *
1096  *              This routine only performs retail tuple insertions.  'itup' should
1097  *              always be either a non-highkey leaf item, or a downlink (new high
1098  *              key items are created indirectly, when a page is split).  When
1099  *              inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1100  *              we're inserting the downlink for.  This function will clear the
1101  *              INCOMPLETE_SPLIT flag on it, and release the buffer.
1102  *----------
1103  */
1104 static void
1105 _bt_insertonpg(Relation rel,
1106                            Relation heaprel,
1107                            BTScanInsert itup_key,
1108                            Buffer buf,
1109                            Buffer cbuf,
1110                            BTStack stack,
1111                            IndexTuple itup,
1112                            Size itemsz,
1113                            OffsetNumber newitemoff,
1114                            int postingoff,
1115                            bool split_only_page)
1116 {
1117         Page            page;
1118         BTPageOpaque opaque;
1119         bool            isleaf,
1120                                 isroot,
1121                                 isrightmost,
1122                                 isonly;
1123         IndexTuple      oposting = NULL;
1124         IndexTuple      origitup = NULL;
1125         IndexTuple      nposting = NULL;
1126
1127         page = BufferGetPage(buf);
1128         opaque = BTPageGetOpaque(page);
1129         isleaf = P_ISLEAF(opaque);
1130         isroot = P_ISROOT(opaque);
1131         isrightmost = P_RIGHTMOST(opaque);
1132         isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1133
1134         /* child buffer must be given iff inserting on an internal page */
1135         Assert(isleaf == !BufferIsValid(cbuf));
1136         /* tuple must have appropriate number of attributes */
1137         Assert(!isleaf ||
1138                    BTreeTupleGetNAtts(itup, rel) ==
1139                    IndexRelationGetNumberOfAttributes(rel));
1140         Assert(isleaf ||
1141                    BTreeTupleGetNAtts(itup, rel) <=
1142                    IndexRelationGetNumberOfKeyAttributes(rel));
1143         Assert(!BTreeTupleIsPosting(itup));
1144         Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1145         /* Caller must always finish incomplete split for us */
1146         Assert(!P_INCOMPLETE_SPLIT(opaque));
1147
1148         /*
1149          * Every internal page should have exactly one negative infinity item at
1150          * all times.  Only _bt_split() and _bt_newlevel() should add items that
1151          * become negative infinity items through truncation, since they're the
1152          * only routines that allocate new internal pages.
1153          */
1154         Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1155
1156         /*
1157          * Do we need to split an existing posting list item?
1158          */
1159         if (postingoff != 0)
1160         {
1161                 ItemId          itemid = PageGetItemId(page, newitemoff);
1162
1163                 /*
1164                  * The new tuple is a duplicate with a heap TID that falls inside the
1165                  * range of an existing posting list tuple on a leaf page.  Prepare to
1166                  * split an existing posting list.  Overwriting the posting list with
1167                  * its post-split version is treated as an extra step in either the
1168                  * insert or page split critical section.
1169                  */
1170                 Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1171                 oposting = (IndexTuple) PageGetItem(page, itemid);
1172
1173                 /*
1174                  * postingoff value comes from earlier call to _bt_binsrch_posting().
1175                  * Its binary search might think that a plain tuple must be a posting
1176                  * list tuple that needs to be split.  This can happen with corruption
1177                  * involving an existing plain tuple that is a duplicate of the new
1178                  * item, up to and including its table TID.  Check for that here in
1179                  * passing.
1180                  *
1181                  * Also verify that our caller has made sure that the existing posting
1182                  * list tuple does not have its LP_DEAD bit set.
1183                  */
1184                 if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1185                         ereport(ERROR,
1186                                         (errcode(ERRCODE_INDEX_CORRUPTED),
1187                                          errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1188                                                                          ItemPointerGetBlockNumber(&itup->t_tid),
1189                                                                          ItemPointerGetOffsetNumber(&itup->t_tid),
1190                                                                          newitemoff, BufferGetBlockNumber(buf),
1191                                                                          RelationGetRelationName(rel))));
1192
1193                 /* use a mutable copy of itup as our itup from here on */
1194                 origitup = itup;
1195                 itup = CopyIndexTuple(origitup);
1196                 nposting = _bt_swap_posting(itup, oposting, postingoff);
1197                 /* itup now contains rightmost/max TID from oposting */
1198
1199                 /* Alter offset so that newitem goes after posting list */
1200                 newitemoff = OffsetNumberNext(newitemoff);
1201         }
1202
1203         /*
1204          * Do we need to split the page to fit the item on it?
1205          *
1206          * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1207          * so this comparison is correct even though we appear to be accounting
1208          * only for the item and not for its line pointer.
1209          */
1210         if (PageGetFreeSpace(page) < itemsz)
1211         {
1212                 Buffer          rbuf;
1213
1214                 Assert(!split_only_page);
1215
1216                 /* split the buffer into left and right halves */
1217                 rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1218                                                  itup, origitup, nposting, postingoff);
1219                 PredicateLockPageSplit(rel,
1220                                                            BufferGetBlockNumber(buf),
1221                                                            BufferGetBlockNumber(rbuf));
1222
1223                 /*----------
1224                  * By here,
1225                  *
1226                  *              +  our target page has been split;
1227                  *              +  the original tuple has been inserted;
1228                  *              +  we have write locks on both the old (left half)
1229                  *                 and new (right half) buffers, after the split; and
1230                  *              +  we know the key we want to insert into the parent
1231                  *                 (it's the "high key" on the left child page).
1232                  *
1233                  * We're ready to do the parent insertion.  We need to hold onto the
1234                  * locks for the child pages until we locate the parent, but we can
1235                  * at least release the lock on the right child before doing the
1236                  * actual insertion.  The lock on the left child will be released
1237                  * last of all by parent insertion, where it is the 'cbuf' of parent
1238                  * page.
1239                  *----------
1240                  */
1241                 _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1242         }
1243         else
1244         {
1245                 Buffer          metabuf = InvalidBuffer;
1246                 Page            metapg = NULL;
1247                 BTMetaPageData *metad = NULL;
1248                 BlockNumber blockcache;
1249
1250                 /*
1251                  * If we are doing this insert because we split a page that was the
1252                  * only one on its tree level, but was not the root, it may have been
1253                  * the "fast root".  We need to ensure that the fast root link points
1254                  * at or above the current page.  We can safely acquire a lock on the
1255                  * metapage here --- see comments for _bt_newlevel().
1256                  */
1257                 if (unlikely(split_only_page))
1258                 {
1259                         Assert(!isleaf);
1260                         Assert(BufferIsValid(cbuf));
1261
1262                         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1263                         metapg = BufferGetPage(metabuf);
1264                         metad = BTPageGetMeta(metapg);
1265
1266                         if (metad->btm_fastlevel >= opaque->btpo_level)
1267                         {
1268                                 /* no update wanted */
1269                                 _bt_relbuf(rel, metabuf);
1270                                 metabuf = InvalidBuffer;
1271                         }
1272                 }
1273
1274                 /* Do the update.  No ereport(ERROR) until changes are logged */
1275                 START_CRIT_SECTION();
1276
1277                 if (postingoff != 0)
1278                         memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1279
1280                 if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1281                                                 false) == InvalidOffsetNumber)
1282                         elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1283                                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1284
1285                 MarkBufferDirty(buf);
1286
1287                 if (BufferIsValid(metabuf))
1288                 {
1289                         /* upgrade meta-page if needed */
1290                         if (metad->btm_version < BTREE_NOVAC_VERSION)
1291                                 _bt_upgrademetapage(metapg);
1292                         metad->btm_fastroot = BufferGetBlockNumber(buf);
1293                         metad->btm_fastlevel = opaque->btpo_level;
1294                         MarkBufferDirty(metabuf);
1295                 }
1296
1297                 /*
1298                  * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1299                  * finishes a split
1300                  */
1301                 if (!isleaf)
1302                 {
1303                         Page            cpage = BufferGetPage(cbuf);
1304                         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1305
1306                         Assert(P_INCOMPLETE_SPLIT(cpageop));
1307                         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1308                         MarkBufferDirty(cbuf);
1309                 }
1310
1311                 /* XLOG stuff */
1312                 if (RelationNeedsWAL(rel))
1313                 {
1314                         xl_btree_insert xlrec;
1315                         xl_btree_metadata xlmeta;
1316                         uint8           xlinfo;
1317                         XLogRecPtr      recptr;
1318                         uint16          upostingoff;
1319
1320                         xlrec.offnum = newitemoff;
1321
1322                         XLogBeginInsert();
1323                         XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1324
1325                         if (isleaf && postingoff == 0)
1326                         {
1327                                 /* Simple leaf insert */
1328                                 xlinfo = XLOG_BTREE_INSERT_LEAF;
1329                         }
1330                         else if (postingoff != 0)
1331                         {
1332                                 /*
1333                                  * Leaf insert with posting list split.  Must include
1334                                  * postingoff field before newitem/orignewitem.
1335                                  */
1336                                 Assert(isleaf);
1337                                 xlinfo = XLOG_BTREE_INSERT_POST;
1338                         }
1339                         else
1340                         {
1341                                 /* Internal page insert, which finishes a split on cbuf */
1342                                 xlinfo = XLOG_BTREE_INSERT_UPPER;
1343                                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1344
1345                                 if (BufferIsValid(metabuf))
1346                                 {
1347                                         /* Actually, it's an internal page insert + meta update */
1348                                         xlinfo = XLOG_BTREE_INSERT_META;
1349
1350                                         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1351                                         xlmeta.version = metad->btm_version;
1352                                         xlmeta.root = metad->btm_root;
1353                                         xlmeta.level = metad->btm_level;
1354                                         xlmeta.fastroot = metad->btm_fastroot;
1355                                         xlmeta.fastlevel = metad->btm_fastlevel;
1356                                         xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1357                                         xlmeta.allequalimage = metad->btm_allequalimage;
1358
1359                                         XLogRegisterBuffer(2, metabuf,
1360                                                                            REGBUF_WILL_INIT | REGBUF_STANDARD);
1361                                         XLogRegisterBufData(2, (char *) &xlmeta,
1362                                                                                 sizeof(xl_btree_metadata));
1363                                 }
1364                         }
1365
1366                         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1367                         if (postingoff == 0)
1368                         {
1369                                 /* Just log itup from caller */
1370                                 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1371                         }
1372                         else
1373                         {
1374                                 /*
1375                                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
1376                                  * record) case.
1377                                  *
1378                                  * Log postingoff.  Also log origitup, not itup.  REDO routine
1379                                  * must reconstruct final itup (as well as nposting) using
1380                                  * _bt_swap_posting().
1381                                  */
1382                                 upostingoff = postingoff;
1383
1384                                 XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1385                                 XLogRegisterBufData(0, (char *) origitup,
1386                                                                         IndexTupleSize(origitup));
1387                         }
1388
1389                         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1390
1391                         if (BufferIsValid(metabuf))
1392                                 PageSetLSN(metapg, recptr);
1393                         if (!isleaf)
1394                                 PageSetLSN(BufferGetPage(cbuf), recptr);
1395
1396                         PageSetLSN(page, recptr);
1397                 }
1398
1399                 END_CRIT_SECTION();
1400
1401                 /* Release subsidiary buffers */
1402                 if (BufferIsValid(metabuf))
1403                         _bt_relbuf(rel, metabuf);
1404                 if (!isleaf)
1405                         _bt_relbuf(rel, cbuf);
1406
1407                 /*
1408                  * Cache the block number if this is the rightmost leaf page.  Cache
1409                  * may be used by a future inserter within _bt_search_insert().
1410                  */
1411                 blockcache = InvalidBlockNumber;
1412                 if (isrightmost && isleaf && !isroot)
1413                         blockcache = BufferGetBlockNumber(buf);
1414
1415                 /* Release buffer for insertion target block */
1416                 _bt_relbuf(rel, buf);
1417
1418                 /*
1419                  * If we decided to cache the insertion target block before releasing
1420                  * its buffer lock, then cache it now.  Check the height of the tree
1421                  * first, though.  We don't go for the optimization with small
1422                  * indexes.  Defer final check to this point to ensure that we don't
1423                  * call _bt_getrootheight while holding a buffer lock.
1424                  */
1425                 if (BlockNumberIsValid(blockcache) &&
1426                         _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
1427                         RelationSetTargetBlock(rel, blockcache);
1428         }
1429
1430         /* be tidy */
1431         if (postingoff != 0)
1432         {
1433                 /* itup is actually a modified copy of caller's original */
1434                 pfree(nposting);
1435                 pfree(itup);
1436         }
1437 }
1438
1439 /*
1440  *      _bt_split() -- split a page in the btree.
1441  *
1442  *              On entry, buf is the page to split, and is pinned and write-locked.
1443  *              newitemoff etc. tell us about the new item that must be inserted
1444  *              along with the data from the original page.
1445  *
1446  *              itup_key is used for suffix truncation on leaf pages (internal
1447  *              page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
1448  *              is the left-sibling of the page we're inserting the downlink for.
1449  *              This function will clear the INCOMPLETE_SPLIT flag on it, and
1450  *              release the buffer.
1451  *
1452  *              orignewitem, nposting, and postingoff are needed when an insert of
1453  *              orignewitem results in both a posting list split and a page split.
1454  *              These extra posting list split details are used here in the same
1455  *              way as they are used in the more common case where a posting list
1456  *              split does not coincide with a page split.  We need to deal with
1457  *              posting list splits directly in order to ensure that everything
1458  *              that follows from the insert of orignewitem is handled as a single
1459  *              atomic operation (though caller's insert of a new pivot/downlink
1460  *              into parent page will still be a separate operation).  See
1461  *              nbtree/README for details on the design of posting list splits.
1462  *
1463  *              Returns the new right sibling of buf, pinned and write-locked.
1464  *              The pin and lock on buf are maintained.
1465  */
1466 static Buffer
1467 _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1468                   Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1469                   IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1470 {
1471         Buffer          rbuf;
1472         Page            origpage;
1473         Page            leftpage,
1474                                 rightpage;
1475         BlockNumber origpagenumber,
1476                                 rightpagenumber;
1477         BTPageOpaque ropaque,
1478                                 lopaque,
1479                                 oopaque;
1480         Buffer          sbuf = InvalidBuffer;
1481         Page            spage = NULL;
1482         BTPageOpaque sopaque = NULL;
1483         Size            itemsz;
1484         ItemId          itemid;
1485         IndexTuple      firstright,
1486                                 lefthighkey;
1487         OffsetNumber firstrightoff;
1488         OffsetNumber afterleftoff,
1489                                 afterrightoff,
1490                                 minusinfoff;
1491         OffsetNumber origpagepostingoff;
1492         OffsetNumber maxoff;
1493         OffsetNumber i;
1494         bool            newitemonleft,
1495                                 isleaf,
1496                                 isrightmost;
1497
1498         /*
1499          * origpage is the original page to be split.  leftpage is a temporary
1500          * buffer that receives the left-sibling data, which will be copied back
1501          * into origpage on success.  rightpage is the new page that will receive
1502          * the right-sibling data.
1503          *
1504          * leftpage is allocated after choosing a split point.  rightpage's new
1505          * buffer isn't acquired until after leftpage is initialized and has new
1506          * high key, the last point where splitting the page may fail (barring
1507          * corruption).  Failing before acquiring new buffer won't have lasting
1508          * consequences, since origpage won't have been modified and leftpage is
1509          * only workspace.
1510          */
1511         origpage = BufferGetPage(buf);
1512         oopaque = BTPageGetOpaque(origpage);
1513         isleaf = P_ISLEAF(oopaque);
1514         isrightmost = P_RIGHTMOST(oopaque);
1515         maxoff = PageGetMaxOffsetNumber(origpage);
1516         origpagenumber = BufferGetBlockNumber(buf);
1517
1518         /*
1519          * Choose a point to split origpage at.
1520          *
1521          * A split point can be thought of as a point _between_ two existing data
1522          * items on origpage (the lastleft and firstright tuples), provided you
1523          * pretend that the new item that didn't fit is already on origpage.
1524          *
1525          * Since origpage does not actually contain newitem, the representation of
1526          * split points needs to work with two boundary cases: splits where
1527          * newitem is lastleft, and splits where newitem is firstright.
1528          * newitemonleft resolves the ambiguity that would otherwise exist when
1529          * newitemoff == firstrightoff.  In all other cases it's clear which side
1530          * of the split every tuple goes on from context.  newitemonleft is
1531          * usually (but not always) redundant information.
1532          *
1533          * firstrightoff is supposed to be an origpage offset number, but it's
1534          * possible that its value will be maxoff+1, which is "past the end" of
1535          * origpage.  This happens in the rare case where newitem goes after all
1536          * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1537          * origpage at the point that leaves newitem alone on new right page.  Any
1538          * "!newitemonleft && newitemoff == firstrightoff" split point makes
1539          * newitem the firstright tuple, though, so this case isn't a special
1540          * case.
1541          */
1542         firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1543                                                                          newitem, &newitemonleft);
1544
1545         /* Allocate temp buffer for leftpage */
1546         leftpage = PageGetTempPage(origpage);
1547         _bt_pageinit(leftpage, BufferGetPageSize(buf));
1548         lopaque = BTPageGetOpaque(leftpage);
1549
1550         /*
1551          * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
1552          * and HAS_GARBAGE flags.
1553          */
1554         lopaque->btpo_flags = oopaque->btpo_flags;
1555         lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1556         /* set flag in leftpage indicating that rightpage has no downlink yet */
1557         lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1558         lopaque->btpo_prev = oopaque->btpo_prev;
1559         /* handle btpo_next after rightpage buffer acquired */
1560         lopaque->btpo_level = oopaque->btpo_level;
1561         /* handle btpo_cycleid after rightpage buffer acquired */
1562
1563         /*
1564          * Copy the original page's LSN into leftpage, which will become the
1565          * updated version of the page.  We need this because XLogInsert will
1566          * examine the LSN and possibly dump it in a page image.
1567          */
1568         PageSetLSN(leftpage, PageGetLSN(origpage));
1569
1570         /*
1571          * Determine page offset number of existing overlapped-with-orignewitem
1572          * posting list when it is necessary to perform a posting list split in
1573          * passing.  Note that newitem was already changed by caller (newitem no
1574          * longer has the orignewitem TID).
1575          *
1576          * This page offset number (origpagepostingoff) will be used to pretend
1577          * that the posting split has already taken place, even though the
1578          * required modifications to origpage won't occur until we reach the
1579          * critical section.  The lastleft and firstright tuples of our page split
1580          * point should, in effect, come from an imaginary version of origpage
1581          * that has the nposting tuple instead of the original posting list tuple.
1582          *
1583          * Note: _bt_findsplitloc() should have compensated for coinciding posting
1584          * list splits in just the same way, at least in theory.  It doesn't
1585          * bother with that, though.  In practice it won't affect its choice of
1586          * split point.
1587          */
1588         origpagepostingoff = InvalidOffsetNumber;
1589         if (postingoff != 0)
1590         {
1591                 Assert(isleaf);
1592                 Assert(ItemPointerCompare(&orignewitem->t_tid,
1593                                                                   &newitem->t_tid) < 0);
1594                 Assert(BTreeTupleIsPosting(nposting));
1595                 origpagepostingoff = OffsetNumberPrev(newitemoff);
1596         }
1597
1598         /*
1599          * The high key for the new left page is a possibly-truncated copy of
1600          * firstright on the leaf level (it's "firstright itself" on internal
1601          * pages; see !isleaf comments below).  This may seem to be contrary to
1602          * Lehman & Yao's approach of using a copy of lastleft as the new high key
1603          * when splitting on the leaf level.  It isn't, though.
1604          *
1605          * Suffix truncation will leave the left page's high key fully equal to
1606          * lastleft when lastleft and firstright are equal prior to heap TID (that
1607          * is, the tiebreaker TID value comes from lastleft).  It isn't actually
1608          * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1609          * "subtree" invariant to hold.  It's sufficient to make sure that the new
1610          * leaf high key is strictly less than firstright, and greater than or
1611          * equal to (not necessarily equal to) lastleft.  In other words, when
1612          * suffix truncation isn't possible during a leaf page split, we take
1613          * L&Y's exact approach to generating a new high key for the left page.
1614          * (Actually, that is slightly inaccurate.  We don't just use a copy of
1615          * lastleft.  A tuple with all the keys from firstright but the max heap
1616          * TID from lastleft is used, to avoid introducing a special case.)
1617          */
1618         if (!newitemonleft && newitemoff == firstrightoff)
1619         {
1620                 /* incoming tuple becomes firstright */
1621                 itemsz = newitemsz;
1622                 firstright = newitem;
1623         }
1624         else
1625         {
1626                 /* existing item at firstrightoff becomes firstright */
1627                 itemid = PageGetItemId(origpage, firstrightoff);
1628                 itemsz = ItemIdGetLength(itemid);
1629                 firstright = (IndexTuple) PageGetItem(origpage, itemid);
1630                 if (firstrightoff == origpagepostingoff)
1631                         firstright = nposting;
1632         }
1633
1634         if (isleaf)
1635         {
1636                 IndexTuple      lastleft;
1637
1638                 /* Attempt suffix truncation for leaf page splits */
1639                 if (newitemonleft && newitemoff == firstrightoff)
1640                 {
1641                         /* incoming tuple becomes lastleft */
1642                         lastleft = newitem;
1643                 }
1644                 else
1645                 {
1646                         OffsetNumber lastleftoff;
1647
1648                         /* existing item before firstrightoff becomes lastleft */
1649                         lastleftoff = OffsetNumberPrev(firstrightoff);
1650                         Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1651                         itemid = PageGetItemId(origpage, lastleftoff);
1652                         lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1653                         if (lastleftoff == origpagepostingoff)
1654                                 lastleft = nposting;
1655                 }
1656
1657                 lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1658                 itemsz = IndexTupleSize(lefthighkey);
1659         }
1660         else
1661         {
1662                 /*
1663                  * Don't perform suffix truncation on a copy of firstright to make
1664                  * left page high key for internal page splits.  Must use firstright
1665                  * as new high key directly.
1666                  *
1667                  * Each distinct separator key value originates as a leaf level high
1668                  * key; all other separator keys/pivot tuples are copied from one
1669                  * level down.  A separator key in a grandparent page must be
1670                  * identical to high key in rightmost parent page of the subtree to
1671                  * its left, which must itself be identical to high key in rightmost
1672                  * child page of that same subtree (this even applies to separator
1673                  * from grandparent's high key).  There must always be an unbroken
1674                  * "seam" of identical separator keys that guide index scans at every
1675                  * level, starting from the grandparent.  That's why suffix truncation
1676                  * is unsafe here.
1677                  *
1678                  * Internal page splits will truncate firstright into a "negative
1679                  * infinity" data item when it gets inserted on the new right page
1680                  * below, though.  This happens during the call to _bt_pgaddtup() for
1681                  * the new first data item for right page.  Do not confuse this
1682                  * mechanism with suffix truncation.  It is just a convenient way of
1683                  * implementing page splits that split the internal page "inside"
1684                  * firstright.  The lefthighkey separator key cannot appear a second
1685                  * time in the right page (only firstright's downlink goes in right
1686                  * page).
1687                  */
1688                 lefthighkey = firstright;
1689         }
1690
1691         /*
1692          * Add new high key to leftpage
1693          */
1694         afterleftoff = P_HIKEY;
1695
1696         Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1697         Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1698                    IndexRelationGetNumberOfKeyAttributes(rel));
1699         Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1700         if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1701                                         false) == InvalidOffsetNumber)
1702                 elog(ERROR, "failed to add high key to the left sibling"
1703                          " while splitting block %u of index \"%s\"",
1704                          origpagenumber, RelationGetRelationName(rel));
1705         afterleftoff = OffsetNumberNext(afterleftoff);
1706
1707         /*
1708          * Acquire a new right page to split into, now that left page has a new
1709          * high key.  From here on, it's not okay to throw an error without
1710          * zeroing rightpage first.  This coding rule ensures that we won't
1711          * confuse future VACUUM operations, which might otherwise try to re-find
1712          * a downlink to a leftover junk page as the page undergoes deletion.
1713          *
1714          * It would be reasonable to start the critical section just after the new
1715          * rightpage buffer is acquired instead; that would allow us to avoid
1716          * leftover junk pages without bothering to zero rightpage.  We do it this
1717          * way because it avoids an unnecessary PANIC when either origpage or its
1718          * existing sibling page are corrupt.
1719          */
1720         rbuf = _bt_allocbuf(rel, heaprel);
1721         rightpage = BufferGetPage(rbuf);
1722         rightpagenumber = BufferGetBlockNumber(rbuf);
1723         /* rightpage was initialized by _bt_getbuf */
1724         ropaque = BTPageGetOpaque(rightpage);
1725
1726         /*
1727          * Finish off remaining leftpage special area fields.  They cannot be set
1728          * before both origpage (leftpage) and rightpage buffers are acquired and
1729          * locked.
1730          *
1731          * btpo_cycleid is only used with leaf pages, though we set it here in all
1732          * cases just to be consistent.
1733          */
1734         lopaque->btpo_next = rightpagenumber;
1735         lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1736
1737         /*
1738          * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
1739          * and HAS_GARBAGE flags.
1740          */
1741         ropaque->btpo_flags = oopaque->btpo_flags;
1742         ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1743         ropaque->btpo_prev = origpagenumber;
1744         ropaque->btpo_next = oopaque->btpo_next;
1745         ropaque->btpo_level = oopaque->btpo_level;
1746         ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1747
1748         /*
1749          * Add new high key to rightpage where necessary.
1750          *
1751          * If the page we're splitting is not the rightmost page at its level in
1752          * the tree, then the first entry on the page is the high key from
1753          * origpage.
1754          */
1755         afterrightoff = P_HIKEY;
1756
1757         if (!isrightmost)
1758         {
1759                 IndexTuple      righthighkey;
1760
1761                 itemid = PageGetItemId(origpage, P_HIKEY);
1762                 itemsz = ItemIdGetLength(itemid);
1763                 righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1764                 Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1765                 Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1766                            IndexRelationGetNumberOfKeyAttributes(rel));
1767                 if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1768                                                 false, false) == InvalidOffsetNumber)
1769                 {
1770                         memset(rightpage, 0, BufferGetPageSize(rbuf));
1771                         elog(ERROR, "failed to add high key to the right sibling"
1772                                  " while splitting block %u of index \"%s\"",
1773                                  origpagenumber, RelationGetRelationName(rel));
1774                 }
1775                 afterrightoff = OffsetNumberNext(afterrightoff);
1776         }
1777
1778         /*
1779          * Internal page splits truncate first data item on right page -- it
1780          * becomes "minus infinity" item for the page.  Set this up here.
1781          */
1782         minusinfoff = InvalidOffsetNumber;
1783         if (!isleaf)
1784                 minusinfoff = afterrightoff;
1785
1786         /*
1787          * Now transfer all the data items (non-pivot tuples in isleaf case, or
1788          * additional pivot tuples in !isleaf case) to the appropriate page.
1789          *
1790          * Note: we *must* insert at least the right page's items in item-number
1791          * order, for the benefit of _bt_restore_page().
1792          */
1793         for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1794         {
1795                 IndexTuple      dataitem;
1796
1797                 itemid = PageGetItemId(origpage, i);
1798                 itemsz = ItemIdGetLength(itemid);
1799                 dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1800
1801                 /* replace original item with nposting due to posting split? */
1802                 if (i == origpagepostingoff)
1803                 {
1804                         Assert(BTreeTupleIsPosting(dataitem));
1805                         Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1806                         dataitem = nposting;
1807                 }
1808
1809                 /* does new item belong before this one? */
1810                 else if (i == newitemoff)
1811                 {
1812                         if (newitemonleft)
1813                         {
1814                                 Assert(newitemoff <= firstrightoff);
1815                                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1816                                                                   false))
1817                                 {
1818                                         memset(rightpage, 0, BufferGetPageSize(rbuf));
1819                                         elog(ERROR, "failed to add new item to the left sibling"
1820                                                  " while splitting block %u of index \"%s\"",
1821                                                  origpagenumber, RelationGetRelationName(rel));
1822                                 }
1823                                 afterleftoff = OffsetNumberNext(afterleftoff);
1824                         }
1825                         else
1826                         {
1827                                 Assert(newitemoff >= firstrightoff);
1828                                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1829                                                                   afterrightoff == minusinfoff))
1830                                 {
1831                                         memset(rightpage, 0, BufferGetPageSize(rbuf));
1832                                         elog(ERROR, "failed to add new item to the right sibling"
1833                                                  " while splitting block %u of index \"%s\"",
1834                                                  origpagenumber, RelationGetRelationName(rel));
1835                                 }
1836                                 afterrightoff = OffsetNumberNext(afterrightoff);
1837                         }
1838                 }
1839
1840                 /* decide which page to put it on */
1841                 if (i < firstrightoff)
1842                 {
1843                         if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1844                         {
1845                                 memset(rightpage, 0, BufferGetPageSize(rbuf));
1846                                 elog(ERROR, "failed to add old item to the left sibling"
1847                                          " while splitting block %u of index \"%s\"",
1848                                          origpagenumber, RelationGetRelationName(rel));
1849                         }
1850                         afterleftoff = OffsetNumberNext(afterleftoff);
1851                 }
1852                 else
1853                 {
1854                         if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1855                                                           afterrightoff == minusinfoff))
1856                         {
1857                                 memset(rightpage, 0, BufferGetPageSize(rbuf));
1858                                 elog(ERROR, "failed to add old item to the right sibling"
1859                                          " while splitting block %u of index \"%s\"",
1860                                          origpagenumber, RelationGetRelationName(rel));
1861                         }
1862                         afterrightoff = OffsetNumberNext(afterrightoff);
1863                 }
1864         }
1865
1866         /* Handle case where newitem goes at the end of rightpage */
1867         if (i <= newitemoff)
1868         {
1869                 /*
1870                  * Can't have newitemonleft here; that would imply we were told to put
1871                  * *everything* on the left page, which cannot fit (if it could, we'd
1872                  * not be splitting the page).
1873                  */
1874                 Assert(!newitemonleft && newitemoff == maxoff + 1);
1875                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1876                                                   afterrightoff == minusinfoff))
1877                 {
1878                         memset(rightpage, 0, BufferGetPageSize(rbuf));
1879                         elog(ERROR, "failed to add new item to the right sibling"
1880                                  " while splitting block %u of index \"%s\"",
1881                                  origpagenumber, RelationGetRelationName(rel));
1882                 }
1883                 afterrightoff = OffsetNumberNext(afterrightoff);
1884         }
1885
1886         /*
1887          * We have to grab the original right sibling (if any) and update its prev
1888          * link.  We are guaranteed that this is deadlock-free, since we couple
1889          * the locks in the standard order: left to right.
1890          */
1891         if (!isrightmost)
1892         {
1893                 sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1894                 spage = BufferGetPage(sbuf);
1895                 sopaque = BTPageGetOpaque(spage);
1896                 if (sopaque->btpo_prev != origpagenumber)
1897                 {
1898                         memset(rightpage, 0, BufferGetPageSize(rbuf));
1899                         ereport(ERROR,
1900                                         (errcode(ERRCODE_INDEX_CORRUPTED),
1901                                          errmsg_internal("right sibling's left-link doesn't match: "
1902                                                                          "block %u links to %u instead of expected %u in index \"%s\"",
1903                                                                          oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1904                                                                          RelationGetRelationName(rel))));
1905                 }
1906
1907                 /*
1908                  * Check to see if we can set the SPLIT_END flag in the right-hand
1909                  * split page; this can save some I/O for vacuum since it need not
1910                  * proceed to the right sibling.  We can set the flag if the right
1911                  * sibling has a different cycleid: that means it could not be part of
1912                  * a group of pages that were all split off from the same ancestor
1913                  * page.  If you're confused, imagine that page A splits to A B and
1914                  * then again, yielding A C B, while vacuum is in progress.  Tuples
1915                  * originally in A could now be in either B or C, hence vacuum must
1916                  * examine both pages.  But if D, our right sibling, has a different
1917                  * cycleid then it could not contain any tuples that were in A when
1918                  * the vacuum started.
1919                  */
1920                 if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1921                         ropaque->btpo_flags |= BTP_SPLIT_END;
1922         }
1923
1924         /*
1925          * Right sibling is locked, new siblings are prepared, but original page
1926          * is not updated yet.
1927          *
1928          * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
1929          * not starting the critical section till here because we haven't been
1930          * scribbling on the original page yet; see comments above.
1931          */
1932         START_CRIT_SECTION();
1933
1934         /*
1935          * By here, the original data page has been split into two new halves, and
1936          * these are correct.  The algorithm requires that the left page never
1937          * move during a split, so we copy the new left page back on top of the
1938          * original.  We need to do this before writing the WAL record, so that
1939          * XLogInsert can WAL log an image of the page if necessary.
1940          */
1941         PageRestoreTempPage(leftpage, origpage);
1942         /* leftpage, lopaque must not be used below here */
1943
1944         MarkBufferDirty(buf);
1945         MarkBufferDirty(rbuf);
1946
1947         if (!isrightmost)
1948         {
1949                 sopaque->btpo_prev = rightpagenumber;
1950                 MarkBufferDirty(sbuf);
1951         }
1952
1953         /*
1954          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1955          * a split
1956          */
1957         if (!isleaf)
1958         {
1959                 Page            cpage = BufferGetPage(cbuf);
1960                 BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1961
1962                 cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1963                 MarkBufferDirty(cbuf);
1964         }
1965
1966         /* XLOG stuff */
1967         if (RelationNeedsWAL(rel))
1968         {
1969                 xl_btree_split xlrec;
1970                 uint8           xlinfo;
1971                 XLogRecPtr      recptr;
1972
1973                 xlrec.level = ropaque->btpo_level;
1974                 /* See comments below on newitem, orignewitem, and posting lists */
1975                 xlrec.firstrightoff = firstrightoff;
1976                 xlrec.newitemoff = newitemoff;
1977                 xlrec.postingoff = 0;
1978                 if (postingoff != 0 && origpagepostingoff < firstrightoff)
1979                         xlrec.postingoff = postingoff;
1980
1981                 XLogBeginInsert();
1982                 XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1983
1984                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1985                 XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
1986                 /* Log original right sibling, since we've changed its prev-pointer */
1987                 if (!isrightmost)
1988                         XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
1989                 if (!isleaf)
1990                         XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
1991
1992                 /*
1993                  * Log the new item, if it was inserted on the left page. (If it was
1994                  * put on the right page, we don't need to explicitly WAL log it
1995                  * because it's included with all the other items on the right page.)
1996                  * Show the new item as belonging to the left page buffer, so that it
1997                  * is not stored if XLogInsert decides it needs a full-page image of
1998                  * the left page.  We always store newitemoff in the record, though.
1999                  *
2000                  * The details are sometimes slightly different for page splits that
2001                  * coincide with a posting list split.  If both the replacement
2002                  * posting list and newitem go on the right page, then we don't need
2003                  * to log anything extra, just like the simple !newitemonleft
2004                  * no-posting-split case (postingoff is set to zero in the WAL record,
2005                  * so recovery doesn't need to process a posting list split at all).
2006                  * Otherwise, we set postingoff and log orignewitem instead of
2007                  * newitem, despite having actually inserted newitem.  REDO routine
2008                  * must reconstruct nposting and newitem using _bt_swap_posting().
2009                  *
2010                  * Note: It's possible that our page split point is the point that
2011                  * makes the posting list lastleft and newitem firstright.  This is
2012                  * the only case where we log orignewitem/newitem despite newitem
2013                  * going on the right page.  If XLogInsert decides that it can omit
2014                  * orignewitem due to logging a full-page image of the left page,
2015                  * everything still works out, since recovery only needs to log
2016                  * orignewitem for items on the left page (just like the regular
2017                  * newitem-logged case).
2018                  */
2019                 if (newitemonleft && xlrec.postingoff == 0)
2020                         XLogRegisterBufData(0, (char *) newitem, newitemsz);
2021                 else if (xlrec.postingoff != 0)
2022                 {
2023                         Assert(isleaf);
2024                         Assert(newitemonleft || firstrightoff == newitemoff);
2025                         Assert(newitemsz == IndexTupleSize(orignewitem));
2026                         XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2027                 }
2028
2029                 /* Log the left page's new high key */
2030                 if (!isleaf)
2031                 {
2032                         /* lefthighkey isn't local copy, get current pointer */
2033                         itemid = PageGetItemId(origpage, P_HIKEY);
2034                         lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2035                 }
2036                 XLogRegisterBufData(0, (char *) lefthighkey,
2037                                                         MAXALIGN(IndexTupleSize(lefthighkey)));
2038
2039                 /*
2040                  * Log the contents of the right page in the format understood by
2041                  * _bt_restore_page().  The whole right page will be recreated.
2042                  *
2043                  * Direct access to page is not good but faster - we should implement
2044                  * some new func in page API.  Note we only store the tuples
2045                  * themselves, knowing that they were inserted in item-number order
2046                  * and so the line pointers can be reconstructed.  See comments for
2047                  * _bt_restore_page().
2048                  */
2049                 XLogRegisterBufData(1,
2050                                                         (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2051                                                         ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2052
2053                 xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2054                 recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2055
2056                 PageSetLSN(origpage, recptr);
2057                 PageSetLSN(rightpage, recptr);
2058                 if (!isrightmost)
2059                         PageSetLSN(spage, recptr);
2060                 if (!isleaf)
2061                         PageSetLSN(BufferGetPage(cbuf), recptr);
2062         }
2063
2064         END_CRIT_SECTION();
2065
2066         /* release the old right sibling */
2067         if (!isrightmost)
2068                 _bt_relbuf(rel, sbuf);
2069
2070         /* release the child */
2071         if (!isleaf)
2072                 _bt_relbuf(rel, cbuf);
2073
2074         /* be tidy */
2075         if (isleaf)
2076                 pfree(lefthighkey);
2077
2078         /* split's done */
2079         return rbuf;
2080 }
2081
2082 /*
2083  * _bt_insert_parent() -- Insert downlink into parent, completing split.
2084  *
2085  * On entry, buf and rbuf are the left and right split pages, which we
2086  * still hold write locks on.  Both locks will be released here.  We
2087  * release the rbuf lock once we have a write lock on the page that we
2088  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2089  * The lock on buf is released at the same point as the lock on the parent
2090  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2091  * atomic operation that completes the split by inserting a new downlink.
2092  *
2093  * stack - stack showing how we got here.  Will be NULL when splitting true
2094  *                      root, or during concurrent root split, where we can be inefficient
2095  * isroot - we split the true root
2096  * isonly - we split a page alone on its level (might have been fast root)
2097  */
2098 static void
2099 _bt_insert_parent(Relation rel,
2100                                   Relation heaprel,
2101                                   Buffer buf,
2102                                   Buffer rbuf,
2103                                   BTStack stack,
2104                                   bool isroot,
2105                                   bool isonly)
2106 {
2107         Assert(heaprel != NULL);
2108
2109         /*
2110          * Here we have to do something Lehman and Yao don't talk about: deal with
2111          * a root split and construction of a new root.  If our stack is empty
2112          * then we have just split a node on what had been the root level when we
2113          * descended the tree.  If it was still the root then we perform a
2114          * new-root construction.  If it *wasn't* the root anymore, search to find
2115          * the next higher level that someone constructed meanwhile, and find the
2116          * right place to insert as for the normal case.
2117          *
2118          * If we have to search for the parent level, we do so by re-descending
2119          * from the root.  This is not super-efficient, but it's rare enough not
2120          * to matter.
2121          */
2122         if (isroot)
2123         {
2124                 Buffer          rootbuf;
2125
2126                 Assert(stack == NULL);
2127                 Assert(isonly);
2128                 /* create a new root node one level up and update the metapage */
2129                 rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2130                 /* release the split buffers */
2131                 _bt_relbuf(rel, rootbuf);
2132                 _bt_relbuf(rel, rbuf);
2133                 _bt_relbuf(rel, buf);
2134         }
2135         else
2136         {
2137                 BlockNumber bknum = BufferGetBlockNumber(buf);
2138                 BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2139                 Page            page = BufferGetPage(buf);
2140                 IndexTuple      new_item;
2141                 BTStackData fakestack;
2142                 IndexTuple      ritem;
2143                 Buffer          pbuf;
2144
2145                 if (stack == NULL)
2146                 {
2147                         BTPageOpaque opaque;
2148
2149                         elog(DEBUG2, "concurrent ROOT page split");
2150                         opaque = BTPageGetOpaque(page);
2151
2152                         /*
2153                          * We should never reach here when a leaf page split takes place
2154                          * despite the insert of newitem being able to apply the fastpath
2155                          * optimization.  Make sure of that with an assertion.
2156                          *
2157                          * This is more of a performance issue than a correctness issue.
2158                          * The fastpath won't have a descent stack.  Using a phony stack
2159                          * here works, but never rely on that.  The fastpath should be
2160                          * rejected within _bt_search_insert() when the rightmost leaf
2161                          * page will split, since it's faster to go through _bt_search()
2162                          * and get a stack in the usual way.
2163                          */
2164                         Assert(!(P_ISLEAF(opaque) &&
2165                                          BlockNumberIsValid(RelationGetTargetBlock(rel))));
2166
2167                         /* Find the leftmost page at the next level up */
2168                         pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
2169                         /* Set up a phony stack entry pointing there */
2170                         stack = &fakestack;
2171                         stack->bts_blkno = BufferGetBlockNumber(pbuf);
2172                         stack->bts_offset = InvalidOffsetNumber;
2173                         stack->bts_parent = NULL;
2174                         _bt_relbuf(rel, pbuf);
2175                 }
2176
2177                 /* get high key from left, a strict lower bound for new right page */
2178                 ritem = (IndexTuple) PageGetItem(page,
2179                                                                                  PageGetItemId(page, P_HIKEY));
2180
2181                 /* form an index tuple that points at the new right page */
2182                 new_item = CopyIndexTuple(ritem);
2183                 BTreeTupleSetDownLink(new_item, rbknum);
2184
2185                 /*
2186                  * Re-find and write lock the parent of buf.
2187                  *
2188                  * It's possible that the location of buf's downlink has changed since
2189                  * our initial _bt_search() descent.  _bt_getstackbuf() will detect
2190                  * and recover from this, updating the stack, which ensures that the
2191                  * new downlink will be inserted at the correct offset. Even buf's
2192                  * parent may have changed.
2193                  */
2194                 pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2195
2196                 /*
2197                  * Unlock the right child.  The left child will be unlocked in
2198                  * _bt_insertonpg().
2199                  *
2200                  * Unlocking the right child must be delayed until here to ensure that
2201                  * no concurrent VACUUM operation can become confused.  Page deletion
2202                  * cannot be allowed to fail to re-find a downlink for the rbuf page.
2203                  * (Actually, this is just a vestige of how things used to work.  The
2204                  * page deletion code is expected to check for the INCOMPLETE_SPLIT
2205                  * flag on the left child.  It won't attempt deletion of the right
2206                  * child until the split is complete.  Despite all this, we opt to
2207                  * conservatively delay unlocking the right child until here.)
2208                  */
2209                 _bt_relbuf(rel, rbuf);
2210
2211                 if (pbuf == InvalidBuffer)
2212                         ereport(ERROR,
2213                                         (errcode(ERRCODE_INDEX_CORRUPTED),
2214                                          errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2215                                                                          RelationGetRelationName(rel), bknum, rbknum)));
2216
2217                 /* Recursively insert into the parent */
2218                 _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2219                                            new_item, MAXALIGN(IndexTupleSize(new_item)),
2220                                            stack->bts_offset + 1, 0, isonly);
2221
2222                 /* be tidy */
2223                 pfree(new_item);
2224         }
2225 }
2226
2227 /*
2228  * _bt_finish_split() -- Finish an incomplete split
2229  *
2230  * A crash or other failure can leave a split incomplete.  The insertion
2231  * routines won't allow to insert on a page that is incompletely split.
2232  * Before inserting on such a page, call _bt_finish_split().
2233  *
2234  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
2235  * and unpinned.
2236  *
2237  * Caller must provide a valid heaprel, since finishing a page split requires
2238  * allocating a new page if and when the parent page splits in turn.
2239  */
2240 void
2241 _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2242 {
2243         Page            lpage = BufferGetPage(lbuf);
2244         BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245         Buffer          rbuf;
2246         Page            rpage;
2247         BTPageOpaque rpageop;
2248         bool            wasroot;
2249         bool            wasonly;
2250
2251         Assert(P_INCOMPLETE_SPLIT(lpageop));
2252         Assert(heaprel != NULL);
2253
2254         /* Lock right sibling, the one missing the downlink */
2255         rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256         rpage = BufferGetPage(rbuf);
2257         rpageop = BTPageGetOpaque(rpage);
2258
2259         /* Could this be a root split? */
2260         if (!stack)
2261         {
2262                 Buffer          metabuf;
2263                 Page            metapg;
2264                 BTMetaPageData *metad;
2265
2266                 /* acquire lock on the metapage */
2267                 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268                 metapg = BufferGetPage(metabuf);
2269                 metad = BTPageGetMeta(metapg);
2270
2271                 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272
2273                 _bt_relbuf(rel, metabuf);
2274         }
2275         else
2276                 wasroot = false;
2277
2278         /* Was this the only page on the level before split? */
2279         wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280
2281         elog(DEBUG1, "finishing incomplete split of %u/%u",
2282                  BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2283
2284         _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285 }
2286
2287 /*
2288  *      _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2289  *                                               tuple whose downlink points to child page.
2290  *
2291  *              Caller passes child's block number, which is used to identify
2292  *              associated pivot tuple in parent page using a linear search that
2293  *              matches on pivot's downlink/block number.  The expected location of
2294  *              the pivot tuple is taken from the stack one level above the child
2295  *              page.  This is used as a starting point.  Insertions into the
2296  *              parent level could cause the pivot tuple to move right; deletions
2297  *              could cause it to move left, but not left of the page we previously
2298  *              found it on.
2299  *
2300  *              Caller can use its stack to relocate the pivot tuple/downlink for
2301  *              any same-level page to the right of the page found by its initial
2302  *              descent.  This is necessary because of the possibility that caller
2303  *              moved right to recover from a concurrent page split.  It's also
2304  *              convenient for certain callers to be able to step right when there
2305  *              wasn't a concurrent page split, while still using their original
2306  *              stack.  For example, the checkingunique _bt_doinsert() case may
2307  *              have to step right when there are many physical duplicates, and its
2308  *              scantid forces an insertion to the right of the "first page the
2309  *              value could be on".  (This is also relied on by all of our callers
2310  *              when dealing with !heapkeyspace indexes.)
2311  *
2312  *              Returns write-locked parent page buffer, or InvalidBuffer if pivot
2313  *              tuple not found (should not happen).  Adjusts bts_blkno &
2314  *              bts_offset if changed.  Page split caller should insert its new
2315  *              pivot tuple for its new right sibling page on parent page, at the
2316  *              offset number bts_offset + 1.
2317  */
2318 Buffer
2319 _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2320 {
2321         BlockNumber blkno;
2322         OffsetNumber start;
2323
2324         blkno = stack->bts_blkno;
2325         start = stack->bts_offset;
2326
2327         for (;;)
2328         {
2329                 Buffer          buf;
2330                 Page            page;
2331                 BTPageOpaque opaque;
2332
2333                 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334                 page = BufferGetPage(buf);
2335                 opaque = BTPageGetOpaque(page);
2336
2337                 Assert(heaprel != NULL);
2338                 if (P_INCOMPLETE_SPLIT(opaque))
2339                 {
2340                         _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341                         continue;
2342                 }
2343
2344                 if (!P_IGNORE(opaque))
2345                 {
2346                         OffsetNumber offnum,
2347                                                 minoff,
2348                                                 maxoff;
2349                         ItemId          itemid;
2350                         IndexTuple      item;
2351
2352                         minoff = P_FIRSTDATAKEY(opaque);
2353                         maxoff = PageGetMaxOffsetNumber(page);
2354
2355                         /*
2356                          * start = InvalidOffsetNumber means "search the whole page". We
2357                          * need this test anyway due to possibility that page has a high
2358                          * key now when it didn't before.
2359                          */
2360                         if (start < minoff)
2361                                 start = minoff;
2362
2363                         /*
2364                          * Need this check too, to guard against possibility that page
2365                          * split since we visited it originally.
2366                          */
2367                         if (start > maxoff)
2368                                 start = OffsetNumberNext(maxoff);
2369
2370                         /*
2371                          * These loops will check every item on the page --- but in an
2372                          * order that's attuned to the probability of where it actually
2373                          * is.  Scan to the right first, then to the left.
2374                          */
2375                         for (offnum = start;
2376                                  offnum <= maxoff;
2377                                  offnum = OffsetNumberNext(offnum))
2378                         {
2379                                 itemid = PageGetItemId(page, offnum);
2380                                 item = (IndexTuple) PageGetItem(page, itemid);
2381
2382                                 if (BTreeTupleGetDownLink(item) == child)
2383                                 {
2384                                         /* Return accurate pointer to where link is now */
2385                                         stack->bts_blkno = blkno;
2386                                         stack->bts_offset = offnum;
2387                                         return buf;
2388                                 }
2389                         }
2390
2391                         for (offnum = OffsetNumberPrev(start);
2392                                  offnum >= minoff;
2393                                  offnum = OffsetNumberPrev(offnum))
2394                         {
2395                                 itemid = PageGetItemId(page, offnum);
2396                                 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398                                 if (BTreeTupleGetDownLink(item) == child)
2399                                 {
2400                                         /* Return accurate pointer to where link is now */
2401                                         stack->bts_blkno = blkno;
2402                                         stack->bts_offset = offnum;
2403                                         return buf;
2404                                 }
2405                         }
2406                 }
2407
2408                 /*
2409                  * The item we're looking for moved right at least one page.
2410                  *
2411                  * Lehman and Yao couple/chain locks when moving right here, which we
2412                  * can avoid.  See nbtree/README.
2413                  */
2414                 if (P_RIGHTMOST(opaque))
2415                 {
2416                         _bt_relbuf(rel, buf);
2417                         return InvalidBuffer;
2418                 }
2419                 blkno = opaque->btpo_next;
2420                 start = InvalidOffsetNumber;
2421                 _bt_relbuf(rel, buf);
2422         }
2423 }
2424
2425 /*
2426  *      _bt_newlevel() -- Create a new level above root page.
2427  *
2428  *              We've just split the old root page and need to create a new one.
2429  *              In order to do this, we add a new root page to the file, then lock
2430  *              the metadata page and update it.  This is guaranteed to be deadlock-
2431  *              free, because all readers release their locks on the metadata page
2432  *              before trying to lock the root, and all writers lock the root before
2433  *              trying to lock the metadata page.  We have a write lock on the old
2434  *              root page, so we have not introduced any cycles into the waits-for
2435  *              graph.
2436  *
2437  *              On entry, lbuf (the old root) and rbuf (its new peer) are write-
2438  *              locked. On exit, a new root page exists with entries for the
2439  *              two new children, metapage is updated and unlocked/unpinned.
2440  *              The new root buffer is returned to caller which has to unlock/unpin
2441  *              lbuf, rbuf & rootbuf.
2442  */
2443 static Buffer
2444 _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2445 {
2446         Buffer          rootbuf;
2447         Page            lpage,
2448                                 rootpage;
2449         BlockNumber lbkno,
2450                                 rbkno;
2451         BlockNumber rootblknum;
2452         BTPageOpaque rootopaque;
2453         BTPageOpaque lopaque;
2454         ItemId          itemid;
2455         IndexTuple      item;
2456         IndexTuple      left_item;
2457         Size            left_item_sz;
2458         IndexTuple      right_item;
2459         Size            right_item_sz;
2460         Buffer          metabuf;
2461         Page            metapg;
2462         BTMetaPageData *metad;
2463
2464         lbkno = BufferGetBlockNumber(lbuf);
2465         rbkno = BufferGetBlockNumber(rbuf);
2466         lpage = BufferGetPage(lbuf);
2467         lopaque = BTPageGetOpaque(lpage);
2468
2469         /* get a new root page */
2470         rootbuf = _bt_allocbuf(rel, heaprel);
2471         rootpage = BufferGetPage(rootbuf);
2472         rootblknum = BufferGetBlockNumber(rootbuf);
2473
2474         /* acquire lock on the metapage */
2475         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2476         metapg = BufferGetPage(metabuf);
2477         metad = BTPageGetMeta(metapg);
2478
2479         /*
2480          * Create downlink item for left page (old root).  The key value used is
2481          * "minus infinity", a sentinel value that's reliably less than any real
2482          * key value that could appear in the left page.
2483          */
2484         left_item_sz = sizeof(IndexTupleData);
2485         left_item = (IndexTuple) palloc(left_item_sz);
2486         left_item->t_info = left_item_sz;
2487         BTreeTupleSetDownLink(left_item, lbkno);
2488         BTreeTupleSetNAtts(left_item, 0, false);
2489
2490         /*
2491          * Create downlink item for right page.  The key for it is obtained from
2492          * the "high key" position in the left page.
2493          */
2494         itemid = PageGetItemId(lpage, P_HIKEY);
2495         right_item_sz = ItemIdGetLength(itemid);
2496         item = (IndexTuple) PageGetItem(lpage, itemid);
2497         right_item = CopyIndexTuple(item);
2498         BTreeTupleSetDownLink(right_item, rbkno);
2499
2500         /* NO EREPORT(ERROR) from here till newroot op is logged */
2501         START_CRIT_SECTION();
2502
2503         /* upgrade metapage if needed */
2504         if (metad->btm_version < BTREE_NOVAC_VERSION)
2505                 _bt_upgrademetapage(metapg);
2506
2507         /* set btree special data */
2508         rootopaque = BTPageGetOpaque(rootpage);
2509         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2510         rootopaque->btpo_flags = BTP_ROOT;
2511         rootopaque->btpo_level =
2512                 (BTPageGetOpaque(lpage))->btpo_level + 1;
2513         rootopaque->btpo_cycleid = 0;
2514
2515         /* update metapage data */
2516         metad->btm_root = rootblknum;
2517         metad->btm_level = rootopaque->btpo_level;
2518         metad->btm_fastroot = rootblknum;
2519         metad->btm_fastlevel = rootopaque->btpo_level;
2520
2521         /*
2522          * Insert the left page pointer into the new root page.  The root page is
2523          * the rightmost page on its level so there is no "high key" in it; the
2524          * two items will go into positions P_HIKEY and P_FIRSTKEY.
2525          *
2526          * Note: we *must* insert the two items in item-number order, for the
2527          * benefit of _bt_restore_page().
2528          */
2529         Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2530         if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2531                                         false, false) == InvalidOffsetNumber)
2532                 elog(PANIC, "failed to add leftkey to new root page"
2533                          " while splitting block %u of index \"%s\"",
2534                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2535
2536         /*
2537          * insert the right page pointer into the new root page.
2538          */
2539         Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2540         Assert(BTreeTupleGetNAtts(right_item, rel) <=
2541                    IndexRelationGetNumberOfKeyAttributes(rel));
2542         if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2543                                         false, false) == InvalidOffsetNumber)
2544                 elog(PANIC, "failed to add rightkey to new root page"
2545                          " while splitting block %u of index \"%s\"",
2546                          BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2547
2548         /* Clear the incomplete-split flag in the left child */
2549         Assert(P_INCOMPLETE_SPLIT(lopaque));
2550         lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2551         MarkBufferDirty(lbuf);
2552
2553         MarkBufferDirty(rootbuf);
2554         MarkBufferDirty(metabuf);
2555
2556         /* XLOG stuff */
2557         if (RelationNeedsWAL(rel))
2558         {
2559                 xl_btree_newroot xlrec;
2560                 XLogRecPtr      recptr;
2561                 xl_btree_metadata md;
2562
2563                 xlrec.rootblk = rootblknum;
2564                 xlrec.level = metad->btm_level;
2565
2566                 XLogBeginInsert();
2567                 XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2568
2569                 XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2570                 XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2571                 XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2572
2573                 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2574                 md.version = metad->btm_version;
2575                 md.root = rootblknum;
2576                 md.level = metad->btm_level;
2577                 md.fastroot = rootblknum;
2578                 md.fastlevel = metad->btm_level;
2579                 md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2580                 md.allequalimage = metad->btm_allequalimage;
2581
2582                 XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2583
2584                 /*
2585                  * Direct access to page is not good but faster - we should implement
2586                  * some new func in page API.
2587                  */
2588                 XLogRegisterBufData(0,
2589                                                         (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2590                                                         ((PageHeader) rootpage)->pd_special -
2591                                                         ((PageHeader) rootpage)->pd_upper);
2592
2593                 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2594
2595                 PageSetLSN(lpage, recptr);
2596                 PageSetLSN(rootpage, recptr);
2597                 PageSetLSN(metapg, recptr);
2598         }
2599
2600         END_CRIT_SECTION();
2601
2602         /* done with metapage */
2603         _bt_relbuf(rel, metabuf);
2604
2605         pfree(left_item);
2606         pfree(right_item);
2607
2608         return rootbuf;
2609 }
2610
2611 /*
2612  *      _bt_pgaddtup() -- add a data item to a particular page during split.
2613  *
2614  *              The difference between this routine and a bare PageAddItem call is
2615  *              that this code can deal with the first data item on an internal btree
2616  *              page in passing.  This data item (which is called "firstright" within
2617  *              _bt_split()) has a key that must be treated as minus infinity after
2618  *              the split.  Therefore, we truncate away all attributes when caller
2619  *              specifies it's the first data item on page (downlink is not changed,
2620  *              though).  This extra step is only needed for the right page of an
2621  *              internal page split.  There is no need to do this for the first data
2622  *              item on the existing/left page, since that will already have been
2623  *              truncated during an earlier page split.
2624  *
2625  *              See _bt_split() for a high level explanation of why we truncate here.
2626  *              Note that this routine has nothing to do with suffix truncation,
2627  *              despite using some of the same infrastructure.
2628  */
2629 static inline bool
2630 _bt_pgaddtup(Page page,
2631                          Size itemsize,
2632                          IndexTuple itup,
2633                          OffsetNumber itup_off,
2634                          bool newfirstdataitem)
2635 {
2636         IndexTupleData trunctuple;
2637
2638         if (newfirstdataitem)
2639         {
2640                 trunctuple = *itup;
2641                 trunctuple.t_info = sizeof(IndexTupleData);
2642                 BTreeTupleSetNAtts(&trunctuple, 0, false);
2643                 itup = &trunctuple;
2644                 itemsize = sizeof(IndexTupleData);
2645         }
2646
2647         if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2648                                                          false) == InvalidOffsetNumber))
2649                 return false;
2650
2651         return true;
2652 }
2653
2654 /*
2655  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2656  *
2657  * There are three operations performed here: simple index deletion, bottom-up
2658  * index deletion, and deduplication.  If all three operations fail to free
2659  * enough space for the incoming item then caller will go on to split the
2660  * page.  We always consider simple deletion first.  If that doesn't work out
2661  * we consider alternatives.  Callers that only want us to consider simple
2662  * deletion (without any fallback) ask for that using the 'simpleonly'
2663  * argument.
2664  *
2665  * We usually pick only one alternative "complex" operation when simple
2666  * deletion alone won't prevent a page split.  The 'checkingunique',
2667  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2668  *
2669  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2670  * level flag was found set.  The flag was useful back when there wasn't
2671  * necessarily one single page for a duplicate tuple to go on (before heap TID
2672  * became a part of the key space in version 4 indexes).  But we don't
2673  * actually look at the flag anymore (it's not a gating condition for our
2674  * caller).  That would cause us to miss tuples that are safe to delete,
2675  * without getting any benefit in return.  We know that the alternative is to
2676  * split the page; scanning the line pointer array in passing won't have
2677  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
2678  * all this because !heapkeyspace indexes must still do a "getting tired"
2679  * linear search, and so are likely to get some benefit from using it as a
2680  * gating condition.)
2681  */
2682 static void
2683 _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2684                                                          BTInsertState insertstate,
2685                                                          bool simpleonly, bool checkingunique,
2686                                                          bool uniquedup, bool indexUnchanged)
2687 {
2688         OffsetNumber deletable[MaxIndexTuplesPerPage];
2689         int                     ndeletable = 0;
2690         OffsetNumber offnum,
2691                                 minoff,
2692                                 maxoff;
2693         Buffer          buffer = insertstate->buf;
2694         BTScanInsert itup_key = insertstate->itup_key;
2695         Page            page = BufferGetPage(buffer);
2696         BTPageOpaque opaque = BTPageGetOpaque(page);
2697
2698         Assert(P_ISLEAF(opaque));
2699         Assert(simpleonly || itup_key->heapkeyspace);
2700         Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2701
2702         /*
2703          * Scan over all items to see which ones need to be deleted according to
2704          * LP_DEAD flags.  We'll usually manage to delete a few extra items that
2705          * are not marked LP_DEAD in passing.  Often the extra items that actually
2706          * end up getting deleted are items that would have had their LP_DEAD bit
2707          * set before long anyway (if we opted not to include them as extras).
2708          */
2709         minoff = P_FIRSTDATAKEY(opaque);
2710         maxoff = PageGetMaxOffsetNumber(page);
2711         for (offnum = minoff;
2712                  offnum <= maxoff;
2713                  offnum = OffsetNumberNext(offnum))
2714         {
2715                 ItemId          itemId = PageGetItemId(page, offnum);
2716
2717                 if (ItemIdIsDead(itemId))
2718                         deletable[ndeletable++] = offnum;
2719         }
2720
2721         if (ndeletable > 0)
2722         {
2723                 _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2724                                                    insertstate->itup, minoff, maxoff);
2725                 insertstate->bounds_valid = false;
2726
2727                 /* Return when a page split has already been avoided */
2728                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
2729                         return;
2730
2731                 /* Might as well assume duplicates (if checkingunique) */
2732                 uniquedup = true;
2733         }
2734
2735         /*
2736          * We're done with simple deletion.  Return early with callers that only
2737          * call here so that simple deletion can be considered.  This includes
2738          * callers that explicitly ask for this and checkingunique callers that
2739          * probably don't have any version churn duplicates on the page.
2740          *
2741          * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2742          * return at this point (or when we go on the try either or both of our
2743          * other strategies and they also fail).  We do not bother expending a
2744          * separate write to clear it, however.  Caller will definitely clear it
2745          * when it goes on to split the page (note also that the deduplication
2746          * process will clear the flag in passing, just to keep things tidy).
2747          */
2748         if (simpleonly || (checkingunique && !uniquedup))
2749         {
2750                 Assert(!indexUnchanged);
2751                 return;
2752         }
2753
2754         /* Assume bounds about to be invalidated (this is almost certain now) */
2755         insertstate->bounds_valid = false;
2756
2757         /*
2758          * Perform bottom-up index deletion pass when executor hint indicated that
2759          * incoming item is logically unchanged, or for a unique index that is
2760          * known to have physical duplicates for some other reason.  (There is a
2761          * large overlap between these two cases for a unique index.  It's worth
2762          * having both triggering conditions in order to apply the optimization in
2763          * the event of successive related INSERT and DELETE statements.)
2764          *
2765          * We'll go on to do a deduplication pass when a bottom-up pass fails to
2766          * delete an acceptable amount of free space (a significant fraction of
2767          * the page, or space for the new item, whichever is greater).
2768          *
2769          * Note: Bottom-up index deletion uses the same equality/equivalence
2770          * routines as deduplication internally.  However, it does not merge
2771          * together index tuples, so the same correctness considerations do not
2772          * apply.  We deliberately omit an index-is-allequalimage test here.
2773          */
2774         if ((indexUnchanged || uniquedup) &&
2775                 _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2776                 return;
2777
2778         /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2779         if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2780                 _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2781                                            (indexUnchanged || uniquedup));
2782 }
2783
2784 /*
2785  * _bt_simpledel_pass - Simple index tuple deletion pass.
2786  *
2787  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
2788  * of all such tuples are determined by caller (caller passes these to us as
2789  * its 'deletable' argument).
2790  *
2791  * We might also delete extra index tuples that turn out to be safe to delete
2792  * in passing (though they must be cheap to check in passing to begin with).
2793  * There is no certainty that any extra tuples will be deleted, though.  The
2794  * high level goal of the approach we take is to get the most out of each call
2795  * here (without noticeably increasing the per-call overhead compared to what
2796  * we need to do just to be able to delete the page's LP_DEAD-marked index
2797  * tuples).
2798  *
2799  * The number of extra index tuples that turn out to be deletable might
2800  * greatly exceed the number of LP_DEAD-marked index tuples due to various
2801  * locality related effects.  For example, it's possible that the total number
2802  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2803  * quite low, in which case we might end up checking if it's possible to
2804  * delete _most_ index tuples on the page (without the tableam needing to
2805  * access additional table blocks).  The tableam will sometimes stumble upon
2806  * _many_ extra deletable index tuples in indexes where this pattern is
2807  * common.
2808  *
2809  * See nbtree/README for further details on simple index tuple deletion.
2810  */
2811 static void
2812 _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2813                                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2814                                    OffsetNumber minoff, OffsetNumber maxoff)
2815 {
2816         Page            page = BufferGetPage(buffer);
2817         BlockNumber *deadblocks;
2818         int                     ndeadblocks;
2819         TM_IndexDeleteOp delstate;
2820         OffsetNumber offnum;
2821
2822         /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2823         deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2824                                                                 &ndeadblocks);
2825
2826         /* Initialize tableam state that describes index deletion operation */
2827         delstate.irel = rel;
2828         delstate.iblknum = BufferGetBlockNumber(buffer);
2829         delstate.bottomup = false;
2830         delstate.bottomupfreespace = 0;
2831         delstate.ndeltids = 0;
2832         delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2833         delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2834
2835         for (offnum = minoff;
2836                  offnum <= maxoff;
2837                  offnum = OffsetNumberNext(offnum))
2838         {
2839                 ItemId          itemid = PageGetItemId(page, offnum);
2840                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
2841                 TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2842                 TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2843                 BlockNumber tidblock;
2844                 void       *match;
2845
2846                 if (!BTreeTupleIsPosting(itup))
2847                 {
2848                         tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2849                         match = bsearch(&tidblock, deadblocks, ndeadblocks,
2850                                                         sizeof(BlockNumber), _bt_blk_cmp);
2851
2852                         if (!match)
2853                         {
2854                                 Assert(!ItemIdIsDead(itemid));
2855                                 continue;
2856                         }
2857
2858                         /*
2859                          * TID's table block is among those pointed to by the TIDs from
2860                          * LP_DEAD-bit set tuples on page -- add TID to deltids
2861                          */
2862                         odeltid->tid = itup->t_tid;
2863                         odeltid->id = delstate.ndeltids;
2864                         ostatus->idxoffnum = offnum;
2865                         ostatus->knowndeletable = ItemIdIsDead(itemid);
2866                         ostatus->promising = false; /* unused */
2867                         ostatus->freespace = 0; /* unused */
2868
2869                         delstate.ndeltids++;
2870                 }
2871                 else
2872                 {
2873                         int                     nitem = BTreeTupleGetNPosting(itup);
2874
2875                         for (int p = 0; p < nitem; p++)
2876                         {
2877                                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2878
2879                                 tidblock = ItemPointerGetBlockNumber(tid);
2880                                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
2881                                                                 sizeof(BlockNumber), _bt_blk_cmp);
2882
2883                                 if (!match)
2884                                 {
2885                                         Assert(!ItemIdIsDead(itemid));
2886                                         continue;
2887                                 }
2888
2889                                 /*
2890                                  * TID's table block is among those pointed to by the TIDs
2891                                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
2892                                  */
2893                                 odeltid->tid = *tid;
2894                                 odeltid->id = delstate.ndeltids;
2895                                 ostatus->idxoffnum = offnum;
2896                                 ostatus->knowndeletable = ItemIdIsDead(itemid);
2897                                 ostatus->promising = false; /* unused */
2898                                 ostatus->freespace = 0; /* unused */
2899
2900                                 odeltid++;
2901                                 ostatus++;
2902                                 delstate.ndeltids++;
2903                         }
2904                 }
2905         }
2906
2907         pfree(deadblocks);
2908
2909         Assert(delstate.ndeltids >= ndeletable);
2910
2911         /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2912         _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2913
2914         pfree(delstate.deltids);
2915         pfree(delstate.status);
2916 }
2917
2918 /*
2919  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2920  *
2921  * Builds sorted and unique-ified array of table block numbers from index
2922  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
2923  * block from incoming newitem just in case it isn't among the LP_DEAD-related
2924  * table blocks.
2925  *
2926  * Always counting the newitem's table block as an LP_DEAD related block makes
2927  * sense because the cost is consistently low; it is practically certain that
2928  * the table block will not incur a buffer miss in tableam.  On the other hand
2929  * the benefit is often quite high.  There is a decent chance that there will
2930  * be some deletable items from this block, since in general most garbage
2931  * tuples became garbage in the recent past (in many cases this won't be the
2932  * first logical row that core code added to/modified in table block
2933  * recently).
2934  *
2935  * Returns final array, and sets *nblocks to its final size for caller.
2936  */
2937 static BlockNumber *
2938 _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2939                            IndexTuple newitem, int *nblocks)
2940 {
2941         int                     spacentids,
2942                                 ntids;
2943         BlockNumber *tidblocks;
2944
2945         /*
2946          * Accumulate each TID's block in array whose initial size has space for
2947          * one table block per LP_DEAD-set tuple (plus space for the newitem table
2948          * block).  Array will only need to grow when there are LP_DEAD-marked
2949          * posting list tuples (which is not that common).
2950          */
2951         spacentids = ndeletable + 1;
2952         ntids = 0;
2953         tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2954
2955         /*
2956          * First add the table block for the incoming newitem.  This is the one
2957          * case where simple deletion can visit a table block that doesn't have
2958          * any known deletable items.
2959          */
2960         Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2961         tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2962
2963         for (int i = 0; i < ndeletable; i++)
2964         {
2965                 ItemId          itemid = PageGetItemId(page, deletable[i]);
2966                 IndexTuple      itup = (IndexTuple) PageGetItem(page, itemid);
2967
2968                 Assert(ItemIdIsDead(itemid));
2969
2970                 if (!BTreeTupleIsPosting(itup))
2971                 {
2972                         if (ntids + 1 > spacentids)
2973                         {
2974                                 spacentids *= 2;
2975                                 tidblocks = (BlockNumber *)
2976                                         repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2977                         }
2978
2979                         tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2980                 }
2981                 else
2982                 {
2983                         int                     nposting = BTreeTupleGetNPosting(itup);
2984
2985                         if (ntids + nposting > spacentids)
2986                         {
2987                                 spacentids = Max(spacentids * 2, ntids + nposting);
2988                                 tidblocks = (BlockNumber *)
2989                                         repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990                         }
2991
2992                         for (int j = 0; j < nposting; j++)
2993                         {
2994                                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2995
2996                                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2997                         }
2998                 }
2999         }
3000
3001         qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3002         *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3003
3004         return tidblocks;
3005 }
3006
3007 /*
3008  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3009  */
3010 static inline int
3011 _bt_blk_cmp(const void *arg1, const void *arg2)
3012 {
3013         BlockNumber b1 = *((BlockNumber *) arg1);
3014         BlockNumber b2 = *((BlockNumber *) arg2);
3015
3016         if (b1 < b2)
3017                 return -1;
3018         else if (b1 > b2)
3019                 return 1;
3020
3021         return 0;
3022 }