1 /*-------------------------------------------------------------------------
4 * Item insertion in Lehman and Yao btrees for Postgres.
6 * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtinsert.c
13 *-------------------------------------------------------------------------
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xloginsert.h"
22 #include "common/pg_prng.h"
23 #include "lib/qunique.h"
24 #include "miscadmin.h"
25 #include "storage/lmgr.h"
26 #include "storage/predicate.h"
27 #include "storage/smgr.h"
29 /* Minimum tree height for application of fastpath optimization */
30 #define BTREE_FASTPATH_MIN_LEVEL 2
33 static BTStack
_bt_search_insert(Relation rel
, Relation heaprel
,
34 BTInsertState insertstate
);
35 static TransactionId
_bt_check_unique(Relation rel
, BTInsertState insertstate
,
37 IndexUniqueCheck checkUnique
, bool *is_unique
,
38 uint32
*speculativeToken
);
39 static OffsetNumber
_bt_findinsertloc(Relation rel
,
40 BTInsertState insertstate
,
45 static void _bt_stepright(Relation rel
, Relation heaprel
,
46 BTInsertState insertstate
, BTStack stack
);
47 static void _bt_insertonpg(Relation rel
, Relation heaprel
, BTScanInsert itup_key
,
53 OffsetNumber newitemoff
,
55 bool split_only_page
);
56 static Buffer
_bt_split(Relation rel
, Relation heaprel
, BTScanInsert itup_key
,
57 Buffer buf
, Buffer cbuf
, OffsetNumber newitemoff
,
58 Size newitemsz
, IndexTuple newitem
, IndexTuple orignewitem
,
59 IndexTuple nposting
, uint16 postingoff
);
60 static void _bt_insert_parent(Relation rel
, Relation heaprel
, Buffer buf
,
61 Buffer rbuf
, BTStack stack
, bool isroot
, bool isonly
);
62 static Buffer
_bt_newlevel(Relation rel
, Relation heaprel
, Buffer lbuf
, Buffer rbuf
);
63 static inline bool _bt_pgaddtup(Page page
, Size itemsize
, IndexTuple itup
,
64 OffsetNumber itup_off
, bool newfirstdataitem
);
65 static void _bt_delete_or_dedup_one_page(Relation rel
, Relation heapRel
,
66 BTInsertState insertstate
,
67 bool simpleonly
, bool checkingunique
,
68 bool uniquedup
, bool indexUnchanged
);
69 static void _bt_simpledel_pass(Relation rel
, Buffer buffer
, Relation heapRel
,
70 OffsetNumber
*deletable
, int ndeletable
,
71 IndexTuple newitem
, OffsetNumber minoff
,
73 static BlockNumber
*_bt_deadblocks(Page page
, OffsetNumber
*deletable
,
74 int ndeletable
, IndexTuple newitem
,
76 static inline int _bt_blk_cmp(const void *arg1
, const void *arg2
);
79 * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
81 * This routine is called by the public interface routine, btinsert.
82 * By here, itup is filled in, including the TID.
84 * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
85 * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
86 * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
87 * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
88 * don't actually insert.
90 * indexUnchanged executor hint indicates if itup is from an
91 * UPDATE that didn't logically change the indexed value, but
92 * must nevertheless have a new entry to point to a successor
95 * The result value is only significant for UNIQUE_CHECK_PARTIAL:
96 * it must be true if the entry is known unique, else false.
97 * (In the current implementation we'll also return true after a
98 * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
99 * that's just a coding artifact.)
102 _bt_doinsert(Relation rel
, IndexTuple itup
,
103 IndexUniqueCheck checkUnique
, bool indexUnchanged
,
106 bool is_unique
= false;
107 BTInsertStateData insertstate
;
108 BTScanInsert itup_key
;
110 bool checkingunique
= (checkUnique
!= UNIQUE_CHECK_NO
);
112 /* we need an insertion scan key to do our search, so build one */
113 itup_key
= _bt_mkscankey(rel
, itup
);
117 if (!itup_key
->anynullkeys
)
119 /* No (heapkeyspace) scantid until uniqueness established */
120 itup_key
->scantid
= NULL
;
125 * Scan key for new tuple contains NULL key values. Bypass
126 * checkingunique steps. They are unnecessary because core code
127 * considers NULL unequal to every value, including NULL.
129 * This optimization avoids O(N^2) behavior within the
130 * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 * large number of "duplicates" with NULL key values.
133 checkingunique
= false;
134 /* Tuple is unique in the sense that core code cares about */
135 Assert(checkUnique
!= UNIQUE_CHECK_EXISTING
);
141 * Fill in the BTInsertState working area, to track the current page and
142 * position within the page to insert on.
144 * Note that itemsz is passed down to lower level code that deals with
145 * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 * accounting code consistently considers the alignment overhead that we
147 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 * already conservative about alignment, but we don't rely on that from
149 * this distance. Besides, preserving the "true" tuple size in index
150 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
153 insertstate
.itup
= itup
;
154 insertstate
.itemsz
= MAXALIGN(IndexTupleSize(itup
));
155 insertstate
.itup_key
= itup_key
;
156 insertstate
.bounds_valid
= false;
157 insertstate
.buf
= InvalidBuffer
;
158 insertstate
.postingoff
= 0;
163 * Find and lock the leaf page that the tuple should be added to by
164 * searching from the root page. insertstate.buf will hold a buffer that
165 * is locked in exclusive mode afterwards.
167 stack
= _bt_search_insert(rel
, heapRel
, &insertstate
);
170 * checkingunique inserts are not allowed to go ahead when two tuples with
171 * equal key attribute values would be visible to new MVCC snapshots once
172 * the xact commits. Check for conflicts in the locked page/buffer (if
175 * It might be necessary to check a page to the right in _bt_check_unique,
176 * though that should be very rare. In practice the first page the value
177 * could be on (with scantid omitted) is almost always also the only page
178 * that a matching tuple might be found on. This is due to the behavior
179 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 * only be allowed to cross a page boundary when there is no candidate
181 * leaf page split point that avoids it. Also, _bt_check_unique can use
182 * the leaf page high key to determine that there will be no duplicates on
183 * the right sibling without actually visiting it (it uses the high key in
184 * cases where the new item happens to belong at the far right of the leaf
187 * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 * in the index; so it cannot defend against concurrent insertions of the
189 * same key. We protect against that by means of holding a write lock on
190 * the first page the value could be on, with omitted/-inf value for the
191 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 * the same key must acquire a write lock on the same page, so only one
193 * would-be inserter can be making the check at one time. Furthermore,
194 * once we are past the check we hold write locks continuously until we
195 * have performed our insertion, so no later inserter can fail to see our
196 * insertion. (This requires some care in _bt_findinsertloc.)
198 * If we must wait for another xact, we release the lock while waiting,
199 * and then must perform a new search.
201 * For a partial uniqueness check, we don't wait for the other xact. Just
202 * let the tuple in and return false for possibly non-unique, or true for
208 uint32 speculativeToken
;
210 xwait
= _bt_check_unique(rel
, &insertstate
, heapRel
, checkUnique
,
211 &is_unique
, &speculativeToken
);
213 if (unlikely(TransactionIdIsValid(xwait
)))
215 /* Have to wait for the other guy ... */
216 _bt_relbuf(rel
, insertstate
.buf
);
217 insertstate
.buf
= InvalidBuffer
;
220 * If it's a speculative insertion, wait for it to finish (ie. to
221 * go ahead with the insertion, or kill the tuple). Otherwise
222 * wait for the transaction to finish as usual.
224 if (speculativeToken
)
225 SpeculativeInsertionWait(xwait
, speculativeToken
);
227 XactLockTableWait(xwait
, rel
, &itup
->t_tid
, XLTW_InsertIndex
);
231 _bt_freestack(stack
);
235 /* Uniqueness is established -- restore heap tid as scantid */
236 if (itup_key
->heapkeyspace
)
237 itup_key
->scantid
= &itup
->t_tid
;
240 if (checkUnique
!= UNIQUE_CHECK_EXISTING
)
242 OffsetNumber newitemoff
;
245 * The only conflict predicate locking cares about for indexes is when
246 * an index tuple insert conflicts with an existing lock. We don't
247 * know the actual page we're going to insert on for sure just yet in
248 * checkingunique and !heapkeyspace cases, but it's okay to use the
249 * first page the value could be on (with scantid omitted) instead.
251 CheckForSerializableConflictIn(rel
, NULL
, BufferGetBlockNumber(insertstate
.buf
));
254 * Do the insertion. Note that insertstate contains cached binary
255 * search bounds established within _bt_check_unique when insertion is
258 newitemoff
= _bt_findinsertloc(rel
, &insertstate
, checkingunique
,
259 indexUnchanged
, stack
, heapRel
);
260 _bt_insertonpg(rel
, heapRel
, itup_key
, insertstate
.buf
, InvalidBuffer
,
261 stack
, itup
, insertstate
.itemsz
, newitemoff
,
262 insertstate
.postingoff
, false);
266 /* just release the buffer */
267 _bt_relbuf(rel
, insertstate
.buf
);
272 _bt_freestack(stack
);
279 * _bt_search_insert() -- _bt_search() wrapper for inserts
281 * Search the tree for a particular scankey, or more precisely for the first
282 * leaf page it could be on. Try to make use of the fastpath optimization's
283 * rightmost leaf page cache before actually searching the tree from the root
286 * Return value is a stack of parent-page pointers (though see notes about
287 * fastpath optimization and page splits below). insertstate->buf is set to
288 * the address of the leaf-page buffer, which is write-locked and pinned in
289 * all cases (if necessary by creating a new empty root page for caller).
291 * The fastpath optimization avoids most of the work of searching the tree
292 * repeatedly when a single backend inserts successive new tuples on the
293 * rightmost leaf page of an index. A backend cache of the rightmost leaf
294 * page is maintained within _bt_insertonpg(), and used here. The cache is
295 * invalidated here when an insert of a non-pivot tuple must take place on a
296 * non-rightmost leaf page.
298 * The optimization helps with indexes on an auto-incremented field. It also
299 * helps with indexes on datetime columns, as well as indexes with lots of
300 * NULL values. (NULLs usually get inserted in the rightmost page for single
301 * column indexes, since they usually get treated as coming after everything
302 * else in the key space. Individual NULL tuples will generally be placed on
303 * the rightmost leaf page due to the influence of the heap TID column.)
305 * Note that we avoid applying the optimization when there is insufficient
306 * space on the rightmost page to fit caller's new item. This is necessary
307 * because we'll need to return a real descent stack when a page split is
308 * expected (actually, caller can cope with a leaf page split that uses a NULL
309 * stack, but that's very slow and so must be avoided). Note also that the
310 * fastpath optimization acquires the lock on the page conditionally as a way
311 * of reducing extra contention when there are concurrent insertions into the
312 * rightmost page (we give up if we'd have to wait for the lock). We assume
313 * that it isn't useful to apply the optimization when there is contention,
314 * since each per-backend cache won't stay valid for long.
317 _bt_search_insert(Relation rel
, Relation heaprel
, BTInsertState insertstate
)
319 Assert(insertstate
->buf
== InvalidBuffer
);
320 Assert(!insertstate
->bounds_valid
);
321 Assert(insertstate
->postingoff
== 0);
323 if (RelationGetTargetBlock(rel
) != InvalidBlockNumber
)
325 /* Simulate a _bt_getbuf() call with conditional locking */
326 insertstate
->buf
= ReadBuffer(rel
, RelationGetTargetBlock(rel
));
327 if (_bt_conditionallockbuf(rel
, insertstate
->buf
))
332 _bt_checkpage(rel
, insertstate
->buf
);
333 page
= BufferGetPage(insertstate
->buf
);
334 opaque
= BTPageGetOpaque(page
);
337 * Check if the page is still the rightmost leaf page and has
338 * enough free space to accommodate the new tuple. Also check
339 * that the insertion scan key is strictly greater than the first
340 * non-pivot tuple on the page. (Note that we expect itup_key's
341 * scantid to be unset when our caller is a checkingunique
344 if (P_RIGHTMOST(opaque
) &&
347 PageGetFreeSpace(page
) > insertstate
->itemsz
&&
348 PageGetMaxOffsetNumber(page
) >= P_HIKEY
&&
349 _bt_compare(rel
, insertstate
->itup_key
, page
, P_HIKEY
) > 0)
352 * Caller can use the fastpath optimization because cached
353 * block is still rightmost leaf page, which can fit caller's
354 * new tuple without splitting. Keep block in local cache for
355 * next insert, and have caller use NULL stack.
357 * Note that _bt_insert_parent() has an assertion that catches
358 * leaf page splits that somehow follow from a fastpath insert
359 * (it should only be passed a NULL stack when it must deal
360 * with a concurrent root page split, and never because a NULL
361 * stack was returned here).
366 /* Page unsuitable for caller, drop lock and pin */
367 _bt_relbuf(rel
, insertstate
->buf
);
371 /* Lock unavailable, drop pin */
372 ReleaseBuffer(insertstate
->buf
);
375 /* Forget block, since cache doesn't appear to be useful */
376 RelationSetTargetBlock(rel
, InvalidBlockNumber
);
379 /* Cannot use optimization -- descend tree, return proper descent stack */
380 return _bt_search(rel
, heaprel
, insertstate
->itup_key
, &insertstate
->buf
,
385 * _bt_check_unique() -- Check for violation of unique index constraint
387 * Returns InvalidTransactionId if there is no conflict, else an xact ID
388 * we must wait for to see if it commits a conflicting tuple. If an actual
389 * conflict is detected, no return --- just ereport(). If an xact ID is
390 * returned, and the conflicting tuple still has a speculative insertion in
391 * progress, *speculativeToken is set to non-zero, and the caller can wait for
392 * the verdict on the insertion using SpeculativeInsertionWait().
394 * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
395 * InvalidTransactionId because we don't want to wait. In this case we
396 * set *is_unique to false if there is a potential conflict, and the
397 * core code must redo the uniqueness check later.
399 * As a side-effect, sets state in insertstate that can later be used by
400 * _bt_findinsertloc() to reuse most of the binary search work we do
403 * This code treats NULLs as equal, unlike the default semantics for unique
404 * indexes. So do not call here when there are NULL values in scan key and
405 * the index uses the default NULLS DISTINCT mode.
408 _bt_check_unique(Relation rel
, BTInsertState insertstate
, Relation heapRel
,
409 IndexUniqueCheck checkUnique
, bool *is_unique
,
410 uint32
*speculativeToken
)
412 IndexTuple itup
= insertstate
->itup
;
413 IndexTuple curitup
= NULL
;
414 ItemId curitemid
= NULL
;
415 BTScanInsert itup_key
= insertstate
->itup_key
;
416 SnapshotData SnapshotDirty
;
421 Buffer nbuf
= InvalidBuffer
;
423 bool inposting
= false;
424 bool prevalldead
= true;
427 /* Assume unique until we find a duplicate */
430 InitDirtySnapshot(SnapshotDirty
);
432 page
= BufferGetPage(insertstate
->buf
);
433 opaque
= BTPageGetOpaque(page
);
434 maxoff
= PageGetMaxOffsetNumber(page
);
437 * Find the first tuple with the same key.
439 * This also saves the binary search bounds in insertstate. We use them
440 * in the fastpath below, but also in the _bt_findinsertloc() call later.
442 Assert(!insertstate
->bounds_valid
);
443 offset
= _bt_binsrch_insert(rel
, insertstate
);
446 * Scan over all equal tuples, looking for live conflicts.
448 Assert(!insertstate
->bounds_valid
|| insertstate
->low
== offset
);
449 Assert(!itup_key
->anynullkeys
);
450 Assert(itup_key
->scantid
== NULL
);
454 * Each iteration of the loop processes one heap TID, not one index
455 * tuple. Current offset number for page isn't usually advanced on
456 * iterations that process heap TIDs from posting list tuples.
458 * "inposting" state is set when _inside_ a posting list --- not when
459 * we're at the start (or end) of a posting list. We advance curposti
460 * at the end of the iteration when inside a posting list tuple. In
461 * general, every loop iteration either advances the page offset or
462 * advances curposti --- an iteration that handles the rightmost/max
463 * heap TID in a posting list finally advances the page offset (and
464 * unsets "inposting").
466 * Make sure the offset points to an actual index tuple before trying
469 if (offset
<= maxoff
)
472 * Fastpath: In most cases, we can use cached search bounds to
473 * limit our consideration to items that are definitely
474 * duplicates. This fastpath doesn't apply when the original page
475 * is empty, or when initial offset is past the end of the
476 * original page, which may indicate that we need to examine a
477 * second or subsequent page.
479 * Note that this optimization allows us to avoid calling
480 * _bt_compare() directly when there are no duplicates, as long as
481 * the offset where the key will go is not at the end of the page.
483 if (nbuf
== InvalidBuffer
&& offset
== insertstate
->stricthigh
)
485 Assert(insertstate
->bounds_valid
);
486 Assert(insertstate
->low
>= P_FIRSTDATAKEY(opaque
));
487 Assert(insertstate
->low
<= insertstate
->stricthigh
);
488 Assert(_bt_compare(rel
, itup_key
, page
, offset
) < 0);
493 * We can skip items that are already marked killed.
495 * In the presence of heavy update activity an index may contain
496 * many killed items with the same key; running _bt_compare() on
497 * each killed item gets expensive. Just advance over killed
498 * items as quickly as we can. We only apply _bt_compare() when
499 * we get to a non-killed item. We could reuse the bounds to
500 * avoid _bt_compare() calls for known equal tuples, but it
501 * doesn't seem worth it.
504 curitemid
= PageGetItemId(page
, offset
);
505 if (inposting
|| !ItemIdIsDead(curitemid
))
507 ItemPointerData htid
;
508 bool all_dead
= false;
512 /* Plain tuple, or first TID in posting list tuple */
513 if (_bt_compare(rel
, itup_key
, page
, offset
) != 0)
514 break; /* we're past all the equal tuples */
516 /* Advanced curitup */
517 curitup
= (IndexTuple
) PageGetItem(page
, curitemid
);
518 Assert(!BTreeTupleIsPivot(curitup
));
521 /* okay, we gotta fetch the heap tuple using htid ... */
522 if (!BTreeTupleIsPosting(curitup
))
524 /* ... htid is from simple non-pivot tuple */
526 htid
= curitup
->t_tid
;
530 /* ... htid is first TID in new posting list */
534 htid
= *BTreeTupleGetPostingN(curitup
, 0);
538 /* ... htid is second or subsequent TID in posting list */
539 Assert(curposti
> 0);
540 htid
= *BTreeTupleGetPostingN(curitup
, curposti
);
544 * If we are doing a recheck, we expect to find the tuple we
545 * are rechecking. It's not a duplicate, but we have to keep
548 if (checkUnique
== UNIQUE_CHECK_EXISTING
&&
549 ItemPointerCompare(&htid
, &itup
->t_tid
) == 0)
555 * Check if there's any table tuples for this index entry
556 * satisfying SnapshotDirty. This is necessary because for AMs
557 * with optimizations like heap's HOT, we have just a single
558 * index entry for the entire chain.
560 else if (table_index_fetch_tuple_check(heapRel
, &htid
,
567 * It is a duplicate. If we are only doing a partial
568 * check, then don't bother checking if the tuple is being
569 * updated in another transaction. Just return the fact
570 * that it is a potential conflict and leave the full
571 * check till later. Don't invalidate binary search
574 if (checkUnique
== UNIQUE_CHECK_PARTIAL
)
576 if (nbuf
!= InvalidBuffer
)
577 _bt_relbuf(rel
, nbuf
);
579 return InvalidTransactionId
;
583 * If this tuple is being updated by other transaction
584 * then we have to wait for its commit/abort.
586 xwait
= (TransactionIdIsValid(SnapshotDirty
.xmin
)) ?
587 SnapshotDirty
.xmin
: SnapshotDirty
.xmax
;
589 if (TransactionIdIsValid(xwait
))
591 if (nbuf
!= InvalidBuffer
)
592 _bt_relbuf(rel
, nbuf
);
593 /* Tell _bt_doinsert to wait... */
594 *speculativeToken
= SnapshotDirty
.speculativeToken
;
595 /* Caller releases lock on buf immediately */
596 insertstate
->bounds_valid
= false;
601 * Otherwise we have a definite conflict. But before
602 * complaining, look to see if the tuple we want to insert
603 * is itself now committed dead --- if so, don't complain.
604 * This is a waste of time in normal scenarios but we must
605 * do it to support CREATE INDEX CONCURRENTLY.
607 * We must follow HOT-chains here because during
608 * concurrent index build, we insert the root TID though
609 * the actual tuple may be somewhere in the HOT-chain.
610 * While following the chain we might not stop at the
611 * exact tuple which triggered the insert, but that's OK
612 * because if we find a live tuple anywhere in this chain,
613 * we have a unique key conflict. The other live tuple is
614 * not part of this chain because it had a different index
618 if (table_index_fetch_tuple_check(heapRel
, &htid
,
621 /* Normal case --- it's still live */
626 * It's been deleted, so no error, and no need to
633 * Check for a conflict-in as we would if we were going to
634 * write to this page. We aren't actually going to write,
635 * but we want a chance to report SSI conflicts that would
636 * otherwise be masked by this unique constraint
639 CheckForSerializableConflictIn(rel
, NULL
, BufferGetBlockNumber(insertstate
->buf
));
642 * This is a definite conflict. Break the tuple down into
643 * datums and report the error. But first, make sure we
644 * release the buffer locks we're holding ---
645 * BuildIndexValueDescription could make catalog accesses,
646 * which in the worst case might touch this same index and
649 if (nbuf
!= InvalidBuffer
)
650 _bt_relbuf(rel
, nbuf
);
651 _bt_relbuf(rel
, insertstate
->buf
);
652 insertstate
->buf
= InvalidBuffer
;
653 insertstate
->bounds_valid
= false;
656 Datum values
[INDEX_MAX_KEYS
];
657 bool isnull
[INDEX_MAX_KEYS
];
660 index_deform_tuple(itup
, RelationGetDescr(rel
),
663 key_desc
= BuildIndexValueDescription(rel
, values
,
667 (errcode(ERRCODE_UNIQUE_VIOLATION
),
668 errmsg("duplicate key value violates unique constraint \"%s\"",
669 RelationGetRelationName(rel
)),
670 key_desc
? errdetail("Key %s already exists.",
672 errtableconstraint(heapRel
,
673 RelationGetRelationName(rel
))));
676 else if (all_dead
&& (!inposting
||
678 curposti
== BTreeTupleGetNPosting(curitup
) - 1)))
681 * The conflicting tuple (or all HOT chains pointed to by
682 * all posting list TIDs) is dead to everyone, so mark the
683 * index entry killed.
685 ItemIdMarkDead(curitemid
);
686 opaque
->btpo_flags
|= BTP_HAS_GARBAGE
;
689 * Mark buffer with a dirty hint, since state is not
690 * crucial. Be sure to mark the proper buffer dirty.
692 if (nbuf
!= InvalidBuffer
)
693 MarkBufferDirtyHint(nbuf
, true);
695 MarkBufferDirtyHint(insertstate
->buf
, true);
699 * Remember if posting list tuple has even a single HOT chain
700 * whose members are not all dead
702 if (!all_dead
&& inposting
)
707 if (inposting
&& curposti
< BTreeTupleGetNPosting(curitup
) - 1)
709 /* Advance to next TID in same posting list */
713 else if (offset
< maxoff
)
715 /* Advance to next tuple */
718 offset
= OffsetNumberNext(offset
);
724 /* If scankey == hikey we gotta check the next page too */
725 if (P_RIGHTMOST(opaque
))
727 highkeycmp
= _bt_compare(rel
, itup_key
, page
, P_HIKEY
);
728 Assert(highkeycmp
<= 0);
731 /* Advance to next non-dead page --- there must be one */
734 BlockNumber nblkno
= opaque
->btpo_next
;
736 nbuf
= _bt_relandgetbuf(rel
, nbuf
, nblkno
, BT_READ
);
737 page
= BufferGetPage(nbuf
);
738 opaque
= BTPageGetOpaque(page
);
739 if (!P_IGNORE(opaque
))
741 if (P_RIGHTMOST(opaque
))
742 elog(ERROR
, "fell off the end of index \"%s\"",
743 RelationGetRelationName(rel
));
745 /* Will also advance to next tuple */
748 maxoff
= PageGetMaxOffsetNumber(page
);
749 offset
= P_FIRSTDATAKEY(opaque
);
750 /* Don't invalidate binary search bounds */
755 * If we are doing a recheck then we should have found the tuple we are
756 * checking. Otherwise there's something very wrong --- probably, the
757 * index is on a non-immutable expression.
759 if (checkUnique
== UNIQUE_CHECK_EXISTING
&& !found
)
761 (errcode(ERRCODE_INTERNAL_ERROR
),
762 errmsg("failed to re-find tuple within index \"%s\"",
763 RelationGetRelationName(rel
)),
764 errhint("This may be because of a non-immutable index expression."),
765 errtableconstraint(heapRel
,
766 RelationGetRelationName(rel
))));
768 if (nbuf
!= InvalidBuffer
)
769 _bt_relbuf(rel
, nbuf
);
771 return InvalidTransactionId
;
776 * _bt_findinsertloc() -- Finds an insert location for a tuple
778 * On entry, insertstate buffer contains the page the new tuple belongs
779 * on. It is exclusive-locked and pinned by the caller.
781 * If 'checkingunique' is true, the buffer on entry is the first page
782 * that contains duplicates of the new key. If there are duplicates on
783 * multiple pages, the correct insertion position might be some page to
784 * the right, rather than the first page. In that case, this function
785 * moves right to the correct target page.
787 * (In a !heapkeyspace index, there can be multiple pages with the same
788 * high key, where the new tuple could legitimately be placed on. In
789 * that case, the caller passes the first page containing duplicates,
790 * just like when checkingunique=true. If that page doesn't have enough
791 * room for the new tuple, this function moves right, trying to find a
792 * legal page that does.)
794 * If 'indexUnchanged' is true, this is for an UPDATE that didn't
795 * logically change the indexed value, but must nevertheless have a new
796 * entry to point to a successor version. This hint from the executor
797 * will influence our behavior when the page might have to be split and
798 * we must consider our options. Bottom-up index deletion can avoid
799 * pathological version-driven page splits, but we only want to go to the
800 * trouble of trying it when we already have moderate confidence that
801 * it's appropriate. The hint should not significantly affect our
802 * behavior over time unless practically all inserts on to the leaf page
805 * On exit, insertstate buffer contains the chosen insertion page, and
806 * the offset within that page is returned. If _bt_findinsertloc needed
807 * to move right, the lock and pin on the original page are released, and
808 * the new buffer is exclusively locked and pinned instead.
810 * If insertstate contains cached binary search bounds, we will take
811 * advantage of them. This avoids repeating comparisons that we made in
812 * _bt_check_unique() already.
815 _bt_findinsertloc(Relation rel
,
816 BTInsertState insertstate
,
822 BTScanInsert itup_key
= insertstate
->itup_key
;
823 Page page
= BufferGetPage(insertstate
->buf
);
825 OffsetNumber newitemoff
;
827 opaque
= BTPageGetOpaque(page
);
829 /* Check 1/3 of a page restriction */
830 if (unlikely(insertstate
->itemsz
> BTMaxItemSize(page
)))
831 _bt_check_third_page(rel
, heapRel
, itup_key
->heapkeyspace
, page
,
834 Assert(P_ISLEAF(opaque
) && !P_INCOMPLETE_SPLIT(opaque
));
835 Assert(!insertstate
->bounds_valid
|| checkingunique
);
836 Assert(!itup_key
->heapkeyspace
|| itup_key
->scantid
!= NULL
);
837 Assert(itup_key
->heapkeyspace
|| itup_key
->scantid
== NULL
);
838 Assert(!itup_key
->allequalimage
|| itup_key
->heapkeyspace
);
840 if (itup_key
->heapkeyspace
)
842 /* Keep track of whether checkingunique duplicate seen */
843 bool uniquedup
= indexUnchanged
;
846 * If we're inserting into a unique index, we may have to walk right
847 * through leaf pages to find the one leaf page that we must insert on
850 * This is needed for checkingunique callers because a scantid was not
851 * used when we called _bt_search(). scantid can only be set after
852 * _bt_check_unique() has checked for duplicates. The buffer
853 * initially stored in insertstate->buf has the page where the first
854 * duplicate key might be found, which isn't always the page that new
855 * tuple belongs on. The heap TID attribute for new tuple (scantid)
856 * could force us to insert on a sibling page, though that should be
857 * very rare in practice.
861 if (insertstate
->low
< insertstate
->stricthigh
)
863 /* Encountered a duplicate in _bt_check_unique() */
864 Assert(insertstate
->bounds_valid
);
871 * Does the new tuple belong on this page?
873 * The earlier _bt_check_unique() call may well have
874 * established a strict upper bound on the offset for the new
875 * item. If it's not the last item of the page (i.e. if there
876 * is at least one tuple on the page that goes after the tuple
877 * we're inserting) then we know that the tuple belongs on
878 * this page. We can skip the high key check.
880 if (insertstate
->bounds_valid
&&
881 insertstate
->low
<= insertstate
->stricthigh
&&
882 insertstate
->stricthigh
<= PageGetMaxOffsetNumber(page
))
885 /* Test '<=', not '!=', since scantid is set now */
886 if (P_RIGHTMOST(opaque
) ||
887 _bt_compare(rel
, itup_key
, page
, P_HIKEY
) <= 0)
890 _bt_stepright(rel
, heapRel
, insertstate
, stack
);
891 /* Update local state after stepping right */
892 page
= BufferGetPage(insertstate
->buf
);
893 opaque
= BTPageGetOpaque(page
);
894 /* Assume duplicates (if checkingunique) */
900 * If the target page cannot fit newitem, try to avoid splitting the
901 * page on insert by performing deletion or deduplication now
903 if (PageGetFreeSpace(page
) < insertstate
->itemsz
)
904 _bt_delete_or_dedup_one_page(rel
, heapRel
, insertstate
, false,
905 checkingunique
, uniquedup
,
911 * This is a !heapkeyspace (version 2 or 3) index. The current page
912 * is the first page that we could insert the new tuple to, but there
913 * may be other pages to the right that we could opt to use instead.
915 * If the new key is equal to one or more existing keys, we can
916 * legitimately place it anywhere in the series of equal keys. In
917 * fact, if the new key is equal to the page's "high key" we can place
918 * it on the next page. If it is equal to the high key, and there's
919 * not room to insert the new tuple on the current page without
920 * splitting, then we move right hoping to find more free space and
923 * Keep scanning right until we
924 * (a) find a page with enough free space,
925 * (b) reach the last page where the tuple can legally go, or
926 * (c) get tired of searching.
927 * (c) is not flippant; it is important because if there are many
928 * pages' worth of equal keys, it's better to split one of the early
929 * pages than to scan all the way to the end of the run of equal keys
930 * on every insert. We implement "get tired" as a random choice,
931 * since stopping after scanning a fixed number of pages wouldn't work
932 * well (we'd never reach the right-hand side of previously split
933 * pages). The probability of moving right is set at 0.99, which may
934 * seem too high to change the behavior much, but it does an excellent
935 * job of preventing O(N^2) behavior with many equal keys.
938 while (PageGetFreeSpace(page
) < insertstate
->itemsz
)
941 * Before considering moving right, see if we can obtain enough
942 * space by erasing LP_DEAD items
944 if (P_HAS_GARBAGE(opaque
))
946 /* Perform simple deletion */
947 _bt_delete_or_dedup_one_page(rel
, heapRel
, insertstate
, true,
948 false, false, false);
950 if (PageGetFreeSpace(page
) >= insertstate
->itemsz
)
951 break; /* OK, now we have enough space */
955 * Nope, so check conditions (b) and (c) enumerated above
957 * The earlier _bt_check_unique() call may well have established a
958 * strict upper bound on the offset for the new item. If it's not
959 * the last item of the page (i.e. if there is at least one tuple
960 * on the page that's greater than the tuple we're inserting to)
961 * then we know that the tuple belongs on this page. We can skip
962 * the high key check.
964 if (insertstate
->bounds_valid
&&
965 insertstate
->low
<= insertstate
->stricthigh
&&
966 insertstate
->stricthigh
<= PageGetMaxOffsetNumber(page
))
969 if (P_RIGHTMOST(opaque
) ||
970 _bt_compare(rel
, itup_key
, page
, P_HIKEY
) != 0 ||
971 pg_prng_uint32(&pg_global_prng_state
) <= (PG_UINT32_MAX
/ 100))
974 _bt_stepright(rel
, heapRel
, insertstate
, stack
);
975 /* Update local state after stepping right */
976 page
= BufferGetPage(insertstate
->buf
);
977 opaque
= BTPageGetOpaque(page
);
982 * We should now be on the correct page. Find the offset within the page
983 * for the new tuple. (Possibly reusing earlier search bounds.)
985 Assert(P_RIGHTMOST(opaque
) ||
986 _bt_compare(rel
, itup_key
, page
, P_HIKEY
) <= 0);
988 newitemoff
= _bt_binsrch_insert(rel
, insertstate
);
990 if (insertstate
->postingoff
== -1)
993 * There is an overlapping posting list tuple with its LP_DEAD bit
994 * set. We don't want to unnecessarily unset its LP_DEAD bit while
995 * performing a posting list split, so perform simple index tuple
998 _bt_delete_or_dedup_one_page(rel
, heapRel
, insertstate
, true,
999 false, false, false);
1002 * Do new binary search. New insert location cannot overlap with any
1005 Assert(!insertstate
->bounds_valid
);
1006 insertstate
->postingoff
= 0;
1007 newitemoff
= _bt_binsrch_insert(rel
, insertstate
);
1008 Assert(insertstate
->postingoff
== 0);
1015 * Step right to next non-dead page, during insertion.
1017 * This is a bit more complicated than moving right in a search. We must
1018 * write-lock the target page before releasing write lock on current page;
1019 * else someone else's _bt_check_unique scan could fail to see our insertion.
1020 * Write locks on intermediate dead pages won't do because we don't know when
1021 * they will get de-linked from the tree.
1023 * This is more aggressive than it needs to be for non-unique !heapkeyspace
1027 _bt_stepright(Relation rel
, Relation heaprel
, BTInsertState insertstate
,
1031 BTPageOpaque opaque
;
1035 Assert(heaprel
!= NULL
);
1036 page
= BufferGetPage(insertstate
->buf
);
1037 opaque
= BTPageGetOpaque(page
);
1039 rbuf
= InvalidBuffer
;
1040 rblkno
= opaque
->btpo_next
;
1043 rbuf
= _bt_relandgetbuf(rel
, rbuf
, rblkno
, BT_WRITE
);
1044 page
= BufferGetPage(rbuf
);
1045 opaque
= BTPageGetOpaque(page
);
1048 * If this page was incompletely split, finish the split now. We do
1049 * this while holding a lock on the left sibling, which is not good
1050 * because finishing the split could be a fairly lengthy operation.
1051 * But this should happen very seldom.
1053 if (P_INCOMPLETE_SPLIT(opaque
))
1055 _bt_finish_split(rel
, heaprel
, rbuf
, stack
);
1056 rbuf
= InvalidBuffer
;
1060 if (!P_IGNORE(opaque
))
1062 if (P_RIGHTMOST(opaque
))
1063 elog(ERROR
, "fell off the end of index \"%s\"",
1064 RelationGetRelationName(rel
));
1066 rblkno
= opaque
->btpo_next
;
1068 /* rbuf locked; unlock buf, update state for caller */
1069 _bt_relbuf(rel
, insertstate
->buf
);
1070 insertstate
->buf
= rbuf
;
1071 insertstate
->bounds_valid
= false;
1075 * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1077 * This recursive procedure does the following things:
1079 * + if postingoff != 0, splits existing posting list tuple
1080 * (since it overlaps with new 'itup' tuple).
1081 * + if necessary, splits the target page, using 'itup_key' for
1082 * suffix truncation on leaf pages (caller passes NULL for
1084 * + inserts the new tuple (might be split from posting list).
1085 * + if the page was split, pops the parent stack, and finds the
1086 * right place to insert the new child pointer (by walking
1087 * right using information stored in the parent stack).
1088 * + invokes itself with the appropriate tuple for the right
1089 * child page on the parent.
1090 * + updates the metapage if a true root or fast root is split.
1092 * On entry, we must have the correct buffer in which to do the
1093 * insertion, and the buffer must be pinned and write-locked. On return,
1094 * we will have dropped both the pin and the lock on the buffer.
1096 * This routine only performs retail tuple insertions. 'itup' should
1097 * always be either a non-highkey leaf item, or a downlink (new high
1098 * key items are created indirectly, when a page is split). When
1099 * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1100 * we're inserting the downlink for. This function will clear the
1101 * INCOMPLETE_SPLIT flag on it, and release the buffer.
1105 _bt_insertonpg(Relation rel
,
1107 BTScanInsert itup_key
,
1113 OffsetNumber newitemoff
,
1115 bool split_only_page
)
1118 BTPageOpaque opaque
;
1123 IndexTuple oposting
= NULL
;
1124 IndexTuple origitup
= NULL
;
1125 IndexTuple nposting
= NULL
;
1127 page
= BufferGetPage(buf
);
1128 opaque
= BTPageGetOpaque(page
);
1129 isleaf
= P_ISLEAF(opaque
);
1130 isroot
= P_ISROOT(opaque
);
1131 isrightmost
= P_RIGHTMOST(opaque
);
1132 isonly
= P_LEFTMOST(opaque
) && P_RIGHTMOST(opaque
);
1134 /* child buffer must be given iff inserting on an internal page */
1135 Assert(isleaf
== !BufferIsValid(cbuf
));
1136 /* tuple must have appropriate number of attributes */
1138 BTreeTupleGetNAtts(itup
, rel
) ==
1139 IndexRelationGetNumberOfAttributes(rel
));
1141 BTreeTupleGetNAtts(itup
, rel
) <=
1142 IndexRelationGetNumberOfKeyAttributes(rel
));
1143 Assert(!BTreeTupleIsPosting(itup
));
1144 Assert(MAXALIGN(IndexTupleSize(itup
)) == itemsz
);
1145 /* Caller must always finish incomplete split for us */
1146 Assert(!P_INCOMPLETE_SPLIT(opaque
));
1149 * Every internal page should have exactly one negative infinity item at
1150 * all times. Only _bt_split() and _bt_newlevel() should add items that
1151 * become negative infinity items through truncation, since they're the
1152 * only routines that allocate new internal pages.
1154 Assert(isleaf
|| newitemoff
> P_FIRSTDATAKEY(opaque
));
1157 * Do we need to split an existing posting list item?
1159 if (postingoff
!= 0)
1161 ItemId itemid
= PageGetItemId(page
, newitemoff
);
1164 * The new tuple is a duplicate with a heap TID that falls inside the
1165 * range of an existing posting list tuple on a leaf page. Prepare to
1166 * split an existing posting list. Overwriting the posting list with
1167 * its post-split version is treated as an extra step in either the
1168 * insert or page split critical section.
1170 Assert(isleaf
&& itup_key
->heapkeyspace
&& itup_key
->allequalimage
);
1171 oposting
= (IndexTuple
) PageGetItem(page
, itemid
);
1174 * postingoff value comes from earlier call to _bt_binsrch_posting().
1175 * Its binary search might think that a plain tuple must be a posting
1176 * list tuple that needs to be split. This can happen with corruption
1177 * involving an existing plain tuple that is a duplicate of the new
1178 * item, up to and including its table TID. Check for that here in
1181 * Also verify that our caller has made sure that the existing posting
1182 * list tuple does not have its LP_DEAD bit set.
1184 if (!BTreeTupleIsPosting(oposting
) || ItemIdIsDead(itemid
))
1186 (errcode(ERRCODE_INDEX_CORRUPTED
),
1187 errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1188 ItemPointerGetBlockNumber(&itup
->t_tid
),
1189 ItemPointerGetOffsetNumber(&itup
->t_tid
),
1190 newitemoff
, BufferGetBlockNumber(buf
),
1191 RelationGetRelationName(rel
))));
1193 /* use a mutable copy of itup as our itup from here on */
1195 itup
= CopyIndexTuple(origitup
);
1196 nposting
= _bt_swap_posting(itup
, oposting
, postingoff
);
1197 /* itup now contains rightmost/max TID from oposting */
1199 /* Alter offset so that newitem goes after posting list */
1200 newitemoff
= OffsetNumberNext(newitemoff
);
1204 * Do we need to split the page to fit the item on it?
1206 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1207 * so this comparison is correct even though we appear to be accounting
1208 * only for the item and not for its line pointer.
1210 if (PageGetFreeSpace(page
) < itemsz
)
1214 Assert(!split_only_page
);
1216 /* split the buffer into left and right halves */
1217 rbuf
= _bt_split(rel
, heaprel
, itup_key
, buf
, cbuf
, newitemoff
, itemsz
,
1218 itup
, origitup
, nposting
, postingoff
);
1219 PredicateLockPageSplit(rel
,
1220 BufferGetBlockNumber(buf
),
1221 BufferGetBlockNumber(rbuf
));
1226 * + our target page has been split;
1227 * + the original tuple has been inserted;
1228 * + we have write locks on both the old (left half)
1229 * and new (right half) buffers, after the split; and
1230 * + we know the key we want to insert into the parent
1231 * (it's the "high key" on the left child page).
1233 * We're ready to do the parent insertion. We need to hold onto the
1234 * locks for the child pages until we locate the parent, but we can
1235 * at least release the lock on the right child before doing the
1236 * actual insertion. The lock on the left child will be released
1237 * last of all by parent insertion, where it is the 'cbuf' of parent
1241 _bt_insert_parent(rel
, heaprel
, buf
, rbuf
, stack
, isroot
, isonly
);
1245 Buffer metabuf
= InvalidBuffer
;
1247 BTMetaPageData
*metad
= NULL
;
1248 BlockNumber blockcache
;
1251 * If we are doing this insert because we split a page that was the
1252 * only one on its tree level, but was not the root, it may have been
1253 * the "fast root". We need to ensure that the fast root link points
1254 * at or above the current page. We can safely acquire a lock on the
1255 * metapage here --- see comments for _bt_newlevel().
1257 if (unlikely(split_only_page
))
1260 Assert(BufferIsValid(cbuf
));
1262 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_WRITE
);
1263 metapg
= BufferGetPage(metabuf
);
1264 metad
= BTPageGetMeta(metapg
);
1266 if (metad
->btm_fastlevel
>= opaque
->btpo_level
)
1268 /* no update wanted */
1269 _bt_relbuf(rel
, metabuf
);
1270 metabuf
= InvalidBuffer
;
1274 /* Do the update. No ereport(ERROR) until changes are logged */
1275 START_CRIT_SECTION();
1277 if (postingoff
!= 0)
1278 memcpy(oposting
, nposting
, MAXALIGN(IndexTupleSize(nposting
)));
1280 if (PageAddItem(page
, (Item
) itup
, itemsz
, newitemoff
, false,
1281 false) == InvalidOffsetNumber
)
1282 elog(PANIC
, "failed to add new item to block %u in index \"%s\"",
1283 BufferGetBlockNumber(buf
), RelationGetRelationName(rel
));
1285 MarkBufferDirty(buf
);
1287 if (BufferIsValid(metabuf
))
1289 /* upgrade meta-page if needed */
1290 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
1291 _bt_upgrademetapage(metapg
);
1292 metad
->btm_fastroot
= BufferGetBlockNumber(buf
);
1293 metad
->btm_fastlevel
= opaque
->btpo_level
;
1294 MarkBufferDirty(metabuf
);
1298 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1303 Page cpage
= BufferGetPage(cbuf
);
1304 BTPageOpaque cpageop
= BTPageGetOpaque(cpage
);
1306 Assert(P_INCOMPLETE_SPLIT(cpageop
));
1307 cpageop
->btpo_flags
&= ~BTP_INCOMPLETE_SPLIT
;
1308 MarkBufferDirty(cbuf
);
1312 if (RelationNeedsWAL(rel
))
1314 xl_btree_insert xlrec
;
1315 xl_btree_metadata xlmeta
;
1320 xlrec
.offnum
= newitemoff
;
1323 XLogRegisterData((char *) &xlrec
, SizeOfBtreeInsert
);
1325 if (isleaf
&& postingoff
== 0)
1327 /* Simple leaf insert */
1328 xlinfo
= XLOG_BTREE_INSERT_LEAF
;
1330 else if (postingoff
!= 0)
1333 * Leaf insert with posting list split. Must include
1334 * postingoff field before newitem/orignewitem.
1337 xlinfo
= XLOG_BTREE_INSERT_POST
;
1341 /* Internal page insert, which finishes a split on cbuf */
1342 xlinfo
= XLOG_BTREE_INSERT_UPPER
;
1343 XLogRegisterBuffer(1, cbuf
, REGBUF_STANDARD
);
1345 if (BufferIsValid(metabuf
))
1347 /* Actually, it's an internal page insert + meta update */
1348 xlinfo
= XLOG_BTREE_INSERT_META
;
1350 Assert(metad
->btm_version
>= BTREE_NOVAC_VERSION
);
1351 xlmeta
.version
= metad
->btm_version
;
1352 xlmeta
.root
= metad
->btm_root
;
1353 xlmeta
.level
= metad
->btm_level
;
1354 xlmeta
.fastroot
= metad
->btm_fastroot
;
1355 xlmeta
.fastlevel
= metad
->btm_fastlevel
;
1356 xlmeta
.last_cleanup_num_delpages
= metad
->btm_last_cleanup_num_delpages
;
1357 xlmeta
.allequalimage
= metad
->btm_allequalimage
;
1359 XLogRegisterBuffer(2, metabuf
,
1360 REGBUF_WILL_INIT
| REGBUF_STANDARD
);
1361 XLogRegisterBufData(2, (char *) &xlmeta
,
1362 sizeof(xl_btree_metadata
));
1366 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
1367 if (postingoff
== 0)
1369 /* Just log itup from caller */
1370 XLogRegisterBufData(0, (char *) itup
, IndexTupleSize(itup
));
1375 * Insert with posting list split (XLOG_BTREE_INSERT_POST
1378 * Log postingoff. Also log origitup, not itup. REDO routine
1379 * must reconstruct final itup (as well as nposting) using
1380 * _bt_swap_posting().
1382 upostingoff
= postingoff
;
1384 XLogRegisterBufData(0, (char *) &upostingoff
, sizeof(uint16
));
1385 XLogRegisterBufData(0, (char *) origitup
,
1386 IndexTupleSize(origitup
));
1389 recptr
= XLogInsert(RM_BTREE_ID
, xlinfo
);
1391 if (BufferIsValid(metabuf
))
1392 PageSetLSN(metapg
, recptr
);
1394 PageSetLSN(BufferGetPage(cbuf
), recptr
);
1396 PageSetLSN(page
, recptr
);
1401 /* Release subsidiary buffers */
1402 if (BufferIsValid(metabuf
))
1403 _bt_relbuf(rel
, metabuf
);
1405 _bt_relbuf(rel
, cbuf
);
1408 * Cache the block number if this is the rightmost leaf page. Cache
1409 * may be used by a future inserter within _bt_search_insert().
1411 blockcache
= InvalidBlockNumber
;
1412 if (isrightmost
&& isleaf
&& !isroot
)
1413 blockcache
= BufferGetBlockNumber(buf
);
1415 /* Release buffer for insertion target block */
1416 _bt_relbuf(rel
, buf
);
1419 * If we decided to cache the insertion target block before releasing
1420 * its buffer lock, then cache it now. Check the height of the tree
1421 * first, though. We don't go for the optimization with small
1422 * indexes. Defer final check to this point to ensure that we don't
1423 * call _bt_getrootheight while holding a buffer lock.
1425 if (BlockNumberIsValid(blockcache
) &&
1426 _bt_getrootheight(rel
) >= BTREE_FASTPATH_MIN_LEVEL
)
1427 RelationSetTargetBlock(rel
, blockcache
);
1431 if (postingoff
!= 0)
1433 /* itup is actually a modified copy of caller's original */
1440 * _bt_split() -- split a page in the btree.
1442 * On entry, buf is the page to split, and is pinned and write-locked.
1443 * newitemoff etc. tell us about the new item that must be inserted
1444 * along with the data from the original page.
1446 * itup_key is used for suffix truncation on leaf pages (internal
1447 * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1448 * is the left-sibling of the page we're inserting the downlink for.
1449 * This function will clear the INCOMPLETE_SPLIT flag on it, and
1450 * release the buffer.
1452 * orignewitem, nposting, and postingoff are needed when an insert of
1453 * orignewitem results in both a posting list split and a page split.
1454 * These extra posting list split details are used here in the same
1455 * way as they are used in the more common case where a posting list
1456 * split does not coincide with a page split. We need to deal with
1457 * posting list splits directly in order to ensure that everything
1458 * that follows from the insert of orignewitem is handled as a single
1459 * atomic operation (though caller's insert of a new pivot/downlink
1460 * into parent page will still be a separate operation). See
1461 * nbtree/README for details on the design of posting list splits.
1463 * Returns the new right sibling of buf, pinned and write-locked.
1464 * The pin and lock on buf are maintained.
1467 _bt_split(Relation rel
, Relation heaprel
, BTScanInsert itup_key
, Buffer buf
,
1468 Buffer cbuf
, OffsetNumber newitemoff
, Size newitemsz
, IndexTuple newitem
,
1469 IndexTuple orignewitem
, IndexTuple nposting
, uint16 postingoff
)
1475 BlockNumber origpagenumber
,
1477 BTPageOpaque ropaque
,
1480 Buffer sbuf
= InvalidBuffer
;
1482 BTPageOpaque sopaque
= NULL
;
1485 IndexTuple firstright
,
1487 OffsetNumber firstrightoff
;
1488 OffsetNumber afterleftoff
,
1491 OffsetNumber origpagepostingoff
;
1492 OffsetNumber maxoff
;
1499 * origpage is the original page to be split. leftpage is a temporary
1500 * buffer that receives the left-sibling data, which will be copied back
1501 * into origpage on success. rightpage is the new page that will receive
1502 * the right-sibling data.
1504 * leftpage is allocated after choosing a split point. rightpage's new
1505 * buffer isn't acquired until after leftpage is initialized and has new
1506 * high key, the last point where splitting the page may fail (barring
1507 * corruption). Failing before acquiring new buffer won't have lasting
1508 * consequences, since origpage won't have been modified and leftpage is
1511 origpage
= BufferGetPage(buf
);
1512 oopaque
= BTPageGetOpaque(origpage
);
1513 isleaf
= P_ISLEAF(oopaque
);
1514 isrightmost
= P_RIGHTMOST(oopaque
);
1515 maxoff
= PageGetMaxOffsetNumber(origpage
);
1516 origpagenumber
= BufferGetBlockNumber(buf
);
1519 * Choose a point to split origpage at.
1521 * A split point can be thought of as a point _between_ two existing data
1522 * items on origpage (the lastleft and firstright tuples), provided you
1523 * pretend that the new item that didn't fit is already on origpage.
1525 * Since origpage does not actually contain newitem, the representation of
1526 * split points needs to work with two boundary cases: splits where
1527 * newitem is lastleft, and splits where newitem is firstright.
1528 * newitemonleft resolves the ambiguity that would otherwise exist when
1529 * newitemoff == firstrightoff. In all other cases it's clear which side
1530 * of the split every tuple goes on from context. newitemonleft is
1531 * usually (but not always) redundant information.
1533 * firstrightoff is supposed to be an origpage offset number, but it's
1534 * possible that its value will be maxoff+1, which is "past the end" of
1535 * origpage. This happens in the rare case where newitem goes after all
1536 * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1537 * origpage at the point that leaves newitem alone on new right page. Any
1538 * "!newitemonleft && newitemoff == firstrightoff" split point makes
1539 * newitem the firstright tuple, though, so this case isn't a special
1542 firstrightoff
= _bt_findsplitloc(rel
, origpage
, newitemoff
, newitemsz
,
1543 newitem
, &newitemonleft
);
1545 /* Allocate temp buffer for leftpage */
1546 leftpage
= PageGetTempPage(origpage
);
1547 _bt_pageinit(leftpage
, BufferGetPageSize(buf
));
1548 lopaque
= BTPageGetOpaque(leftpage
);
1551 * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1552 * and HAS_GARBAGE flags.
1554 lopaque
->btpo_flags
= oopaque
->btpo_flags
;
1555 lopaque
->btpo_flags
&= ~(BTP_ROOT
| BTP_SPLIT_END
| BTP_HAS_GARBAGE
);
1556 /* set flag in leftpage indicating that rightpage has no downlink yet */
1557 lopaque
->btpo_flags
|= BTP_INCOMPLETE_SPLIT
;
1558 lopaque
->btpo_prev
= oopaque
->btpo_prev
;
1559 /* handle btpo_next after rightpage buffer acquired */
1560 lopaque
->btpo_level
= oopaque
->btpo_level
;
1561 /* handle btpo_cycleid after rightpage buffer acquired */
1564 * Copy the original page's LSN into leftpage, which will become the
1565 * updated version of the page. We need this because XLogInsert will
1566 * examine the LSN and possibly dump it in a page image.
1568 PageSetLSN(leftpage
, PageGetLSN(origpage
));
1571 * Determine page offset number of existing overlapped-with-orignewitem
1572 * posting list when it is necessary to perform a posting list split in
1573 * passing. Note that newitem was already changed by caller (newitem no
1574 * longer has the orignewitem TID).
1576 * This page offset number (origpagepostingoff) will be used to pretend
1577 * that the posting split has already taken place, even though the
1578 * required modifications to origpage won't occur until we reach the
1579 * critical section. The lastleft and firstright tuples of our page split
1580 * point should, in effect, come from an imaginary version of origpage
1581 * that has the nposting tuple instead of the original posting list tuple.
1583 * Note: _bt_findsplitloc() should have compensated for coinciding posting
1584 * list splits in just the same way, at least in theory. It doesn't
1585 * bother with that, though. In practice it won't affect its choice of
1588 origpagepostingoff
= InvalidOffsetNumber
;
1589 if (postingoff
!= 0)
1592 Assert(ItemPointerCompare(&orignewitem
->t_tid
,
1593 &newitem
->t_tid
) < 0);
1594 Assert(BTreeTupleIsPosting(nposting
));
1595 origpagepostingoff
= OffsetNumberPrev(newitemoff
);
1599 * The high key for the new left page is a possibly-truncated copy of
1600 * firstright on the leaf level (it's "firstright itself" on internal
1601 * pages; see !isleaf comments below). This may seem to be contrary to
1602 * Lehman & Yao's approach of using a copy of lastleft as the new high key
1603 * when splitting on the leaf level. It isn't, though.
1605 * Suffix truncation will leave the left page's high key fully equal to
1606 * lastleft when lastleft and firstright are equal prior to heap TID (that
1607 * is, the tiebreaker TID value comes from lastleft). It isn't actually
1608 * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1609 * "subtree" invariant to hold. It's sufficient to make sure that the new
1610 * leaf high key is strictly less than firstright, and greater than or
1611 * equal to (not necessarily equal to) lastleft. In other words, when
1612 * suffix truncation isn't possible during a leaf page split, we take
1613 * L&Y's exact approach to generating a new high key for the left page.
1614 * (Actually, that is slightly inaccurate. We don't just use a copy of
1615 * lastleft. A tuple with all the keys from firstright but the max heap
1616 * TID from lastleft is used, to avoid introducing a special case.)
1618 if (!newitemonleft
&& newitemoff
== firstrightoff
)
1620 /* incoming tuple becomes firstright */
1622 firstright
= newitem
;
1626 /* existing item at firstrightoff becomes firstright */
1627 itemid
= PageGetItemId(origpage
, firstrightoff
);
1628 itemsz
= ItemIdGetLength(itemid
);
1629 firstright
= (IndexTuple
) PageGetItem(origpage
, itemid
);
1630 if (firstrightoff
== origpagepostingoff
)
1631 firstright
= nposting
;
1636 IndexTuple lastleft
;
1638 /* Attempt suffix truncation for leaf page splits */
1639 if (newitemonleft
&& newitemoff
== firstrightoff
)
1641 /* incoming tuple becomes lastleft */
1646 OffsetNumber lastleftoff
;
1648 /* existing item before firstrightoff becomes lastleft */
1649 lastleftoff
= OffsetNumberPrev(firstrightoff
);
1650 Assert(lastleftoff
>= P_FIRSTDATAKEY(oopaque
));
1651 itemid
= PageGetItemId(origpage
, lastleftoff
);
1652 lastleft
= (IndexTuple
) PageGetItem(origpage
, itemid
);
1653 if (lastleftoff
== origpagepostingoff
)
1654 lastleft
= nposting
;
1657 lefthighkey
= _bt_truncate(rel
, lastleft
, firstright
, itup_key
);
1658 itemsz
= IndexTupleSize(lefthighkey
);
1663 * Don't perform suffix truncation on a copy of firstright to make
1664 * left page high key for internal page splits. Must use firstright
1665 * as new high key directly.
1667 * Each distinct separator key value originates as a leaf level high
1668 * key; all other separator keys/pivot tuples are copied from one
1669 * level down. A separator key in a grandparent page must be
1670 * identical to high key in rightmost parent page of the subtree to
1671 * its left, which must itself be identical to high key in rightmost
1672 * child page of that same subtree (this even applies to separator
1673 * from grandparent's high key). There must always be an unbroken
1674 * "seam" of identical separator keys that guide index scans at every
1675 * level, starting from the grandparent. That's why suffix truncation
1678 * Internal page splits will truncate firstright into a "negative
1679 * infinity" data item when it gets inserted on the new right page
1680 * below, though. This happens during the call to _bt_pgaddtup() for
1681 * the new first data item for right page. Do not confuse this
1682 * mechanism with suffix truncation. It is just a convenient way of
1683 * implementing page splits that split the internal page "inside"
1684 * firstright. The lefthighkey separator key cannot appear a second
1685 * time in the right page (only firstright's downlink goes in right
1688 lefthighkey
= firstright
;
1692 * Add new high key to leftpage
1694 afterleftoff
= P_HIKEY
;
1696 Assert(BTreeTupleGetNAtts(lefthighkey
, rel
) > 0);
1697 Assert(BTreeTupleGetNAtts(lefthighkey
, rel
) <=
1698 IndexRelationGetNumberOfKeyAttributes(rel
));
1699 Assert(itemsz
== MAXALIGN(IndexTupleSize(lefthighkey
)));
1700 if (PageAddItem(leftpage
, (Item
) lefthighkey
, itemsz
, afterleftoff
, false,
1701 false) == InvalidOffsetNumber
)
1702 elog(ERROR
, "failed to add high key to the left sibling"
1703 " while splitting block %u of index \"%s\"",
1704 origpagenumber
, RelationGetRelationName(rel
));
1705 afterleftoff
= OffsetNumberNext(afterleftoff
);
1708 * Acquire a new right page to split into, now that left page has a new
1709 * high key. From here on, it's not okay to throw an error without
1710 * zeroing rightpage first. This coding rule ensures that we won't
1711 * confuse future VACUUM operations, which might otherwise try to re-find
1712 * a downlink to a leftover junk page as the page undergoes deletion.
1714 * It would be reasonable to start the critical section just after the new
1715 * rightpage buffer is acquired instead; that would allow us to avoid
1716 * leftover junk pages without bothering to zero rightpage. We do it this
1717 * way because it avoids an unnecessary PANIC when either origpage or its
1718 * existing sibling page are corrupt.
1720 rbuf
= _bt_allocbuf(rel
, heaprel
);
1721 rightpage
= BufferGetPage(rbuf
);
1722 rightpagenumber
= BufferGetBlockNumber(rbuf
);
1723 /* rightpage was initialized by _bt_getbuf */
1724 ropaque
= BTPageGetOpaque(rightpage
);
1727 * Finish off remaining leftpage special area fields. They cannot be set
1728 * before both origpage (leftpage) and rightpage buffers are acquired and
1731 * btpo_cycleid is only used with leaf pages, though we set it here in all
1732 * cases just to be consistent.
1734 lopaque
->btpo_next
= rightpagenumber
;
1735 lopaque
->btpo_cycleid
= _bt_vacuum_cycleid(rel
);
1738 * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1739 * and HAS_GARBAGE flags.
1741 ropaque
->btpo_flags
= oopaque
->btpo_flags
;
1742 ropaque
->btpo_flags
&= ~(BTP_ROOT
| BTP_SPLIT_END
| BTP_HAS_GARBAGE
);
1743 ropaque
->btpo_prev
= origpagenumber
;
1744 ropaque
->btpo_next
= oopaque
->btpo_next
;
1745 ropaque
->btpo_level
= oopaque
->btpo_level
;
1746 ropaque
->btpo_cycleid
= lopaque
->btpo_cycleid
;
1749 * Add new high key to rightpage where necessary.
1751 * If the page we're splitting is not the rightmost page at its level in
1752 * the tree, then the first entry on the page is the high key from
1755 afterrightoff
= P_HIKEY
;
1759 IndexTuple righthighkey
;
1761 itemid
= PageGetItemId(origpage
, P_HIKEY
);
1762 itemsz
= ItemIdGetLength(itemid
);
1763 righthighkey
= (IndexTuple
) PageGetItem(origpage
, itemid
);
1764 Assert(BTreeTupleGetNAtts(righthighkey
, rel
) > 0);
1765 Assert(BTreeTupleGetNAtts(righthighkey
, rel
) <=
1766 IndexRelationGetNumberOfKeyAttributes(rel
));
1767 if (PageAddItem(rightpage
, (Item
) righthighkey
, itemsz
, afterrightoff
,
1768 false, false) == InvalidOffsetNumber
)
1770 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1771 elog(ERROR
, "failed to add high key to the right sibling"
1772 " while splitting block %u of index \"%s\"",
1773 origpagenumber
, RelationGetRelationName(rel
));
1775 afterrightoff
= OffsetNumberNext(afterrightoff
);
1779 * Internal page splits truncate first data item on right page -- it
1780 * becomes "minus infinity" item for the page. Set this up here.
1782 minusinfoff
= InvalidOffsetNumber
;
1784 minusinfoff
= afterrightoff
;
1787 * Now transfer all the data items (non-pivot tuples in isleaf case, or
1788 * additional pivot tuples in !isleaf case) to the appropriate page.
1790 * Note: we *must* insert at least the right page's items in item-number
1791 * order, for the benefit of _bt_restore_page().
1793 for (i
= P_FIRSTDATAKEY(oopaque
); i
<= maxoff
; i
= OffsetNumberNext(i
))
1795 IndexTuple dataitem
;
1797 itemid
= PageGetItemId(origpage
, i
);
1798 itemsz
= ItemIdGetLength(itemid
);
1799 dataitem
= (IndexTuple
) PageGetItem(origpage
, itemid
);
1801 /* replace original item with nposting due to posting split? */
1802 if (i
== origpagepostingoff
)
1804 Assert(BTreeTupleIsPosting(dataitem
));
1805 Assert(itemsz
== MAXALIGN(IndexTupleSize(nposting
)));
1806 dataitem
= nposting
;
1809 /* does new item belong before this one? */
1810 else if (i
== newitemoff
)
1814 Assert(newitemoff
<= firstrightoff
);
1815 if (!_bt_pgaddtup(leftpage
, newitemsz
, newitem
, afterleftoff
,
1818 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1819 elog(ERROR
, "failed to add new item to the left sibling"
1820 " while splitting block %u of index \"%s\"",
1821 origpagenumber
, RelationGetRelationName(rel
));
1823 afterleftoff
= OffsetNumberNext(afterleftoff
);
1827 Assert(newitemoff
>= firstrightoff
);
1828 if (!_bt_pgaddtup(rightpage
, newitemsz
, newitem
, afterrightoff
,
1829 afterrightoff
== minusinfoff
))
1831 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1832 elog(ERROR
, "failed to add new item to the right sibling"
1833 " while splitting block %u of index \"%s\"",
1834 origpagenumber
, RelationGetRelationName(rel
));
1836 afterrightoff
= OffsetNumberNext(afterrightoff
);
1840 /* decide which page to put it on */
1841 if (i
< firstrightoff
)
1843 if (!_bt_pgaddtup(leftpage
, itemsz
, dataitem
, afterleftoff
, false))
1845 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1846 elog(ERROR
, "failed to add old item to the left sibling"
1847 " while splitting block %u of index \"%s\"",
1848 origpagenumber
, RelationGetRelationName(rel
));
1850 afterleftoff
= OffsetNumberNext(afterleftoff
);
1854 if (!_bt_pgaddtup(rightpage
, itemsz
, dataitem
, afterrightoff
,
1855 afterrightoff
== minusinfoff
))
1857 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1858 elog(ERROR
, "failed to add old item to the right sibling"
1859 " while splitting block %u of index \"%s\"",
1860 origpagenumber
, RelationGetRelationName(rel
));
1862 afterrightoff
= OffsetNumberNext(afterrightoff
);
1866 /* Handle case where newitem goes at the end of rightpage */
1867 if (i
<= newitemoff
)
1870 * Can't have newitemonleft here; that would imply we were told to put
1871 * *everything* on the left page, which cannot fit (if it could, we'd
1872 * not be splitting the page).
1874 Assert(!newitemonleft
&& newitemoff
== maxoff
+ 1);
1875 if (!_bt_pgaddtup(rightpage
, newitemsz
, newitem
, afterrightoff
,
1876 afterrightoff
== minusinfoff
))
1878 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1879 elog(ERROR
, "failed to add new item to the right sibling"
1880 " while splitting block %u of index \"%s\"",
1881 origpagenumber
, RelationGetRelationName(rel
));
1883 afterrightoff
= OffsetNumberNext(afterrightoff
);
1887 * We have to grab the original right sibling (if any) and update its prev
1888 * link. We are guaranteed that this is deadlock-free, since we couple
1889 * the locks in the standard order: left to right.
1893 sbuf
= _bt_getbuf(rel
, oopaque
->btpo_next
, BT_WRITE
);
1894 spage
= BufferGetPage(sbuf
);
1895 sopaque
= BTPageGetOpaque(spage
);
1896 if (sopaque
->btpo_prev
!= origpagenumber
)
1898 memset(rightpage
, 0, BufferGetPageSize(rbuf
));
1900 (errcode(ERRCODE_INDEX_CORRUPTED
),
1901 errmsg_internal("right sibling's left-link doesn't match: "
1902 "block %u links to %u instead of expected %u in index \"%s\"",
1903 oopaque
->btpo_next
, sopaque
->btpo_prev
, origpagenumber
,
1904 RelationGetRelationName(rel
))));
1908 * Check to see if we can set the SPLIT_END flag in the right-hand
1909 * split page; this can save some I/O for vacuum since it need not
1910 * proceed to the right sibling. We can set the flag if the right
1911 * sibling has a different cycleid: that means it could not be part of
1912 * a group of pages that were all split off from the same ancestor
1913 * page. If you're confused, imagine that page A splits to A B and
1914 * then again, yielding A C B, while vacuum is in progress. Tuples
1915 * originally in A could now be in either B or C, hence vacuum must
1916 * examine both pages. But if D, our right sibling, has a different
1917 * cycleid then it could not contain any tuples that were in A when
1918 * the vacuum started.
1920 if (sopaque
->btpo_cycleid
!= ropaque
->btpo_cycleid
)
1921 ropaque
->btpo_flags
|= BTP_SPLIT_END
;
1925 * Right sibling is locked, new siblings are prepared, but original page
1926 * is not updated yet.
1928 * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1929 * not starting the critical section till here because we haven't been
1930 * scribbling on the original page yet; see comments above.
1932 START_CRIT_SECTION();
1935 * By here, the original data page has been split into two new halves, and
1936 * these are correct. The algorithm requires that the left page never
1937 * move during a split, so we copy the new left page back on top of the
1938 * original. We need to do this before writing the WAL record, so that
1939 * XLogInsert can WAL log an image of the page if necessary.
1941 PageRestoreTempPage(leftpage
, origpage
);
1942 /* leftpage, lopaque must not be used below here */
1944 MarkBufferDirty(buf
);
1945 MarkBufferDirty(rbuf
);
1949 sopaque
->btpo_prev
= rightpagenumber
;
1950 MarkBufferDirty(sbuf
);
1954 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1959 Page cpage
= BufferGetPage(cbuf
);
1960 BTPageOpaque cpageop
= BTPageGetOpaque(cpage
);
1962 cpageop
->btpo_flags
&= ~BTP_INCOMPLETE_SPLIT
;
1963 MarkBufferDirty(cbuf
);
1967 if (RelationNeedsWAL(rel
))
1969 xl_btree_split xlrec
;
1973 xlrec
.level
= ropaque
->btpo_level
;
1974 /* See comments below on newitem, orignewitem, and posting lists */
1975 xlrec
.firstrightoff
= firstrightoff
;
1976 xlrec
.newitemoff
= newitemoff
;
1977 xlrec
.postingoff
= 0;
1978 if (postingoff
!= 0 && origpagepostingoff
< firstrightoff
)
1979 xlrec
.postingoff
= postingoff
;
1982 XLogRegisterData((char *) &xlrec
, SizeOfBtreeSplit
);
1984 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
1985 XLogRegisterBuffer(1, rbuf
, REGBUF_WILL_INIT
);
1986 /* Log original right sibling, since we've changed its prev-pointer */
1988 XLogRegisterBuffer(2, sbuf
, REGBUF_STANDARD
);
1990 XLogRegisterBuffer(3, cbuf
, REGBUF_STANDARD
);
1993 * Log the new item, if it was inserted on the left page. (If it was
1994 * put on the right page, we don't need to explicitly WAL log it
1995 * because it's included with all the other items on the right page.)
1996 * Show the new item as belonging to the left page buffer, so that it
1997 * is not stored if XLogInsert decides it needs a full-page image of
1998 * the left page. We always store newitemoff in the record, though.
2000 * The details are sometimes slightly different for page splits that
2001 * coincide with a posting list split. If both the replacement
2002 * posting list and newitem go on the right page, then we don't need
2003 * to log anything extra, just like the simple !newitemonleft
2004 * no-posting-split case (postingoff is set to zero in the WAL record,
2005 * so recovery doesn't need to process a posting list split at all).
2006 * Otherwise, we set postingoff and log orignewitem instead of
2007 * newitem, despite having actually inserted newitem. REDO routine
2008 * must reconstruct nposting and newitem using _bt_swap_posting().
2010 * Note: It's possible that our page split point is the point that
2011 * makes the posting list lastleft and newitem firstright. This is
2012 * the only case where we log orignewitem/newitem despite newitem
2013 * going on the right page. If XLogInsert decides that it can omit
2014 * orignewitem due to logging a full-page image of the left page,
2015 * everything still works out, since recovery only needs to log
2016 * orignewitem for items on the left page (just like the regular
2017 * newitem-logged case).
2019 if (newitemonleft
&& xlrec
.postingoff
== 0)
2020 XLogRegisterBufData(0, (char *) newitem
, newitemsz
);
2021 else if (xlrec
.postingoff
!= 0)
2024 Assert(newitemonleft
|| firstrightoff
== newitemoff
);
2025 Assert(newitemsz
== IndexTupleSize(orignewitem
));
2026 XLogRegisterBufData(0, (char *) orignewitem
, newitemsz
);
2029 /* Log the left page's new high key */
2032 /* lefthighkey isn't local copy, get current pointer */
2033 itemid
= PageGetItemId(origpage
, P_HIKEY
);
2034 lefthighkey
= (IndexTuple
) PageGetItem(origpage
, itemid
);
2036 XLogRegisterBufData(0, (char *) lefthighkey
,
2037 MAXALIGN(IndexTupleSize(lefthighkey
)));
2040 * Log the contents of the right page in the format understood by
2041 * _bt_restore_page(). The whole right page will be recreated.
2043 * Direct access to page is not good but faster - we should implement
2044 * some new func in page API. Note we only store the tuples
2045 * themselves, knowing that they were inserted in item-number order
2046 * and so the line pointers can be reconstructed. See comments for
2047 * _bt_restore_page().
2049 XLogRegisterBufData(1,
2050 (char *) rightpage
+ ((PageHeader
) rightpage
)->pd_upper
,
2051 ((PageHeader
) rightpage
)->pd_special
- ((PageHeader
) rightpage
)->pd_upper
);
2053 xlinfo
= newitemonleft
? XLOG_BTREE_SPLIT_L
: XLOG_BTREE_SPLIT_R
;
2054 recptr
= XLogInsert(RM_BTREE_ID
, xlinfo
);
2056 PageSetLSN(origpage
, recptr
);
2057 PageSetLSN(rightpage
, recptr
);
2059 PageSetLSN(spage
, recptr
);
2061 PageSetLSN(BufferGetPage(cbuf
), recptr
);
2066 /* release the old right sibling */
2068 _bt_relbuf(rel
, sbuf
);
2070 /* release the child */
2072 _bt_relbuf(rel
, cbuf
);
2083 * _bt_insert_parent() -- Insert downlink into parent, completing split.
2085 * On entry, buf and rbuf are the left and right split pages, which we
2086 * still hold write locks on. Both locks will be released here. We
2087 * release the rbuf lock once we have a write lock on the page that we
2088 * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2089 * The lock on buf is released at the same point as the lock on the parent
2090 * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2091 * atomic operation that completes the split by inserting a new downlink.
2093 * stack - stack showing how we got here. Will be NULL when splitting true
2094 * root, or during concurrent root split, where we can be inefficient
2095 * isroot - we split the true root
2096 * isonly - we split a page alone on its level (might have been fast root)
2099 _bt_insert_parent(Relation rel
,
2107 Assert(heaprel
!= NULL
);
2110 * Here we have to do something Lehman and Yao don't talk about: deal with
2111 * a root split and construction of a new root. If our stack is empty
2112 * then we have just split a node on what had been the root level when we
2113 * descended the tree. If it was still the root then we perform a
2114 * new-root construction. If it *wasn't* the root anymore, search to find
2115 * the next higher level that someone constructed meanwhile, and find the
2116 * right place to insert as for the normal case.
2118 * If we have to search for the parent level, we do so by re-descending
2119 * from the root. This is not super-efficient, but it's rare enough not
2126 Assert(stack
== NULL
);
2128 /* create a new root node one level up and update the metapage */
2129 rootbuf
= _bt_newlevel(rel
, heaprel
, buf
, rbuf
);
2130 /* release the split buffers */
2131 _bt_relbuf(rel
, rootbuf
);
2132 _bt_relbuf(rel
, rbuf
);
2133 _bt_relbuf(rel
, buf
);
2137 BlockNumber bknum
= BufferGetBlockNumber(buf
);
2138 BlockNumber rbknum
= BufferGetBlockNumber(rbuf
);
2139 Page page
= BufferGetPage(buf
);
2140 IndexTuple new_item
;
2141 BTStackData fakestack
;
2147 BTPageOpaque opaque
;
2149 elog(DEBUG2
, "concurrent ROOT page split");
2150 opaque
= BTPageGetOpaque(page
);
2153 * We should never reach here when a leaf page split takes place
2154 * despite the insert of newitem being able to apply the fastpath
2155 * optimization. Make sure of that with an assertion.
2157 * This is more of a performance issue than a correctness issue.
2158 * The fastpath won't have a descent stack. Using a phony stack
2159 * here works, but never rely on that. The fastpath should be
2160 * rejected within _bt_search_insert() when the rightmost leaf
2161 * page will split, since it's faster to go through _bt_search()
2162 * and get a stack in the usual way.
2164 Assert(!(P_ISLEAF(opaque
) &&
2165 BlockNumberIsValid(RelationGetTargetBlock(rel
))));
2167 /* Find the leftmost page at the next level up */
2168 pbuf
= _bt_get_endpoint(rel
, opaque
->btpo_level
+ 1, false, NULL
);
2169 /* Set up a phony stack entry pointing there */
2171 stack
->bts_blkno
= BufferGetBlockNumber(pbuf
);
2172 stack
->bts_offset
= InvalidOffsetNumber
;
2173 stack
->bts_parent
= NULL
;
2174 _bt_relbuf(rel
, pbuf
);
2177 /* get high key from left, a strict lower bound for new right page */
2178 ritem
= (IndexTuple
) PageGetItem(page
,
2179 PageGetItemId(page
, P_HIKEY
));
2181 /* form an index tuple that points at the new right page */
2182 new_item
= CopyIndexTuple(ritem
);
2183 BTreeTupleSetDownLink(new_item
, rbknum
);
2186 * Re-find and write lock the parent of buf.
2188 * It's possible that the location of buf's downlink has changed since
2189 * our initial _bt_search() descent. _bt_getstackbuf() will detect
2190 * and recover from this, updating the stack, which ensures that the
2191 * new downlink will be inserted at the correct offset. Even buf's
2192 * parent may have changed.
2194 pbuf
= _bt_getstackbuf(rel
, heaprel
, stack
, bknum
);
2197 * Unlock the right child. The left child will be unlocked in
2200 * Unlocking the right child must be delayed until here to ensure that
2201 * no concurrent VACUUM operation can become confused. Page deletion
2202 * cannot be allowed to fail to re-find a downlink for the rbuf page.
2203 * (Actually, this is just a vestige of how things used to work. The
2204 * page deletion code is expected to check for the INCOMPLETE_SPLIT
2205 * flag on the left child. It won't attempt deletion of the right
2206 * child until the split is complete. Despite all this, we opt to
2207 * conservatively delay unlocking the right child until here.)
2209 _bt_relbuf(rel
, rbuf
);
2211 if (pbuf
== InvalidBuffer
)
2213 (errcode(ERRCODE_INDEX_CORRUPTED
),
2214 errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2215 RelationGetRelationName(rel
), bknum
, rbknum
)));
2217 /* Recursively insert into the parent */
2218 _bt_insertonpg(rel
, heaprel
, NULL
, pbuf
, buf
, stack
->bts_parent
,
2219 new_item
, MAXALIGN(IndexTupleSize(new_item
)),
2220 stack
->bts_offset
+ 1, 0, isonly
);
2228 * _bt_finish_split() -- Finish an incomplete split
2230 * A crash or other failure can leave a split incomplete. The insertion
2231 * routines won't allow to insert on a page that is incompletely split.
2232 * Before inserting on such a page, call _bt_finish_split().
2234 * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2237 * Caller must provide a valid heaprel, since finishing a page split requires
2238 * allocating a new page if and when the parent page splits in turn.
2241 _bt_finish_split(Relation rel
, Relation heaprel
, Buffer lbuf
, BTStack stack
)
2243 Page lpage
= BufferGetPage(lbuf
);
2244 BTPageOpaque lpageop
= BTPageGetOpaque(lpage
);
2247 BTPageOpaque rpageop
;
2251 Assert(P_INCOMPLETE_SPLIT(lpageop
));
2252 Assert(heaprel
!= NULL
);
2254 /* Lock right sibling, the one missing the downlink */
2255 rbuf
= _bt_getbuf(rel
, lpageop
->btpo_next
, BT_WRITE
);
2256 rpage
= BufferGetPage(rbuf
);
2257 rpageop
= BTPageGetOpaque(rpage
);
2259 /* Could this be a root split? */
2264 BTMetaPageData
*metad
;
2266 /* acquire lock on the metapage */
2267 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_WRITE
);
2268 metapg
= BufferGetPage(metabuf
);
2269 metad
= BTPageGetMeta(metapg
);
2271 wasroot
= (metad
->btm_root
== BufferGetBlockNumber(lbuf
));
2273 _bt_relbuf(rel
, metabuf
);
2278 /* Was this the only page on the level before split? */
2279 wasonly
= (P_LEFTMOST(lpageop
) && P_RIGHTMOST(rpageop
));
2281 elog(DEBUG1
, "finishing incomplete split of %u/%u",
2282 BufferGetBlockNumber(lbuf
), BufferGetBlockNumber(rbuf
));
2284 _bt_insert_parent(rel
, heaprel
, lbuf
, rbuf
, stack
, wasroot
, wasonly
);
2288 * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2289 * tuple whose downlink points to child page.
2291 * Caller passes child's block number, which is used to identify
2292 * associated pivot tuple in parent page using a linear search that
2293 * matches on pivot's downlink/block number. The expected location of
2294 * the pivot tuple is taken from the stack one level above the child
2295 * page. This is used as a starting point. Insertions into the
2296 * parent level could cause the pivot tuple to move right; deletions
2297 * could cause it to move left, but not left of the page we previously
2300 * Caller can use its stack to relocate the pivot tuple/downlink for
2301 * any same-level page to the right of the page found by its initial
2302 * descent. This is necessary because of the possibility that caller
2303 * moved right to recover from a concurrent page split. It's also
2304 * convenient for certain callers to be able to step right when there
2305 * wasn't a concurrent page split, while still using their original
2306 * stack. For example, the checkingunique _bt_doinsert() case may
2307 * have to step right when there are many physical duplicates, and its
2308 * scantid forces an insertion to the right of the "first page the
2309 * value could be on". (This is also relied on by all of our callers
2310 * when dealing with !heapkeyspace indexes.)
2312 * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2313 * tuple not found (should not happen). Adjusts bts_blkno &
2314 * bts_offset if changed. Page split caller should insert its new
2315 * pivot tuple for its new right sibling page on parent page, at the
2316 * offset number bts_offset + 1.
2319 _bt_getstackbuf(Relation rel
, Relation heaprel
, BTStack stack
, BlockNumber child
)
2324 blkno
= stack
->bts_blkno
;
2325 start
= stack
->bts_offset
;
2331 BTPageOpaque opaque
;
2333 buf
= _bt_getbuf(rel
, blkno
, BT_WRITE
);
2334 page
= BufferGetPage(buf
);
2335 opaque
= BTPageGetOpaque(page
);
2337 Assert(heaprel
!= NULL
);
2338 if (P_INCOMPLETE_SPLIT(opaque
))
2340 _bt_finish_split(rel
, heaprel
, buf
, stack
->bts_parent
);
2344 if (!P_IGNORE(opaque
))
2346 OffsetNumber offnum
,
2352 minoff
= P_FIRSTDATAKEY(opaque
);
2353 maxoff
= PageGetMaxOffsetNumber(page
);
2356 * start = InvalidOffsetNumber means "search the whole page". We
2357 * need this test anyway due to possibility that page has a high
2358 * key now when it didn't before.
2364 * Need this check too, to guard against possibility that page
2365 * split since we visited it originally.
2368 start
= OffsetNumberNext(maxoff
);
2371 * These loops will check every item on the page --- but in an
2372 * order that's attuned to the probability of where it actually
2373 * is. Scan to the right first, then to the left.
2375 for (offnum
= start
;
2377 offnum
= OffsetNumberNext(offnum
))
2379 itemid
= PageGetItemId(page
, offnum
);
2380 item
= (IndexTuple
) PageGetItem(page
, itemid
);
2382 if (BTreeTupleGetDownLink(item
) == child
)
2384 /* Return accurate pointer to where link is now */
2385 stack
->bts_blkno
= blkno
;
2386 stack
->bts_offset
= offnum
;
2391 for (offnum
= OffsetNumberPrev(start
);
2393 offnum
= OffsetNumberPrev(offnum
))
2395 itemid
= PageGetItemId(page
, offnum
);
2396 item
= (IndexTuple
) PageGetItem(page
, itemid
);
2398 if (BTreeTupleGetDownLink(item
) == child
)
2400 /* Return accurate pointer to where link is now */
2401 stack
->bts_blkno
= blkno
;
2402 stack
->bts_offset
= offnum
;
2409 * The item we're looking for moved right at least one page.
2411 * Lehman and Yao couple/chain locks when moving right here, which we
2412 * can avoid. See nbtree/README.
2414 if (P_RIGHTMOST(opaque
))
2416 _bt_relbuf(rel
, buf
);
2417 return InvalidBuffer
;
2419 blkno
= opaque
->btpo_next
;
2420 start
= InvalidOffsetNumber
;
2421 _bt_relbuf(rel
, buf
);
2426 * _bt_newlevel() -- Create a new level above root page.
2428 * We've just split the old root page and need to create a new one.
2429 * In order to do this, we add a new root page to the file, then lock
2430 * the metadata page and update it. This is guaranteed to be deadlock-
2431 * free, because all readers release their locks on the metadata page
2432 * before trying to lock the root, and all writers lock the root before
2433 * trying to lock the metadata page. We have a write lock on the old
2434 * root page, so we have not introduced any cycles into the waits-for
2437 * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2438 * locked. On exit, a new root page exists with entries for the
2439 * two new children, metapage is updated and unlocked/unpinned.
2440 * The new root buffer is returned to caller which has to unlock/unpin
2441 * lbuf, rbuf & rootbuf.
2444 _bt_newlevel(Relation rel
, Relation heaprel
, Buffer lbuf
, Buffer rbuf
)
2451 BlockNumber rootblknum
;
2452 BTPageOpaque rootopaque
;
2453 BTPageOpaque lopaque
;
2456 IndexTuple left_item
;
2458 IndexTuple right_item
;
2462 BTMetaPageData
*metad
;
2464 lbkno
= BufferGetBlockNumber(lbuf
);
2465 rbkno
= BufferGetBlockNumber(rbuf
);
2466 lpage
= BufferGetPage(lbuf
);
2467 lopaque
= BTPageGetOpaque(lpage
);
2469 /* get a new root page */
2470 rootbuf
= _bt_allocbuf(rel
, heaprel
);
2471 rootpage
= BufferGetPage(rootbuf
);
2472 rootblknum
= BufferGetBlockNumber(rootbuf
);
2474 /* acquire lock on the metapage */
2475 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_WRITE
);
2476 metapg
= BufferGetPage(metabuf
);
2477 metad
= BTPageGetMeta(metapg
);
2480 * Create downlink item for left page (old root). The key value used is
2481 * "minus infinity", a sentinel value that's reliably less than any real
2482 * key value that could appear in the left page.
2484 left_item_sz
= sizeof(IndexTupleData
);
2485 left_item
= (IndexTuple
) palloc(left_item_sz
);
2486 left_item
->t_info
= left_item_sz
;
2487 BTreeTupleSetDownLink(left_item
, lbkno
);
2488 BTreeTupleSetNAtts(left_item
, 0, false);
2491 * Create downlink item for right page. The key for it is obtained from
2492 * the "high key" position in the left page.
2494 itemid
= PageGetItemId(lpage
, P_HIKEY
);
2495 right_item_sz
= ItemIdGetLength(itemid
);
2496 item
= (IndexTuple
) PageGetItem(lpage
, itemid
);
2497 right_item
= CopyIndexTuple(item
);
2498 BTreeTupleSetDownLink(right_item
, rbkno
);
2500 /* NO EREPORT(ERROR) from here till newroot op is logged */
2501 START_CRIT_SECTION();
2503 /* upgrade metapage if needed */
2504 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
2505 _bt_upgrademetapage(metapg
);
2507 /* set btree special data */
2508 rootopaque
= BTPageGetOpaque(rootpage
);
2509 rootopaque
->btpo_prev
= rootopaque
->btpo_next
= P_NONE
;
2510 rootopaque
->btpo_flags
= BTP_ROOT
;
2511 rootopaque
->btpo_level
=
2512 (BTPageGetOpaque(lpage
))->btpo_level
+ 1;
2513 rootopaque
->btpo_cycleid
= 0;
2515 /* update metapage data */
2516 metad
->btm_root
= rootblknum
;
2517 metad
->btm_level
= rootopaque
->btpo_level
;
2518 metad
->btm_fastroot
= rootblknum
;
2519 metad
->btm_fastlevel
= rootopaque
->btpo_level
;
2522 * Insert the left page pointer into the new root page. The root page is
2523 * the rightmost page on its level so there is no "high key" in it; the
2524 * two items will go into positions P_HIKEY and P_FIRSTKEY.
2526 * Note: we *must* insert the two items in item-number order, for the
2527 * benefit of _bt_restore_page().
2529 Assert(BTreeTupleGetNAtts(left_item
, rel
) == 0);
2530 if (PageAddItem(rootpage
, (Item
) left_item
, left_item_sz
, P_HIKEY
,
2531 false, false) == InvalidOffsetNumber
)
2532 elog(PANIC
, "failed to add leftkey to new root page"
2533 " while splitting block %u of index \"%s\"",
2534 BufferGetBlockNumber(lbuf
), RelationGetRelationName(rel
));
2537 * insert the right page pointer into the new root page.
2539 Assert(BTreeTupleGetNAtts(right_item
, rel
) > 0);
2540 Assert(BTreeTupleGetNAtts(right_item
, rel
) <=
2541 IndexRelationGetNumberOfKeyAttributes(rel
));
2542 if (PageAddItem(rootpage
, (Item
) right_item
, right_item_sz
, P_FIRSTKEY
,
2543 false, false) == InvalidOffsetNumber
)
2544 elog(PANIC
, "failed to add rightkey to new root page"
2545 " while splitting block %u of index \"%s\"",
2546 BufferGetBlockNumber(lbuf
), RelationGetRelationName(rel
));
2548 /* Clear the incomplete-split flag in the left child */
2549 Assert(P_INCOMPLETE_SPLIT(lopaque
));
2550 lopaque
->btpo_flags
&= ~BTP_INCOMPLETE_SPLIT
;
2551 MarkBufferDirty(lbuf
);
2553 MarkBufferDirty(rootbuf
);
2554 MarkBufferDirty(metabuf
);
2557 if (RelationNeedsWAL(rel
))
2559 xl_btree_newroot xlrec
;
2561 xl_btree_metadata md
;
2563 xlrec
.rootblk
= rootblknum
;
2564 xlrec
.level
= metad
->btm_level
;
2567 XLogRegisterData((char *) &xlrec
, SizeOfBtreeNewroot
);
2569 XLogRegisterBuffer(0, rootbuf
, REGBUF_WILL_INIT
);
2570 XLogRegisterBuffer(1, lbuf
, REGBUF_STANDARD
);
2571 XLogRegisterBuffer(2, metabuf
, REGBUF_WILL_INIT
| REGBUF_STANDARD
);
2573 Assert(metad
->btm_version
>= BTREE_NOVAC_VERSION
);
2574 md
.version
= metad
->btm_version
;
2575 md
.root
= rootblknum
;
2576 md
.level
= metad
->btm_level
;
2577 md
.fastroot
= rootblknum
;
2578 md
.fastlevel
= metad
->btm_level
;
2579 md
.last_cleanup_num_delpages
= metad
->btm_last_cleanup_num_delpages
;
2580 md
.allequalimage
= metad
->btm_allequalimage
;
2582 XLogRegisterBufData(2, (char *) &md
, sizeof(xl_btree_metadata
));
2585 * Direct access to page is not good but faster - we should implement
2586 * some new func in page API.
2588 XLogRegisterBufData(0,
2589 (char *) rootpage
+ ((PageHeader
) rootpage
)->pd_upper
,
2590 ((PageHeader
) rootpage
)->pd_special
-
2591 ((PageHeader
) rootpage
)->pd_upper
);
2593 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_NEWROOT
);
2595 PageSetLSN(lpage
, recptr
);
2596 PageSetLSN(rootpage
, recptr
);
2597 PageSetLSN(metapg
, recptr
);
2602 /* done with metapage */
2603 _bt_relbuf(rel
, metabuf
);
2612 * _bt_pgaddtup() -- add a data item to a particular page during split.
2614 * The difference between this routine and a bare PageAddItem call is
2615 * that this code can deal with the first data item on an internal btree
2616 * page in passing. This data item (which is called "firstright" within
2617 * _bt_split()) has a key that must be treated as minus infinity after
2618 * the split. Therefore, we truncate away all attributes when caller
2619 * specifies it's the first data item on page (downlink is not changed,
2620 * though). This extra step is only needed for the right page of an
2621 * internal page split. There is no need to do this for the first data
2622 * item on the existing/left page, since that will already have been
2623 * truncated during an earlier page split.
2625 * See _bt_split() for a high level explanation of why we truncate here.
2626 * Note that this routine has nothing to do with suffix truncation,
2627 * despite using some of the same infrastructure.
2630 _bt_pgaddtup(Page page
,
2633 OffsetNumber itup_off
,
2634 bool newfirstdataitem
)
2636 IndexTupleData trunctuple
;
2638 if (newfirstdataitem
)
2641 trunctuple
.t_info
= sizeof(IndexTupleData
);
2642 BTreeTupleSetNAtts(&trunctuple
, 0, false);
2644 itemsize
= sizeof(IndexTupleData
);
2647 if (unlikely(PageAddItem(page
, (Item
) itup
, itemsize
, itup_off
, false,
2648 false) == InvalidOffsetNumber
))
2655 * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2657 * There are three operations performed here: simple index deletion, bottom-up
2658 * index deletion, and deduplication. If all three operations fail to free
2659 * enough space for the incoming item then caller will go on to split the
2660 * page. We always consider simple deletion first. If that doesn't work out
2661 * we consider alternatives. Callers that only want us to consider simple
2662 * deletion (without any fallback) ask for that using the 'simpleonly'
2665 * We usually pick only one alternative "complex" operation when simple
2666 * deletion alone won't prevent a page split. The 'checkingunique',
2667 * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2669 * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2670 * level flag was found set. The flag was useful back when there wasn't
2671 * necessarily one single page for a duplicate tuple to go on (before heap TID
2672 * became a part of the key space in version 4 indexes). But we don't
2673 * actually look at the flag anymore (it's not a gating condition for our
2674 * caller). That would cause us to miss tuples that are safe to delete,
2675 * without getting any benefit in return. We know that the alternative is to
2676 * split the page; scanning the line pointer array in passing won't have
2677 * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2678 * all this because !heapkeyspace indexes must still do a "getting tired"
2679 * linear search, and so are likely to get some benefit from using it as a
2680 * gating condition.)
2683 _bt_delete_or_dedup_one_page(Relation rel
, Relation heapRel
,
2684 BTInsertState insertstate
,
2685 bool simpleonly
, bool checkingunique
,
2686 bool uniquedup
, bool indexUnchanged
)
2688 OffsetNumber deletable
[MaxIndexTuplesPerPage
];
2690 OffsetNumber offnum
,
2693 Buffer buffer
= insertstate
->buf
;
2694 BTScanInsert itup_key
= insertstate
->itup_key
;
2695 Page page
= BufferGetPage(buffer
);
2696 BTPageOpaque opaque
= BTPageGetOpaque(page
);
2698 Assert(P_ISLEAF(opaque
));
2699 Assert(simpleonly
|| itup_key
->heapkeyspace
);
2700 Assert(!simpleonly
|| (!checkingunique
&& !uniquedup
&& !indexUnchanged
));
2703 * Scan over all items to see which ones need to be deleted according to
2704 * LP_DEAD flags. We'll usually manage to delete a few extra items that
2705 * are not marked LP_DEAD in passing. Often the extra items that actually
2706 * end up getting deleted are items that would have had their LP_DEAD bit
2707 * set before long anyway (if we opted not to include them as extras).
2709 minoff
= P_FIRSTDATAKEY(opaque
);
2710 maxoff
= PageGetMaxOffsetNumber(page
);
2711 for (offnum
= minoff
;
2713 offnum
= OffsetNumberNext(offnum
))
2715 ItemId itemId
= PageGetItemId(page
, offnum
);
2717 if (ItemIdIsDead(itemId
))
2718 deletable
[ndeletable
++] = offnum
;
2723 _bt_simpledel_pass(rel
, buffer
, heapRel
, deletable
, ndeletable
,
2724 insertstate
->itup
, minoff
, maxoff
);
2725 insertstate
->bounds_valid
= false;
2727 /* Return when a page split has already been avoided */
2728 if (PageGetFreeSpace(page
) >= insertstate
->itemsz
)
2731 /* Might as well assume duplicates (if checkingunique) */
2736 * We're done with simple deletion. Return early with callers that only
2737 * call here so that simple deletion can be considered. This includes
2738 * callers that explicitly ask for this and checkingunique callers that
2739 * probably don't have any version churn duplicates on the page.
2741 * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2742 * return at this point (or when we go on the try either or both of our
2743 * other strategies and they also fail). We do not bother expending a
2744 * separate write to clear it, however. Caller will definitely clear it
2745 * when it goes on to split the page (note also that the deduplication
2746 * process will clear the flag in passing, just to keep things tidy).
2748 if (simpleonly
|| (checkingunique
&& !uniquedup
))
2750 Assert(!indexUnchanged
);
2754 /* Assume bounds about to be invalidated (this is almost certain now) */
2755 insertstate
->bounds_valid
= false;
2758 * Perform bottom-up index deletion pass when executor hint indicated that
2759 * incoming item is logically unchanged, or for a unique index that is
2760 * known to have physical duplicates for some other reason. (There is a
2761 * large overlap between these two cases for a unique index. It's worth
2762 * having both triggering conditions in order to apply the optimization in
2763 * the event of successive related INSERT and DELETE statements.)
2765 * We'll go on to do a deduplication pass when a bottom-up pass fails to
2766 * delete an acceptable amount of free space (a significant fraction of
2767 * the page, or space for the new item, whichever is greater).
2769 * Note: Bottom-up index deletion uses the same equality/equivalence
2770 * routines as deduplication internally. However, it does not merge
2771 * together index tuples, so the same correctness considerations do not
2772 * apply. We deliberately omit an index-is-allequalimage test here.
2774 if ((indexUnchanged
|| uniquedup
) &&
2775 _bt_bottomupdel_pass(rel
, buffer
, heapRel
, insertstate
->itemsz
))
2778 /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2779 if (BTGetDeduplicateItems(rel
) && itup_key
->allequalimage
)
2780 _bt_dedup_pass(rel
, buffer
, insertstate
->itup
, insertstate
->itemsz
,
2781 (indexUnchanged
|| uniquedup
));
2785 * _bt_simpledel_pass - Simple index tuple deletion pass.
2787 * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2788 * of all such tuples are determined by caller (caller passes these to us as
2789 * its 'deletable' argument).
2791 * We might also delete extra index tuples that turn out to be safe to delete
2792 * in passing (though they must be cheap to check in passing to begin with).
2793 * There is no certainty that any extra tuples will be deleted, though. The
2794 * high level goal of the approach we take is to get the most out of each call
2795 * here (without noticeably increasing the per-call overhead compared to what
2796 * we need to do just to be able to delete the page's LP_DEAD-marked index
2799 * The number of extra index tuples that turn out to be deletable might
2800 * greatly exceed the number of LP_DEAD-marked index tuples due to various
2801 * locality related effects. For example, it's possible that the total number
2802 * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2803 * quite low, in which case we might end up checking if it's possible to
2804 * delete _most_ index tuples on the page (without the tableam needing to
2805 * access additional table blocks). The tableam will sometimes stumble upon
2806 * _many_ extra deletable index tuples in indexes where this pattern is
2809 * See nbtree/README for further details on simple index tuple deletion.
2812 _bt_simpledel_pass(Relation rel
, Buffer buffer
, Relation heapRel
,
2813 OffsetNumber
*deletable
, int ndeletable
, IndexTuple newitem
,
2814 OffsetNumber minoff
, OffsetNumber maxoff
)
2816 Page page
= BufferGetPage(buffer
);
2817 BlockNumber
*deadblocks
;
2819 TM_IndexDeleteOp delstate
;
2820 OffsetNumber offnum
;
2822 /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2823 deadblocks
= _bt_deadblocks(page
, deletable
, ndeletable
, newitem
,
2826 /* Initialize tableam state that describes index deletion operation */
2827 delstate
.irel
= rel
;
2828 delstate
.iblknum
= BufferGetBlockNumber(buffer
);
2829 delstate
.bottomup
= false;
2830 delstate
.bottomupfreespace
= 0;
2831 delstate
.ndeltids
= 0;
2832 delstate
.deltids
= palloc(MaxTIDsPerBTreePage
* sizeof(TM_IndexDelete
));
2833 delstate
.status
= palloc(MaxTIDsPerBTreePage
* sizeof(TM_IndexStatus
));
2835 for (offnum
= minoff
;
2837 offnum
= OffsetNumberNext(offnum
))
2839 ItemId itemid
= PageGetItemId(page
, offnum
);
2840 IndexTuple itup
= (IndexTuple
) PageGetItem(page
, itemid
);
2841 TM_IndexDelete
*odeltid
= &delstate
.deltids
[delstate
.ndeltids
];
2842 TM_IndexStatus
*ostatus
= &delstate
.status
[delstate
.ndeltids
];
2843 BlockNumber tidblock
;
2846 if (!BTreeTupleIsPosting(itup
))
2848 tidblock
= ItemPointerGetBlockNumber(&itup
->t_tid
);
2849 match
= bsearch(&tidblock
, deadblocks
, ndeadblocks
,
2850 sizeof(BlockNumber
), _bt_blk_cmp
);
2854 Assert(!ItemIdIsDead(itemid
));
2859 * TID's table block is among those pointed to by the TIDs from
2860 * LP_DEAD-bit set tuples on page -- add TID to deltids
2862 odeltid
->tid
= itup
->t_tid
;
2863 odeltid
->id
= delstate
.ndeltids
;
2864 ostatus
->idxoffnum
= offnum
;
2865 ostatus
->knowndeletable
= ItemIdIsDead(itemid
);
2866 ostatus
->promising
= false; /* unused */
2867 ostatus
->freespace
= 0; /* unused */
2869 delstate
.ndeltids
++;
2873 int nitem
= BTreeTupleGetNPosting(itup
);
2875 for (int p
= 0; p
< nitem
; p
++)
2877 ItemPointer tid
= BTreeTupleGetPostingN(itup
, p
);
2879 tidblock
= ItemPointerGetBlockNumber(tid
);
2880 match
= bsearch(&tidblock
, deadblocks
, ndeadblocks
,
2881 sizeof(BlockNumber
), _bt_blk_cmp
);
2885 Assert(!ItemIdIsDead(itemid
));
2890 * TID's table block is among those pointed to by the TIDs
2891 * from LP_DEAD-bit set tuples on page -- add TID to deltids
2893 odeltid
->tid
= *tid
;
2894 odeltid
->id
= delstate
.ndeltids
;
2895 ostatus
->idxoffnum
= offnum
;
2896 ostatus
->knowndeletable
= ItemIdIsDead(itemid
);
2897 ostatus
->promising
= false; /* unused */
2898 ostatus
->freespace
= 0; /* unused */
2902 delstate
.ndeltids
++;
2909 Assert(delstate
.ndeltids
>= ndeletable
);
2911 /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2912 _bt_delitems_delete_check(rel
, buffer
, heapRel
, &delstate
);
2914 pfree(delstate
.deltids
);
2915 pfree(delstate
.status
);
2919 * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2921 * Builds sorted and unique-ified array of table block numbers from index
2922 * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2923 * block from incoming newitem just in case it isn't among the LP_DEAD-related
2926 * Always counting the newitem's table block as an LP_DEAD related block makes
2927 * sense because the cost is consistently low; it is practically certain that
2928 * the table block will not incur a buffer miss in tableam. On the other hand
2929 * the benefit is often quite high. There is a decent chance that there will
2930 * be some deletable items from this block, since in general most garbage
2931 * tuples became garbage in the recent past (in many cases this won't be the
2932 * first logical row that core code added to/modified in table block
2935 * Returns final array, and sets *nblocks to its final size for caller.
2937 static BlockNumber
*
2938 _bt_deadblocks(Page page
, OffsetNumber
*deletable
, int ndeletable
,
2939 IndexTuple newitem
, int *nblocks
)
2943 BlockNumber
*tidblocks
;
2946 * Accumulate each TID's block in array whose initial size has space for
2947 * one table block per LP_DEAD-set tuple (plus space for the newitem table
2948 * block). Array will only need to grow when there are LP_DEAD-marked
2949 * posting list tuples (which is not that common).
2951 spacentids
= ndeletable
+ 1;
2953 tidblocks
= (BlockNumber
*) palloc(sizeof(BlockNumber
) * spacentids
);
2956 * First add the table block for the incoming newitem. This is the one
2957 * case where simple deletion can visit a table block that doesn't have
2958 * any known deletable items.
2960 Assert(!BTreeTupleIsPosting(newitem
) && !BTreeTupleIsPivot(newitem
));
2961 tidblocks
[ntids
++] = ItemPointerGetBlockNumber(&newitem
->t_tid
);
2963 for (int i
= 0; i
< ndeletable
; i
++)
2965 ItemId itemid
= PageGetItemId(page
, deletable
[i
]);
2966 IndexTuple itup
= (IndexTuple
) PageGetItem(page
, itemid
);
2968 Assert(ItemIdIsDead(itemid
));
2970 if (!BTreeTupleIsPosting(itup
))
2972 if (ntids
+ 1 > spacentids
)
2975 tidblocks
= (BlockNumber
*)
2976 repalloc(tidblocks
, sizeof(BlockNumber
) * spacentids
);
2979 tidblocks
[ntids
++] = ItemPointerGetBlockNumber(&itup
->t_tid
);
2983 int nposting
= BTreeTupleGetNPosting(itup
);
2985 if (ntids
+ nposting
> spacentids
)
2987 spacentids
= Max(spacentids
* 2, ntids
+ nposting
);
2988 tidblocks
= (BlockNumber
*)
2989 repalloc(tidblocks
, sizeof(BlockNumber
) * spacentids
);
2992 for (int j
= 0; j
< nposting
; j
++)
2994 ItemPointer tid
= BTreeTupleGetPostingN(itup
, j
);
2996 tidblocks
[ntids
++] = ItemPointerGetBlockNumber(tid
);
3001 qsort(tidblocks
, ntids
, sizeof(BlockNumber
), _bt_blk_cmp
);
3002 *nblocks
= qunique(tidblocks
, ntids
, sizeof(BlockNumber
), _bt_blk_cmp
);
3008 * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3011 _bt_blk_cmp(const void *arg1
, const void *arg2
)
3013 BlockNumber b1
= *((BlockNumber
*) arg1
);
3014 BlockNumber b2
= *((BlockNumber
*) arg2
);