1 /*-------------------------------------------------------------------------
4 * BTree-specific page management code for the Postgres btree access
7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
12 * src/backend/access/nbtree/nbtpage.c
15 * Postgres btree pages look like ordinary relation pages. The opaque
16 * data at high addresses includes pointers to left and right siblings
17 * and flag data describing page state. The first page in a btree, page
18 * zero, is special -- it stores meta-information describing the tree.
19 * Pages one and higher store the actual tree data.
21 *-------------------------------------------------------------------------
25 #include "access/nbtree.h"
26 #include "access/nbtxlog.h"
27 #include "access/tableam.h"
28 #include "access/transam.h"
29 #include "access/xlog.h"
30 #include "access/xloginsert.h"
31 #include "miscadmin.h"
32 #include "storage/indexfsm.h"
33 #include "storage/lmgr.h"
34 #include "storage/predicate.h"
35 #include "utils/snapmgr.h"
37 static BTMetaPageData
*_bt_getmeta(Relation rel
, Buffer metabuf
);
38 static bool _bt_mark_page_halfdead(Relation rel
, Buffer buf
, BTStack stack
);
39 static bool _bt_unlink_halfdead_page(Relation rel
, Buffer leafbuf
,
40 bool *rightsib_empty
);
41 static TransactionId
_bt_xid_horizon(Relation rel
, Relation heapRel
, Page page
,
42 OffsetNumber
*deletable
, int ndeletable
);
43 static bool _bt_lock_branch_parent(Relation rel
, BlockNumber child
,
44 BTStack stack
, Buffer
*topparent
, OffsetNumber
*topoff
,
45 BlockNumber
*target
, BlockNumber
*rightsib
);
46 static void _bt_log_reuse_page(Relation rel
, BlockNumber blkno
,
47 TransactionId latestRemovedXid
);
50 * _bt_initmetapage() -- Fill a page buffer with a correct metapage image
53 _bt_initmetapage(Page page
, BlockNumber rootbknum
, uint32 level
,
56 BTMetaPageData
*metad
;
57 BTPageOpaque metaopaque
;
59 _bt_pageinit(page
, BLCKSZ
);
61 metad
= BTPageGetMeta(page
);
62 metad
->btm_magic
= BTREE_MAGIC
;
63 metad
->btm_version
= BTREE_VERSION
;
64 metad
->btm_root
= rootbknum
;
65 metad
->btm_level
= level
;
66 metad
->btm_fastroot
= rootbknum
;
67 metad
->btm_fastlevel
= level
;
68 metad
->btm_oldest_btpo_xact
= InvalidTransactionId
;
69 metad
->btm_last_cleanup_num_heap_tuples
= -1.0;
70 metad
->btm_allequalimage
= allequalimage
;
72 metaopaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
73 metaopaque
->btpo_flags
= BTP_META
;
76 * Set pd_lower just past the end of the metadata. This is essential,
77 * because without doing so, metadata will be lost if xlog.c compresses
80 ((PageHeader
) page
)->pd_lower
=
81 ((char *) metad
+ sizeof(BTMetaPageData
)) - (char *) page
;
85 * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
86 * 3, the last version that can be updated without broadly affecting
87 * on-disk compatibility. (A REINDEX is required to upgrade to v4.)
89 * This routine does purely in-memory image upgrade. Caller is
90 * responsible for locking, WAL-logging etc.
93 _bt_upgrademetapage(Page page
)
95 BTMetaPageData
*metad
;
96 BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY
;
98 metad
= BTPageGetMeta(page
);
99 metaopaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
101 /* It must be really a meta page of upgradable version */
102 Assert(metaopaque
->btpo_flags
& BTP_META
);
103 Assert(metad
->btm_version
< BTREE_NOVAC_VERSION
);
104 Assert(metad
->btm_version
>= BTREE_MIN_VERSION
);
106 /* Set version number and fill extra fields added into version 3 */
107 metad
->btm_version
= BTREE_NOVAC_VERSION
;
108 metad
->btm_oldest_btpo_xact
= InvalidTransactionId
;
109 metad
->btm_last_cleanup_num_heap_tuples
= -1.0;
110 /* Only a REINDEX can set this field */
111 Assert(!metad
->btm_allequalimage
);
112 metad
->btm_allequalimage
= false;
114 /* Adjust pd_lower (see _bt_initmetapage() for details) */
115 ((PageHeader
) page
)->pd_lower
=
116 ((char *) metad
+ sizeof(BTMetaPageData
)) - (char *) page
;
120 * Get metadata from share-locked buffer containing metapage, while performing
121 * standard sanity checks.
123 * Callers that cache data returned here in local cache should note that an
124 * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
125 * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
127 static BTMetaPageData
*
128 _bt_getmeta(Relation rel
, Buffer metabuf
)
131 BTPageOpaque metaopaque
;
132 BTMetaPageData
*metad
;
134 metapg
= BufferGetPage(metabuf
);
135 metaopaque
= (BTPageOpaque
) PageGetSpecialPointer(metapg
);
136 metad
= BTPageGetMeta(metapg
);
138 /* sanity-check the metapage */
139 if (!P_ISMETA(metaopaque
) ||
140 metad
->btm_magic
!= BTREE_MAGIC
)
142 (errcode(ERRCODE_INDEX_CORRUPTED
),
143 errmsg("index \"%s\" is not a btree",
144 RelationGetRelationName(rel
))));
146 if (metad
->btm_version
< BTREE_MIN_VERSION
||
147 metad
->btm_version
> BTREE_VERSION
)
149 (errcode(ERRCODE_INDEX_CORRUPTED
),
150 errmsg("version mismatch in index \"%s\": file version %d, "
151 "current version %d, minimal supported version %d",
152 RelationGetRelationName(rel
),
153 metad
->btm_version
, BTREE_VERSION
, BTREE_MIN_VERSION
)));
159 * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
162 * This routine checks if provided cleanup-related information is matching
163 * to those written in the metapage. On mismatch, metapage is overwritten.
166 _bt_update_meta_cleanup_info(Relation rel
, TransactionId oldestBtpoXact
,
167 float8 numHeapTuples
)
171 BTMetaPageData
*metad
;
172 bool needsRewrite
= false;
175 /* read the metapage and check if it needs rewrite */
176 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_READ
);
177 metapg
= BufferGetPage(metabuf
);
178 metad
= BTPageGetMeta(metapg
);
180 /* outdated version of metapage always needs rewrite */
181 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
183 else if (metad
->btm_oldest_btpo_xact
!= oldestBtpoXact
||
184 metad
->btm_last_cleanup_num_heap_tuples
!= numHeapTuples
)
189 _bt_relbuf(rel
, metabuf
);
193 /* trade in our read lock for a write lock */
194 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
195 LockBuffer(metabuf
, BT_WRITE
);
197 START_CRIT_SECTION();
199 /* upgrade meta-page if needed */
200 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
201 _bt_upgrademetapage(metapg
);
203 /* update cleanup-related information */
204 metad
->btm_oldest_btpo_xact
= oldestBtpoXact
;
205 metad
->btm_last_cleanup_num_heap_tuples
= numHeapTuples
;
206 MarkBufferDirty(metabuf
);
208 /* write wal record if needed */
209 if (RelationNeedsWAL(rel
))
211 xl_btree_metadata md
;
214 XLogRegisterBuffer(0, metabuf
, REGBUF_WILL_INIT
| REGBUF_STANDARD
);
216 Assert(metad
->btm_version
>= BTREE_NOVAC_VERSION
);
217 md
.version
= metad
->btm_version
;
218 md
.root
= metad
->btm_root
;
219 md
.level
= metad
->btm_level
;
220 md
.fastroot
= metad
->btm_fastroot
;
221 md
.fastlevel
= metad
->btm_fastlevel
;
222 md
.oldest_btpo_xact
= oldestBtpoXact
;
223 md
.last_cleanup_num_heap_tuples
= numHeapTuples
;
224 md
.allequalimage
= metad
->btm_allequalimage
;
226 XLogRegisterBufData(0, (char *) &md
, sizeof(xl_btree_metadata
));
228 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_META_CLEANUP
);
230 PageSetLSN(metapg
, recptr
);
234 _bt_relbuf(rel
, metabuf
);
238 * _bt_getroot() -- Get the root page of the btree.
240 * Since the root page can move around the btree file, we have to read
241 * its location from the metadata page, and then read the root page
242 * itself. If no root page exists yet, we have to create one.
244 * The access type parameter (BT_READ or BT_WRITE) controls whether
245 * a new root page will be created or not. If access = BT_READ,
246 * and no root page exists, we just return InvalidBuffer. For
247 * BT_WRITE, we try to create the root page if it doesn't exist.
248 * NOTE that the returned root page will have only a read lock set
249 * on it even if access = BT_WRITE!
251 * The returned page is not necessarily the true root --- it could be
252 * a "fast root" (a page that is alone in its level due to deletions).
253 * Also, if the root page is split while we are "in flight" to it,
254 * what we will return is the old root, which is now just the leftmost
255 * page on a probably-not-very-wide level. For most purposes this is
256 * as good as or better than the true root, so we do not bother to
257 * insist on finding the true root. We do, however, guarantee to
258 * return a live (not deleted or half-dead) page.
260 * On successful return, the root page is pinned and read-locked.
261 * The metadata page is not locked or pinned on exit.
264 _bt_getroot(Relation rel
, int access
)
269 BTPageOpaque rootopaque
;
270 BlockNumber rootblkno
;
272 BTMetaPageData
*metad
;
275 * Try to use previously-cached metapage data to find the root. This
276 * normally saves one buffer access per index search, which is a very
277 * helpful savings in bufmgr traffic and hence contention.
279 if (rel
->rd_amcache
!= NULL
)
281 metad
= (BTMetaPageData
*) rel
->rd_amcache
;
282 /* We shouldn't have cached it if any of these fail */
283 Assert(metad
->btm_magic
== BTREE_MAGIC
);
284 Assert(metad
->btm_version
>= BTREE_MIN_VERSION
);
285 Assert(metad
->btm_version
<= BTREE_VERSION
);
286 Assert(!metad
->btm_allequalimage
||
287 metad
->btm_version
> BTREE_NOVAC_VERSION
);
288 Assert(metad
->btm_root
!= P_NONE
);
290 rootblkno
= metad
->btm_fastroot
;
291 Assert(rootblkno
!= P_NONE
);
292 rootlevel
= metad
->btm_fastlevel
;
294 rootbuf
= _bt_getbuf(rel
, rootblkno
, BT_READ
);
295 rootpage
= BufferGetPage(rootbuf
);
296 rootopaque
= (BTPageOpaque
) PageGetSpecialPointer(rootpage
);
299 * Since the cache might be stale, we check the page more carefully
300 * here than normal. We *must* check that it's not deleted. If it's
301 * not alone on its level, then we reject too --- this may be overly
302 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
303 * because that's not set in a "fast root".
305 if (!P_IGNORE(rootopaque
) &&
306 rootopaque
->btpo
.level
== rootlevel
&&
307 P_LEFTMOST(rootopaque
) &&
308 P_RIGHTMOST(rootopaque
))
310 /* OK, accept cached page as the root */
313 _bt_relbuf(rel
, rootbuf
);
314 /* Cache is stale, throw it away */
316 pfree(rel
->rd_amcache
);
317 rel
->rd_amcache
= NULL
;
320 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_READ
);
321 metad
= _bt_getmeta(rel
, metabuf
);
323 /* if no root page initialized yet, do it */
324 if (metad
->btm_root
== P_NONE
)
328 /* If access = BT_READ, caller doesn't want us to create root yet */
329 if (access
== BT_READ
)
331 _bt_relbuf(rel
, metabuf
);
332 return InvalidBuffer
;
335 /* trade in our read lock for a write lock */
336 LockBuffer(metabuf
, BUFFER_LOCK_UNLOCK
);
337 LockBuffer(metabuf
, BT_WRITE
);
340 * Race condition: if someone else initialized the metadata between
341 * the time we released the read lock and acquired the write lock, we
342 * must avoid doing it again.
344 if (metad
->btm_root
!= P_NONE
)
347 * Metadata initialized by someone else. In order to guarantee no
348 * deadlocks, we have to release the metadata page and start all
349 * over again. (Is that really true? But it's hardly worth trying
350 * to optimize this case.)
352 _bt_relbuf(rel
, metabuf
);
353 return _bt_getroot(rel
, access
);
357 * Get, initialize, write, and leave a lock of the appropriate type on
358 * the new root page. Since this is the first page in the tree, it's
359 * a leaf as well as the root.
361 rootbuf
= _bt_getbuf(rel
, P_NEW
, BT_WRITE
);
362 rootblkno
= BufferGetBlockNumber(rootbuf
);
363 rootpage
= BufferGetPage(rootbuf
);
364 rootopaque
= (BTPageOpaque
) PageGetSpecialPointer(rootpage
);
365 rootopaque
->btpo_prev
= rootopaque
->btpo_next
= P_NONE
;
366 rootopaque
->btpo_flags
= (BTP_LEAF
| BTP_ROOT
);
367 rootopaque
->btpo
.level
= 0;
368 rootopaque
->btpo_cycleid
= 0;
369 /* Get raw page pointer for metapage */
370 metapg
= BufferGetPage(metabuf
);
372 /* NO ELOG(ERROR) till meta is updated */
373 START_CRIT_SECTION();
375 /* upgrade metapage if needed */
376 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
377 _bt_upgrademetapage(metapg
);
379 metad
->btm_root
= rootblkno
;
380 metad
->btm_level
= 0;
381 metad
->btm_fastroot
= rootblkno
;
382 metad
->btm_fastlevel
= 0;
383 metad
->btm_oldest_btpo_xact
= InvalidTransactionId
;
384 metad
->btm_last_cleanup_num_heap_tuples
= -1.0;
386 MarkBufferDirty(rootbuf
);
387 MarkBufferDirty(metabuf
);
390 if (RelationNeedsWAL(rel
))
392 xl_btree_newroot xlrec
;
394 xl_btree_metadata md
;
397 XLogRegisterBuffer(0, rootbuf
, REGBUF_WILL_INIT
);
398 XLogRegisterBuffer(2, metabuf
, REGBUF_WILL_INIT
| REGBUF_STANDARD
);
400 Assert(metad
->btm_version
>= BTREE_NOVAC_VERSION
);
401 md
.version
= metad
->btm_version
;
404 md
.fastroot
= rootblkno
;
406 md
.oldest_btpo_xact
= InvalidTransactionId
;
407 md
.last_cleanup_num_heap_tuples
= -1.0;
408 md
.allequalimage
= metad
->btm_allequalimage
;
410 XLogRegisterBufData(2, (char *) &md
, sizeof(xl_btree_metadata
));
412 xlrec
.rootblk
= rootblkno
;
415 XLogRegisterData((char *) &xlrec
, SizeOfBtreeNewroot
);
417 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_NEWROOT
);
419 PageSetLSN(rootpage
, recptr
);
420 PageSetLSN(metapg
, recptr
);
426 * swap root write lock for read lock. There is no danger of anyone
427 * else accessing the new root page while it's unlocked, since no one
428 * else knows where it is yet.
430 LockBuffer(rootbuf
, BUFFER_LOCK_UNLOCK
);
431 LockBuffer(rootbuf
, BT_READ
);
433 /* okay, metadata is correct, release lock on it without caching */
434 _bt_relbuf(rel
, metabuf
);
438 rootblkno
= metad
->btm_fastroot
;
439 Assert(rootblkno
!= P_NONE
);
440 rootlevel
= metad
->btm_fastlevel
;
443 * Cache the metapage data for next time
445 rel
->rd_amcache
= MemoryContextAlloc(rel
->rd_indexcxt
,
446 sizeof(BTMetaPageData
));
447 memcpy(rel
->rd_amcache
, metad
, sizeof(BTMetaPageData
));
450 * We are done with the metapage; arrange to release it via first
451 * _bt_relandgetbuf call
457 rootbuf
= _bt_relandgetbuf(rel
, rootbuf
, rootblkno
, BT_READ
);
458 rootpage
= BufferGetPage(rootbuf
);
459 rootopaque
= (BTPageOpaque
) PageGetSpecialPointer(rootpage
);
461 if (!P_IGNORE(rootopaque
))
464 /* it's dead, Jim. step right one page */
465 if (P_RIGHTMOST(rootopaque
))
466 elog(ERROR
, "no live root page found in index \"%s\"",
467 RelationGetRelationName(rel
));
468 rootblkno
= rootopaque
->btpo_next
;
471 /* Note: can't check btpo.level on deleted pages */
472 if (rootopaque
->btpo
.level
!= rootlevel
)
473 elog(ERROR
, "root page %u of index \"%s\" has level %u, expected %u",
474 rootblkno
, RelationGetRelationName(rel
),
475 rootopaque
->btpo
.level
, rootlevel
);
479 * By here, we have a pin and read lock on the root page, and no lock set
480 * on the metadata page. Return the root page's buffer.
486 * _bt_gettrueroot() -- Get the true root page of the btree.
488 * This is the same as the BT_READ case of _bt_getroot(), except
489 * we follow the true-root link not the fast-root link.
491 * By the time we acquire lock on the root page, it might have been split and
492 * not be the true root anymore. This is okay for the present uses of this
493 * routine; we only really need to be able to move up at least one tree level
494 * from whatever non-root page we were at. If we ever do need to lock the
495 * one true root page, we could loop here, re-reading the metapage on each
496 * failure. (Note that it wouldn't do to hold the lock on the metapage while
497 * moving to the root --- that'd deadlock against any concurrent root split.)
500 _bt_gettrueroot(Relation rel
)
504 BTPageOpaque metaopaque
;
507 BTPageOpaque rootopaque
;
508 BlockNumber rootblkno
;
510 BTMetaPageData
*metad
;
513 * We don't try to use cached metapage data here, since (a) this path is
514 * not performance-critical, and (b) if we are here it suggests our cache
515 * is out-of-date anyway. In light of point (b), it's probably safest to
516 * actively flush any cached metapage info.
519 pfree(rel
->rd_amcache
);
520 rel
->rd_amcache
= NULL
;
522 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_READ
);
523 metapg
= BufferGetPage(metabuf
);
524 metaopaque
= (BTPageOpaque
) PageGetSpecialPointer(metapg
);
525 metad
= BTPageGetMeta(metapg
);
527 if (!P_ISMETA(metaopaque
) ||
528 metad
->btm_magic
!= BTREE_MAGIC
)
530 (errcode(ERRCODE_INDEX_CORRUPTED
),
531 errmsg("index \"%s\" is not a btree",
532 RelationGetRelationName(rel
))));
534 if (metad
->btm_version
< BTREE_MIN_VERSION
||
535 metad
->btm_version
> BTREE_VERSION
)
537 (errcode(ERRCODE_INDEX_CORRUPTED
),
538 errmsg("version mismatch in index \"%s\": file version %d, "
539 "current version %d, minimal supported version %d",
540 RelationGetRelationName(rel
),
541 metad
->btm_version
, BTREE_VERSION
, BTREE_MIN_VERSION
)));
543 /* if no root page initialized yet, fail */
544 if (metad
->btm_root
== P_NONE
)
546 _bt_relbuf(rel
, metabuf
);
547 return InvalidBuffer
;
550 rootblkno
= metad
->btm_root
;
551 rootlevel
= metad
->btm_level
;
554 * We are done with the metapage; arrange to release it via first
555 * _bt_relandgetbuf call
561 rootbuf
= _bt_relandgetbuf(rel
, rootbuf
, rootblkno
, BT_READ
);
562 rootpage
= BufferGetPage(rootbuf
);
563 rootopaque
= (BTPageOpaque
) PageGetSpecialPointer(rootpage
);
565 if (!P_IGNORE(rootopaque
))
568 /* it's dead, Jim. step right one page */
569 if (P_RIGHTMOST(rootopaque
))
570 elog(ERROR
, "no live root page found in index \"%s\"",
571 RelationGetRelationName(rel
));
572 rootblkno
= rootopaque
->btpo_next
;
575 /* Note: can't check btpo.level on deleted pages */
576 if (rootopaque
->btpo
.level
!= rootlevel
)
577 elog(ERROR
, "root page %u of index \"%s\" has level %u, expected %u",
578 rootblkno
, RelationGetRelationName(rel
),
579 rootopaque
->btpo
.level
, rootlevel
);
585 * _bt_getrootheight() -- Get the height of the btree search tree.
587 * We return the level (counting from zero) of the current fast root.
588 * This represents the number of tree levels we'd have to descend through
589 * to start any btree index search.
591 * This is used by the planner for cost-estimation purposes. Since it's
592 * only an estimate, slightly-stale data is fine, hence we don't worry
593 * about updating previously cached data.
596 _bt_getrootheight(Relation rel
)
598 BTMetaPageData
*metad
;
600 if (rel
->rd_amcache
== NULL
)
604 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_READ
);
605 metad
= _bt_getmeta(rel
, metabuf
);
608 * If there's no root page yet, _bt_getroot() doesn't expect a cache
609 * to be made, so just stop here and report the index height is zero.
610 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
612 if (metad
->btm_root
== P_NONE
)
614 _bt_relbuf(rel
, metabuf
);
619 * Cache the metapage data for next time
621 rel
->rd_amcache
= MemoryContextAlloc(rel
->rd_indexcxt
,
622 sizeof(BTMetaPageData
));
623 memcpy(rel
->rd_amcache
, metad
, sizeof(BTMetaPageData
));
624 _bt_relbuf(rel
, metabuf
);
627 /* Get cached page */
628 metad
= (BTMetaPageData
*) rel
->rd_amcache
;
629 /* We shouldn't have cached it if any of these fail */
630 Assert(metad
->btm_magic
== BTREE_MAGIC
);
631 Assert(metad
->btm_version
>= BTREE_MIN_VERSION
);
632 Assert(metad
->btm_version
<= BTREE_VERSION
);
633 Assert(!metad
->btm_allequalimage
||
634 metad
->btm_version
> BTREE_NOVAC_VERSION
);
635 Assert(metad
->btm_fastroot
!= P_NONE
);
637 return metad
->btm_fastlevel
;
641 * _bt_metaversion() -- Get version/status info from metapage.
643 * Sets caller's *heapkeyspace and *allequalimage arguments using data
644 * from the B-Tree metapage (could be locally-cached version). This
645 * information needs to be stashed in insertion scankey, so we provide a
646 * single function that fetches both at once.
648 * This is used to determine the rules that must be used to descend a
649 * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.
650 * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
651 * performance when inserting a new BTScanInsert-wise duplicate tuple
652 * among many leaf pages already full of such duplicates.
654 * Also sets allequalimage field, which indicates whether or not it is
655 * safe to apply deduplication. We rely on the assumption that
656 * btm_allequalimage will be zero'ed on heapkeyspace indexes that were
657 * pg_upgrade'd from Postgres 12.
660 _bt_metaversion(Relation rel
, bool *heapkeyspace
, bool *allequalimage
)
662 BTMetaPageData
*metad
;
664 if (rel
->rd_amcache
== NULL
)
668 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_READ
);
669 metad
= _bt_getmeta(rel
, metabuf
);
672 * If there's no root page yet, _bt_getroot() doesn't expect a cache
673 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
674 * be changed to allow this case.)
676 if (metad
->btm_root
== P_NONE
)
678 *heapkeyspace
= metad
->btm_version
> BTREE_NOVAC_VERSION
;
679 *allequalimage
= metad
->btm_allequalimage
;
681 _bt_relbuf(rel
, metabuf
);
686 * Cache the metapage data for next time
688 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
689 * can change the nbtree version for an index without invalidating any
690 * local cache. This is okay because it can only happen when moving
691 * from version 2 to version 3, both of which are !heapkeyspace
694 rel
->rd_amcache
= MemoryContextAlloc(rel
->rd_indexcxt
,
695 sizeof(BTMetaPageData
));
696 memcpy(rel
->rd_amcache
, metad
, sizeof(BTMetaPageData
));
697 _bt_relbuf(rel
, metabuf
);
700 /* Get cached page */
701 metad
= (BTMetaPageData
*) rel
->rd_amcache
;
702 /* We shouldn't have cached it if any of these fail */
703 Assert(metad
->btm_magic
== BTREE_MAGIC
);
704 Assert(metad
->btm_version
>= BTREE_MIN_VERSION
);
705 Assert(metad
->btm_version
<= BTREE_VERSION
);
706 Assert(!metad
->btm_allequalimage
||
707 metad
->btm_version
> BTREE_NOVAC_VERSION
);
708 Assert(metad
->btm_fastroot
!= P_NONE
);
710 *heapkeyspace
= metad
->btm_version
> BTREE_NOVAC_VERSION
;
711 *allequalimage
= metad
->btm_allequalimage
;
715 * _bt_checkpage() -- Verify that a freshly-read page looks sane.
718 _bt_checkpage(Relation rel
, Buffer buf
)
720 Page page
= BufferGetPage(buf
);
723 * ReadBuffer verifies that every newly-read page passes
724 * PageHeaderIsValid, which means it either contains a reasonably sane
725 * page header or is all-zero. We have to defend against the all-zero
730 (errcode(ERRCODE_INDEX_CORRUPTED
),
731 errmsg("index \"%s\" contains unexpected zero page at block %u",
732 RelationGetRelationName(rel
),
733 BufferGetBlockNumber(buf
)),
734 errhint("Please REINDEX it.")));
737 * Additionally check that the special area looks sane.
739 if (PageGetSpecialSize(page
) != MAXALIGN(sizeof(BTPageOpaqueData
)))
741 (errcode(ERRCODE_INDEX_CORRUPTED
),
742 errmsg("index \"%s\" contains corrupted page at block %u",
743 RelationGetRelationName(rel
),
744 BufferGetBlockNumber(buf
)),
745 errhint("Please REINDEX it.")));
749 * Log the reuse of a page from the FSM.
752 _bt_log_reuse_page(Relation rel
, BlockNumber blkno
, TransactionId latestRemovedXid
)
754 xl_btree_reuse_page xlrec_reuse
;
757 * Note that we don't register the buffer with the record, because this
758 * operation doesn't modify the page. This record only exists to provide a
759 * conflict point for Hot Standby.
763 xlrec_reuse
.node
= rel
->rd_node
;
764 xlrec_reuse
.block
= blkno
;
765 xlrec_reuse
.latestRemovedXid
= latestRemovedXid
;
768 XLogRegisterData((char *) &xlrec_reuse
, SizeOfBtreeReusePage
);
770 XLogInsert(RM_BTREE_ID
, XLOG_BTREE_REUSE_PAGE
);
774 * _bt_getbuf() -- Get a buffer by block number for read or write.
776 * blkno == P_NEW means to get an unallocated index page. The page
777 * will be initialized before returning it.
779 * When this routine returns, the appropriate lock is set on the
780 * requested buffer and its reference count has been incremented
781 * (ie, the buffer is "locked and pinned"). Also, we apply
782 * _bt_checkpage to sanity-check the page (except in P_NEW case).
785 _bt_getbuf(Relation rel
, BlockNumber blkno
, int access
)
791 /* Read an existing block of the relation */
792 buf
= ReadBuffer(rel
, blkno
);
793 LockBuffer(buf
, access
);
794 _bt_checkpage(rel
, buf
);
801 Assert(access
== BT_WRITE
);
804 * First see if the FSM knows of any free pages.
806 * We can't trust the FSM's report unreservedly; we have to check that
807 * the page is still free. (For example, an already-free page could
808 * have been re-used between the time the last VACUUM scanned it and
809 * the time the VACUUM made its FSM updates.)
811 * In fact, it's worse than that: we can't even assume that it's safe
812 * to take a lock on the reported page. If somebody else has a lock
813 * on it, or even worse our own caller does, we could deadlock. (The
814 * own-caller scenario is actually not improbable. Consider an index
815 * on a serial or timestamp column. Nearly all splits will be at the
816 * rightmost page, so it's entirely likely that _bt_split will call us
817 * while holding a lock on the page most recently acquired from FSM. A
818 * VACUUM running concurrently with the previous split could well have
819 * placed that page back in FSM.)
821 * To get around that, we ask for only a conditional lock on the
822 * reported page. If we fail, then someone else is using the page,
823 * and we may reasonably assume it's not free. (If we happen to be
824 * wrong, the worst consequence is the page will be lost to use till
825 * the next VACUUM, which is no big problem.)
829 blkno
= GetFreeIndexPage(rel
);
830 if (blkno
== InvalidBlockNumber
)
832 buf
= ReadBuffer(rel
, blkno
);
833 if (ConditionalLockBuffer(buf
))
835 page
= BufferGetPage(buf
);
836 if (_bt_page_recyclable(page
))
839 * If we are generating WAL for Hot Standby then create a
840 * WAL record that will allow us to conflict with queries
841 * running on standby, in case they have snapshots older
842 * than btpo.xact. This can only apply if the page does
843 * have a valid btpo.xact value, ie not if it's new. (We
844 * must check that because an all-zero page has no special
847 if (XLogStandbyInfoActive() && RelationNeedsWAL(rel
) &&
850 BTPageOpaque opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
852 _bt_log_reuse_page(rel
, blkno
, opaque
->btpo
.xact
);
855 /* Okay to use page. Re-initialize and return it */
856 _bt_pageinit(page
, BufferGetPageSize(buf
));
859 elog(DEBUG2
, "FSM returned nonrecyclable page");
860 _bt_relbuf(rel
, buf
);
864 elog(DEBUG2
, "FSM returned nonlockable page");
865 /* couldn't get lock, so just drop pin */
871 * Extend the relation by one page.
873 * We have to use a lock to ensure no one else is extending the rel at
874 * the same time, else we will both try to initialize the same new
875 * page. We can skip locking for new or temp relations, however,
876 * since no one else could be accessing them.
878 needLock
= !RELATION_IS_LOCAL(rel
);
881 LockRelationForExtension(rel
, ExclusiveLock
);
883 buf
= ReadBuffer(rel
, P_NEW
);
885 /* Acquire buffer lock on new page */
886 LockBuffer(buf
, BT_WRITE
);
889 * Release the file-extension lock; it's now OK for someone else to
890 * extend the relation some more. Note that we cannot release this
891 * lock before we have buffer lock on the new page, or we risk a race
892 * condition against btvacuumscan --- see comments therein.
895 UnlockRelationForExtension(rel
, ExclusiveLock
);
897 /* Initialize the new page before returning it */
898 page
= BufferGetPage(buf
);
899 Assert(PageIsNew(page
));
900 _bt_pageinit(page
, BufferGetPageSize(buf
));
903 /* ref count and lock type are correct */
908 * _bt_relandgetbuf() -- release a locked buffer and get another one.
910 * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
911 * exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
912 * then it reduces to just _bt_getbuf; allowing this case simplifies some
915 * The original motivation for using this was to avoid two entries to the
916 * bufmgr when one would do. However, now it's mainly just a notational
917 * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf
918 * is when the target page is the same one already in the buffer.
921 _bt_relandgetbuf(Relation rel
, Buffer obuf
, BlockNumber blkno
, int access
)
925 Assert(blkno
!= P_NEW
);
926 if (BufferIsValid(obuf
))
927 LockBuffer(obuf
, BUFFER_LOCK_UNLOCK
);
928 buf
= ReleaseAndReadBuffer(obuf
, rel
, blkno
);
929 LockBuffer(buf
, access
);
930 _bt_checkpage(rel
, buf
);
935 * _bt_relbuf() -- release a locked buffer.
937 * Lock and pin (refcount) are both dropped.
940 _bt_relbuf(Relation rel
, Buffer buf
)
942 UnlockReleaseBuffer(buf
);
946 * _bt_pageinit() -- Initialize a new page.
948 * On return, the page header is initialized; data space is empty;
949 * special space is zeroed out.
952 _bt_pageinit(Page page
, Size size
)
954 PageInit(page
, size
, sizeof(BTPageOpaqueData
));
958 * _bt_page_recyclable() -- Is an existing page recyclable?
960 * This exists to make sure _bt_getbuf and btvacuumscan have the same
961 * policy about whether a page is safe to re-use. But note that _bt_getbuf
962 * knows enough to distinguish the PageIsNew condition from the other one.
963 * At some point it might be appropriate to redesign this to have a three-way
967 _bt_page_recyclable(Page page
)
972 * It's possible to find an all-zeroes page in an index --- for example, a
973 * backend might successfully extend the relation one page and then crash
974 * before it is able to make a WAL entry for adding the page. If we find a
975 * zeroed page then reclaim it.
981 * Otherwise, recycle if deleted and too old to have any processes
984 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
985 if (P_ISDELETED(opaque
) &&
986 TransactionIdPrecedes(opaque
->btpo
.xact
, RecentGlobalXmin
))
992 * Delete item(s) from a btree leaf page during VACUUM.
994 * This routine assumes that the caller has a super-exclusive write lock on
995 * the buffer. Also, the given deletable and updatable arrays *must* be
996 * sorted in ascending order.
998 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
999 * in an existing posting list item are to be removed by VACUUM. This works
1000 * by updating/overwriting an existing item with caller's new version of the
1001 * item (a version that lacks the TIDs that are to be deleted).
1003 * We record VACUUMs and b-tree deletes differently in WAL. Deletes must
1004 * generate their own latestRemovedXid by accessing the heap directly, whereas
1005 * VACUUMs rely on the initial heap scan taking care of it indirectly. Also,
1006 * only VACUUM can perform granular deletes of individual TIDs in posting list
1010 _bt_delitems_vacuum(Relation rel
, Buffer buf
,
1011 OffsetNumber
*deletable
, int ndeletable
,
1012 BTVacuumPosting
*updatable
, int nupdatable
)
1014 Page page
= BufferGetPage(buf
);
1015 BTPageOpaque opaque
;
1017 char *updatedbuf
= NULL
;
1018 Size updatedbuflen
= 0;
1019 OffsetNumber updatedoffsets
[MaxIndexTuplesPerPage
];
1021 /* Shouldn't be called unless there's something to do */
1022 Assert(ndeletable
> 0 || nupdatable
> 0);
1024 for (int i
= 0; i
< nupdatable
; i
++)
1026 /* Replace work area IndexTuple with updated version */
1027 _bt_update_posting(updatable
[i
]);
1029 /* Maintain array of updatable page offsets for WAL record */
1030 updatedoffsets
[i
] = updatable
[i
]->updatedoffset
;
1033 /* XLOG stuff -- allocate and fill buffer before critical section */
1034 if (nupdatable
> 0 && RelationNeedsWAL(rel
))
1038 for (int i
= 0; i
< nupdatable
; i
++)
1040 BTVacuumPosting vacposting
= updatable
[i
];
1042 itemsz
= SizeOfBtreeUpdate
+
1043 vacposting
->ndeletedtids
* sizeof(uint16
);
1044 updatedbuflen
+= itemsz
;
1047 updatedbuf
= palloc(updatedbuflen
);
1048 for (int i
= 0; i
< nupdatable
; i
++)
1050 BTVacuumPosting vacposting
= updatable
[i
];
1051 xl_btree_update update
;
1053 update
.ndeletedtids
= vacposting
->ndeletedtids
;
1054 memcpy(updatedbuf
+ offset
, &update
.ndeletedtids
,
1056 offset
+= SizeOfBtreeUpdate
;
1058 itemsz
= update
.ndeletedtids
* sizeof(uint16
);
1059 memcpy(updatedbuf
+ offset
, vacposting
->deletetids
, itemsz
);
1064 /* No ereport(ERROR) until changes are logged */
1065 START_CRIT_SECTION();
1068 * Handle posting tuple updates.
1070 * Deliberately do this before handling simple deletes. If we did it the
1071 * other way around (i.e. WAL record order -- simple deletes before
1072 * updates) then we'd have to make compensating changes to the 'updatable'
1073 * array of offset numbers.
1075 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1076 * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1077 * level flag, unsetting individual LP_DEAD bits should still be avoided.
1079 for (int i
= 0; i
< nupdatable
; i
++)
1081 OffsetNumber updatedoffset
= updatedoffsets
[i
];
1084 itup
= updatable
[i
]->itup
;
1085 itemsz
= MAXALIGN(IndexTupleSize(itup
));
1086 if (!PageIndexTupleOverwrite(page
, updatedoffset
, (Item
) itup
,
1088 elog(PANIC
, "failed to update partially dead item in block %u of index \"%s\"",
1089 BufferGetBlockNumber(buf
), RelationGetRelationName(rel
));
1092 /* Now handle simple deletes of entire tuples */
1094 PageIndexMultiDelete(page
, deletable
, ndeletable
);
1097 * We can clear the vacuum cycle ID since this page has certainly been
1098 * processed by the current vacuum scan.
1100 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1101 opaque
->btpo_cycleid
= 0;
1104 * Mark the page as not containing any LP_DEAD items. This is not
1105 * certainly true (there might be some that have recently been marked, but
1106 * weren't targeted by VACUUM's heap scan), but it will be true often
1107 * enough. VACUUM does not delete items purely because they have their
1108 * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1109 * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1111 * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1112 * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1113 * are particularly dependent on LP_DEAD bits being set quickly will
1114 * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1115 * again anyway. Furthermore, attempting a deduplication pass will remove
1116 * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1119 opaque
->btpo_flags
&= ~BTP_HAS_GARBAGE
;
1121 MarkBufferDirty(buf
);
1124 if (RelationNeedsWAL(rel
))
1127 xl_btree_vacuum xlrec_vacuum
;
1129 xlrec_vacuum
.ndeleted
= ndeletable
;
1130 xlrec_vacuum
.nupdated
= nupdatable
;
1133 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
1134 XLogRegisterData((char *) &xlrec_vacuum
, SizeOfBtreeVacuum
);
1137 XLogRegisterBufData(0, (char *) deletable
,
1138 ndeletable
* sizeof(OffsetNumber
));
1142 XLogRegisterBufData(0, (char *) updatedoffsets
,
1143 nupdatable
* sizeof(OffsetNumber
));
1144 XLogRegisterBufData(0, updatedbuf
, updatedbuflen
);
1147 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_VACUUM
);
1149 PageSetLSN(page
, recptr
);
1154 /* can't leak memory here */
1155 if (updatedbuf
!= NULL
)
1157 /* free tuples generated by calling _bt_update_posting() */
1158 for (int i
= 0; i
< nupdatable
; i
++)
1159 pfree(updatable
[i
]->itup
);
1163 * Delete item(s) from a btree leaf page during single-page cleanup.
1165 * This routine assumes that the caller has pinned and write locked the
1166 * buffer. Also, the given deletable array *must* be sorted in ascending
1169 * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1170 * the page, but it needs to generate its own latestRemovedXid by accessing
1171 * the heap. This is used by the REDO routine to generate recovery conflicts.
1172 * Also, it doesn't handle posting list tuples unless the entire tuple can be
1173 * deleted as a whole (since there is only one LP_DEAD bit per line pointer).
1176 _bt_delitems_delete(Relation rel
, Buffer buf
,
1177 OffsetNumber
*deletable
, int ndeletable
,
1180 Page page
= BufferGetPage(buf
);
1181 BTPageOpaque opaque
;
1182 TransactionId latestRemovedXid
= InvalidTransactionId
;
1184 /* Shouldn't be called unless there's something to do */
1185 Assert(ndeletable
> 0);
1187 if (XLogStandbyInfoActive() && RelationNeedsWAL(rel
))
1189 _bt_xid_horizon(rel
, heapRel
, page
, deletable
, ndeletable
);
1191 /* No ereport(ERROR) until changes are logged */
1192 START_CRIT_SECTION();
1195 PageIndexMultiDelete(page
, deletable
, ndeletable
);
1198 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1199 * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1200 * page flag, since we deleted all items with their LP_DEAD bit set.
1202 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1203 opaque
->btpo_flags
&= ~BTP_HAS_GARBAGE
;
1205 MarkBufferDirty(buf
);
1208 if (RelationNeedsWAL(rel
))
1211 xl_btree_delete xlrec_delete
;
1213 xlrec_delete
.latestRemovedXid
= latestRemovedXid
;
1214 xlrec_delete
.ndeleted
= ndeletable
;
1217 XLogRegisterBuffer(0, buf
, REGBUF_STANDARD
);
1218 XLogRegisterData((char *) &xlrec_delete
, SizeOfBtreeDelete
);
1221 * The deletable array is not in the buffer, but pretend that it is.
1222 * When XLogInsert stores the whole buffer, the array need not be
1225 XLogRegisterBufData(0, (char *) deletable
,
1226 ndeletable
* sizeof(OffsetNumber
));
1228 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_DELETE
);
1230 PageSetLSN(page
, recptr
);
1237 * Get the latestRemovedXid from the table entries pointed to by the non-pivot
1238 * tuples being deleted.
1240 * This is a specialized version of index_compute_xid_horizon_for_tuples().
1241 * It's needed because btree tuples don't always store table TID using the
1242 * standard index tuple header field.
1244 static TransactionId
1245 _bt_xid_horizon(Relation rel
, Relation heapRel
, Page page
,
1246 OffsetNumber
*deletable
, int ndeletable
)
1248 TransactionId latestRemovedXid
= InvalidTransactionId
;
1253 /* Array will grow iff there are posting list tuples to consider */
1254 spacenhtids
= ndeletable
;
1256 htids
= (ItemPointer
) palloc(sizeof(ItemPointerData
) * spacenhtids
);
1257 for (int i
= 0; i
< ndeletable
; i
++)
1262 itemid
= PageGetItemId(page
, deletable
[i
]);
1263 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
1265 Assert(ItemIdIsDead(itemid
));
1266 Assert(!BTreeTupleIsPivot(itup
));
1268 if (!BTreeTupleIsPosting(itup
))
1270 if (nhtids
+ 1 > spacenhtids
)
1273 htids
= (ItemPointer
)
1274 repalloc(htids
, sizeof(ItemPointerData
) * spacenhtids
);
1277 Assert(ItemPointerIsValid(&itup
->t_tid
));
1278 ItemPointerCopy(&itup
->t_tid
, &htids
[nhtids
]);
1283 int nposting
= BTreeTupleGetNPosting(itup
);
1285 if (nhtids
+ nposting
> spacenhtids
)
1287 spacenhtids
= Max(spacenhtids
* 2, nhtids
+ nposting
);
1288 htids
= (ItemPointer
)
1289 repalloc(htids
, sizeof(ItemPointerData
) * spacenhtids
);
1292 for (int j
= 0; j
< nposting
; j
++)
1294 ItemPointer htid
= BTreeTupleGetPostingN(itup
, j
);
1296 Assert(ItemPointerIsValid(htid
));
1297 ItemPointerCopy(htid
, &htids
[nhtids
]);
1303 Assert(nhtids
>= ndeletable
);
1306 table_compute_xid_horizon_for_tuples(heapRel
, htids
, nhtids
);
1310 return latestRemovedXid
;
1314 * Returns true, if the given block has the half-dead flag set.
1317 _bt_is_page_halfdead(Relation rel
, BlockNumber blk
)
1321 BTPageOpaque opaque
;
1324 buf
= _bt_getbuf(rel
, blk
, BT_READ
);
1325 page
= BufferGetPage(buf
);
1326 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1328 result
= P_ISHALFDEAD(opaque
);
1329 _bt_relbuf(rel
, buf
);
1335 * Subroutine to find the parent of the branch we're deleting. This climbs
1336 * up the tree until it finds a page with more than one child, i.e. a page
1337 * that will not be totally emptied by the deletion. The chain of pages below
1338 * it, with one downlink each, will form the branch that we need to delete.
1340 * If we cannot remove the downlink from the parent, because it's the
1341 * rightmost entry, returns false. On success, *topparent and *topoff are set
1342 * to the buffer holding the parent, and the offset of the downlink in it.
1343 * *topparent is write-locked, the caller is responsible for releasing it when
1344 * done. *target is set to the topmost page in the branch to-be-deleted, i.e.
1345 * the page whose downlink *topparent / *topoff point to, and *rightsib to its
1348 * "child" is the leaf page we wish to delete, and "stack" is a search stack
1349 * leading to it (it actually leads to the leftmost leaf page with a high key
1350 * matching that of the page to be deleted in !heapkeyspace indexes). Note
1351 * that we will update the stack entry(s) to reflect current downlink
1352 * positions --- this is essentially the same as the corresponding step of
1353 * splitting, and is not expected to affect caller. The caller should
1354 * initialize *target and *rightsib to the leaf page and its right sibling.
1356 * Note: it's OK to release page locks on any internal pages between the leaf
1357 * and *topparent, because a safe deletion can't become unsafe due to
1358 * concurrent activity. An internal page can only acquire an entry if the
1359 * child is split, but that cannot happen as long as we hold a lock on the
1363 _bt_lock_branch_parent(Relation rel
, BlockNumber child
, BTStack stack
,
1364 Buffer
*topparent
, OffsetNumber
*topoff
,
1365 BlockNumber
*target
, BlockNumber
*rightsib
)
1368 OffsetNumber poffset
,
1372 BTPageOpaque opaque
;
1373 BlockNumber leftsib
;
1376 * Locate the downlink of "child" in the parent, updating the stack entry
1377 * if needed. This is how !heapkeyspace indexes deal with having
1378 * non-unique high keys in leaf level pages. Even heapkeyspace indexes
1379 * can have a stale stack due to insertions into the parent.
1381 pbuf
= _bt_getstackbuf(rel
, stack
, child
);
1382 if (pbuf
== InvalidBuffer
)
1384 (errcode(ERRCODE_INDEX_CORRUPTED
),
1385 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
1386 RelationGetRelationName(rel
), child
)));
1387 parent
= stack
->bts_blkno
;
1388 poffset
= stack
->bts_offset
;
1390 page
= BufferGetPage(pbuf
);
1391 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1392 maxoff
= PageGetMaxOffsetNumber(page
);
1395 * If the target is the rightmost child of its parent, then we can't
1396 * delete, unless it's also the only child.
1398 if (poffset
>= maxoff
)
1400 /* It's rightmost child... */
1401 if (poffset
== P_FIRSTDATAKEY(opaque
))
1404 * It's only child, so safe if parent would itself be removable.
1405 * We have to check the parent itself, and then recurse to test
1406 * the conditions at the parent's parent.
1408 if (P_RIGHTMOST(opaque
) || P_ISROOT(opaque
) ||
1409 P_INCOMPLETE_SPLIT(opaque
))
1411 _bt_relbuf(rel
, pbuf
);
1416 *rightsib
= opaque
->btpo_next
;
1417 leftsib
= opaque
->btpo_prev
;
1419 _bt_relbuf(rel
, pbuf
);
1422 * Like in _bt_pagedel, check that the left sibling is not marked
1423 * with INCOMPLETE_SPLIT flag. That would mean that there is no
1424 * downlink to the page to be deleted, and the page deletion
1425 * algorithm isn't prepared to handle that.
1427 if (leftsib
!= P_NONE
)
1431 BTPageOpaque lopaque
;
1433 lbuf
= _bt_getbuf(rel
, leftsib
, BT_READ
);
1434 lpage
= BufferGetPage(lbuf
);
1435 lopaque
= (BTPageOpaque
) PageGetSpecialPointer(lpage
);
1438 * If the left sibling was concurrently split, so that its
1439 * next-pointer doesn't point to the current page anymore, the
1440 * split that created the current page must be completed. (We
1441 * don't allow splitting an incompletely split page again
1442 * until the previous split has been completed)
1444 if (lopaque
->btpo_next
== parent
&&
1445 P_INCOMPLETE_SPLIT(lopaque
))
1447 _bt_relbuf(rel
, lbuf
);
1450 _bt_relbuf(rel
, lbuf
);
1453 return _bt_lock_branch_parent(rel
, parent
, stack
->bts_parent
,
1454 topparent
, topoff
, target
, rightsib
);
1458 /* Unsafe to delete */
1459 _bt_relbuf(rel
, pbuf
);
1465 /* Not rightmost child, so safe to delete */
1473 * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
1475 * This action unlinks the page from the b-tree structure, removing all
1476 * pointers leading to it --- but not touching its own left and right links.
1477 * The page cannot be physically reclaimed right away, since other processes
1478 * may currently be trying to follow links leading to the page; they have to
1479 * be allowed to use its right-link to recover. See nbtree/README.
1481 * On entry, the target buffer must be pinned and locked (either read or write
1482 * lock is OK). This lock and pin will be dropped before exiting.
1484 * Returns the number of pages successfully deleted (zero if page cannot
1485 * be deleted now; could be more than one if parent or sibling pages were
1488 * NOTE: this leaks memory. Rather than trying to clean up everything
1489 * carefully, it's better to run it in a temp context that can be reset
1493 _bt_pagedel(Relation rel
, Buffer buf
)
1496 BlockNumber rightsib
;
1497 bool rightsib_empty
;
1499 BTPageOpaque opaque
;
1502 * "stack" is a search stack leading (approximately) to the target page.
1503 * It is initially NULL, but when iterating, we keep it to avoid
1504 * duplicated search effort.
1506 * Also, when "stack" is not NULL, we have already checked that the
1507 * current page is not the right half of an incomplete split, i.e. the
1508 * left sibling does not have its INCOMPLETE_SPLIT flag set.
1510 BTStack stack
= NULL
;
1514 page
= BufferGetPage(buf
);
1515 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1518 * Internal pages are never deleted directly, only as part of deleting
1519 * the whole branch all the way down to leaf level.
1521 if (!P_ISLEAF(opaque
))
1524 * Pre-9.4 page deletion only marked internal pages as half-dead,
1525 * but now we only use that flag on leaf pages. The old algorithm
1526 * was never supposed to leave half-dead pages in the tree, it was
1527 * just a transient state, but it was nevertheless possible in
1528 * error scenarios. We don't know how to deal with them here. They
1529 * are harmless as far as searches are considered, but inserts
1530 * into the deleted keyspace could add out-of-order downlinks in
1531 * the upper levels. Log a notice, hopefully the admin will notice
1534 if (P_ISHALFDEAD(opaque
))
1536 (errcode(ERRCODE_INDEX_CORRUPTED
),
1537 errmsg("index \"%s\" contains a half-dead internal page",
1538 RelationGetRelationName(rel
)),
1539 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1540 _bt_relbuf(rel
, buf
);
1545 * We can never delete rightmost pages nor root pages. While at it,
1546 * check that page is not already deleted and is empty.
1548 * To keep the algorithm simple, we also never delete an incompletely
1549 * split page (they should be rare enough that this doesn't make any
1550 * meaningful difference to disk usage):
1552 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1553 * left half of an incomplete split, but ensuring that it's not the
1554 * right half is more complicated. For that, we have to check that
1555 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1556 * the first iteration, we temporarily release the lock on the current
1557 * page, and check the left sibling and also construct a search stack
1558 * to. On subsequent iterations, we know we stepped right from a page
1559 * that passed these tests, so it's OK.
1561 if (P_RIGHTMOST(opaque
) || P_ISROOT(opaque
) || P_ISDELETED(opaque
) ||
1562 P_FIRSTDATAKEY(opaque
) <= PageGetMaxOffsetNumber(page
) ||
1563 P_INCOMPLETE_SPLIT(opaque
))
1565 /* Should never fail to delete a half-dead page */
1566 Assert(!P_ISHALFDEAD(opaque
));
1568 _bt_relbuf(rel
, buf
);
1573 * First, remove downlink pointing to the page (or a parent of the
1574 * page, if we are going to delete a taller branch), and mark the page
1577 if (!P_ISHALFDEAD(opaque
))
1580 * We need an approximate pointer to the page's parent page. We
1581 * use a variant of the standard search mechanism to search for
1582 * the page's high key; this will give us a link to either the
1583 * current parent or someplace to its left (if there are multiple
1584 * equal high keys, which is possible with !heapkeyspace indexes).
1586 * Also check if this is the right-half of an incomplete split
1587 * (see comment above).
1591 BTScanInsert itup_key
;
1593 IndexTuple targetkey
;
1595 BlockNumber leftsib
;
1597 itemid
= PageGetItemId(page
, P_HIKEY
);
1598 targetkey
= CopyIndexTuple((IndexTuple
) PageGetItem(page
, itemid
));
1600 leftsib
= opaque
->btpo_prev
;
1603 * To avoid deadlocks, we'd better drop the leaf page lock
1604 * before going further.
1606 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
1609 * Fetch the left sibling, to check that it's not marked with
1610 * INCOMPLETE_SPLIT flag. That would mean that the page
1611 * to-be-deleted doesn't have a downlink, and the page
1612 * deletion algorithm isn't prepared to handle that.
1614 if (leftsib
!= P_NONE
)
1616 BTPageOpaque lopaque
;
1619 lbuf
= _bt_getbuf(rel
, leftsib
, BT_READ
);
1620 lpage
= BufferGetPage(lbuf
);
1621 lopaque
= (BTPageOpaque
) PageGetSpecialPointer(lpage
);
1624 * If the left sibling is split again by another backend,
1625 * after we released the lock, we know that the first
1626 * split must have finished, because we don't allow an
1627 * incompletely-split page to be split again. So we don't
1628 * need to walk right here.
1630 if (lopaque
->btpo_next
== BufferGetBlockNumber(buf
) &&
1631 P_INCOMPLETE_SPLIT(lopaque
))
1634 _bt_relbuf(rel
, lbuf
);
1637 _bt_relbuf(rel
, lbuf
);
1640 /* we need an insertion scan key for the search, so build one */
1641 itup_key
= _bt_mkscankey(rel
, targetkey
);
1642 /* find the leftmost leaf page with matching pivot/high key */
1643 itup_key
->pivotsearch
= true;
1644 stack
= _bt_search(rel
, itup_key
, &lbuf
, BT_READ
, NULL
);
1645 /* don't need a lock or second pin on the page */
1646 _bt_relbuf(rel
, lbuf
);
1649 * Re-lock the leaf page, and start over, to re-check that the
1650 * page can still be deleted.
1652 LockBuffer(buf
, BT_WRITE
);
1656 if (!_bt_mark_page_halfdead(rel
, buf
, stack
))
1658 _bt_relbuf(rel
, buf
);
1664 * Then unlink it from its siblings. Each call to
1665 * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1666 * making it shallower. Iterate until the leaf page is gone.
1668 rightsib_empty
= false;
1669 while (P_ISHALFDEAD(opaque
))
1671 /* will check for interrupts, once lock is released */
1672 if (!_bt_unlink_halfdead_page(rel
, buf
, &rightsib_empty
))
1674 /* _bt_unlink_halfdead_page already released buffer */
1680 rightsib
= opaque
->btpo_next
;
1682 _bt_relbuf(rel
, buf
);
1685 * Check here, as calling loops will have locks held, preventing
1686 * interrupts from being processed.
1688 CHECK_FOR_INTERRUPTS();
1691 * The page has now been deleted. If its right sibling is completely
1692 * empty, it's possible that the reason we haven't deleted it earlier
1693 * is that it was the rightmost child of the parent. Now that we
1694 * removed the downlink for this page, the right sibling might now be
1695 * the only child of the parent, and could be removed. It would be
1696 * picked up by the next vacuum anyway, but might as well try to
1697 * remove it now, so loop back to process the right sibling.
1699 * Note: This relies on the assumption that _bt_getstackbuf() will be
1700 * able to reuse our original descent stack with a different child
1701 * block (provided that the child block is to the right of the
1702 * original leaf page reached by _bt_search()). It will even update
1703 * the descent stack each time we loop around, avoiding repeated work.
1705 if (!rightsib_empty
)
1708 buf
= _bt_getbuf(rel
, rightsib
, BT_WRITE
);
1715 * First stage of page deletion. Remove the downlink to the top of the
1716 * branch being deleted, and mark the leaf page as half-dead.
1719 _bt_mark_page_halfdead(Relation rel
, Buffer leafbuf
, BTStack stack
)
1721 BlockNumber leafblkno
;
1722 BlockNumber leafrightsib
;
1724 BlockNumber rightsib
;
1727 BTPageOpaque opaque
;
1729 OffsetNumber topoff
;
1730 OffsetNumber nextoffset
;
1732 IndexTupleData trunctuple
;
1734 page
= BufferGetPage(leafbuf
);
1735 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1737 Assert(!P_RIGHTMOST(opaque
) && !P_ISROOT(opaque
) && !P_ISDELETED(opaque
) &&
1738 !P_ISHALFDEAD(opaque
) && P_ISLEAF(opaque
) &&
1739 P_FIRSTDATAKEY(opaque
) > PageGetMaxOffsetNumber(page
));
1742 * Save info about the leaf page.
1744 leafblkno
= BufferGetBlockNumber(leafbuf
);
1745 leafrightsib
= opaque
->btpo_next
;
1748 * Before attempting to lock the parent page, check that the right sibling
1749 * is not in half-dead state. A half-dead right sibling would have no
1750 * downlink in the parent, which would be highly confusing later when we
1751 * delete the downlink that follows the current page's downlink. (I
1752 * believe the deletion would work correctly, but it would fail the
1753 * cross-check we make that the following downlink points to the right
1754 * sibling of the delete page.)
1756 if (_bt_is_page_halfdead(rel
, leafrightsib
))
1758 elog(DEBUG1
, "could not delete page %u because its right sibling %u is half-dead",
1759 leafblkno
, leafrightsib
);
1764 * We cannot delete a page that is the rightmost child of its immediate
1765 * parent, unless it is the only child --- in which case the parent has to
1766 * be deleted too, and the same condition applies recursively to it. We
1767 * have to check this condition all the way up before trying to delete,
1768 * and lock the final parent of the to-be-deleted subtree.
1770 * However, we won't need to repeat the above _bt_is_page_halfdead() check
1771 * for parent/ancestor pages because of the rightmost restriction. The
1772 * leaf check will apply to a right "cousin" leaf page rather than a
1773 * simple right sibling leaf page in cases where we actually go on to
1774 * perform internal page deletion. The right cousin leaf page is
1775 * representative of the left edge of the subtree to the right of the
1776 * to-be-deleted subtree as a whole. (Besides, internal pages are never
1777 * marked half-dead, so it isn't even possible to directly assess if an
1778 * internal page is part of some other to-be-deleted subtree.)
1780 rightsib
= leafrightsib
;
1782 if (!_bt_lock_branch_parent(rel
, leafblkno
, stack
,
1783 &topparent
, &topoff
, &target
, &rightsib
))
1787 * Check that the parent-page index items we're about to delete/overwrite
1788 * contain what we expect. This can fail if the index has become corrupt
1789 * for some reason. We want to throw any error before entering the
1790 * critical section --- otherwise it'd be a PANIC.
1792 * The test on the target item is just an Assert because
1793 * _bt_lock_branch_parent should have guaranteed it has the expected
1794 * contents. The test on the next-child downlink is known to sometimes
1795 * fail in the field, though.
1797 page
= BufferGetPage(topparent
);
1798 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1800 #ifdef USE_ASSERT_CHECKING
1801 itemid
= PageGetItemId(page
, topoff
);
1802 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
1803 Assert(BTreeTupleGetDownLink(itup
) == target
);
1806 nextoffset
= OffsetNumberNext(topoff
);
1807 itemid
= PageGetItemId(page
, nextoffset
);
1808 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
1809 if (BTreeTupleGetDownLink(itup
) != rightsib
)
1811 (errcode(ERRCODE_INDEX_CORRUPTED
),
1812 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
1813 rightsib
, target
, BTreeTupleGetDownLink(itup
),
1814 BufferGetBlockNumber(topparent
), RelationGetRelationName(rel
))));
1817 * Any insert which would have gone on the leaf block will now go to its
1820 PredicateLockPageCombine(rel
, leafblkno
, leafrightsib
);
1822 /* No ereport(ERROR) until changes are logged */
1823 START_CRIT_SECTION();
1826 * Update parent. The normal case is a tad tricky because we want to
1827 * delete the target's downlink and the *following* key. Easiest way is
1828 * to copy the right sibling's downlink over the target downlink, and then
1829 * delete the following item.
1831 page
= BufferGetPage(topparent
);
1832 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1834 itemid
= PageGetItemId(page
, topoff
);
1835 itup
= (IndexTuple
) PageGetItem(page
, itemid
);
1836 BTreeTupleSetDownLink(itup
, rightsib
);
1838 nextoffset
= OffsetNumberNext(topoff
);
1839 PageIndexTupleDelete(page
, nextoffset
);
1842 * Mark the leaf page as half-dead, and stamp it with a pointer to the
1843 * highest internal page in the branch we're deleting. We use the tid of
1844 * the high key to store it.
1846 page
= BufferGetPage(leafbuf
);
1847 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1848 opaque
->btpo_flags
|= BTP_HALF_DEAD
;
1850 Assert(PageGetMaxOffsetNumber(page
) == P_HIKEY
);
1851 MemSet(&trunctuple
, 0, sizeof(IndexTupleData
));
1852 trunctuple
.t_info
= sizeof(IndexTupleData
);
1853 if (target
!= leafblkno
)
1854 BTreeTupleSetTopParent(&trunctuple
, target
);
1856 BTreeTupleSetTopParent(&trunctuple
, InvalidBlockNumber
);
1858 if (!PageIndexTupleOverwrite(page
, P_HIKEY
, (Item
) &trunctuple
,
1859 IndexTupleSize(&trunctuple
)))
1860 elog(ERROR
, "could not overwrite high key in half-dead page");
1862 /* Must mark buffers dirty before XLogInsert */
1863 MarkBufferDirty(topparent
);
1864 MarkBufferDirty(leafbuf
);
1867 if (RelationNeedsWAL(rel
))
1869 xl_btree_mark_page_halfdead xlrec
;
1872 xlrec
.poffset
= topoff
;
1873 xlrec
.leafblk
= leafblkno
;
1874 if (target
!= leafblkno
)
1875 xlrec
.topparent
= target
;
1877 xlrec
.topparent
= InvalidBlockNumber
;
1880 XLogRegisterBuffer(0, leafbuf
, REGBUF_WILL_INIT
);
1881 XLogRegisterBuffer(1, topparent
, REGBUF_STANDARD
);
1883 page
= BufferGetPage(leafbuf
);
1884 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1885 xlrec
.leftblk
= opaque
->btpo_prev
;
1886 xlrec
.rightblk
= opaque
->btpo_next
;
1888 XLogRegisterData((char *) &xlrec
, SizeOfBtreeMarkPageHalfDead
);
1890 recptr
= XLogInsert(RM_BTREE_ID
, XLOG_BTREE_MARK_PAGE_HALFDEAD
);
1892 page
= BufferGetPage(topparent
);
1893 PageSetLSN(page
, recptr
);
1894 page
= BufferGetPage(leafbuf
);
1895 PageSetLSN(page
, recptr
);
1900 _bt_relbuf(rel
, topparent
);
1905 * Unlink a page in a branch of half-dead pages from its siblings.
1907 * If the leaf page still has a downlink pointing to it, unlinks the highest
1908 * parent in the to-be-deleted branch instead of the leaf page. To get rid
1909 * of the whole branch, including the leaf page itself, iterate until the
1910 * leaf page is deleted.
1912 * Returns 'false' if the page could not be unlinked (shouldn't happen).
1913 * If the (current) right sibling of the page is empty, *rightsib_empty is
1916 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
1917 * On success exit, we'll be holding pin and write lock. On failure exit,
1918 * we'll release both pin and lock before returning (we define it that way
1919 * to avoid having to reacquire a lock we already released).
1922 _bt_unlink_halfdead_page(Relation rel
, Buffer leafbuf
, bool *rightsib_empty
)
1924 BlockNumber leafblkno
= BufferGetBlockNumber(leafbuf
);
1925 BlockNumber leafleftsib
;
1926 BlockNumber leafrightsib
;
1928 BlockNumber leftsib
;
1929 BlockNumber rightsib
;
1930 Buffer lbuf
= InvalidBuffer
;
1933 Buffer metabuf
= InvalidBuffer
;
1935 BTMetaPageData
*metad
= NULL
;
1938 BTPageOpaque opaque
;
1939 bool rightsib_is_rightmost
;
1941 IndexTuple leafhikey
;
1942 BlockNumber nextchild
;
1944 page
= BufferGetPage(leafbuf
);
1945 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1947 Assert(P_ISLEAF(opaque
) && P_ISHALFDEAD(opaque
));
1950 * Remember some information about the leaf page.
1952 itemid
= PageGetItemId(page
, P_HIKEY
);
1953 leafhikey
= (IndexTuple
) PageGetItem(page
, itemid
);
1954 leafleftsib
= opaque
->btpo_prev
;
1955 leafrightsib
= opaque
->btpo_next
;
1957 LockBuffer(leafbuf
, BUFFER_LOCK_UNLOCK
);
1960 * Check here, as calling loops will have locks held, preventing
1961 * interrupts from being processed.
1963 CHECK_FOR_INTERRUPTS();
1966 * If the leaf page still has a parent pointing to it (or a chain of
1967 * parents), we don't unlink the leaf page yet, but the topmost remaining
1968 * parent in the branch. Set 'target' and 'buf' to reference the page
1969 * actually being unlinked.
1971 target
= BTreeTupleGetTopParent(leafhikey
);
1973 if (target
!= InvalidBlockNumber
)
1975 Assert(target
!= leafblkno
);
1977 /* fetch the block number of the topmost parent's left sibling */
1978 buf
= _bt_getbuf(rel
, target
, BT_READ
);
1979 page
= BufferGetPage(buf
);
1980 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
1981 leftsib
= opaque
->btpo_prev
;
1982 targetlevel
= opaque
->btpo
.level
;
1985 * To avoid deadlocks, we'd better drop the target page lock before
1988 LockBuffer(buf
, BUFFER_LOCK_UNLOCK
);
1995 leftsib
= leafleftsib
;
2000 * We have to lock the pages we need to modify in the standard order:
2001 * moving right, then up. Else we will deadlock against other writers.
2003 * So, first lock the leaf page, if it's not the target. Then find and
2004 * write-lock the current left sibling of the target page. The sibling
2005 * that was current a moment ago could have split, so we may have to move
2006 * right. This search could fail if either the sibling or the target page
2007 * was deleted by someone else meanwhile; if so, give up. (Right now,
2008 * that should never happen, since page deletion is only done in VACUUM
2009 * and there shouldn't be multiple VACUUMs concurrently on the same
2012 if (target
!= leafblkno
)
2013 LockBuffer(leafbuf
, BT_WRITE
);
2014 if (leftsib
!= P_NONE
)
2016 lbuf
= _bt_getbuf(rel
, leftsib
, BT_WRITE
);
2017 page
= BufferGetPage(lbuf
);
2018 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2019 while (P_ISDELETED(opaque
) || opaque
->btpo_next
!= target
)
2021 /* step right one page */
2022 leftsib
= opaque
->btpo_next
;
2023 _bt_relbuf(rel
, lbuf
);
2026 * It'd be good to check for interrupts here, but it's not easy to
2027 * do so because a lock is always held. This block isn't
2028 * frequently reached, so hopefully the consequences of not
2029 * checking interrupts aren't too bad.
2032 if (leftsib
== P_NONE
)
2034 elog(LOG
, "no left sibling (concurrent deletion?) of block %u in \"%s\"",
2036 RelationGetRelationName(rel
));
2037 if (target
!= leafblkno
)
2039 /* we have only a pin on target, but pin+lock on leafbuf */
2041 _bt_relbuf(rel
, leafbuf
);
2045 /* we have only a pin on leafbuf */
2046 ReleaseBuffer(leafbuf
);
2050 lbuf
= _bt_getbuf(rel
, leftsib
, BT_WRITE
);
2051 page
= BufferGetPage(lbuf
);
2052 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2056 lbuf
= InvalidBuffer
;
2059 * Next write-lock the target page itself. It should be okay to take just
2060 * a write lock not a superexclusive lock, since no scans would stop on an
2063 LockBuffer(buf
, BT_WRITE
);
2064 page
= BufferGetPage(buf
);
2065 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2068 * Check page is still empty etc, else abandon deletion. This is just for
2069 * paranoia's sake; a half-dead page cannot resurrect because there can be
2070 * only one vacuum process running at a time.
2072 if (P_RIGHTMOST(opaque
) || P_ISROOT(opaque
) || P_ISDELETED(opaque
))
2074 elog(ERROR
, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2075 target
, RelationGetRelationName(rel
));
2077 if (opaque
->btpo_prev
!= leftsib
)
2079 (errcode(ERRCODE_INDEX_CORRUPTED
),
2080 errmsg_internal("left link changed unexpectedly in block %u of index \"%s\"",
2081 target
, RelationGetRelationName(rel
))));
2083 if (target
== leafblkno
)
2085 if (P_FIRSTDATAKEY(opaque
) <= PageGetMaxOffsetNumber(page
) ||
2086 !P_ISLEAF(opaque
) || !P_ISHALFDEAD(opaque
))
2087 elog(ERROR
, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2088 target
, RelationGetRelationName(rel
));
2089 nextchild
= InvalidBlockNumber
;
2093 if (P_FIRSTDATAKEY(opaque
) != PageGetMaxOffsetNumber(page
) ||
2095 elog(ERROR
, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
2096 target
, RelationGetRelationName(rel
));
2098 /* remember the next non-leaf child down in the branch. */
2099 itemid
= PageGetItemId(page
, P_FIRSTDATAKEY(opaque
));
2100 nextchild
= BTreeTupleGetDownLink((IndexTuple
) PageGetItem(page
, itemid
));
2101 if (nextchild
== leafblkno
)
2102 nextchild
= InvalidBlockNumber
;
2106 * And next write-lock the (current) right sibling.
2108 rightsib
= opaque
->btpo_next
;
2109 rbuf
= _bt_getbuf(rel
, rightsib
, BT_WRITE
);
2110 page
= BufferGetPage(rbuf
);
2111 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2112 if (opaque
->btpo_prev
!= target
)
2114 (errcode(ERRCODE_INDEX_CORRUPTED
),
2115 errmsg_internal("right sibling's left-link doesn't match: "
2116 "block %u links to %u instead of expected %u in index \"%s\"",
2117 rightsib
, opaque
->btpo_prev
, target
,
2118 RelationGetRelationName(rel
))));
2119 rightsib_is_rightmost
= P_RIGHTMOST(opaque
);
2120 *rightsib_empty
= (P_FIRSTDATAKEY(opaque
) > PageGetMaxOffsetNumber(page
));
2123 * If we are deleting the next-to-last page on the target's level, then
2124 * the rightsib is a candidate to become the new fast root. (In theory, it
2125 * might be possible to push the fast root even further down, but the odds
2126 * of doing so are slim, and the locking considerations daunting.)
2128 * We can safely acquire a lock on the metapage here --- see comments for
2131 if (leftsib
== P_NONE
&& rightsib_is_rightmost
)
2133 page
= BufferGetPage(rbuf
);
2134 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2135 if (P_RIGHTMOST(opaque
))
2137 /* rightsib will be the only one left on the level */
2138 metabuf
= _bt_getbuf(rel
, BTREE_METAPAGE
, BT_WRITE
);
2139 metapg
= BufferGetPage(metabuf
);
2140 metad
= BTPageGetMeta(metapg
);
2143 * The expected case here is btm_fastlevel == targetlevel+1; if
2144 * the fastlevel is <= targetlevel, something is wrong, and we
2145 * choose to overwrite it to fix it.
2147 if (metad
->btm_fastlevel
> targetlevel
+ 1)
2149 /* no update wanted */
2150 _bt_relbuf(rel
, metabuf
);
2151 metabuf
= InvalidBuffer
;
2157 * Here we begin doing the deletion.
2160 /* No ereport(ERROR) until changes are logged */
2161 START_CRIT_SECTION();
2164 * Update siblings' side-links. Note the target page's side-links will
2165 * continue to point to the siblings. Asserts here are just rechecking
2166 * things we already verified above.
2168 if (BufferIsValid(lbuf
))
2170 page
= BufferGetPage(lbuf
);
2171 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2172 Assert(opaque
->btpo_next
== target
);
2173 opaque
->btpo_next
= rightsib
;
2175 page
= BufferGetPage(rbuf
);
2176 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2177 Assert(opaque
->btpo_prev
== target
);
2178 opaque
->btpo_prev
= leftsib
;
2181 * If we deleted a parent of the targeted leaf page, instead of the leaf
2182 * itself, update the leaf to point to the next remaining child in the
2185 if (target
!= leafblkno
)
2186 BTreeTupleSetTopParent(leafhikey
, nextchild
);
2189 * Mark the page itself deleted. It can be recycled when all current
2190 * transactions are gone. Storing GetTopTransactionId() would work, but
2191 * we're in VACUUM and would not otherwise have an XID. Having already
2192 * updated links to the target, ReadNewTransactionId() suffices as an
2193 * upper bound. Any scan having retained a now-stale link is advertising
2194 * in its PGXACT an xmin less than or equal to the value we read here. It
2195 * will continue to do so, holding back RecentGlobalXmin, for the duration
2198 page
= BufferGetPage(buf
);
2199 opaque
= (BTPageOpaque
) PageGetSpecialPointer(page
);
2200 opaque
->btpo_flags
&= ~BTP_HALF_DEAD
;
2201 opaque
->btpo_flags
|= BTP_DELETED
;
2202 opaque
->btpo
.xact
= ReadNewTransactionId();
2204 /* And update the metapage, if needed */
2205 if (BufferIsValid(metabuf
))
2207 /* upgrade metapage if needed */
2208 if (metad
->btm_version
< BTREE_NOVAC_VERSION
)
2209 _bt_upgrademetapage(metapg
);
2210 metad
->btm_fastroot
= rightsib
;
2211 metad
->btm_fastlevel
= targetlevel
;
2212 MarkBufferDirty(metabuf
);
2215 /* Must mark buffers dirty before XLogInsert */
2216 MarkBufferDirty(rbuf
);
2217 MarkBufferDirty(buf
);
2218 if (BufferIsValid(lbuf
))
2219 MarkBufferDirty(lbuf
);
2220 if (target
!= leafblkno
)
2221 MarkBufferDirty(leafbuf
);
2224 if (RelationNeedsWAL(rel
))
2226 xl_btree_unlink_page xlrec
;
2227 xl_btree_metadata xlmeta
;
2233 XLogRegisterBuffer(0, buf
, REGBUF_WILL_INIT
);
2234 if (BufferIsValid(lbuf
))
2235 XLogRegisterBuffer(1, lbuf
, REGBUF_STANDARD
);
2236 XLogRegisterBuffer(2, rbuf
, REGBUF_STANDARD
);
2237 if (target
!= leafblkno
)
2238 XLogRegisterBuffer(3, leafbuf
, REGBUF_WILL_INIT
);
2240 /* information on the unlinked block */
2241 xlrec
.leftsib
= leftsib
;
2242 xlrec
.rightsib
= rightsib
;
2243 xlrec
.btpo_xact
= opaque
->btpo
.xact
;
2245 /* information needed to recreate the leaf block (if not the target) */
2246 xlrec
.leafleftsib
= leafleftsib
;
2247 xlrec
.leafrightsib
= leafrightsib
;
2248 xlrec
.topparent
= nextchild
;
2250 XLogRegisterData((char *) &xlrec
, SizeOfBtreeUnlinkPage
);
2252 if (BufferIsValid(metabuf
))
2254 XLogRegisterBuffer(4, metabuf
, REGBUF_WILL_INIT
| REGBUF_STANDARD
);
2256 Assert(metad
->btm_version
>= BTREE_NOVAC_VERSION
);
2257 xlmeta
.version
= metad
->btm_version
;
2258 xlmeta
.root
= metad
->btm_root
;
2259 xlmeta
.level
= metad
->btm_level
;
2260 xlmeta
.fastroot
= metad
->btm_fastroot
;
2261 xlmeta
.fastlevel
= metad
->btm_fastlevel
;
2262 xlmeta
.oldest_btpo_xact
= metad
->btm_oldest_btpo_xact
;
2263 xlmeta
.last_cleanup_num_heap_tuples
= metad
->btm_last_cleanup_num_heap_tuples
;
2264 xlmeta
.allequalimage
= metad
->btm_allequalimage
;
2266 XLogRegisterBufData(4, (char *) &xlmeta
, sizeof(xl_btree_metadata
));
2267 xlinfo
= XLOG_BTREE_UNLINK_PAGE_META
;
2270 xlinfo
= XLOG_BTREE_UNLINK_PAGE
;
2272 recptr
= XLogInsert(RM_BTREE_ID
, xlinfo
);
2274 if (BufferIsValid(metabuf
))
2276 PageSetLSN(metapg
, recptr
);
2278 page
= BufferGetPage(rbuf
);
2279 PageSetLSN(page
, recptr
);
2280 page
= BufferGetPage(buf
);
2281 PageSetLSN(page
, recptr
);
2282 if (BufferIsValid(lbuf
))
2284 page
= BufferGetPage(lbuf
);
2285 PageSetLSN(page
, recptr
);
2287 if (target
!= leafblkno
)
2289 page
= BufferGetPage(leafbuf
);
2290 PageSetLSN(page
, recptr
);
2296 /* release metapage */
2297 if (BufferIsValid(metabuf
))
2298 _bt_relbuf(rel
, metabuf
);
2300 /* release siblings */
2301 if (BufferIsValid(lbuf
))
2302 _bt_relbuf(rel
, lbuf
);
2303 _bt_relbuf(rel
, rbuf
);
2306 * Release the target, if it was not the leaf block. The leaf is always
2309 if (target
!= leafblkno
)
2310 _bt_relbuf(rel
, buf
);