src/backend/access/spgist/spgvacuum.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * spgvacuum.c
   4  *        vacuum for SP-GiST
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * IDENTIFICATION
  11  *                      src/backend/access/spgist/spgvacuum.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/genam.h"
  19 #include "access/spgist_private.h"
  20 #include "access/spgxlog.h"
  21 #include "access/transam.h"
  22 #include "access/xloginsert.h"
  23 #include "catalog/storage_xlog.h"
  24 #include "commands/vacuum.h"
  25 #include "miscadmin.h"
  26 #include "storage/bufmgr.h"
  27 #include "storage/indexfsm.h"
  28 #include "storage/lmgr.h"
  29 #include "utils/snapmgr.h"
  30
  31
  32 /* Entry in pending-list of TIDs we need to revisit */
  33 typedef struct spgVacPendingItem
  34 {
  35         ItemPointerData tid;            /* redirection target to visit */
  36         bool            done;                   /* have we dealt with this? */
  37         struct spgVacPendingItem *next; /* list link */
  38 } spgVacPendingItem;
  39
  40 /* Local state for vacuum operations */
  41 typedef struct spgBulkDeleteState
  42 {
  43         /* Parameters passed in to spgvacuumscan */
  44         IndexVacuumInfo *info;
  45         IndexBulkDeleteResult *stats;
  46         IndexBulkDeleteCallback callback;
  47         void       *callback_state;
  48
  49         /* Additional working state */
  50         SpGistState spgstate;           /* for SPGiST operations that need one */
  51         spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
  52         TransactionId myXmin;           /* for detecting newly-added redirects */
  53         BlockNumber lastFilledBlock;    /* last non-deletable block */
  54 } spgBulkDeleteState;
  55
  56
  57 /*
  58  * Add TID to pendingList, but only if not already present.
  59  *
  60  * Note that new items are always appended at the end of the list; this
  61  * ensures that scans of the list don't miss items added during the scan.
  62  */
  63 static void
  64 spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
  65 {
  66         spgVacPendingItem *pitem;
  67         spgVacPendingItem **listLink;
  68
  69         /* search the list for pre-existing entry */
  70         listLink = &bds->pendingList;
  71         while (*listLink != NULL)
  72         {
  73                 pitem = *listLink;
  74                 if (ItemPointerEquals(tid, &pitem->tid))
  75                         return;                         /* already in list, do nothing */
  76                 listLink = &pitem->next;
  77         }
  78         /* not there, so append new entry */
  79         pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
  80         pitem->tid = *tid;
  81         pitem->done = false;
  82         pitem->next = NULL;
  83         *listLink = pitem;
  84 }
  85
  86 /*
  87  * Clear pendingList
  88  */
  89 static void
  90 spgClearPendingList(spgBulkDeleteState *bds)
  91 {
  92         spgVacPendingItem *pitem;
  93         spgVacPendingItem *nitem;
  94
  95         for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
  96         {
  97                 nitem = pitem->next;
  98                 /* All items in list should have been dealt with */
  99                 Assert(pitem->done);
 100                 pfree(pitem);
 101         }
 102         bds->pendingList = NULL;
 103 }
 104
 105 /*
 106  * Vacuum a regular (non-root) leaf page
 107  *
 108  * We must delete tuples that are targeted for deletion by the VACUUM,
 109  * but not move any tuples that are referenced by outside links; we assume
 110  * those are the ones that are heads of chains.
 111  *
 112  * If we find a REDIRECT that was made by a concurrently-running transaction,
 113  * we must add its target TID to pendingList.  (We don't try to visit the
 114  * target immediately, first because we don't want VACUUM locking more than
 115  * one buffer at a time, and second because the duplicate-filtering logic
 116  * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
 117  * loop in the face of continuous concurrent insertions.)
 118  *
 119  * If forPending is true, we are examining the page as a consequence of
 120  * chasing a redirect link, not as part of the normal sequential scan.
 121  * We still vacuum the page normally, but we don't increment the stats
 122  * about live tuples; else we'd double-count those tuples, since the page
 123  * has been or will be visited in the sequential scan as well.
 124  */
 125 static void
 126 vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
 127                            bool forPending)
 128 {
 129         Page            page = BufferGetPage(buffer);
 130         spgxlogVacuumLeaf xlrec;
 131         OffsetNumber toDead[MaxIndexTuplesPerPage];
 132         OffsetNumber toPlaceholder[MaxIndexTuplesPerPage];
 133         OffsetNumber moveSrc[MaxIndexTuplesPerPage];
 134         OffsetNumber moveDest[MaxIndexTuplesPerPage];
 135         OffsetNumber chainSrc[MaxIndexTuplesPerPage];
 136         OffsetNumber chainDest[MaxIndexTuplesPerPage];
 137         OffsetNumber predecessor[MaxIndexTuplesPerPage + 1];
 138         bool            deletable[MaxIndexTuplesPerPage + 1];
 139         int                     nDeletable;
 140         OffsetNumber i,
 141                                 max = PageGetMaxOffsetNumber(page);
 142
 143         memset(predecessor, 0, sizeof(predecessor));
 144         memset(deletable, 0, sizeof(deletable));
 145         nDeletable = 0;
 146
 147         /* Scan page, identify tuples to delete, accumulate stats */
 148         for (i = FirstOffsetNumber; i <= max; i++)
 149         {
 150                 SpGistLeafTuple lt;
 151
 152                 lt = (SpGistLeafTuple) PageGetItem(page,
 153                                                                                    PageGetItemId(page, i));
 154                 if (lt->tupstate == SPGIST_LIVE)
 155                 {
 156                         Assert(ItemPointerIsValid(&lt->heapPtr));
 157
 158                         if (bds->callback(&lt->heapPtr, bds->callback_state))
 159                         {
 160                                 bds->stats->tuples_removed += 1;
 161                                 deletable[i] = true;
 162                                 nDeletable++;
 163                         }
 164                         else
 165                         {
 166                                 if (!forPending)
 167                                         bds->stats->num_index_tuples += 1;
 168                         }
 169
 170                         /* Form predecessor map, too */
 171                         if (SGLT_GET_NEXTOFFSET(lt) != InvalidOffsetNumber)
 172                         {
 173                                 /* paranoia about corrupted chain links */
 174                                 if (SGLT_GET_NEXTOFFSET(lt) < FirstOffsetNumber ||
 175                                         SGLT_GET_NEXTOFFSET(lt) > max ||
 176                                         predecessor[SGLT_GET_NEXTOFFSET(lt)] != InvalidOffsetNumber)
 177                                         elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"",
 178                                                  BufferGetBlockNumber(buffer),
 179                                                  RelationGetRelationName(index));
 180                                 predecessor[SGLT_GET_NEXTOFFSET(lt)] = i;
 181                         }
 182                 }
 183                 else if (lt->tupstate == SPGIST_REDIRECT)
 184                 {
 185                         SpGistDeadTuple dt = (SpGistDeadTuple) lt;
 186
 187                         Assert(SGLT_GET_NEXTOFFSET(dt) == InvalidOffsetNumber);
 188                         Assert(ItemPointerIsValid(&dt->pointer));
 189
 190                         /*
 191                          * Add target TID to pending list if the redirection could have
 192                          * happened since VACUUM started.
 193                          *
 194                          * Note: we could make a tighter test by seeing if the xid is
 195                          * "running" according to the active snapshot; but snapmgr.c
 196                          * doesn't currently export a suitable API, and it's not entirely
 197                          * clear that a tighter test is worth the cycles anyway.
 198                          */
 199                         if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
 200                                 spgAddPendingTID(bds, &dt->pointer);
 201                 }
 202                 else
 203                 {
 204                         Assert(SGLT_GET_NEXTOFFSET(lt) == InvalidOffsetNumber);
 205                 }
 206         }
 207
 208         if (nDeletable == 0)
 209                 return;                                 /* nothing more to do */
 210
 211         /*----------
 212          * Figure out exactly what we have to do.  We do this separately from
 213          * actually modifying the page, mainly so that we have a representation
 214          * that can be dumped into WAL and then the replay code can do exactly
 215          * the same thing.  The output of this step consists of six arrays
 216          * describing four kinds of operations, to be performed in this order:
 217          *
 218          * toDead[]: tuple numbers to be replaced with DEAD tuples
 219          * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples
 220          * moveSrc[]: tuple numbers that need to be relocated to another offset
 221          * (replacing the tuple there) and then replaced with PLACEHOLDER tuples
 222          * moveDest[]: new locations for moveSrc tuples
 223          * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates
 224          * chainDest[]: new values of nextOffset for chainSrc members
 225          *
 226          * It's easiest to figure out what we have to do by processing tuple
 227          * chains, so we iterate over all the tuples (not just the deletable
 228          * ones!) to identify chain heads, then chase down each chain and make
 229          * work item entries for deletable tuples within the chain.
 230          *----------
 231          */
 232         xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0;
 233
 234         for (i = FirstOffsetNumber; i <= max; i++)
 235         {
 236                 SpGistLeafTuple head;
 237                 bool            interveningDeletable;
 238                 OffsetNumber prevLive;
 239                 OffsetNumber j;
 240
 241                 head = (SpGistLeafTuple) PageGetItem(page,
 242                                                                                          PageGetItemId(page, i));
 243                 if (head->tupstate != SPGIST_LIVE)
 244                         continue;                       /* can't be a chain member */
 245                 if (predecessor[i] != 0)
 246                         continue;                       /* not a chain head */
 247
 248                 /* initialize ... */
 249                 interveningDeletable = false;
 250                 prevLive = deletable[i] ? InvalidOffsetNumber : i;
 251
 252                 /* scan down the chain ... */
 253                 j = SGLT_GET_NEXTOFFSET(head);
 254                 while (j != InvalidOffsetNumber)
 255                 {
 256                         SpGistLeafTuple lt;
 257
 258                         lt = (SpGistLeafTuple) PageGetItem(page,
 259                                                                                            PageGetItemId(page, j));
 260                         if (lt->tupstate != SPGIST_LIVE)
 261                         {
 262                                 /* all tuples in chain should be live */
 263                                 elog(ERROR, "unexpected SPGiST tuple state: %d",
 264                                          lt->tupstate);
 265                         }
 266
 267                         if (deletable[j])
 268                         {
 269                                 /* This tuple should be replaced by a placeholder */
 270                                 toPlaceholder[xlrec.nPlaceholder] = j;
 271                                 xlrec.nPlaceholder++;
 272                                 /* previous live tuple's chain link will need an update */
 273                                 interveningDeletable = true;
 274                         }
 275                         else if (prevLive == InvalidOffsetNumber)
 276                         {
 277                                 /*
 278                                  * This is the first live tuple in the chain.  It has to move
 279                                  * to the head position.
 280                                  */
 281                                 moveSrc[xlrec.nMove] = j;
 282                                 moveDest[xlrec.nMove] = i;
 283                                 xlrec.nMove++;
 284                                 /* Chain updates will be applied after the move */
 285                                 prevLive = i;
 286                                 interveningDeletable = false;
 287                         }
 288                         else
 289                         {
 290                                 /*
 291                                  * Second or later live tuple.  Arrange to re-chain it to the
 292                                  * previous live one, if there was a gap.
 293                                  */
 294                                 if (interveningDeletable)
 295                                 {
 296                                         chainSrc[xlrec.nChain] = prevLive;
 297                                         chainDest[xlrec.nChain] = j;
 298                                         xlrec.nChain++;
 299                                 }
 300                                 prevLive = j;
 301                                 interveningDeletable = false;
 302                         }
 303
 304                         j = SGLT_GET_NEXTOFFSET(lt);
 305                 }
 306
 307                 if (prevLive == InvalidOffsetNumber)
 308                 {
 309                         /* The chain is entirely removable, so we need a DEAD tuple */
 310                         toDead[xlrec.nDead] = i;
 311                         xlrec.nDead++;
 312                 }
 313                 else if (interveningDeletable)
 314                 {
 315                         /* One or more deletions at end of chain, so close it off */
 316                         chainSrc[xlrec.nChain] = prevLive;
 317                         chainDest[xlrec.nChain] = InvalidOffsetNumber;
 318                         xlrec.nChain++;
 319                 }
 320         }
 321
 322         /* sanity check ... */
 323         if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove)
 324                 elog(ERROR, "inconsistent counts of deletable tuples");
 325
 326         /* Do the updates */
 327         START_CRIT_SECTION();
 328
 329         spgPageIndexMultiDelete(&bds->spgstate, page,
 330                                                         toDead, xlrec.nDead,
 331                                                         SPGIST_DEAD, SPGIST_DEAD,
 332                                                         InvalidBlockNumber, InvalidOffsetNumber);
 333
 334         spgPageIndexMultiDelete(&bds->spgstate, page,
 335                                                         toPlaceholder, xlrec.nPlaceholder,
 336                                                         SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
 337                                                         InvalidBlockNumber, InvalidOffsetNumber);
 338
 339         /*
 340          * We implement the move step by swapping the line pointers of the source
 341          * and target tuples, then replacing the newly-source tuples with
 342          * placeholders.  This is perhaps unduly friendly with the page data
 343          * representation, but it's fast and doesn't risk page overflow when a
 344          * tuple to be relocated is large.
 345          */
 346         for (i = 0; i < xlrec.nMove; i++)
 347         {
 348                 ItemId          idSrc = PageGetItemId(page, moveSrc[i]);
 349                 ItemId          idDest = PageGetItemId(page, moveDest[i]);
 350                 ItemIdData      tmp;
 351
 352                 tmp = *idSrc;
 353                 *idSrc = *idDest;
 354                 *idDest = tmp;
 355         }
 356
 357         spgPageIndexMultiDelete(&bds->spgstate, page,
 358                                                         moveSrc, xlrec.nMove,
 359                                                         SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
 360                                                         InvalidBlockNumber, InvalidOffsetNumber);
 361
 362         for (i = 0; i < xlrec.nChain; i++)
 363         {
 364                 SpGistLeafTuple lt;
 365
 366                 lt = (SpGistLeafTuple) PageGetItem(page,
 367                                                                                    PageGetItemId(page, chainSrc[i]));
 368                 Assert(lt->tupstate == SPGIST_LIVE);
 369                 SGLT_SET_NEXTOFFSET(lt, chainDest[i]);
 370         }
 371
 372         MarkBufferDirty(buffer);
 373
 374         if (RelationNeedsWAL(index))
 375         {
 376                 XLogRecPtr      recptr;
 377
 378                 XLogBeginInsert();
 379
 380                 STORE_STATE(&bds->spgstate, xlrec.stateSrc);
 381
 382                 XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumLeaf);
 383                 /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
 384                 XLogRegisterData((char *) toDead, sizeof(OffsetNumber) * xlrec.nDead);
 385                 XLogRegisterData((char *) toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder);
 386                 XLogRegisterData((char *) moveSrc, sizeof(OffsetNumber) * xlrec.nMove);
 387                 XLogRegisterData((char *) moveDest, sizeof(OffsetNumber) * xlrec.nMove);
 388                 XLogRegisterData((char *) chainSrc, sizeof(OffsetNumber) * xlrec.nChain);
 389                 XLogRegisterData((char *) chainDest, sizeof(OffsetNumber) * xlrec.nChain);
 390
 391                 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
 392
 393                 recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF);
 394
 395                 PageSetLSN(page, recptr);
 396         }
 397
 398         END_CRIT_SECTION();
 399 }
 400
 401 /*
 402  * Vacuum a root page when it is also a leaf
 403  *
 404  * On the root, we just delete any dead leaf tuples; no fancy business
 405  */
 406 static void
 407 vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
 408 {
 409         Page            page = BufferGetPage(buffer);
 410         spgxlogVacuumRoot xlrec;
 411         OffsetNumber toDelete[MaxIndexTuplesPerPage];
 412         OffsetNumber i,
 413                                 max = PageGetMaxOffsetNumber(page);
 414
 415         xlrec.nDelete = 0;
 416
 417         /* Scan page, identify tuples to delete, accumulate stats */
 418         for (i = FirstOffsetNumber; i <= max; i++)
 419         {
 420                 SpGistLeafTuple lt;
 421
 422                 lt = (SpGistLeafTuple) PageGetItem(page,
 423                                                                                    PageGetItemId(page, i));
 424                 if (lt->tupstate == SPGIST_LIVE)
 425                 {
 426                         Assert(ItemPointerIsValid(&lt->heapPtr));
 427
 428                         if (bds->callback(&lt->heapPtr, bds->callback_state))
 429                         {
 430                                 bds->stats->tuples_removed += 1;
 431                                 toDelete[xlrec.nDelete] = i;
 432                                 xlrec.nDelete++;
 433                         }
 434                         else
 435                         {
 436                                 bds->stats->num_index_tuples += 1;
 437                         }
 438                 }
 439                 else
 440                 {
 441                         /* all tuples on root should be live */
 442                         elog(ERROR, "unexpected SPGiST tuple state: %d",
 443                                  lt->tupstate);
 444                 }
 445         }
 446
 447         if (xlrec.nDelete == 0)
 448                 return;                                 /* nothing more to do */
 449
 450         /* Do the update */
 451         START_CRIT_SECTION();
 452
 453         /* The tuple numbers are in order, so we can use PageIndexMultiDelete */
 454         PageIndexMultiDelete(page, toDelete, xlrec.nDelete);
 455
 456         MarkBufferDirty(buffer);
 457
 458         if (RelationNeedsWAL(index))
 459         {
 460                 XLogRecPtr      recptr;
 461
 462                 XLogBeginInsert();
 463
 464                 /* Prepare WAL record */
 465                 STORE_STATE(&bds->spgstate, xlrec.stateSrc);
 466
 467                 XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRoot);
 468                 /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
 469                 XLogRegisterData((char *) toDelete,
 470                                                  sizeof(OffsetNumber) * xlrec.nDelete);
 471
 472                 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
 473
 474                 recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT);
 475
 476                 PageSetLSN(page, recptr);
 477         }
 478
 479         END_CRIT_SECTION();
 480 }
 481
 482 /*
 483  * Clean up redirect and placeholder tuples on the given page
 484  *
 485  * Redirect tuples can be marked placeholder once they're old enough.
 486  * Placeholder tuples can be removed if it won't change the offsets of
 487  * non-placeholder ones.
 488  *
 489  * Unlike the routines above, this works on both leaf and inner pages.
 490  */
 491 static void
 492 vacuumRedirectAndPlaceholder(Relation index, Buffer buffer)
 493 {
 494         Page            page = BufferGetPage(buffer);
 495         SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
 496         OffsetNumber i,
 497                                 max = PageGetMaxOffsetNumber(page),
 498                                 firstPlaceholder = InvalidOffsetNumber;
 499         bool            hasNonPlaceholder = false;
 500         bool            hasUpdate = false;
 501         OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage];
 502         OffsetNumber itemnos[MaxIndexTuplesPerPage];
 503         spgxlogVacuumRedirect xlrec;
 504         GlobalVisState *vistest;
 505
 506         xlrec.nToPlaceholder = 0;
 507         xlrec.snapshotConflictHorizon = InvalidTransactionId;
 508
 509         /* XXX: providing heap relation would allow more pruning */
 510         vistest = GlobalVisTestFor(NULL);
 511
 512         START_CRIT_SECTION();
 513
 514         /*
 515          * Scan backwards to convert old redirection tuples to placeholder tuples,
 516          * and identify location of last non-placeholder tuple while at it.
 517          */
 518         for (i = max;
 519                  i >= FirstOffsetNumber &&
 520                  (opaque->nRedirection > 0 || !hasNonPlaceholder);
 521                  i--)
 522         {
 523                 SpGistDeadTuple dt;
 524
 525                 dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i));
 526
 527                 if (dt->tupstate == SPGIST_REDIRECT &&
 528                         GlobalVisTestIsRemovableXid(vistest, dt->xid))
 529                 {
 530                         dt->tupstate = SPGIST_PLACEHOLDER;
 531                         Assert(opaque->nRedirection > 0);
 532                         opaque->nRedirection--;
 533                         opaque->nPlaceholder++;
 534
 535                         /* remember newest XID among the removed redirects */
 536                         if (!TransactionIdIsValid(xlrec.snapshotConflictHorizon) ||
 537                                 TransactionIdPrecedes(xlrec.snapshotConflictHorizon, dt->xid))
 538                                 xlrec.snapshotConflictHorizon = dt->xid;
 539
 540                         ItemPointerSetInvalid(&dt->pointer);
 541
 542                         itemToPlaceholder[xlrec.nToPlaceholder] = i;
 543                         xlrec.nToPlaceholder++;
 544
 545                         hasUpdate = true;
 546                 }
 547
 548                 if (dt->tupstate == SPGIST_PLACEHOLDER)
 549                 {
 550                         if (!hasNonPlaceholder)
 551                                 firstPlaceholder = i;
 552                 }
 553                 else
 554                 {
 555                         hasNonPlaceholder = true;
 556                 }
 557         }
 558
 559         /*
 560          * Any placeholder tuples at the end of page can safely be removed.  We
 561          * can't remove ones before the last non-placeholder, though, because we
 562          * can't alter the offset numbers of non-placeholder tuples.
 563          */
 564         if (firstPlaceholder != InvalidOffsetNumber)
 565         {
 566                 /*
 567                  * We do not store this array to rdata because it's easy to recreate.
 568                  */
 569                 for (i = firstPlaceholder; i <= max; i++)
 570                         itemnos[i - firstPlaceholder] = i;
 571
 572                 i = max - firstPlaceholder + 1;
 573                 Assert(opaque->nPlaceholder >= i);
 574                 opaque->nPlaceholder -= i;
 575
 576                 /* The array is surely sorted, so can use PageIndexMultiDelete */
 577                 PageIndexMultiDelete(page, itemnos, i);
 578
 579                 hasUpdate = true;
 580         }
 581
 582         xlrec.firstPlaceholder = firstPlaceholder;
 583
 584         if (hasUpdate)
 585                 MarkBufferDirty(buffer);
 586
 587         if (hasUpdate && RelationNeedsWAL(index))
 588         {
 589                 XLogRecPtr      recptr;
 590
 591                 XLogBeginInsert();
 592
 593                 XLogRegisterData((char *) &xlrec, SizeOfSpgxlogVacuumRedirect);
 594                 XLogRegisterData((char *) itemToPlaceholder,
 595                                                  sizeof(OffsetNumber) * xlrec.nToPlaceholder);
 596
 597                 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
 598
 599                 recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT);
 600
 601                 PageSetLSN(page, recptr);
 602         }
 603
 604         END_CRIT_SECTION();
 605 }
 606
 607 /*
 608  * Process one page during a bulkdelete scan
 609  */
 610 static void
 611 spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
 612 {
 613         Relation        index = bds->info->index;
 614         Buffer          buffer;
 615         Page            page;
 616
 617         /* call vacuum_delay_point while not holding any buffer lock */
 618         vacuum_delay_point();
 619
 620         buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
 621                                                                 RBM_NORMAL, bds->info->strategy);
 622         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 623         page = (Page) BufferGetPage(buffer);
 624
 625         if (PageIsNew(page))
 626         {
 627                 /*
 628                  * We found an all-zero page, which could happen if the database
 629                  * crashed just after extending the file.  Recycle it.
 630                  */
 631         }
 632         else if (PageIsEmpty(page))
 633         {
 634                 /* nothing to do */
 635         }
 636         else if (SpGistPageIsLeaf(page))
 637         {
 638                 if (SpGistBlockIsRoot(blkno))
 639                 {
 640                         vacuumLeafRoot(bds, index, buffer);
 641                         /* no need for vacuumRedirectAndPlaceholder */
 642                 }
 643                 else
 644                 {
 645                         vacuumLeafPage(bds, index, buffer, false);
 646                         vacuumRedirectAndPlaceholder(index, buffer);
 647                 }
 648         }
 649         else
 650         {
 651                 /* inner page */
 652                 vacuumRedirectAndPlaceholder(index, buffer);
 653         }
 654
 655         /*
 656          * The root pages must never be deleted, nor marked as available in FSM,
 657          * because we don't want them ever returned by a search for a place to put
 658          * a new tuple.  Otherwise, check for empty page, and make sure the FSM
 659          * knows about it.
 660          */
 661         if (!SpGistBlockIsRoot(blkno))
 662         {
 663                 if (PageIsNew(page) || PageIsEmpty(page))
 664                 {
 665                         RecordFreeIndexPage(index, blkno);
 666                         bds->stats->pages_deleted++;
 667                 }
 668                 else
 669                 {
 670                         SpGistSetLastUsedPage(index, buffer);
 671                         bds->lastFilledBlock = blkno;
 672                 }
 673         }
 674
 675         UnlockReleaseBuffer(buffer);
 676 }
 677
 678 /*
 679  * Process the pending-TID list between pages of the main scan
 680  */
 681 static void
 682 spgprocesspending(spgBulkDeleteState *bds)
 683 {
 684         Relation        index = bds->info->index;
 685         spgVacPendingItem *pitem;
 686         spgVacPendingItem *nitem;
 687         BlockNumber blkno;
 688         Buffer          buffer;
 689         Page            page;
 690
 691         for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
 692         {
 693                 if (pitem->done)
 694                         continue;                       /* ignore already-done items */
 695
 696                 /* call vacuum_delay_point while not holding any buffer lock */
 697                 vacuum_delay_point();
 698
 699                 /* examine the referenced page */
 700                 blkno = ItemPointerGetBlockNumber(&pitem->tid);
 701                 buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
 702                                                                         RBM_NORMAL, bds->info->strategy);
 703                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 704                 page = (Page) BufferGetPage(buffer);
 705
 706                 if (PageIsNew(page) || SpGistPageIsDeleted(page))
 707                 {
 708                         /* Probably shouldn't happen, but ignore it */
 709                 }
 710                 else if (SpGistPageIsLeaf(page))
 711                 {
 712                         if (SpGistBlockIsRoot(blkno))
 713                         {
 714                                 /* this should definitely not happen */
 715                                 elog(ERROR, "redirection leads to root page of index \"%s\"",
 716                                          RelationGetRelationName(index));
 717                         }
 718
 719                         /* deal with any deletable tuples */
 720                         vacuumLeafPage(bds, index, buffer, true);
 721                         /* might as well do this while we are here */
 722                         vacuumRedirectAndPlaceholder(index, buffer);
 723
 724                         SpGistSetLastUsedPage(index, buffer);
 725
 726                         /*
 727                          * We can mark as done not only this item, but any later ones
 728                          * pointing at the same page, since we vacuumed the whole page.
 729                          */
 730                         pitem->done = true;
 731                         for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
 732                         {
 733                                 if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
 734                                         nitem->done = true;
 735                         }
 736                 }
 737                 else
 738                 {
 739                         /*
 740                          * On an inner page, visit the referenced inner tuple and add all
 741                          * its downlinks to the pending list.  We might have pending items
 742                          * for more than one inner tuple on the same page (in fact this is
 743                          * pretty likely given the way space allocation works), so get
 744                          * them all while we are here.
 745                          */
 746                         for (nitem = pitem; nitem != NULL; nitem = nitem->next)
 747                         {
 748                                 if (nitem->done)
 749                                         continue;
 750                                 if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
 751                                 {
 752                                         OffsetNumber offset;
 753                                         SpGistInnerTuple innerTuple;
 754
 755                                         offset = ItemPointerGetOffsetNumber(&nitem->tid);
 756                                         innerTuple = (SpGistInnerTuple) PageGetItem(page,
 757                                                                                                                                 PageGetItemId(page, offset));
 758                                         if (innerTuple->tupstate == SPGIST_LIVE)
 759                                         {
 760                                                 SpGistNodeTuple node;
 761                                                 int                     i;
 762
 763                                                 SGITITERATE(innerTuple, i, node)
 764                                                 {
 765                                                         if (ItemPointerIsValid(&node->t_tid))
 766                                                                 spgAddPendingTID(bds, &node->t_tid);
 767                                                 }
 768                                         }
 769                                         else if (innerTuple->tupstate == SPGIST_REDIRECT)
 770                                         {
 771                                                 /* transfer attention to redirect point */
 772                                                 spgAddPendingTID(bds,
 773                                                                                  &((SpGistDeadTuple) innerTuple)->pointer);
 774                                         }
 775                                         else
 776                                                 elog(ERROR, "unexpected SPGiST tuple state: %d",
 777                                                          innerTuple->tupstate);
 778
 779                                         nitem->done = true;
 780                                 }
 781                         }
 782                 }
 783
 784                 UnlockReleaseBuffer(buffer);
 785         }
 786
 787         spgClearPendingList(bds);
 788 }
 789
 790 /*
 791  * Perform a bulkdelete scan
 792  */
 793 static void
 794 spgvacuumscan(spgBulkDeleteState *bds)
 795 {
 796         Relation        index = bds->info->index;
 797         bool            needLock;
 798         BlockNumber num_pages,
 799                                 blkno;
 800
 801         /* Finish setting up spgBulkDeleteState */
 802         initSpGistState(&bds->spgstate, index);
 803         bds->pendingList = NULL;
 804         bds->myXmin = GetActiveSnapshot()->xmin;
 805         bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
 806
 807         /*
 808          * Reset counts that will be incremented during the scan; needed in case
 809          * of multiple scans during a single VACUUM command
 810          */
 811         bds->stats->estimated_count = false;
 812         bds->stats->num_index_tuples = 0;
 813         bds->stats->pages_deleted = 0;
 814
 815         /* We can skip locking for new or temp relations */
 816         needLock = !RELATION_IS_LOCAL(index);
 817
 818         /*
 819          * The outer loop iterates over all index pages except the metapage, in
 820          * physical order (we hope the kernel will cooperate in providing
 821          * read-ahead for speed).  It is critical that we visit all leaf pages,
 822          * including ones added after we start the scan, else we might fail to
 823          * delete some deletable tuples.  See more extensive comments about this
 824          * in btvacuumscan().
 825          */
 826         blkno = SPGIST_METAPAGE_BLKNO + 1;
 827         for (;;)
 828         {
 829                 /* Get the current relation length */
 830                 if (needLock)
 831                         LockRelationForExtension(index, ExclusiveLock);
 832                 num_pages = RelationGetNumberOfBlocks(index);
 833                 if (needLock)
 834                         UnlockRelationForExtension(index, ExclusiveLock);
 835
 836                 /* Quit if we've scanned the whole relation */
 837                 if (blkno >= num_pages)
 838                         break;
 839                 /* Iterate over pages, then loop back to recheck length */
 840                 for (; blkno < num_pages; blkno++)
 841                 {
 842                         spgvacuumpage(bds, blkno);
 843                         /* empty the pending-list after each page */
 844                         if (bds->pendingList != NULL)
 845                                 spgprocesspending(bds);
 846                 }
 847         }
 848
 849         /* Propagate local lastUsedPages cache to metablock */
 850         SpGistUpdateMetaPage(index);
 851
 852         /*
 853          * If we found any empty pages (and recorded them in the FSM), then
 854          * forcibly update the upper-level FSM pages to ensure that searchers can
 855          * find them.  It's possible that the pages were also found during
 856          * previous scans and so this is a waste of time, but it's cheap enough
 857          * relative to scanning the index that it shouldn't matter much, and
 858          * making sure that free pages are available sooner not later seems
 859          * worthwhile.
 860          *
 861          * Note that if no empty pages exist, we don't bother vacuuming the FSM at
 862          * all.
 863          */
 864         if (bds->stats->pages_deleted > 0)
 865                 IndexFreeSpaceMapVacuum(index);
 866
 867         /*
 868          * Truncate index if possible
 869          *
 870          * XXX disabled because it's unsafe due to possible concurrent inserts.
 871          * We'd have to rescan the pages to make sure they're still empty, and it
 872          * doesn't seem worth it.  Note that btree doesn't do this either.
 873          *
 874          * Another reason not to truncate is that it could invalidate the cached
 875          * pages-with-freespace pointers in the metapage and other backends'
 876          * relation caches, that is leave them pointing to nonexistent pages.
 877          * Adding RelationGetNumberOfBlocks calls to protect the places that use
 878          * those pointers would be unduly expensive.
 879          */
 880 #ifdef NOT_USED
 881         if (num_pages > bds->lastFilledBlock + 1)
 882         {
 883                 BlockNumber lastBlock = num_pages - 1;
 884
 885                 num_pages = bds->lastFilledBlock + 1;
 886                 RelationTruncate(index, num_pages);
 887                 bds->stats->pages_removed += lastBlock - bds->lastFilledBlock;
 888                 bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock;
 889         }
 890 #endif
 891
 892         /* Report final stats */
 893         bds->stats->num_pages = num_pages;
 894         bds->stats->pages_newly_deleted = bds->stats->pages_deleted;
 895         bds->stats->pages_free = bds->stats->pages_deleted;
 896 }
 897
 898 /*
 899  * Bulk deletion of all index entries pointing to a set of heap tuples.
 900  * The set of target tuples is specified via a callback routine that tells
 901  * whether any given heap tuple (identified by ItemPointer) is being deleted.
 902  *
 903  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 904  */
 905 IndexBulkDeleteResult *
 906 spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 907                           IndexBulkDeleteCallback callback, void *callback_state)
 908 {
 909         spgBulkDeleteState bds;
 910
 911         /* allocate stats if first time through, else re-use existing struct */
 912         if (stats == NULL)
 913                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 914         bds.info = info;
 915         bds.stats = stats;
 916         bds.callback = callback;
 917         bds.callback_state = callback_state;
 918
 919         spgvacuumscan(&bds);
 920
 921         return stats;
 922 }
 923
 924 /* Dummy callback to delete no tuples during spgvacuumcleanup */
 925 static bool
 926 dummy_callback(ItemPointer itemptr, void *state)
 927 {
 928         return false;
 929 }
 930
 931 /*
 932  * Post-VACUUM cleanup.
 933  *
 934  * Result: a palloc'd struct containing statistical info for VACUUM displays.
 935  */
 936 IndexBulkDeleteResult *
 937 spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 938 {
 939         spgBulkDeleteState bds;
 940
 941         /* No-op in ANALYZE ONLY mode */
 942         if (info->analyze_only)
 943                 return stats;
 944
 945         /*
 946          * We don't need to scan the index if there was a preceding bulkdelete
 947          * pass.  Otherwise, make a pass that won't delete any live tuples, but
 948          * might still accomplish useful stuff with redirect/placeholder cleanup
 949          * and/or FSM housekeeping, and in any case will provide stats.
 950          */
 951         if (stats == NULL)
 952         {
 953                 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 954                 bds.info = info;
 955                 bds.stats = stats;
 956                 bds.callback = dummy_callback;
 957                 bds.callback_state = NULL;
 958
 959                 spgvacuumscan(&bds);
 960         }
 961
 962         /*
 963          * It's quite possible for us to be fooled by concurrent tuple moves into
 964          * double-counting some index tuples, so disbelieve any total that exceeds
 965          * the underlying heap's count ... if we know that accurately.  Otherwise
 966          * this might just make matters worse.
 967          */
 968         if (!info->estimated_count)
 969         {
 970                 if (stats->num_index_tuples > info->num_heap_tuples)
 971                         stats->num_index_tuples = info->num_heap_tuples;
 972         }
 973
 974         return stats;
 975 }