contrib/amcheck/verify_heapam.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * verify_heapam.c
   4  *        Functions to check postgresql heap relations for corruption
   5  *
   6  * Copyright (c) 2016-2024, PostgreSQL Global Development Group
   7  *
   8  *        contrib/amcheck/verify_heapam.c
   9  *-------------------------------------------------------------------------
  10  */
  11 #include "postgres.h"
  12
  13 #include "access/detoast.h"
  14 #include "access/genam.h"
  15 #include "access/heapam.h"
  16 #include "access/heaptoast.h"
  17 #include "access/multixact.h"
  18 #include "access/toast_internals.h"
  19 #include "access/visibilitymap.h"
  20 #include "catalog/pg_am.h"
  21 #include "funcapi.h"
  22 #include "miscadmin.h"
  23 #include "storage/bufmgr.h"
  24 #include "storage/procarray.h"
  25 #include "utils/builtins.h"
  26 #include "utils/fmgroids.h"
  27
  28 PG_FUNCTION_INFO_V1(verify_heapam);
  29
  30 /* The number of columns in tuples returned by verify_heapam */
  31 #define HEAPCHECK_RELATION_COLS 4
  32
  33 /* The largest valid toast va_rawsize */
  34 #define VARLENA_SIZE_LIMIT 0x3FFFFFFF
  35
  36 /*
  37  * Despite the name, we use this for reporting problems with both XIDs and
  38  * MXIDs.
  39  */
  40 typedef enum XidBoundsViolation
  41 {
  42         XID_INVALID,
  43         XID_IN_FUTURE,
  44         XID_PRECEDES_CLUSTERMIN,
  45         XID_PRECEDES_RELMIN,
  46         XID_BOUNDS_OK,
  47 } XidBoundsViolation;
  48
  49 typedef enum XidCommitStatus
  50 {
  51         XID_COMMITTED,
  52         XID_IS_CURRENT_XID,
  53         XID_IN_PROGRESS,
  54         XID_ABORTED,
  55 } XidCommitStatus;
  56
  57 typedef enum SkipPages
  58 {
  59         SKIP_PAGES_ALL_FROZEN,
  60         SKIP_PAGES_ALL_VISIBLE,
  61         SKIP_PAGES_NONE,
  62 } SkipPages;
  63
  64 /*
  65  * Struct holding information about a toasted attribute sufficient to both
  66  * check the toasted attribute and, if found to be corrupt, to report where it
  67  * was encountered in the main table.
  68  */
  69 typedef struct ToastedAttribute
  70 {
  71         struct varatt_external toast_pointer;
  72         BlockNumber blkno;                      /* block in main table */
  73         OffsetNumber offnum;            /* offset in main table */
  74         AttrNumber      attnum;                 /* attribute in main table */
  75 } ToastedAttribute;
  76
  77 /*
  78  * Struct holding the running context information during
  79  * a lifetime of a verify_heapam execution.
  80  */
  81 typedef struct HeapCheckContext
  82 {
  83         /*
  84          * Cached copies of values from TransamVariables and computed values from
  85          * them.
  86          */
  87         FullTransactionId next_fxid;    /* TransamVariables->nextXid */
  88         TransactionId next_xid;         /* 32-bit version of next_fxid */
  89         TransactionId oldest_xid;       /* TransamVariables->oldestXid */
  90         FullTransactionId oldest_fxid;  /* 64-bit version of oldest_xid, computed
  91                                                                          * relative to next_fxid */
  92         TransactionId safe_xmin;        /* this XID and newer ones can't become
  93                                                                  * all-visible while we're running */
  94
  95         /*
  96          * Cached copy of value from MultiXactState
  97          */
  98         MultiXactId next_mxact;         /* MultiXactState->nextMXact */
  99         MultiXactId oldest_mxact;       /* MultiXactState->oldestMultiXactId */
 100
 101         /*
 102          * Cached copies of the most recently checked xid and its status.
 103          */
 104         TransactionId cached_xid;
 105         XidCommitStatus cached_status;
 106
 107         /* Values concerning the heap relation being checked */
 108         Relation        rel;
 109         TransactionId relfrozenxid;
 110         FullTransactionId relfrozenfxid;
 111         TransactionId relminmxid;
 112         Relation        toast_rel;
 113         Relation   *toast_indexes;
 114         Relation        valid_toast_index;
 115         int                     num_toast_indexes;
 116
 117         /* Values for iterating over pages in the relation */
 118         BlockNumber blkno;
 119         BufferAccessStrategy bstrategy;
 120         Buffer          buffer;
 121         Page            page;
 122
 123         /* Values for iterating over tuples within a page */
 124         OffsetNumber offnum;
 125         ItemId          itemid;
 126         uint16          lp_len;
 127         uint16          lp_off;
 128         HeapTupleHeader tuphdr;
 129         int                     natts;
 130
 131         /* Values for iterating over attributes within the tuple */
 132         uint32          offset;                 /* offset in tuple data */
 133         AttrNumber      attnum;
 134
 135         /* True if tuple's xmax makes it eligible for pruning */
 136         bool            tuple_could_be_pruned;
 137
 138         /*
 139          * List of ToastedAttribute structs for toasted attributes which are not
 140          * eligible for pruning and should be checked
 141          */
 142         List       *toasted_attributes;
 143
 144         /* Whether verify_heapam has yet encountered any corrupt tuples */
 145         bool            is_corrupt;
 146
 147         /* The descriptor and tuplestore for verify_heapam's result tuples */
 148         TupleDesc       tupdesc;
 149         Tuplestorestate *tupstore;
 150 } HeapCheckContext;
 151
 152 /* Internal implementation */
 153 static void check_tuple(HeapCheckContext *ctx,
 154                                                 bool *xmin_commit_status_ok,
 155                                                 XidCommitStatus *xmin_commit_status);
 156 static void check_toast_tuple(HeapTuple toasttup, HeapCheckContext *ctx,
 157                                                           ToastedAttribute *ta, int32 *expected_chunk_seq,
 158                                                           uint32 extsize);
 159
 160 static bool check_tuple_attribute(HeapCheckContext *ctx);
 161 static void check_toasted_attribute(HeapCheckContext *ctx,
 162                                                                         ToastedAttribute *ta);
 163
 164 static bool check_tuple_header(HeapCheckContext *ctx);
 165 static bool check_tuple_visibility(HeapCheckContext *ctx,
 166                                                                    bool *xmin_commit_status_ok,
 167                                                                    XidCommitStatus *xmin_commit_status);
 168
 169 static void report_corruption(HeapCheckContext *ctx, char *msg);
 170 static void report_toast_corruption(HeapCheckContext *ctx,
 171                                                                         ToastedAttribute *ta, char *msg);
 172 static FullTransactionId FullTransactionIdFromXidAndCtx(TransactionId xid,
 173                                                                                                                 const HeapCheckContext *ctx);
 174 static void update_cached_xid_range(HeapCheckContext *ctx);
 175 static void update_cached_mxid_range(HeapCheckContext *ctx);
 176 static XidBoundsViolation check_mxid_in_range(MultiXactId mxid,
 177                                                                                           HeapCheckContext *ctx);
 178 static XidBoundsViolation check_mxid_valid_in_rel(MultiXactId mxid,
 179                                                                                                   HeapCheckContext *ctx);
 180 static XidBoundsViolation get_xid_status(TransactionId xid,
 181                                                                                  HeapCheckContext *ctx,
 182                                                                                  XidCommitStatus *status);
 183
 184 /*
 185  * Scan and report corruption in heap pages, optionally reconciling toasted
 186  * attributes with entries in the associated toast table.  Intended to be
 187  * called from SQL with the following parameters:
 188  *
 189  *   relation:
 190  *     The Oid of the heap relation to be checked.
 191  *
 192  *   on_error_stop:
 193  *     Whether to stop at the end of the first page for which errors are
 194  *     detected.  Note that multiple rows may be returned.
 195  *
 196  *   check_toast:
 197  *     Whether to check each toasted attribute against the toast table to
 198  *     verify that it can be found there.
 199  *
 200  *   skip:
 201  *     What kinds of pages in the heap relation should be skipped.  Valid
 202  *     options are "all-visible", "all-frozen", and "none".
 203  *
 204  * Returns to the SQL caller a set of tuples, each containing the location
 205  * and a description of a corruption found in the heap.
 206  *
 207  * This code goes to some trouble to avoid crashing the server even if the
 208  * table pages are badly corrupted, but it's probably not perfect. If
 209  * check_toast is true, we'll use regular index lookups to try to fetch TOAST
 210  * tuples, which can certainly cause crashes if the right kind of corruption
 211  * exists in the toast table or index. No matter what parameters you pass,
 212  * we can't protect against crashes that might occur trying to look up the
 213  * commit status of transaction IDs (though we avoid trying to do such lookups
 214  * for transaction IDs that can't legally appear in the table).
 215  */
 216 Datum
 217 verify_heapam(PG_FUNCTION_ARGS)
 218 {
 219         ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 220         HeapCheckContext ctx;
 221         Buffer          vmbuffer = InvalidBuffer;
 222         Oid                     relid;
 223         bool            on_error_stop;
 224         bool            check_toast;
 225         SkipPages       skip_option = SKIP_PAGES_NONE;
 226         BlockNumber first_block;
 227         BlockNumber last_block;
 228         BlockNumber nblocks;
 229         const char *skip;
 230
 231         /* Check supplied arguments */
 232         if (PG_ARGISNULL(0))
 233                 ereport(ERROR,
 234                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 235                                  errmsg("relation cannot be null")));
 236         relid = PG_GETARG_OID(0);
 237
 238         if (PG_ARGISNULL(1))
 239                 ereport(ERROR,
 240                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 241                                  errmsg("on_error_stop cannot be null")));
 242         on_error_stop = PG_GETARG_BOOL(1);
 243
 244         if (PG_ARGISNULL(2))
 245                 ereport(ERROR,
 246                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 247                                  errmsg("check_toast cannot be null")));
 248         check_toast = PG_GETARG_BOOL(2);
 249
 250         if (PG_ARGISNULL(3))
 251                 ereport(ERROR,
 252                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 253                                  errmsg("skip cannot be null")));
 254         skip = text_to_cstring(PG_GETARG_TEXT_PP(3));
 255         if (pg_strcasecmp(skip, "all-visible") == 0)
 256                 skip_option = SKIP_PAGES_ALL_VISIBLE;
 257         else if (pg_strcasecmp(skip, "all-frozen") == 0)
 258                 skip_option = SKIP_PAGES_ALL_FROZEN;
 259         else if (pg_strcasecmp(skip, "none") == 0)
 260                 skip_option = SKIP_PAGES_NONE;
 261         else
 262                 ereport(ERROR,
 263                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 264                                  errmsg("invalid skip option"),
 265                                  errhint("Valid skip options are \"all-visible\", \"all-frozen\", and \"none\".")));
 266
 267         memset(&ctx, 0, sizeof(HeapCheckContext));
 268         ctx.cached_xid = InvalidTransactionId;
 269         ctx.toasted_attributes = NIL;
 270
 271         /*
 272          * Any xmin newer than the xmin of our snapshot can't become all-visible
 273          * while we're running.
 274          */
 275         ctx.safe_xmin = GetTransactionSnapshot()->xmin;
 276
 277         /*
 278          * If we report corruption when not examining some individual attribute,
 279          * we need attnum to be reported as NULL.  Set that up before any
 280          * corruption reporting might happen.
 281          */
 282         ctx.attnum = -1;
 283
 284         /* Construct the tuplestore and tuple descriptor */
 285         InitMaterializedSRF(fcinfo, 0);
 286         ctx.tupdesc = rsinfo->setDesc;
 287         ctx.tupstore = rsinfo->setResult;
 288
 289         /* Open relation, check relkind and access method */
 290         ctx.rel = relation_open(relid, AccessShareLock);
 291
 292         /*
 293          * Check that a relation's relkind and access method are both supported.
 294          */
 295         if (!RELKIND_HAS_TABLE_AM(ctx.rel->rd_rel->relkind) &&
 296                 ctx.rel->rd_rel->relkind != RELKIND_SEQUENCE)
 297                 ereport(ERROR,
 298                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
 299                                  errmsg("cannot check relation \"%s\"",
 300                                                 RelationGetRelationName(ctx.rel)),
 301                                  errdetail_relkind_not_supported(ctx.rel->rd_rel->relkind)));
 302
 303         /*
 304          * Sequences always use heap AM, but they don't show that in the catalogs.
 305          * Other relkinds might be using a different AM, so check.
 306          */
 307         if (ctx.rel->rd_rel->relkind != RELKIND_SEQUENCE &&
 308                 ctx.rel->rd_rel->relam != HEAP_TABLE_AM_OID)
 309                 ereport(ERROR,
 310                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 311                                  errmsg("only heap AM is supported")));
 312
 313         /*
 314          * Early exit for unlogged relations during recovery.  These will have no
 315          * relation fork, so there won't be anything to check.  We behave as if
 316          * the relation is empty.
 317          */
 318         if (ctx.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
 319                 RecoveryInProgress())
 320         {
 321                 ereport(DEBUG1,
 322                                 (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
 323                                  errmsg("cannot verify unlogged relation \"%s\" during recovery, skipping",
 324                                                 RelationGetRelationName(ctx.rel))));
 325                 relation_close(ctx.rel, AccessShareLock);
 326                 PG_RETURN_NULL();
 327         }
 328
 329         /* Early exit if the relation is empty */
 330         nblocks = RelationGetNumberOfBlocks(ctx.rel);
 331         if (!nblocks)
 332         {
 333                 relation_close(ctx.rel, AccessShareLock);
 334                 PG_RETURN_NULL();
 335         }
 336
 337         ctx.bstrategy = GetAccessStrategy(BAS_BULKREAD);
 338         ctx.buffer = InvalidBuffer;
 339         ctx.page = NULL;
 340
 341         /* Validate block numbers, or handle nulls. */
 342         if (PG_ARGISNULL(4))
 343                 first_block = 0;
 344         else
 345         {
 346                 int64           fb = PG_GETARG_INT64(4);
 347
 348                 if (fb < 0 || fb >= nblocks)
 349                         ereport(ERROR,
 350                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 351                                          errmsg("starting block number must be between 0 and %u",
 352                                                         nblocks - 1)));
 353                 first_block = (BlockNumber) fb;
 354         }
 355         if (PG_ARGISNULL(5))
 356                 last_block = nblocks - 1;
 357         else
 358         {
 359                 int64           lb = PG_GETARG_INT64(5);
 360
 361                 if (lb < 0 || lb >= nblocks)
 362                         ereport(ERROR,
 363                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 364                                          errmsg("ending block number must be between 0 and %u",
 365                                                         nblocks - 1)));
 366                 last_block = (BlockNumber) lb;
 367         }
 368
 369         /* Optionally open the toast relation, if any. */
 370         if (ctx.rel->rd_rel->reltoastrelid && check_toast)
 371         {
 372                 int                     offset;
 373
 374                 /* Main relation has associated toast relation */
 375                 ctx.toast_rel = table_open(ctx.rel->rd_rel->reltoastrelid,
 376                                                                    AccessShareLock);
 377                 offset = toast_open_indexes(ctx.toast_rel,
 378                                                                         AccessShareLock,
 379                                                                         &(ctx.toast_indexes),
 380                                                                         &(ctx.num_toast_indexes));
 381                 ctx.valid_toast_index = ctx.toast_indexes[offset];
 382         }
 383         else
 384         {
 385                 /*
 386                  * Main relation has no associated toast relation, or we're
 387                  * intentionally skipping it.
 388                  */
 389                 ctx.toast_rel = NULL;
 390                 ctx.toast_indexes = NULL;
 391                 ctx.num_toast_indexes = 0;
 392         }
 393
 394         update_cached_xid_range(&ctx);
 395         update_cached_mxid_range(&ctx);
 396         ctx.relfrozenxid = ctx.rel->rd_rel->relfrozenxid;
 397         ctx.relfrozenfxid = FullTransactionIdFromXidAndCtx(ctx.relfrozenxid, &ctx);
 398         ctx.relminmxid = ctx.rel->rd_rel->relminmxid;
 399
 400         if (TransactionIdIsNormal(ctx.relfrozenxid))
 401                 ctx.oldest_xid = ctx.relfrozenxid;
 402
 403         for (ctx.blkno = first_block; ctx.blkno <= last_block; ctx.blkno++)
 404         {
 405                 OffsetNumber maxoff;
 406                 OffsetNumber predecessor[MaxOffsetNumber];
 407                 OffsetNumber successor[MaxOffsetNumber];
 408                 bool            lp_valid[MaxOffsetNumber];
 409                 bool            xmin_commit_status_ok[MaxOffsetNumber];
 410                 XidCommitStatus xmin_commit_status[MaxOffsetNumber];
 411
 412                 CHECK_FOR_INTERRUPTS();
 413
 414                 memset(predecessor, 0, sizeof(OffsetNumber) * MaxOffsetNumber);
 415
 416                 /* Optionally skip over all-frozen or all-visible blocks */
 417                 if (skip_option != SKIP_PAGES_NONE)
 418                 {
 419                         int32           mapbits;
 420
 421                         mapbits = (int32) visibilitymap_get_status(ctx.rel, ctx.blkno,
 422                                                                                                            &vmbuffer);
 423                         if (skip_option == SKIP_PAGES_ALL_FROZEN)
 424                         {
 425                                 if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
 426                                         continue;
 427                         }
 428
 429                         if (skip_option == SKIP_PAGES_ALL_VISIBLE)
 430                         {
 431                                 if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
 432                                         continue;
 433                         }
 434                 }
 435
 436                 /* Read and lock the next page. */
 437                 ctx.buffer = ReadBufferExtended(ctx.rel, MAIN_FORKNUM, ctx.blkno,
 438                                                                                 RBM_NORMAL, ctx.bstrategy);
 439                 LockBuffer(ctx.buffer, BUFFER_LOCK_SHARE);
 440                 ctx.page = BufferGetPage(ctx.buffer);
 441
 442                 /* Perform tuple checks */
 443                 maxoff = PageGetMaxOffsetNumber(ctx.page);
 444                 for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff;
 445                          ctx.offnum = OffsetNumberNext(ctx.offnum))
 446                 {
 447                         BlockNumber nextblkno;
 448                         OffsetNumber nextoffnum;
 449
 450                         successor[ctx.offnum] = InvalidOffsetNumber;
 451                         lp_valid[ctx.offnum] = false;
 452                         xmin_commit_status_ok[ctx.offnum] = false;
 453                         ctx.itemid = PageGetItemId(ctx.page, ctx.offnum);
 454
 455                         /* Skip over unused/dead line pointers */
 456                         if (!ItemIdIsUsed(ctx.itemid) || ItemIdIsDead(ctx.itemid))
 457                                 continue;
 458
 459                         /*
 460                          * If this line pointer has been redirected, check that it
 461                          * redirects to a valid offset within the line pointer array
 462                          */
 463                         if (ItemIdIsRedirected(ctx.itemid))
 464                         {
 465                                 OffsetNumber rdoffnum = ItemIdGetRedirect(ctx.itemid);
 466                                 ItemId          rditem;
 467
 468                                 if (rdoffnum < FirstOffsetNumber)
 469                                 {
 470                                         report_corruption(&ctx,
 471                                                                           psprintf("line pointer redirection to item at offset %u precedes minimum offset %u",
 472                                                                                            (unsigned) rdoffnum,
 473                                                                                            (unsigned) FirstOffsetNumber));
 474                                         continue;
 475                                 }
 476                                 if (rdoffnum > maxoff)
 477                                 {
 478                                         report_corruption(&ctx,
 479                                                                           psprintf("line pointer redirection to item at offset %u exceeds maximum offset %u",
 480                                                                                            (unsigned) rdoffnum,
 481                                                                                            (unsigned) maxoff));
 482                                         continue;
 483                                 }
 484
 485                                 /*
 486                                  * Since we've checked that this redirect points to a line
 487                                  * pointer between FirstOffsetNumber and maxoff, it should now
 488                                  * be safe to fetch the referenced line pointer. We expect it
 489                                  * to be LP_NORMAL; if not, that's corruption.
 490                                  */
 491                                 rditem = PageGetItemId(ctx.page, rdoffnum);
 492                                 if (!ItemIdIsUsed(rditem))
 493                                 {
 494                                         report_corruption(&ctx,
 495                                                                           psprintf("redirected line pointer points to an unused item at offset %u",
 496                                                                                            (unsigned) rdoffnum));
 497                                         continue;
 498                                 }
 499                                 else if (ItemIdIsDead(rditem))
 500                                 {
 501                                         report_corruption(&ctx,
 502                                                                           psprintf("redirected line pointer points to a dead item at offset %u",
 503                                                                                            (unsigned) rdoffnum));
 504                                         continue;
 505                                 }
 506                                 else if (ItemIdIsRedirected(rditem))
 507                                 {
 508                                         report_corruption(&ctx,
 509                                                                           psprintf("redirected line pointer points to another redirected line pointer at offset %u",
 510                                                                                            (unsigned) rdoffnum));
 511                                         continue;
 512                                 }
 513
 514                                 /*
 515                                  * Record the fact that this line pointer has passed basic
 516                                  * sanity checking, and also the offset number to which it
 517                                  * points.
 518                                  */
 519                                 lp_valid[ctx.offnum] = true;
 520                                 successor[ctx.offnum] = rdoffnum;
 521                                 continue;
 522                         }
 523
 524                         /* Sanity-check the line pointer's offset and length values */
 525                         ctx.lp_len = ItemIdGetLength(ctx.itemid);
 526                         ctx.lp_off = ItemIdGetOffset(ctx.itemid);
 527
 528                         if (ctx.lp_off != MAXALIGN(ctx.lp_off))
 529                         {
 530                                 report_corruption(&ctx,
 531                                                                   psprintf("line pointer to page offset %u is not maximally aligned",
 532                                                                                    ctx.lp_off));
 533                                 continue;
 534                         }
 535                         if (ctx.lp_len < MAXALIGN(SizeofHeapTupleHeader))
 536                         {
 537                                 report_corruption(&ctx,
 538                                                                   psprintf("line pointer length %u is less than the minimum tuple header size %u",
 539                                                                                    ctx.lp_len,
 540                                                                                    (unsigned) MAXALIGN(SizeofHeapTupleHeader)));
 541                                 continue;
 542                         }
 543                         if (ctx.lp_off + ctx.lp_len > BLCKSZ)
 544                         {
 545                                 report_corruption(&ctx,
 546                                                                   psprintf("line pointer to page offset %u with length %u ends beyond maximum page offset %u",
 547                                                                                    ctx.lp_off,
 548                                                                                    ctx.lp_len,
 549                                                                                    (unsigned) BLCKSZ));
 550                                 continue;
 551                         }
 552
 553                         /* It should be safe to examine the tuple's header, at least */
 554                         lp_valid[ctx.offnum] = true;
 555                         ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, ctx.itemid);
 556                         ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr);
 557
 558                         /* Ok, ready to check this next tuple */
 559                         check_tuple(&ctx,
 560                                                 &xmin_commit_status_ok[ctx.offnum],
 561                                                 &xmin_commit_status[ctx.offnum]);
 562
 563                         /*
 564                          * If the CTID field of this tuple seems to point to another tuple
 565                          * on the same page, record that tuple as the successor of this
 566                          * one.
 567                          */
 568                         nextblkno = ItemPointerGetBlockNumber(&(ctx.tuphdr)->t_ctid);
 569                         nextoffnum = ItemPointerGetOffsetNumber(&(ctx.tuphdr)->t_ctid);
 570                         if (nextblkno == ctx.blkno && nextoffnum != ctx.offnum &&
 571                                 nextoffnum >= FirstOffsetNumber && nextoffnum <= maxoff)
 572                                 successor[ctx.offnum] = nextoffnum;
 573                 }
 574
 575                 /*
 576                  * Update chain validation. Check each line pointer that's got a valid
 577                  * successor against that successor.
 578                  */
 579                 ctx.attnum = -1;
 580                 for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff;
 581                          ctx.offnum = OffsetNumberNext(ctx.offnum))
 582                 {
 583                         ItemId          curr_lp;
 584                         ItemId          next_lp;
 585                         HeapTupleHeader curr_htup;
 586                         HeapTupleHeader next_htup;
 587                         TransactionId curr_xmin;
 588                         TransactionId curr_xmax;
 589                         TransactionId next_xmin;
 590                         OffsetNumber nextoffnum = successor[ctx.offnum];
 591
 592                         /*
 593                          * The current line pointer may not have a successor, either
 594                          * because it's not valid or because it didn't point to anything.
 595                          * In either case, we have to give up.
 596                          *
 597                          * If the current line pointer does point to something, it's
 598                          * possible that the target line pointer isn't valid. We have to
 599                          * give up in that case, too.
 600                          */
 601                         if (nextoffnum == InvalidOffsetNumber || !lp_valid[nextoffnum])
 602                                 continue;
 603
 604                         /* We have two valid line pointers that we can examine. */
 605                         curr_lp = PageGetItemId(ctx.page, ctx.offnum);
 606                         next_lp = PageGetItemId(ctx.page, nextoffnum);
 607
 608                         /* Handle the cases where the current line pointer is a redirect. */
 609                         if (ItemIdIsRedirected(curr_lp))
 610                         {
 611                                 /*
 612                                  * We should not have set successor[ctx.offnum] to a value
 613                                  * other than InvalidOffsetNumber unless that line pointer is
 614                                  * LP_NORMAL.
 615                                  */
 616                                 Assert(ItemIdIsNormal(next_lp));
 617
 618                                 /* Can only redirect to a HOT tuple. */
 619                                 next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp);
 620                                 if (!HeapTupleHeaderIsHeapOnly(next_htup))
 621                                 {
 622                                         report_corruption(&ctx,
 623                                                                           psprintf("redirected line pointer points to a non-heap-only tuple at offset %u",
 624                                                                                            (unsigned) nextoffnum));
 625                                 }
 626
 627                                 /* HOT chains should not intersect. */
 628                                 if (predecessor[nextoffnum] != InvalidOffsetNumber)
 629                                 {
 630                                         report_corruption(&ctx,
 631                                                                           psprintf("redirect line pointer points to offset %u, but offset %u also points there",
 632                                                                                            (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum]));
 633                                         continue;
 634                                 }
 635
 636                                 /*
 637                                  * This redirect and the tuple to which it points seem to be
 638                                  * part of an update chain.
 639                                  */
 640                                 predecessor[nextoffnum] = ctx.offnum;
 641                                 continue;
 642                         }
 643
 644                         /*
 645                          * If the next line pointer is a redirect, or if it's a tuple but
 646                          * the XMAX of this tuple doesn't match the XMIN of the next
 647                          * tuple, then the two aren't part of the same update chain and
 648                          * there is nothing more to do.
 649                          */
 650                         if (ItemIdIsRedirected(next_lp))
 651                                 continue;
 652                         curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, curr_lp);
 653                         curr_xmax = HeapTupleHeaderGetUpdateXid(curr_htup);
 654                         next_htup = (HeapTupleHeader) PageGetItem(ctx.page, next_lp);
 655                         next_xmin = HeapTupleHeaderGetXmin(next_htup);
 656                         if (!TransactionIdIsValid(curr_xmax) ||
 657                                 !TransactionIdEquals(curr_xmax, next_xmin))
 658                                 continue;
 659
 660                         /* HOT chains should not intersect. */
 661                         if (predecessor[nextoffnum] != InvalidOffsetNumber)
 662                         {
 663                                 report_corruption(&ctx,
 664                                                                   psprintf("tuple points to new version at offset %u, but offset %u also points there",
 665                                                                                    (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum]));
 666                                 continue;
 667                         }
 668
 669                         /*
 670                          * This tuple and the tuple to which it points seem to be part of
 671                          * an update chain.
 672                          */
 673                         predecessor[nextoffnum] = ctx.offnum;
 674
 675                         /*
 676                          * If the current tuple is marked as HOT-updated, then the next
 677                          * tuple should be marked as a heap-only tuple. Conversely, if the
 678                          * current tuple isn't marked as HOT-updated, then the next tuple
 679                          * shouldn't be marked as a heap-only tuple.
 680                          *
 681                          * NB: Can't use HeapTupleHeaderIsHotUpdated() as it checks if
 682                          * hint bits indicate xmin/xmax aborted.
 683                          */
 684                         if (!(curr_htup->t_infomask2 & HEAP_HOT_UPDATED) &&
 685                                 HeapTupleHeaderIsHeapOnly(next_htup))
 686                         {
 687                                 report_corruption(&ctx,
 688                                                                   psprintf("non-heap-only update produced a heap-only tuple at offset %u",
 689                                                                                    (unsigned) nextoffnum));
 690                         }
 691                         if ((curr_htup->t_infomask2 & HEAP_HOT_UPDATED) &&
 692                                 !HeapTupleHeaderIsHeapOnly(next_htup))
 693                         {
 694                                 report_corruption(&ctx,
 695                                                                   psprintf("heap-only update produced a non-heap only tuple at offset %u",
 696                                                                                    (unsigned) nextoffnum));
 697                         }
 698
 699                         /*
 700                          * If the current tuple's xmin is still in progress but the
 701                          * successor tuple's xmin is committed, that's corruption.
 702                          *
 703                          * NB: We recheck the commit status of the current tuple's xmin
 704                          * here, because it might have committed after we checked it and
 705                          * before we checked the commit status of the successor tuple's
 706                          * xmin. This should be safe because the xmin itself can't have
 707                          * changed, only its commit status.
 708                          */
 709                         curr_xmin = HeapTupleHeaderGetXmin(curr_htup);
 710                         if (xmin_commit_status_ok[ctx.offnum] &&
 711                                 xmin_commit_status[ctx.offnum] == XID_IN_PROGRESS &&
 712                                 xmin_commit_status_ok[nextoffnum] &&
 713                                 xmin_commit_status[nextoffnum] == XID_COMMITTED &&
 714                                 TransactionIdIsInProgress(curr_xmin))
 715                         {
 716                                 report_corruption(&ctx,
 717                                                                   psprintf("tuple with in-progress xmin %u was updated to produce a tuple at offset %u with committed xmin %u",
 718                                                                                    (unsigned) curr_xmin,
 719                                                                                    (unsigned) ctx.offnum,
 720                                                                                    (unsigned) next_xmin));
 721                         }
 722
 723                         /*
 724                          * If the current tuple's xmin is aborted but the successor
 725                          * tuple's xmin is in-progress or committed, that's corruption.
 726                          */
 727                         if (xmin_commit_status_ok[ctx.offnum] &&
 728                                 xmin_commit_status[ctx.offnum] == XID_ABORTED &&
 729                                 xmin_commit_status_ok[nextoffnum])
 730                         {
 731                                 if (xmin_commit_status[nextoffnum] == XID_IN_PROGRESS)
 732                                         report_corruption(&ctx,
 733                                                                           psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with in-progress xmin %u",
 734                                                                                            (unsigned) curr_xmin,
 735                                                                                            (unsigned) ctx.offnum,
 736                                                                                            (unsigned) next_xmin));
 737                                 else if (xmin_commit_status[nextoffnum] == XID_COMMITTED)
 738                                         report_corruption(&ctx,
 739                                                                           psprintf("tuple with aborted xmin %u was updated to produce a tuple at offset %u with committed xmin %u",
 740                                                                                            (unsigned) curr_xmin,
 741                                                                                            (unsigned) ctx.offnum,
 742                                                                                            (unsigned) next_xmin));
 743                         }
 744                 }
 745
 746                 /*
 747                  * An update chain can start either with a non-heap-only tuple or with
 748                  * a redirect line pointer, but not with a heap-only tuple.
 749                  *
 750                  * (This check is in a separate loop because we need the predecessor
 751                  * array to be fully populated before we can perform it.)
 752                  */
 753                 for (ctx.offnum = FirstOffsetNumber;
 754                          ctx.offnum <= maxoff;
 755                          ctx.offnum = OffsetNumberNext(ctx.offnum))
 756                 {
 757                         if (xmin_commit_status_ok[ctx.offnum] &&
 758                                 (xmin_commit_status[ctx.offnum] == XID_COMMITTED ||
 759                                  xmin_commit_status[ctx.offnum] == XID_IN_PROGRESS) &&
 760                                 predecessor[ctx.offnum] == InvalidOffsetNumber)
 761                         {
 762                                 ItemId          curr_lp;
 763
 764                                 curr_lp = PageGetItemId(ctx.page, ctx.offnum);
 765                                 if (!ItemIdIsRedirected(curr_lp))
 766                                 {
 767                                         HeapTupleHeader curr_htup;
 768
 769                                         curr_htup = (HeapTupleHeader)
 770                                                 PageGetItem(ctx.page, curr_lp);
 771                                         if (HeapTupleHeaderIsHeapOnly(curr_htup))
 772                                                 report_corruption(&ctx,
 773                                                                                   psprintf("tuple is root of chain but is marked as heap-only tuple"));
 774                                 }
 775                         }
 776                 }
 777
 778                 /* clean up */
 779                 UnlockReleaseBuffer(ctx.buffer);
 780
 781                 /*
 782                  * Check any toast pointers from the page whose lock we just released
 783                  */
 784                 if (ctx.toasted_attributes != NIL)
 785                 {
 786                         ListCell   *cell;
 787
 788                         foreach(cell, ctx.toasted_attributes)
 789                                 check_toasted_attribute(&ctx, lfirst(cell));
 790                         list_free_deep(ctx.toasted_attributes);
 791                         ctx.toasted_attributes = NIL;
 792                 }
 793
 794                 if (on_error_stop && ctx.is_corrupt)
 795                         break;
 796         }
 797
 798         if (vmbuffer != InvalidBuffer)
 799                 ReleaseBuffer(vmbuffer);
 800
 801         /* Close the associated toast table and indexes, if any. */
 802         if (ctx.toast_indexes)
 803                 toast_close_indexes(ctx.toast_indexes, ctx.num_toast_indexes,
 804                                                         AccessShareLock);
 805         if (ctx.toast_rel)
 806                 table_close(ctx.toast_rel, AccessShareLock);
 807
 808         /* Close the main relation */
 809         relation_close(ctx.rel, AccessShareLock);
 810
 811         PG_RETURN_NULL();
 812 }
 813
 814 /*
 815  * Shared internal implementation for report_corruption and
 816  * report_toast_corruption.
 817  */
 818 static void
 819 report_corruption_internal(Tuplestorestate *tupstore, TupleDesc tupdesc,
 820                                                    BlockNumber blkno, OffsetNumber offnum,
 821                                                    AttrNumber attnum, char *msg)
 822 {
 823         Datum           values[HEAPCHECK_RELATION_COLS] = {0};
 824         bool            nulls[HEAPCHECK_RELATION_COLS] = {0};
 825         HeapTuple       tuple;
 826
 827         values[0] = Int64GetDatum(blkno);
 828         values[1] = Int32GetDatum(offnum);
 829         values[2] = Int32GetDatum(attnum);
 830         nulls[2] = (attnum < 0);
 831         values[3] = CStringGetTextDatum(msg);
 832
 833         /*
 834          * In principle, there is nothing to prevent a scan over a large, highly
 835          * corrupted table from using work_mem worth of memory building up the
 836          * tuplestore.  That's ok, but if we also leak the msg argument memory
 837          * until the end of the query, we could exceed work_mem by more than a
 838          * trivial amount.  Therefore, free the msg argument each time we are
 839          * called rather than waiting for our current memory context to be freed.
 840          */
 841         pfree(msg);
 842
 843         tuple = heap_form_tuple(tupdesc, values, nulls);
 844         tuplestore_puttuple(tupstore, tuple);
 845 }
 846
 847 /*
 848  * Record a single corruption found in the main table.  The values in ctx should
 849  * indicate the location of the corruption, and the msg argument should contain
 850  * a human-readable description of the corruption.
 851  *
 852  * The msg argument is pfree'd by this function.
 853  */
 854 static void
 855 report_corruption(HeapCheckContext *ctx, char *msg)
 856 {
 857         report_corruption_internal(ctx->tupstore, ctx->tupdesc, ctx->blkno,
 858                                                            ctx->offnum, ctx->attnum, msg);
 859         ctx->is_corrupt = true;
 860 }
 861
 862 /*
 863  * Record corruption found in the toast table.  The values in ta should
 864  * indicate the location in the main table where the toast pointer was
 865  * encountered, and the msg argument should contain a human-readable
 866  * description of the toast table corruption.
 867  *
 868  * As above, the msg argument is pfree'd by this function.
 869  */
 870 static void
 871 report_toast_corruption(HeapCheckContext *ctx, ToastedAttribute *ta,
 872                                                 char *msg)
 873 {
 874         report_corruption_internal(ctx->tupstore, ctx->tupdesc, ta->blkno,
 875                                                            ta->offnum, ta->attnum, msg);
 876         ctx->is_corrupt = true;
 877 }
 878
 879 /*
 880  * Check for tuple header corruption.
 881  *
 882  * Some kinds of corruption make it unsafe to check the tuple attributes, for
 883  * example when the line pointer refers to a range of bytes outside the page.
 884  * In such cases, we return false (not checkable) after recording appropriate
 885  * corruption messages.
 886  *
 887  * Some other kinds of tuple header corruption confuse the question of where
 888  * the tuple attributes begin, or how long the nulls bitmap is, etc., making it
 889  * unreasonable to attempt to check attributes, even if all candidate answers
 890  * to those questions would not result in reading past the end of the line
 891  * pointer or page.  In such cases, like above, we record corruption messages
 892  * about the header and then return false.
 893  *
 894  * Other kinds of tuple header corruption do not bear on the question of
 895  * whether the tuple attributes can be checked, so we record corruption
 896  * messages for them but we do not return false merely because we detected
 897  * them.
 898  *
 899  * Returns whether the tuple is sufficiently sensible to undergo visibility and
 900  * attribute checks.
 901  */
 902 static bool
 903 check_tuple_header(HeapCheckContext *ctx)
 904 {
 905         HeapTupleHeader tuphdr = ctx->tuphdr;
 906         uint16          infomask = tuphdr->t_infomask;
 907         TransactionId curr_xmax = HeapTupleHeaderGetUpdateXid(tuphdr);
 908         bool            result = true;
 909         unsigned        expected_hoff;
 910
 911         if (ctx->tuphdr->t_hoff > ctx->lp_len)
 912         {
 913                 report_corruption(ctx,
 914                                                   psprintf("data begins at offset %u beyond the tuple length %u",
 915                                                                    ctx->tuphdr->t_hoff, ctx->lp_len));
 916                 result = false;
 917         }
 918
 919         if ((ctx->tuphdr->t_infomask & HEAP_XMAX_COMMITTED) &&
 920                 (ctx->tuphdr->t_infomask & HEAP_XMAX_IS_MULTI))
 921         {
 922                 report_corruption(ctx,
 923                                                   pstrdup("multixact should not be marked committed"));
 924
 925                 /*
 926                  * This condition is clearly wrong, but it's not enough to justify
 927                  * skipping further checks, because we don't rely on this to determine
 928                  * whether the tuple is visible or to interpret other relevant header
 929                  * fields.
 930                  */
 931         }
 932
 933         if (!TransactionIdIsValid(curr_xmax) &&
 934                 HeapTupleHeaderIsHotUpdated(tuphdr))
 935         {
 936                 report_corruption(ctx,
 937                                                   psprintf("tuple has been HOT updated, but xmax is 0"));
 938
 939                 /*
 940                  * As above, even though this shouldn't happen, it's not sufficient
 941                  * justification for skipping further checks, we should still be able
 942                  * to perform sensibly.
 943                  */
 944         }
 945
 946         if (HeapTupleHeaderIsHeapOnly(tuphdr) &&
 947                 ((tuphdr->t_infomask & HEAP_UPDATED) == 0))
 948         {
 949                 report_corruption(ctx,
 950                                                   psprintf("tuple is heap only, but not the result of an update"));
 951
 952                 /* Here again, we can still perform further checks. */
 953         }
 954
 955         if (infomask & HEAP_HASNULL)
 956                 expected_hoff = MAXALIGN(SizeofHeapTupleHeader + BITMAPLEN(ctx->natts));
 957         else
 958                 expected_hoff = MAXALIGN(SizeofHeapTupleHeader);
 959         if (ctx->tuphdr->t_hoff != expected_hoff)
 960         {
 961                 if ((infomask & HEAP_HASNULL) && ctx->natts == 1)
 962                         report_corruption(ctx,
 963                                                           psprintf("tuple data should begin at byte %u, but actually begins at byte %u (1 attribute, has nulls)",
 964                                                                            expected_hoff, ctx->tuphdr->t_hoff));
 965                 else if ((infomask & HEAP_HASNULL))
 966                         report_corruption(ctx,
 967                                                           psprintf("tuple data should begin at byte %u, but actually begins at byte %u (%u attributes, has nulls)",
 968                                                                            expected_hoff, ctx->tuphdr->t_hoff, ctx->natts));
 969                 else if (ctx->natts == 1)
 970                         report_corruption(ctx,
 971                                                           psprintf("tuple data should begin at byte %u, but actually begins at byte %u (1 attribute, no nulls)",
 972                                                                            expected_hoff, ctx->tuphdr->t_hoff));
 973                 else
 974                         report_corruption(ctx,
 975                                                           psprintf("tuple data should begin at byte %u, but actually begins at byte %u (%u attributes, no nulls)",
 976                                                                            expected_hoff, ctx->tuphdr->t_hoff, ctx->natts));
 977                 result = false;
 978         }
 979
 980         return result;
 981 }
 982
 983 /*
 984  * Checks tuple visibility so we know which further checks are safe to
 985  * perform.
 986  *
 987  * If a tuple could have been inserted by a transaction that also added a
 988  * column to the table, but which ultimately did not commit, or which has not
 989  * yet committed, then the table's current TupleDesc might differ from the one
 990  * used to construct this tuple, so we must not check it.
 991  *
 992  * As a special case, if our own transaction inserted the tuple, even if we
 993  * added a column to the table, our TupleDesc should match.  We could check the
 994  * tuple, but choose not to do so.
 995  *
 996  * If a tuple has been updated or deleted, we can still read the old tuple for
 997  * corruption checking purposes, as long as we are careful about concurrent
 998  * vacuums.  The main table tuple itself cannot be vacuumed away because we
 999  * hold a buffer lock on the page, but if the deleting transaction is older
1000  * than our transaction snapshot's xmin, then vacuum could remove the toast at
1001  * any time, so we must not try to follow TOAST pointers.
1002  *
1003  * If xmin or xmax values are older than can be checked against clog, or appear
1004  * to be in the future (possibly due to wrap-around), then we cannot make a
1005  * determination about the visibility of the tuple, so we skip further checks.
1006  *
1007  * Returns true if the tuple itself should be checked, false otherwise.  Sets
1008  * ctx->tuple_could_be_pruned if the tuple -- and thus also any associated
1009  * TOAST tuples -- are eligible for pruning.
1010  *
1011  * Sets *xmin_commit_status_ok to true if the commit status of xmin is known
1012  * and false otherwise. If it's set to true, then also set *xmin_commit_status
1013  * to the actual commit status.
1014  */
1015 static bool
1016 check_tuple_visibility(HeapCheckContext *ctx, bool *xmin_commit_status_ok,
1017                                            XidCommitStatus *xmin_commit_status)
1018 {
1019         TransactionId xmin;
1020         TransactionId xvac;
1021         TransactionId xmax;
1022         XidCommitStatus xmin_status;
1023         XidCommitStatus xvac_status;
1024         XidCommitStatus xmax_status;
1025         HeapTupleHeader tuphdr = ctx->tuphdr;
1026
1027         ctx->tuple_could_be_pruned = true;      /* have not yet proven otherwise */
1028         *xmin_commit_status_ok = false; /* have not yet proven otherwise */
1029
1030         /* If xmin is normal, it should be within valid range */
1031         xmin = HeapTupleHeaderGetXmin(tuphdr);
1032         switch (get_xid_status(xmin, ctx, &xmin_status))
1033         {
1034                 case XID_INVALID:
1035                         /* Could be the result of a speculative insertion that aborted. */
1036                         return false;
1037                 case XID_BOUNDS_OK:
1038                         *xmin_commit_status_ok = true;
1039                         *xmin_commit_status = xmin_status;
1040                         break;
1041                 case XID_IN_FUTURE:
1042                         report_corruption(ctx,
1043                                                           psprintf("xmin %u equals or exceeds next valid transaction ID %u:%u",
1044                                                                            xmin,
1045                                                                            EpochFromFullTransactionId(ctx->next_fxid),
1046                                                                            XidFromFullTransactionId(ctx->next_fxid)));
1047                         return false;
1048                 case XID_PRECEDES_CLUSTERMIN:
1049                         report_corruption(ctx,
1050                                                           psprintf("xmin %u precedes oldest valid transaction ID %u:%u",
1051                                                                            xmin,
1052                                                                            EpochFromFullTransactionId(ctx->oldest_fxid),
1053                                                                            XidFromFullTransactionId(ctx->oldest_fxid)));
1054                         return false;
1055                 case XID_PRECEDES_RELMIN:
1056                         report_corruption(ctx,
1057                                                           psprintf("xmin %u precedes relation freeze threshold %u:%u",
1058                                                                            xmin,
1059                                                                            EpochFromFullTransactionId(ctx->relfrozenfxid),
1060                                                                            XidFromFullTransactionId(ctx->relfrozenfxid)));
1061                         return false;
1062         }
1063
1064         /*
1065          * Has inserting transaction committed?
1066          */
1067         if (!HeapTupleHeaderXminCommitted(tuphdr))
1068         {
1069                 if (HeapTupleHeaderXminInvalid(tuphdr))
1070                         return false;           /* inserter aborted, don't check */
1071                 /* Used by pre-9.0 binary upgrades */
1072                 else if (tuphdr->t_infomask & HEAP_MOVED_OFF)
1073                 {
1074                         xvac = HeapTupleHeaderGetXvac(tuphdr);
1075
1076                         switch (get_xid_status(xvac, ctx, &xvac_status))
1077                         {
1078                                 case XID_INVALID:
1079                                         report_corruption(ctx,
1080                                                                           pstrdup("old-style VACUUM FULL transaction ID for moved off tuple is invalid"));
1081                                         return false;
1082                                 case XID_IN_FUTURE:
1083                                         report_corruption(ctx,
1084                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple equals or exceeds next valid transaction ID %u:%u",
1085                                                                                            xvac,
1086                                                                                            EpochFromFullTransactionId(ctx->next_fxid),
1087                                                                                            XidFromFullTransactionId(ctx->next_fxid)));
1088                                         return false;
1089                                 case XID_PRECEDES_RELMIN:
1090                                         report_corruption(ctx,
1091                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple precedes relation freeze threshold %u:%u",
1092                                                                                            xvac,
1093                                                                                            EpochFromFullTransactionId(ctx->relfrozenfxid),
1094                                                                                            XidFromFullTransactionId(ctx->relfrozenfxid)));
1095                                         return false;
1096                                 case XID_PRECEDES_CLUSTERMIN:
1097                                         report_corruption(ctx,
1098                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple precedes oldest valid transaction ID %u:%u",
1099                                                                                            xvac,
1100                                                                                            EpochFromFullTransactionId(ctx->oldest_fxid),
1101                                                                                            XidFromFullTransactionId(ctx->oldest_fxid)));
1102                                         return false;
1103                                 case XID_BOUNDS_OK:
1104                                         break;
1105                         }
1106
1107                         switch (xvac_status)
1108                         {
1109                                 case XID_IS_CURRENT_XID:
1110                                         report_corruption(ctx,
1111                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple matches our current transaction ID",
1112                                                                                            xvac));
1113                                         return false;
1114                                 case XID_IN_PROGRESS:
1115                                         report_corruption(ctx,
1116                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved off tuple appears to be in progress",
1117                                                                                            xvac));
1118                                         return false;
1119
1120                                 case XID_COMMITTED:
1121
1122                                         /*
1123                                          * The tuple is dead, because the xvac transaction moved
1124                                          * it off and committed. It's checkable, but also
1125                                          * prunable.
1126                                          */
1127                                         return true;
1128
1129                                 case XID_ABORTED:
1130
1131                                         /*
1132                                          * The original xmin must have committed, because the xvac
1133                                          * transaction tried to move it later. Since xvac is
1134                                          * aborted, whether it's still alive now depends on the
1135                                          * status of xmax.
1136                                          */
1137                                         break;
1138                         }
1139                 }
1140                 /* Used by pre-9.0 binary upgrades */
1141                 else if (tuphdr->t_infomask & HEAP_MOVED_IN)
1142                 {
1143                         xvac = HeapTupleHeaderGetXvac(tuphdr);
1144
1145                         switch (get_xid_status(xvac, ctx, &xvac_status))
1146                         {
1147                                 case XID_INVALID:
1148                                         report_corruption(ctx,
1149                                                                           pstrdup("old-style VACUUM FULL transaction ID for moved in tuple is invalid"));
1150                                         return false;
1151                                 case XID_IN_FUTURE:
1152                                         report_corruption(ctx,
1153                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple equals or exceeds next valid transaction ID %u:%u",
1154                                                                                            xvac,
1155                                                                                            EpochFromFullTransactionId(ctx->next_fxid),
1156                                                                                            XidFromFullTransactionId(ctx->next_fxid)));
1157                                         return false;
1158                                 case XID_PRECEDES_RELMIN:
1159                                         report_corruption(ctx,
1160                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple precedes relation freeze threshold %u:%u",
1161                                                                                            xvac,
1162                                                                                            EpochFromFullTransactionId(ctx->relfrozenfxid),
1163                                                                                            XidFromFullTransactionId(ctx->relfrozenfxid)));
1164                                         return false;
1165                                 case XID_PRECEDES_CLUSTERMIN:
1166                                         report_corruption(ctx,
1167                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple precedes oldest valid transaction ID %u:%u",
1168                                                                                            xvac,
1169                                                                                            EpochFromFullTransactionId(ctx->oldest_fxid),
1170                                                                                            XidFromFullTransactionId(ctx->oldest_fxid)));
1171                                         return false;
1172                                 case XID_BOUNDS_OK:
1173                                         break;
1174                         }
1175
1176                         switch (xvac_status)
1177                         {
1178                                 case XID_IS_CURRENT_XID:
1179                                         report_corruption(ctx,
1180                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple matches our current transaction ID",
1181                                                                                            xvac));
1182                                         return false;
1183                                 case XID_IN_PROGRESS:
1184                                         report_corruption(ctx,
1185                                                                           psprintf("old-style VACUUM FULL transaction ID %u for moved in tuple appears to be in progress",
1186                                                                                            xvac));
1187                                         return false;
1188
1189                                 case XID_COMMITTED:
1190
1191                                         /*
1192                                          * The original xmin must have committed, because the xvac
1193                                          * transaction moved it later. Whether it's still alive
1194                                          * now depends on the status of xmax.
1195                                          */
1196                                         break;
1197
1198                                 case XID_ABORTED:
1199
1200                                         /*
1201                                          * The tuple is dead, because the xvac transaction moved
1202                                          * it off and committed. It's checkable, but also
1203                                          * prunable.
1204                                          */
1205                                         return true;
1206                         }
1207                 }
1208                 else if (xmin_status != XID_COMMITTED)
1209                 {
1210                         /*
1211                          * Inserting transaction is not in progress, and not committed, so
1212                          * it might have changed the TupleDesc in ways we don't know
1213                          * about. Thus, don't try to check the tuple structure.
1214                          *
1215                          * If xmin_status happens to be XID_IS_CURRENT_XID, then in theory
1216                          * any such DDL changes ought to be visible to us, so perhaps we
1217                          * could check anyway in that case. But, for now, let's be
1218                          * conservative and treat this like any other uncommitted insert.
1219                          */
1220                         return false;
1221                 }
1222         }
1223
1224         /*
1225          * Okay, the inserter committed, so it was good at some point.  Now what
1226          * about the deleting transaction?
1227          */
1228
1229         if (tuphdr->t_infomask & HEAP_XMAX_IS_MULTI)
1230         {
1231                 /*
1232                  * xmax is a multixact, so sanity-check the MXID. Note that we do this
1233                  * prior to checking for HEAP_XMAX_INVALID or
1234                  * HEAP_XMAX_IS_LOCKED_ONLY. This might therefore complain about
1235                  * things that wouldn't actually be a problem during a normal scan,
1236                  * but eventually we're going to have to freeze, and that process will
1237                  * ignore hint bits.
1238                  *
1239                  * Even if the MXID is out of range, we still know that the original
1240                  * insert committed, so we can check the tuple itself. However, we
1241                  * can't rule out the possibility that this tuple is dead, so don't
1242                  * clear ctx->tuple_could_be_pruned. Possibly we should go ahead and
1243                  * clear that flag anyway if HEAP_XMAX_INVALID is set or if
1244                  * HEAP_XMAX_IS_LOCKED_ONLY is true, but for now we err on the side of
1245                  * avoiding possibly-bogus complaints about missing TOAST entries.
1246                  */
1247                 xmax = HeapTupleHeaderGetRawXmax(tuphdr);
1248                 switch (check_mxid_valid_in_rel(xmax, ctx))
1249                 {
1250                         case XID_INVALID:
1251                                 report_corruption(ctx,
1252                                                                   pstrdup("multitransaction ID is invalid"));
1253                                 return true;
1254                         case XID_PRECEDES_RELMIN:
1255                                 report_corruption(ctx,
1256                                                                   psprintf("multitransaction ID %u precedes relation minimum multitransaction ID threshold %u",
1257                                                                                    xmax, ctx->relminmxid));
1258                                 return true;
1259                         case XID_PRECEDES_CLUSTERMIN:
1260                                 report_corruption(ctx,
1261                                                                   psprintf("multitransaction ID %u precedes oldest valid multitransaction ID threshold %u",
1262                                                                                    xmax, ctx->oldest_mxact));
1263                                 return true;
1264                         case XID_IN_FUTURE:
1265                                 report_corruption(ctx,
1266                                                                   psprintf("multitransaction ID %u equals or exceeds next valid multitransaction ID %u",
1267                                                                                    xmax,
1268                                                                                    ctx->next_mxact));
1269                                 return true;
1270                         case XID_BOUNDS_OK:
1271                                 break;
1272                 }
1273         }
1274
1275         if (tuphdr->t_infomask & HEAP_XMAX_INVALID)
1276         {
1277                 /*
1278                  * This tuple is live.  A concurrently running transaction could
1279                  * delete it before we get around to checking the toast, but any such
1280                  * running transaction is surely not less than our safe_xmin, so the
1281                  * toast cannot be vacuumed out from under us.
1282                  */
1283                 ctx->tuple_could_be_pruned = false;
1284                 return true;
1285         }
1286
1287         if (HEAP_XMAX_IS_LOCKED_ONLY(tuphdr->t_infomask))
1288         {
1289                 /*
1290                  * "Deleting" xact really only locked it, so the tuple is live in any
1291                  * case.  As above, a concurrently running transaction could delete
1292                  * it, but it cannot be vacuumed out from under us.
1293                  */
1294                 ctx->tuple_could_be_pruned = false;
1295                 return true;
1296         }
1297
1298         if (tuphdr->t_infomask & HEAP_XMAX_IS_MULTI)
1299         {
1300                 /*
1301                  * We already checked above that this multixact is within limits for
1302                  * this table.  Now check the update xid from this multixact.
1303                  */
1304                 xmax = HeapTupleGetUpdateXid(tuphdr);
1305                 switch (get_xid_status(xmax, ctx, &xmax_status))
1306                 {
1307                         case XID_INVALID:
1308                                 /* not LOCKED_ONLY, so it has to have an xmax */
1309                                 report_corruption(ctx,
1310                                                                   pstrdup("update xid is invalid"));
1311                                 return true;
1312                         case XID_IN_FUTURE:
1313                                 report_corruption(ctx,
1314                                                                   psprintf("update xid %u equals or exceeds next valid transaction ID %u:%u",
1315                                                                                    xmax,
1316                                                                                    EpochFromFullTransactionId(ctx->next_fxid),
1317                                                                                    XidFromFullTransactionId(ctx->next_fxid)));
1318                                 return true;
1319                         case XID_PRECEDES_RELMIN:
1320                                 report_corruption(ctx,
1321                                                                   psprintf("update xid %u precedes relation freeze threshold %u:%u",
1322                                                                                    xmax,
1323                                                                                    EpochFromFullTransactionId(ctx->relfrozenfxid),
1324                                                                                    XidFromFullTransactionId(ctx->relfrozenfxid)));
1325                                 return true;
1326                         case XID_PRECEDES_CLUSTERMIN:
1327                                 report_corruption(ctx,
1328                                                                   psprintf("update xid %u precedes oldest valid transaction ID %u:%u",
1329                                                                                    xmax,
1330                                                                                    EpochFromFullTransactionId(ctx->oldest_fxid),
1331                                                                                    XidFromFullTransactionId(ctx->oldest_fxid)));
1332                                 return true;
1333                         case XID_BOUNDS_OK:
1334                                 break;
1335                 }
1336
1337                 switch (xmax_status)
1338                 {
1339                         case XID_IS_CURRENT_XID:
1340                         case XID_IN_PROGRESS:
1341
1342                                 /*
1343                                  * The delete is in progress, so it cannot be visible to our
1344                                  * snapshot.
1345                                  */
1346                                 ctx->tuple_could_be_pruned = false;
1347                                 break;
1348                         case XID_COMMITTED:
1349
1350                                 /*
1351                                  * The delete committed.  Whether the toast can be vacuumed
1352                                  * away depends on how old the deleting transaction is.
1353                                  */
1354                                 ctx->tuple_could_be_pruned = TransactionIdPrecedes(xmax,
1355                                                                                                                                    ctx->safe_xmin);
1356                                 break;
1357                         case XID_ABORTED:
1358
1359                                 /*
1360                                  * The delete aborted or crashed.  The tuple is still live.
1361                                  */
1362                                 ctx->tuple_could_be_pruned = false;
1363                                 break;
1364                 }
1365
1366                 /* Tuple itself is checkable even if it's dead. */
1367                 return true;
1368         }
1369
1370         /* xmax is an XID, not a MXID. Sanity check it. */
1371         xmax = HeapTupleHeaderGetRawXmax(tuphdr);
1372         switch (get_xid_status(xmax, ctx, &xmax_status))
1373         {
1374                 case XID_INVALID:
1375                         ctx->tuple_could_be_pruned = false;
1376                         return true;
1377                 case XID_IN_FUTURE:
1378                         report_corruption(ctx,
1379                                                           psprintf("xmax %u equals or exceeds next valid transaction ID %u:%u",
1380                                                                            xmax,
1381                                                                            EpochFromFullTransactionId(ctx->next_fxid),
1382                                                                            XidFromFullTransactionId(ctx->next_fxid)));
1383                         return false;           /* corrupt */
1384                 case XID_PRECEDES_RELMIN:
1385                         report_corruption(ctx,
1386                                                           psprintf("xmax %u precedes relation freeze threshold %u:%u",
1387                                                                            xmax,
1388                                                                            EpochFromFullTransactionId(ctx->relfrozenfxid),
1389                                                                            XidFromFullTransactionId(ctx->relfrozenfxid)));
1390                         return false;           /* corrupt */
1391                 case XID_PRECEDES_CLUSTERMIN:
1392                         report_corruption(ctx,
1393                                                           psprintf("xmax %u precedes oldest valid transaction ID %u:%u",
1394                                                                            xmax,
1395                                                                            EpochFromFullTransactionId(ctx->oldest_fxid),
1396                                                                            XidFromFullTransactionId(ctx->oldest_fxid)));
1397                         return false;           /* corrupt */
1398                 case XID_BOUNDS_OK:
1399                         break;
1400         }
1401
1402         /*
1403          * Whether the toast can be vacuumed away depends on how old the deleting
1404          * transaction is.
1405          */
1406         switch (xmax_status)
1407         {
1408                 case XID_IS_CURRENT_XID:
1409                 case XID_IN_PROGRESS:
1410
1411                         /*
1412                          * The delete is in progress, so it cannot be visible to our
1413                          * snapshot.
1414                          */
1415                         ctx->tuple_could_be_pruned = false;
1416                         break;
1417
1418                 case XID_COMMITTED:
1419
1420                         /*
1421                          * The delete committed.  Whether the toast can be vacuumed away
1422                          * depends on how old the deleting transaction is.
1423                          */
1424                         ctx->tuple_could_be_pruned = TransactionIdPrecedes(xmax,
1425                                                                                                                            ctx->safe_xmin);
1426                         break;
1427
1428                 case XID_ABORTED:
1429
1430                         /*
1431                          * The delete aborted or crashed.  The tuple is still live.
1432                          */
1433                         ctx->tuple_could_be_pruned = false;
1434                         break;
1435         }
1436
1437         /* Tuple itself is checkable even if it's dead. */
1438         return true;
1439 }
1440
1441
1442 /*
1443  * Check the current toast tuple against the state tracked in ctx, recording
1444  * any corruption found in ctx->tupstore.
1445  *
1446  * This is not equivalent to running verify_heapam on the toast table itself,
1447  * and is not hardened against corruption of the toast table.  Rather, when
1448  * validating a toasted attribute in the main table, the sequence of toast
1449  * tuples that store the toasted value are retrieved and checked in order, with
1450  * each toast tuple being checked against where we are in the sequence, as well
1451  * as each toast tuple having its varlena structure sanity checked.
1452  *
1453  * On entry, *expected_chunk_seq should be the chunk_seq value that we expect
1454  * to find in toasttup. On exit, it will be updated to the value the next call
1455  * to this function should expect to see.
1456  */
1457 static void
1458 check_toast_tuple(HeapTuple toasttup, HeapCheckContext *ctx,
1459                                   ToastedAttribute *ta, int32 *expected_chunk_seq,
1460                                   uint32 extsize)
1461 {
1462         int32           chunk_seq;
1463         int32           last_chunk_seq = (extsize - 1) / TOAST_MAX_CHUNK_SIZE;
1464         Pointer         chunk;
1465         bool            isnull;
1466         int32           chunksize;
1467         int32           expected_size;
1468
1469         /* Sanity-check the sequence number. */
1470         chunk_seq = DatumGetInt32(fastgetattr(toasttup, 2,
1471                                                                                   ctx->toast_rel->rd_att, &isnull));
1472         if (isnull)
1473         {
1474                 report_toast_corruption(ctx, ta,
1475                                                                 psprintf("toast value %u has toast chunk with null sequence number",
1476                                                                                  ta->toast_pointer.va_valueid));
1477                 return;
1478         }
1479         if (chunk_seq != *expected_chunk_seq)
1480         {
1481                 /* Either the TOAST index is corrupt, or we don't have all chunks. */
1482                 report_toast_corruption(ctx, ta,
1483                                                                 psprintf("toast value %u index scan returned chunk %d when expecting chunk %d",
1484                                                                                  ta->toast_pointer.va_valueid,
1485                                                                                  chunk_seq, *expected_chunk_seq));
1486         }
1487         *expected_chunk_seq = chunk_seq + 1;
1488
1489         /* Sanity-check the chunk data. */
1490         chunk = DatumGetPointer(fastgetattr(toasttup, 3,
1491                                                                                 ctx->toast_rel->rd_att, &isnull));
1492         if (isnull)
1493         {
1494                 report_toast_corruption(ctx, ta,
1495                                                                 psprintf("toast value %u chunk %d has null data",
1496                                                                                  ta->toast_pointer.va_valueid,
1497                                                                                  chunk_seq));
1498                 return;
1499         }
1500         if (!VARATT_IS_EXTENDED(chunk))
1501                 chunksize = VARSIZE(chunk) - VARHDRSZ;
1502         else if (VARATT_IS_SHORT(chunk))
1503         {
1504                 /*
1505                  * could happen due to heap_form_tuple doing its thing
1506                  */
1507                 chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
1508         }
1509         else
1510         {
1511                 /* should never happen */
1512                 uint32          header = ((varattrib_4b *) chunk)->va_4byte.va_header;
1513
1514                 report_toast_corruption(ctx, ta,
1515                                                                 psprintf("toast value %u chunk %d has invalid varlena header %0x",
1516                                                                                  ta->toast_pointer.va_valueid,
1517                                                                                  chunk_seq, header));
1518                 return;
1519         }
1520
1521         /*
1522          * Some checks on the data we've found
1523          */
1524         if (chunk_seq > last_chunk_seq)
1525         {
1526                 report_toast_corruption(ctx, ta,
1527                                                                 psprintf("toast value %u chunk %d follows last expected chunk %d",
1528                                                                                  ta->toast_pointer.va_valueid,
1529                                                                                  chunk_seq, last_chunk_seq));
1530                 return;
1531         }
1532
1533         expected_size = chunk_seq < last_chunk_seq ? TOAST_MAX_CHUNK_SIZE
1534                 : extsize - (last_chunk_seq * TOAST_MAX_CHUNK_SIZE);
1535
1536         if (chunksize != expected_size)
1537                 report_toast_corruption(ctx, ta,
1538                                                                 psprintf("toast value %u chunk %d has size %u, but expected size %u",
1539                                                                                  ta->toast_pointer.va_valueid,
1540                                                                                  chunk_seq, chunksize, expected_size));
1541 }
1542
1543 /*
1544  * Check the current attribute as tracked in ctx, recording any corruption
1545  * found in ctx->tupstore.
1546  *
1547  * This function follows the logic performed by heap_deform_tuple(), and in the
1548  * case of a toasted value, optionally stores the toast pointer so later it can
1549  * be checked following the logic of detoast_external_attr(), checking for any
1550  * conditions that would result in either of those functions Asserting or
1551  * crashing the backend.  The checks performed by Asserts present in those two
1552  * functions are also performed here and in check_toasted_attribute.  In cases
1553  * where those two functions are a bit cavalier in their assumptions about data
1554  * being correct, we perform additional checks not present in either of those
1555  * two functions.  Where some condition is checked in both of those functions,
1556  * we perform it here twice, as we parallel the logical flow of those two
1557  * functions.  The presence of duplicate checks seems a reasonable price to pay
1558  * for keeping this code tightly coupled with the code it protects.
1559  *
1560  * Returns true if the tuple attribute is sane enough for processing to
1561  * continue on to the next attribute, false otherwise.
1562  */
1563 static bool
1564 check_tuple_attribute(HeapCheckContext *ctx)
1565 {
1566         Datum           attdatum;
1567         struct varlena *attr;
1568         char       *tp;                         /* pointer to the tuple data */
1569         uint16          infomask;
1570         Form_pg_attribute thisatt;
1571         struct varatt_external toast_pointer;
1572
1573         infomask = ctx->tuphdr->t_infomask;
1574         thisatt = TupleDescAttr(RelationGetDescr(ctx->rel), ctx->attnum);
1575
1576         tp = (char *) ctx->tuphdr + ctx->tuphdr->t_hoff;
1577
1578         if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len)
1579         {
1580                 report_corruption(ctx,
1581                                                   psprintf("attribute with length %u starts at offset %u beyond total tuple length %u",
1582                                                                    thisatt->attlen,
1583                                                                    ctx->tuphdr->t_hoff + ctx->offset,
1584                                                                    ctx->lp_len));
1585                 return false;
1586         }
1587
1588         /* Skip null values */
1589         if (infomask & HEAP_HASNULL && att_isnull(ctx->attnum, ctx->tuphdr->t_bits))
1590                 return true;
1591
1592         /* Skip non-varlena values, but update offset first */
1593         if (thisatt->attlen != -1)
1594         {
1595                 ctx->offset = att_align_nominal(ctx->offset, thisatt->attalign);
1596                 ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen,
1597                                                                                         tp + ctx->offset);
1598                 if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len)
1599                 {
1600                         report_corruption(ctx,
1601                                                           psprintf("attribute with length %u ends at offset %u beyond total tuple length %u",
1602                                                                            thisatt->attlen,
1603                                                                            ctx->tuphdr->t_hoff + ctx->offset,
1604                                                                            ctx->lp_len));
1605                         return false;
1606                 }
1607                 return true;
1608         }
1609
1610         /* Ok, we're looking at a varlena attribute. */
1611         ctx->offset = att_align_pointer(ctx->offset, thisatt->attalign, -1,
1612                                                                         tp + ctx->offset);
1613
1614         /* Get the (possibly corrupt) varlena datum */
1615         attdatum = fetchatt(thisatt, tp + ctx->offset);
1616
1617         /*
1618          * We have the datum, but we cannot decode it carelessly, as it may still
1619          * be corrupt.
1620          */
1621
1622         /*
1623          * Check that VARTAG_SIZE won't hit an Assert on a corrupt va_tag before
1624          * risking a call into att_addlength_pointer
1625          */
1626         if (VARATT_IS_EXTERNAL(tp + ctx->offset))
1627         {
1628                 uint8           va_tag = VARTAG_EXTERNAL(tp + ctx->offset);
1629
1630                 if (va_tag != VARTAG_ONDISK)
1631                 {
1632                         report_corruption(ctx,
1633                                                           psprintf("toasted attribute has unexpected TOAST tag %u",
1634                                                                            va_tag));
1635                         /* We can't know where the next attribute begins */
1636                         return false;
1637                 }
1638         }
1639
1640         /* Ok, should be safe now */
1641         ctx->offset = att_addlength_pointer(ctx->offset, thisatt->attlen,
1642                                                                                 tp + ctx->offset);
1643
1644         if (ctx->tuphdr->t_hoff + ctx->offset > ctx->lp_len)
1645         {
1646                 report_corruption(ctx,
1647                                                   psprintf("attribute with length %u ends at offset %u beyond total tuple length %u",
1648                                                                    thisatt->attlen,
1649                                                                    ctx->tuphdr->t_hoff + ctx->offset,
1650                                                                    ctx->lp_len));
1651
1652                 return false;
1653         }
1654
1655         /*
1656          * heap_deform_tuple would be done with this attribute at this point,
1657          * having stored it in values[], and would continue to the next attribute.
1658          * We go further, because we need to check if the toast datum is corrupt.
1659          */
1660
1661         attr = (struct varlena *) DatumGetPointer(attdatum);
1662
1663         /*
1664          * Now we follow the logic of detoast_external_attr(), with the same
1665          * caveats about being paranoid about corruption.
1666          */
1667
1668         /* Skip values that are not external */
1669         if (!VARATT_IS_EXTERNAL(attr))
1670                 return true;
1671
1672         /* It is external, and we're looking at a page on disk */
1673
1674         /*
1675          * Must copy attr into toast_pointer for alignment considerations
1676          */
1677         VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
1678
1679         /* Toasted attributes too large to be untoasted should never be stored */
1680         if (toast_pointer.va_rawsize > VARLENA_SIZE_LIMIT)
1681                 report_corruption(ctx,
1682                                                   psprintf("toast value %u rawsize %d exceeds limit %d",
1683                                                                    toast_pointer.va_valueid,
1684                                                                    toast_pointer.va_rawsize,
1685                                                                    VARLENA_SIZE_LIMIT));
1686
1687         if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
1688         {
1689                 ToastCompressionId cmid;
1690                 bool            valid = false;
1691
1692                 /* Compressed attributes should have a valid compression method */
1693                 cmid = TOAST_COMPRESS_METHOD(&toast_pointer);
1694                 switch (cmid)
1695                 {
1696                                 /* List of all valid compression method IDs */
1697                         case TOAST_PGLZ_COMPRESSION_ID:
1698                         case TOAST_LZ4_COMPRESSION_ID:
1699                                 valid = true;
1700                                 break;
1701
1702                                 /* Recognized but invalid compression method ID */
1703                         case TOAST_INVALID_COMPRESSION_ID:
1704                                 break;
1705
1706                                 /* Intentionally no default here */
1707                 }
1708                 if (!valid)
1709                         report_corruption(ctx,
1710                                                           psprintf("toast value %u has invalid compression method id %d",
1711                                                                            toast_pointer.va_valueid, cmid));
1712         }
1713
1714         /* The tuple header better claim to contain toasted values */
1715         if (!(infomask & HEAP_HASEXTERNAL))
1716         {
1717                 report_corruption(ctx,
1718                                                   psprintf("toast value %u is external but tuple header flag HEAP_HASEXTERNAL not set",
1719                                                                    toast_pointer.va_valueid));
1720                 return true;
1721         }
1722
1723         /* The relation better have a toast table */
1724         if (!ctx->rel->rd_rel->reltoastrelid)
1725         {
1726                 report_corruption(ctx,
1727                                                   psprintf("toast value %u is external but relation has no toast relation",
1728                                                                    toast_pointer.va_valueid));
1729                 return true;
1730         }
1731
1732         /* If we were told to skip toast checking, then we're done. */
1733         if (ctx->toast_rel == NULL)
1734                 return true;
1735
1736         /*
1737          * If this tuple is eligible to be pruned, we cannot check the toast.
1738          * Otherwise, we push a copy of the toast tuple so we can check it after
1739          * releasing the main table buffer lock.
1740          */
1741         if (!ctx->tuple_could_be_pruned)
1742         {
1743                 ToastedAttribute *ta;
1744
1745                 ta = (ToastedAttribute *) palloc0(sizeof(ToastedAttribute));
1746
1747                 VARATT_EXTERNAL_GET_POINTER(ta->toast_pointer, attr);
1748                 ta->blkno = ctx->blkno;
1749                 ta->offnum = ctx->offnum;
1750                 ta->attnum = ctx->attnum;
1751                 ctx->toasted_attributes = lappend(ctx->toasted_attributes, ta);
1752         }
1753
1754         return true;
1755 }
1756
1757 /*
1758  * For each attribute collected in ctx->toasted_attributes, look up the value
1759  * in the toast table and perform checks on it.  This function should only be
1760  * called on toast pointers which cannot be vacuumed away during our
1761  * processing.
1762  */
1763 static void
1764 check_toasted_attribute(HeapCheckContext *ctx, ToastedAttribute *ta)
1765 {
1766         SnapshotData SnapshotToast;
1767         ScanKeyData toastkey;
1768         SysScanDesc toastscan;
1769         bool            found_toasttup;
1770         HeapTuple       toasttup;
1771         uint32          extsize;
1772         int32           expected_chunk_seq = 0;
1773         int32           last_chunk_seq;
1774
1775         extsize = VARATT_EXTERNAL_GET_EXTSIZE(ta->toast_pointer);
1776         last_chunk_seq = (extsize - 1) / TOAST_MAX_CHUNK_SIZE;
1777
1778         /*
1779          * Setup a scan key to find chunks in toast table with matching va_valueid
1780          */
1781         ScanKeyInit(&toastkey,
1782                                 (AttrNumber) 1,
1783                                 BTEqualStrategyNumber, F_OIDEQ,
1784                                 ObjectIdGetDatum(ta->toast_pointer.va_valueid));
1785
1786         /*
1787          * Check if any chunks for this toasted object exist in the toast table,
1788          * accessible via the index.
1789          */
1790         init_toast_snapshot(&SnapshotToast);
1791         toastscan = systable_beginscan_ordered(ctx->toast_rel,
1792                                                                                    ctx->valid_toast_index,
1793                                                                                    &SnapshotToast, 1,
1794                                                                                    &toastkey);
1795         found_toasttup = false;
1796         while ((toasttup =
1797                         systable_getnext_ordered(toastscan,
1798                                                                          ForwardScanDirection)) != NULL)
1799         {
1800                 found_toasttup = true;
1801                 check_toast_tuple(toasttup, ctx, ta, &expected_chunk_seq, extsize);
1802         }
1803         systable_endscan_ordered(toastscan);
1804
1805         if (!found_toasttup)
1806                 report_toast_corruption(ctx, ta,
1807                                                                 psprintf("toast value %u not found in toast table",
1808                                                                                  ta->toast_pointer.va_valueid));
1809         else if (expected_chunk_seq <= last_chunk_seq)
1810                 report_toast_corruption(ctx, ta,
1811                                                                 psprintf("toast value %u was expected to end at chunk %d, but ended while expecting chunk %d",
1812                                                                                  ta->toast_pointer.va_valueid,
1813                                                                                  last_chunk_seq, expected_chunk_seq));
1814 }
1815
1816 /*
1817  * Check the current tuple as tracked in ctx, recording any corruption found in
1818  * ctx->tupstore.
1819  *
1820  * We return some information about the status of xmin to aid in validating
1821  * update chains.
1822  */
1823 static void
1824 check_tuple(HeapCheckContext *ctx, bool *xmin_commit_status_ok,
1825                         XidCommitStatus *xmin_commit_status)
1826 {
1827         /*
1828          * Check various forms of tuple header corruption, and if the header is
1829          * too corrupt, do not continue with other checks.
1830          */
1831         if (!check_tuple_header(ctx))
1832                 return;
1833
1834         /*
1835          * Check tuple visibility.  If the inserting transaction aborted, we
1836          * cannot assume our relation description matches the tuple structure, and
1837          * therefore cannot check it.
1838          */
1839         if (!check_tuple_visibility(ctx, xmin_commit_status_ok,
1840                                                                 xmin_commit_status))
1841                 return;
1842
1843         /*
1844          * The tuple is visible, so it must be compatible with the current version
1845          * of the relation descriptor. It might have fewer columns than are
1846          * present in the relation descriptor, but it cannot have more.
1847          */
1848         if (RelationGetDescr(ctx->rel)->natts < ctx->natts)
1849         {
1850                 report_corruption(ctx,
1851                                                   psprintf("number of attributes %u exceeds maximum expected for table %u",
1852                                                                    ctx->natts,
1853                                                                    RelationGetDescr(ctx->rel)->natts));
1854                 return;
1855         }
1856
1857         /*
1858          * Check each attribute unless we hit corruption that confuses what to do
1859          * next, at which point we abort further attribute checks for this tuple.
1860          * Note that we don't abort for all types of corruption, only for those
1861          * types where we don't know how to continue.  We also don't abort the
1862          * checking of toasted attributes collected from the tuple prior to
1863          * aborting.  Those will still be checked later along with other toasted
1864          * attributes collected from the page.
1865          */
1866         ctx->offset = 0;
1867         for (ctx->attnum = 0; ctx->attnum < ctx->natts; ctx->attnum++)
1868                 if (!check_tuple_attribute(ctx))
1869                         break;                          /* cannot continue */
1870
1871         /* revert attnum to -1 until we again examine individual attributes */
1872         ctx->attnum = -1;
1873 }
1874
1875 /*
1876  * Convert a TransactionId into a FullTransactionId using our cached values of
1877  * the valid transaction ID range.  It is the caller's responsibility to have
1878  * already updated the cached values, if necessary.
1879  */
1880 static FullTransactionId
1881 FullTransactionIdFromXidAndCtx(TransactionId xid, const HeapCheckContext *ctx)
1882 {
1883         uint64          nextfxid_i;
1884         int32           diff;
1885         FullTransactionId fxid;
1886
1887         Assert(TransactionIdIsNormal(ctx->next_xid));
1888         Assert(FullTransactionIdIsNormal(ctx->next_fxid));
1889         Assert(XidFromFullTransactionId(ctx->next_fxid) == ctx->next_xid);
1890
1891         if (!TransactionIdIsNormal(xid))
1892                 return FullTransactionIdFromEpochAndXid(0, xid);
1893
1894         nextfxid_i = U64FromFullTransactionId(ctx->next_fxid);
1895
1896         /* compute the 32bit modulo difference */
1897         diff = (int32) (ctx->next_xid - xid);
1898
1899         /*
1900          * In cases of corruption we might see a 32bit xid that is before epoch 0.
1901          * We can't represent that as a 64bit xid, due to 64bit xids being
1902          * unsigned integers, without the modulo arithmetic of 32bit xid. There's
1903          * no really nice way to deal with that, but it works ok enough to use
1904          * FirstNormalFullTransactionId in that case, as a freshly initdb'd
1905          * cluster already has a newer horizon.
1906          */
1907         if (diff > 0 && (nextfxid_i - FirstNormalTransactionId) < (int64) diff)
1908         {
1909                 Assert(EpochFromFullTransactionId(ctx->next_fxid) == 0);
1910                 fxid = FirstNormalFullTransactionId;
1911         }
1912         else
1913                 fxid = FullTransactionIdFromU64(nextfxid_i - diff);
1914
1915         Assert(FullTransactionIdIsNormal(fxid));
1916         return fxid;
1917 }
1918
1919 /*
1920  * Update our cached range of valid transaction IDs.
1921  */
1922 static void
1923 update_cached_xid_range(HeapCheckContext *ctx)
1924 {
1925         /* Make cached copies */
1926         LWLockAcquire(XidGenLock, LW_SHARED);
1927         ctx->next_fxid = TransamVariables->nextXid;
1928         ctx->oldest_xid = TransamVariables->oldestXid;
1929         LWLockRelease(XidGenLock);
1930
1931         /* And compute alternate versions of the same */
1932         ctx->next_xid = XidFromFullTransactionId(ctx->next_fxid);
1933         ctx->oldest_fxid = FullTransactionIdFromXidAndCtx(ctx->oldest_xid, ctx);
1934 }
1935
1936 /*
1937  * Update our cached range of valid multitransaction IDs.
1938  */
1939 static void
1940 update_cached_mxid_range(HeapCheckContext *ctx)
1941 {
1942         ReadMultiXactIdRange(&ctx->oldest_mxact, &ctx->next_mxact);
1943 }
1944
1945 /*
1946  * Return whether the given FullTransactionId is within our cached valid
1947  * transaction ID range.
1948  */
1949 static inline bool
1950 fxid_in_cached_range(FullTransactionId fxid, const HeapCheckContext *ctx)
1951 {
1952         return (FullTransactionIdPrecedesOrEquals(ctx->oldest_fxid, fxid) &&
1953                         FullTransactionIdPrecedes(fxid, ctx->next_fxid));
1954 }
1955
1956 /*
1957  * Checks whether a multitransaction ID is in the cached valid range, returning
1958  * the nature of the range violation, if any.
1959  */
1960 static XidBoundsViolation
1961 check_mxid_in_range(MultiXactId mxid, HeapCheckContext *ctx)
1962 {
1963         if (!TransactionIdIsValid(mxid))
1964                 return XID_INVALID;
1965         if (MultiXactIdPrecedes(mxid, ctx->relminmxid))
1966                 return XID_PRECEDES_RELMIN;
1967         if (MultiXactIdPrecedes(mxid, ctx->oldest_mxact))
1968                 return XID_PRECEDES_CLUSTERMIN;
1969         if (MultiXactIdPrecedesOrEquals(ctx->next_mxact, mxid))
1970                 return XID_IN_FUTURE;
1971         return XID_BOUNDS_OK;
1972 }
1973
1974 /*
1975  * Checks whether the given mxid is valid to appear in the heap being checked,
1976  * returning the nature of the range violation, if any.
1977  *
1978  * This function attempts to return quickly by caching the known valid mxid
1979  * range in ctx.  Callers should already have performed the initial setup of
1980  * the cache prior to the first call to this function.
1981  */
1982 static XidBoundsViolation
1983 check_mxid_valid_in_rel(MultiXactId mxid, HeapCheckContext *ctx)
1984 {
1985         XidBoundsViolation result;
1986
1987         result = check_mxid_in_range(mxid, ctx);
1988         if (result == XID_BOUNDS_OK)
1989                 return XID_BOUNDS_OK;
1990
1991         /* The range may have advanced.  Recheck. */
1992         update_cached_mxid_range(ctx);
1993         return check_mxid_in_range(mxid, ctx);
1994 }
1995
1996 /*
1997  * Checks whether the given transaction ID is (or was recently) valid to appear
1998  * in the heap being checked, or whether it is too old or too new to appear in
1999  * the relation, returning information about the nature of the bounds violation.
2000  *
2001  * We cache the range of valid transaction IDs.  If xid is in that range, we
2002  * conclude that it is valid, even though concurrent changes to the table might
2003  * invalidate it under certain corrupt conditions.  (For example, if the table
2004  * contains corrupt all-frozen bits, a concurrent vacuum might skip the page(s)
2005  * containing the xid and then truncate clog and advance the relfrozenxid
2006  * beyond xid.) Reporting the xid as valid under such conditions seems
2007  * acceptable, since if we had checked it earlier in our scan it would have
2008  * truly been valid at that time.
2009  *
2010  * If the status argument is not NULL, and if and only if the transaction ID
2011  * appears to be valid in this relation, the status argument will be set with
2012  * the commit status of the transaction ID.
2013  */
2014 static XidBoundsViolation
2015 get_xid_status(TransactionId xid, HeapCheckContext *ctx,
2016                            XidCommitStatus *status)
2017 {
2018         FullTransactionId fxid;
2019         FullTransactionId clog_horizon;
2020
2021         /* Quick check for special xids */
2022         if (!TransactionIdIsValid(xid))
2023                 return XID_INVALID;
2024         else if (xid == BootstrapTransactionId || xid == FrozenTransactionId)
2025         {
2026                 if (status != NULL)
2027                         *status = XID_COMMITTED;
2028                 return XID_BOUNDS_OK;
2029         }
2030
2031         /* Check if the xid is within bounds */
2032         fxid = FullTransactionIdFromXidAndCtx(xid, ctx);
2033         if (!fxid_in_cached_range(fxid, ctx))
2034         {
2035                 /*
2036                  * We may have been checking against stale values.  Update the cached
2037                  * range to be sure, and since we relied on the cached range when we
2038                  * performed the full xid conversion, reconvert.
2039                  */
2040                 update_cached_xid_range(ctx);
2041                 fxid = FullTransactionIdFromXidAndCtx(xid, ctx);
2042         }
2043
2044         if (FullTransactionIdPrecedesOrEquals(ctx->next_fxid, fxid))
2045                 return XID_IN_FUTURE;
2046         if (FullTransactionIdPrecedes(fxid, ctx->oldest_fxid))
2047                 return XID_PRECEDES_CLUSTERMIN;
2048         if (FullTransactionIdPrecedes(fxid, ctx->relfrozenfxid))
2049                 return XID_PRECEDES_RELMIN;
2050
2051         /* Early return if the caller does not request clog checking */
2052         if (status == NULL)
2053                 return XID_BOUNDS_OK;
2054
2055         /* Early return if we just checked this xid in a prior call */
2056         if (xid == ctx->cached_xid)
2057         {
2058                 *status = ctx->cached_status;
2059                 return XID_BOUNDS_OK;
2060         }
2061
2062         *status = XID_COMMITTED;
2063         LWLockAcquire(XactTruncationLock, LW_SHARED);
2064         clog_horizon =
2065                 FullTransactionIdFromXidAndCtx(TransamVariables->oldestClogXid,
2066                                                                            ctx);
2067         if (FullTransactionIdPrecedesOrEquals(clog_horizon, fxid))
2068         {
2069                 if (TransactionIdIsCurrentTransactionId(xid))
2070                         *status = XID_IS_CURRENT_XID;
2071                 else if (TransactionIdIsInProgress(xid))
2072                         *status = XID_IN_PROGRESS;
2073                 else if (TransactionIdDidCommit(xid))
2074                         *status = XID_COMMITTED;
2075                 else
2076                         *status = XID_ABORTED;
2077         }
2078         LWLockRelease(XactTruncationLock);
2079         ctx->cached_xid = xid;
2080         ctx->cached_status = *status;
2081         return XID_BOUNDS_OK;
2082 }