src/backend/catalog/index.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * index.c
   4  *        code to create and destroy POSTGRES index relations
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *
  14  * INTERFACE ROUTINES
  15  *              index_create()                  - Create a cataloged index relation
  16  *              index_drop()                    - Removes index relation from catalogs
  17  *              BuildIndexInfo()                - Prepare to insert index tuples
  18  *              FormIndexDatum()                - Construct datum vector for one index tuple
  19  *
  20  *-------------------------------------------------------------------------
  21  */
  22 #include "postgres.h"
  23
  24 #include <unistd.h>
  25
  26 #include "access/genam.h"
  27 #include "access/heapam.h"
  28 #include "access/relscan.h"
  29 #include "access/sysattr.h"
  30 #include "access/transam.h"
  31 #include "access/xact.h"
  32 #include "bootstrap/bootstrap.h"
  33 #include "catalog/catalog.h"
  34 #include "catalog/dependency.h"
  35 #include "catalog/heap.h"
  36 #include "catalog/index.h"
  37 #include "catalog/indexing.h"
  38 #include "catalog/namespace.h"
  39 #include "catalog/pg_constraint.h"
  40 #include "catalog/pg_operator.h"
  41 #include "catalog/pg_opclass.h"
  42 #include "catalog/pg_tablespace.h"
  43 #include "catalog/pg_type.h"
  44 #include "commands/tablecmds.h"
  45 #include "executor/executor.h"
  46 #include "miscadmin.h"
  47 #include "nodes/nodeFuncs.h"
  48 #include "optimizer/clauses.h"
  49 #include "optimizer/var.h"
  50 #include "storage/bufmgr.h"
  51 #include "storage/lmgr.h"
  52 #include "storage/procarray.h"
  53 #include "storage/smgr.h"
  54 #include "utils/builtins.h"
  55 #include "utils/fmgroids.h"
  56 #include "utils/inval.h"
  57 #include "utils/lsyscache.h"
  58 #include "utils/memutils.h"
  59 #include "utils/relcache.h"
  60 #include "utils/syscache.h"
  61 #include "utils/tuplesort.h"
  62 #include "utils/snapmgr.h"
  63 #include "utils/tqual.h"
  64
  65
  66 /* state info for validate_index bulkdelete callback */
  67 typedef struct
  68 {
  69         Tuplesortstate *tuplesort;      /* for sorting the index TIDs */
  70         /* statistics (for debug purposes only): */
  71         double          htups,
  72                                 itups,
  73                                 tups_inserted;
  74 } v_i_state;
  75
  76 /* non-export function prototypes */
  77 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
  78                                                  IndexInfo *indexInfo,
  79                                                  Oid accessMethodObjectId,
  80                                                  Oid *classObjectId);
  81 static void InitializeAttributeOids(Relation indexRelation,
  82                                                 int numatts, Oid indexoid);
  83 static void AppendAttributeTuples(Relation indexRelation, int numatts);
  84 static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
  85                                         IndexInfo *indexInfo,
  86                                         Oid *classOids,
  87                                         int16 *coloptions,
  88                                         bool primary,
  89                                         bool isvalid);
  90 static void index_update_stats(Relation rel, bool hasindex, bool isprimary,
  91                                    Oid reltoastidxid, double reltuples);
  92 static bool validate_index_callback(ItemPointer itemptr, void *opaque);
  93 static void validate_index_heapscan(Relation heapRelation,
  94                                                 Relation indexRelation,
  95                                                 IndexInfo *indexInfo,
  96                                                 Snapshot snapshot,
  97                                                 v_i_state *state);
  98 static Oid      IndexGetRelation(Oid indexId);
  99
 100
 101 /*
 102  *              ConstructTupleDescriptor
 103  *
 104  * Build an index tuple descriptor for a new index
 105  */
 106 static TupleDesc
 107 ConstructTupleDescriptor(Relation heapRelation,
 108                                                  IndexInfo *indexInfo,
 109                                                  Oid accessMethodObjectId,
 110                                                  Oid *classObjectId)
 111 {
 112         int                     numatts = indexInfo->ii_NumIndexAttrs;
 113         ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
 114         HeapTuple       amtuple;
 115         Form_pg_am      amform;
 116         TupleDesc       heapTupDesc;
 117         TupleDesc       indexTupDesc;
 118         int                     natts;                  /* #atts in heap rel --- for error checks */
 119         int                     i;
 120
 121         /* We need access to the index AM's pg_am tuple */
 122         amtuple = SearchSysCache(AMOID,
 123                                                          ObjectIdGetDatum(accessMethodObjectId),
 124                                                          0, 0, 0);
 125         if (!HeapTupleIsValid(amtuple))
 126                 elog(ERROR, "cache lookup failed for access method %u",
 127                          accessMethodObjectId);
 128         amform = (Form_pg_am) GETSTRUCT(amtuple);
 129
 130         /* ... and to the table's tuple descriptor */
 131         heapTupDesc = RelationGetDescr(heapRelation);
 132         natts = RelationGetForm(heapRelation)->relnatts;
 133
 134         /*
 135          * allocate the new tuple descriptor
 136          */
 137         indexTupDesc = CreateTemplateTupleDesc(numatts, false);
 138
 139         /*
 140          * For simple index columns, we copy the pg_attribute row from the parent
 141          * relation and modify it as necessary.  For expressions we have to cons
 142          * up a pg_attribute row the hard way.
 143          */
 144         for (i = 0; i < numatts; i++)
 145         {
 146                 AttrNumber      atnum = indexInfo->ii_KeyAttrNumbers[i];
 147                 Form_pg_attribute to = indexTupDesc->attrs[i];
 148                 HeapTuple       tuple;
 149                 Form_pg_type typeTup;
 150                 Form_pg_opclass opclassTup;
 151                 Oid                     keyType;
 152
 153                 if (atnum != 0)
 154                 {
 155                         /* Simple index column */
 156                         Form_pg_attribute from;
 157
 158                         if (atnum < 0)
 159                         {
 160                                 /*
 161                                  * here we are indexing on a system attribute (-1...-n)
 162                                  */
 163                                 from = SystemAttributeDefinition(atnum,
 164                                                                                    heapRelation->rd_rel->relhasoids);
 165                         }
 166                         else
 167                         {
 168                                 /*
 169                                  * here we are indexing on a normal attribute (1...n)
 170                                  */
 171                                 if (atnum > natts)              /* safety check */
 172                                         elog(ERROR, "invalid column number %d", atnum);
 173                                 from = heapTupDesc->attrs[AttrNumberGetAttrOffset(atnum)];
 174                         }
 175
 176                         /*
 177                          * now that we've determined the "from", let's copy the tuple desc
 178                          * data...
 179                          */
 180                         memcpy(to, from, ATTRIBUTE_TUPLE_SIZE);
 181
 182                         /*
 183                          * Fix the stuff that should not be the same as the underlying
 184                          * attr
 185                          */
 186                         to->attnum = i + 1;
 187
 188                         to->attstattarget = -1;
 189                         to->attcacheoff = -1;
 190                         to->attnotnull = false;
 191                         to->atthasdef = false;
 192                         to->attislocal = true;
 193                         to->attinhcount = 0;
 194                 }
 195                 else
 196                 {
 197                         /* Expressional index */
 198                         Node       *indexkey;
 199
 200                         MemSet(to, 0, ATTRIBUTE_TUPLE_SIZE);
 201
 202                         if (indexpr_item == NULL)       /* shouldn't happen */
 203                                 elog(ERROR, "too few entries in indexprs list");
 204                         indexkey = (Node *) lfirst(indexpr_item);
 205                         indexpr_item = lnext(indexpr_item);
 206
 207                         /*
 208                          * Make the attribute's name "pg_expresssion_nnn" (maybe think of
 209                          * something better later)
 210                          */
 211                         sprintf(NameStr(to->attname), "pg_expression_%d", i + 1);
 212
 213                         /*
 214                          * Lookup the expression type in pg_type for the type length etc.
 215                          */
 216                         keyType = exprType(indexkey);
 217                         tuple = SearchSysCache(TYPEOID,
 218                                                                    ObjectIdGetDatum(keyType),
 219                                                                    0, 0, 0);
 220                         if (!HeapTupleIsValid(tuple))
 221                                 elog(ERROR, "cache lookup failed for type %u", keyType);
 222                         typeTup = (Form_pg_type) GETSTRUCT(tuple);
 223
 224                         /*
 225                          * Assign some of the attributes values. Leave the rest as 0.
 226                          */
 227                         to->attnum = i + 1;
 228                         to->atttypid = keyType;
 229                         to->attlen = typeTup->typlen;
 230                         to->attbyval = typeTup->typbyval;
 231                         to->attstorage = typeTup->typstorage;
 232                         to->attalign = typeTup->typalign;
 233                         to->attstattarget = -1;
 234                         to->attcacheoff = -1;
 235                         to->atttypmod = -1;
 236                         to->attislocal = true;
 237
 238                         ReleaseSysCache(tuple);
 239
 240                         /*
 241                          * Make sure the expression yields a type that's safe to store in
 242                          * an index.  We need this defense because we have index opclasses
 243                          * for pseudo-types such as "record", and the actually stored type
 244                          * had better be safe; eg, a named composite type is okay, an
 245                          * anonymous record type is not.  The test is the same as for
 246                          * whether a table column is of a safe type (which is why we
 247                          * needn't check for the non-expression case).
 248                          */
 249                         CheckAttributeType(NameStr(to->attname), to->atttypid);
 250                 }
 251
 252                 /*
 253                  * We do not yet have the correct relation OID for the index, so just
 254                  * set it invalid for now.      InitializeAttributeOids() will fix it
 255                  * later.
 256                  */
 257                 to->attrelid = InvalidOid;
 258
 259                 /*
 260                  * Check the opclass and index AM to see if either provides a keytype
 261                  * (overriding the attribute type).  Opclass takes precedence.
 262                  */
 263                 tuple = SearchSysCache(CLAOID,
 264                                                            ObjectIdGetDatum(classObjectId[i]),
 265                                                            0, 0, 0);
 266                 if (!HeapTupleIsValid(tuple))
 267                         elog(ERROR, "cache lookup failed for opclass %u",
 268                                  classObjectId[i]);
 269                 opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
 270                 if (OidIsValid(opclassTup->opckeytype))
 271                         keyType = opclassTup->opckeytype;
 272                 else
 273                         keyType = amform->amkeytype;
 274                 ReleaseSysCache(tuple);
 275
 276                 if (OidIsValid(keyType) && keyType != to->atttypid)
 277                 {
 278                         /* index value and heap value have different types */
 279                         tuple = SearchSysCache(TYPEOID,
 280                                                                    ObjectIdGetDatum(keyType),
 281                                                                    0, 0, 0);
 282                         if (!HeapTupleIsValid(tuple))
 283                                 elog(ERROR, "cache lookup failed for type %u", keyType);
 284                         typeTup = (Form_pg_type) GETSTRUCT(tuple);
 285
 286                         to->atttypid = keyType;
 287                         to->atttypmod = -1;
 288                         to->attlen = typeTup->typlen;
 289                         to->attbyval = typeTup->typbyval;
 290                         to->attalign = typeTup->typalign;
 291                         to->attstorage = typeTup->typstorage;
 292
 293                         ReleaseSysCache(tuple);
 294                 }
 295         }
 296
 297         ReleaseSysCache(amtuple);
 298
 299         return indexTupDesc;
 300 }
 301
 302 /* ----------------------------------------------------------------
 303  *              InitializeAttributeOids
 304  * ----------------------------------------------------------------
 305  */
 306 static void
 307 InitializeAttributeOids(Relation indexRelation,
 308                                                 int numatts,
 309                                                 Oid indexoid)
 310 {
 311         TupleDesc       tupleDescriptor;
 312         int                     i;
 313
 314         tupleDescriptor = RelationGetDescr(indexRelation);
 315
 316         for (i = 0; i < numatts; i += 1)
 317                 tupleDescriptor->attrs[i]->attrelid = indexoid;
 318 }
 319
 320 /* ----------------------------------------------------------------
 321  *              AppendAttributeTuples
 322  * ----------------------------------------------------------------
 323  */
 324 static void
 325 AppendAttributeTuples(Relation indexRelation, int numatts)
 326 {
 327         Relation        pg_attribute;
 328         CatalogIndexState indstate;
 329         TupleDesc       indexTupDesc;
 330         int                     i;
 331
 332         /*
 333          * open the attribute relation and its indexes
 334          */
 335         pg_attribute = heap_open(AttributeRelationId, RowExclusiveLock);
 336
 337         indstate = CatalogOpenIndexes(pg_attribute);
 338
 339         /*
 340          * insert data from new index's tupdesc into pg_attribute
 341          */
 342         indexTupDesc = RelationGetDescr(indexRelation);
 343
 344         for (i = 0; i < numatts; i++)
 345         {
 346                 /*
 347                  * There used to be very grotty code here to set these fields, but I
 348                  * think it's unnecessary.  They should be set already.
 349                  */
 350                 Assert(indexTupDesc->attrs[i]->attnum == i + 1);
 351                 Assert(indexTupDesc->attrs[i]->attcacheoff == -1);
 352
 353                 InsertPgAttributeTuple(pg_attribute, indexTupDesc->attrs[i], indstate);
 354         }
 355
 356         CatalogCloseIndexes(indstate);
 357
 358         heap_close(pg_attribute, RowExclusiveLock);
 359 }
 360
 361 /* ----------------------------------------------------------------
 362  *              UpdateIndexRelation
 363  *
 364  * Construct and insert a new entry in the pg_index catalog
 365  * ----------------------------------------------------------------
 366  */
 367 static void
 368 UpdateIndexRelation(Oid indexoid,
 369                                         Oid heapoid,
 370                                         IndexInfo *indexInfo,
 371                                         Oid *classOids,
 372                                         int16 *coloptions,
 373                                         bool primary,
 374                                         bool isvalid)
 375 {
 376         int2vector *indkey;
 377         oidvector  *indclass;
 378         int2vector *indoption;
 379         Datum           exprsDatum;
 380         Datum           predDatum;
 381         Datum           values[Natts_pg_index];
 382         bool            nulls[Natts_pg_index];
 383         Relation        pg_index;
 384         HeapTuple       tuple;
 385         int                     i;
 386
 387         /*
 388          * Copy the index key, opclass, and indoption info into arrays (should we
 389          * make the caller pass them like this to start with?)
 390          */
 391         indkey = buildint2vector(NULL, indexInfo->ii_NumIndexAttrs);
 392         for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 393                 indkey->values[i] = indexInfo->ii_KeyAttrNumbers[i];
 394         indclass = buildoidvector(classOids, indexInfo->ii_NumIndexAttrs);
 395         indoption = buildint2vector(coloptions, indexInfo->ii_NumIndexAttrs);
 396
 397         /*
 398          * Convert the index expressions (if any) to a text datum
 399          */
 400         if (indexInfo->ii_Expressions != NIL)
 401         {
 402                 char       *exprsString;
 403
 404                 exprsString = nodeToString(indexInfo->ii_Expressions);
 405                 exprsDatum = CStringGetTextDatum(exprsString);
 406                 pfree(exprsString);
 407         }
 408         else
 409                 exprsDatum = (Datum) 0;
 410
 411         /*
 412          * Convert the index predicate (if any) to a text datum.  Note we convert
 413          * implicit-AND format to normal explicit-AND for storage.
 414          */
 415         if (indexInfo->ii_Predicate != NIL)
 416         {
 417                 char       *predString;
 418
 419                 predString = nodeToString(make_ands_explicit(indexInfo->ii_Predicate));
 420                 predDatum = CStringGetTextDatum(predString);
 421                 pfree(predString);
 422         }
 423         else
 424                 predDatum = (Datum) 0;
 425
 426         /*
 427          * open the system catalog index relation
 428          */
 429         pg_index = heap_open(IndexRelationId, RowExclusiveLock);
 430
 431         /*
 432          * Build a pg_index tuple
 433          */
 434         MemSet(nulls, false, sizeof(nulls));
 435
 436         values[Anum_pg_index_indexrelid - 1] = ObjectIdGetDatum(indexoid);
 437         values[Anum_pg_index_indrelid - 1] = ObjectIdGetDatum(heapoid);
 438         values[Anum_pg_index_indnatts - 1] = Int16GetDatum(indexInfo->ii_NumIndexAttrs);
 439         values[Anum_pg_index_indisunique - 1] = BoolGetDatum(indexInfo->ii_Unique);
 440         values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
 441         values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
 442         values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
 443         values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
 444         /* we set isvalid and isready the same way */
 445         values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
 446         values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
 447         values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
 448         values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption);
 449         values[Anum_pg_index_indexprs - 1] = exprsDatum;
 450         if (exprsDatum == (Datum) 0)
 451                 nulls[Anum_pg_index_indexprs - 1] = true;
 452         values[Anum_pg_index_indpred - 1] = predDatum;
 453         if (predDatum == (Datum) 0)
 454                 nulls[Anum_pg_index_indpred - 1] = true;
 455
 456         tuple = heap_form_tuple(RelationGetDescr(pg_index), values, nulls);
 457
 458         /*
 459          * insert the tuple into the pg_index catalog
 460          */
 461         simple_heap_insert(pg_index, tuple);
 462
 463         /* update the indexes on pg_index */
 464         CatalogUpdateIndexes(pg_index, tuple);
 465
 466         /*
 467          * close the relation and free the tuple
 468          */
 469         heap_close(pg_index, RowExclusiveLock);
 470         heap_freetuple(tuple);
 471 }
 472
 473
 474 /*
 475  * index_create
 476  *
 477  * heapRelationId: OID of table to build index on
 478  * indexRelationName: what it say
 479  * indexRelationId: normally, pass InvalidOid to let this routine
 480  *              generate an OID for the index.  During bootstrap this may be
 481  *              nonzero to specify a preselected OID.
 482  * indexInfo: same info executor uses to insert into the index
 483  * accessMethodObjectId: OID of index AM to use
 484  * tableSpaceId: OID of tablespace to use
 485  * classObjectId: array of index opclass OIDs, one per index column
 486  * coloptions: array of per-index-column indoption settings
 487  * reloptions: AM-specific options
 488  * isprimary: index is a PRIMARY KEY
 489  * isconstraint: index is owned by a PRIMARY KEY or UNIQUE constraint
 490  * allow_system_table_mods: allow table to be a system catalog
 491  * skip_build: true to skip the index_build() step for the moment; caller
 492  *              must do it later (typically via reindex_index())
 493  * concurrent: if true, do not lock the table against writers.  The index
 494  *              will be marked "invalid" and the caller must take additional steps
 495  *              to fix it up.
 496  *
 497  * Returns OID of the created index.
 498  */
 499 Oid
 500 index_create(Oid heapRelationId,
 501                          const char *indexRelationName,
 502                          Oid indexRelationId,
 503                          IndexInfo *indexInfo,
 504                          Oid accessMethodObjectId,
 505                          Oid tableSpaceId,
 506                          Oid *classObjectId,
 507                          int16 *coloptions,
 508                          Datum reloptions,
 509                          bool isprimary,
 510                          bool isconstraint,
 511                          bool allow_system_table_mods,
 512                          bool skip_build,
 513                          bool concurrent)
 514 {
 515         Relation        pg_class;
 516         Relation        heapRelation;
 517         Relation        indexRelation;
 518         TupleDesc       indexTupDesc;
 519         bool            shared_relation;
 520         Oid                     namespaceId;
 521         int                     i;
 522
 523         pg_class = heap_open(RelationRelationId, RowExclusiveLock);
 524
 525         /*
 526          * Only SELECT ... FOR UPDATE/SHARE are allowed while doing a standard
 527          * index build; but for concurrent builds we allow INSERT/UPDATE/DELETE
 528          * (but not VACUUM).
 529          */
 530         heapRelation = heap_open(heapRelationId,
 531                                                 (concurrent ? ShareUpdateExclusiveLock : ShareLock));
 532
 533         /*
 534          * The index will be in the same namespace as its parent table, and is
 535          * shared across databases if and only if the parent is.
 536          */
 537         namespaceId = RelationGetNamespace(heapRelation);
 538         shared_relation = heapRelation->rd_rel->relisshared;
 539
 540         /*
 541          * check parameters
 542          */
 543         if (indexInfo->ii_NumIndexAttrs < 1)
 544                 elog(ERROR, "must index at least one column");
 545
 546         if (!allow_system_table_mods &&
 547                 IsSystemRelation(heapRelation) &&
 548                 IsNormalProcessingMode())
 549                 ereport(ERROR,
 550                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 551                                  errmsg("user-defined indexes on system catalog tables are not supported")));
 552
 553         /*
 554          * concurrent index build on a system catalog is unsafe because we tend to
 555          * release locks before committing in catalogs
 556          */
 557         if (concurrent &&
 558                 IsSystemRelation(heapRelation))
 559                 ereport(ERROR,
 560                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 561                                  errmsg("concurrent index creation on system catalog tables is not supported")));
 562
 563         /*
 564          * We cannot allow indexing a shared relation after initdb (because
 565          * there's no way to make the entry in other databases' pg_class).
 566          */
 567         if (shared_relation && !IsBootstrapProcessingMode())
 568                 ereport(ERROR,
 569                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 570                                  errmsg("shared indexes cannot be created after initdb")));
 571
 572         /*
 573          * Validate shared/non-shared tablespace (must check this before doing
 574          * GetNewRelFileNode, to prevent Assert therein)
 575          */
 576         if (shared_relation)
 577         {
 578                 if (tableSpaceId != GLOBALTABLESPACE_OID)
 579                         /* elog since this is not a user-facing error */
 580                         elog(ERROR,
 581                                  "shared relations must be placed in pg_global tablespace");
 582         }
 583         else
 584         {
 585                 if (tableSpaceId == GLOBALTABLESPACE_OID)
 586                         ereport(ERROR,
 587                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 588                                          errmsg("only shared relations can be placed in pg_global tablespace")));
 589         }
 590
 591         if (get_relname_relid(indexRelationName, namespaceId))
 592                 ereport(ERROR,
 593                                 (errcode(ERRCODE_DUPLICATE_TABLE),
 594                                  errmsg("relation \"%s\" already exists",
 595                                                 indexRelationName)));
 596
 597         /*
 598          * construct tuple descriptor for index tuples
 599          */
 600         indexTupDesc = ConstructTupleDescriptor(heapRelation,
 601                                                                                         indexInfo,
 602                                                                                         accessMethodObjectId,
 603                                                                                         classObjectId);
 604
 605         /*
 606          * Allocate an OID for the index, unless we were told what to use.
 607          *
 608          * The OID will be the relfilenode as well, so make sure it doesn't
 609          * collide with either pg_class OIDs or existing physical files.
 610          */
 611         if (!OidIsValid(indexRelationId))
 612                 indexRelationId = GetNewRelFileNode(tableSpaceId, shared_relation,
 613                                                                                         pg_class);
 614
 615         /*
 616          * create the index relation's relcache entry and physical disk file. (If
 617          * we fail further down, it's the smgr's responsibility to remove the disk
 618          * file again.)
 619          */
 620         indexRelation = heap_create(indexRelationName,
 621                                                                 namespaceId,
 622                                                                 tableSpaceId,
 623                                                                 indexRelationId,
 624                                                                 indexTupDesc,
 625                                                                 RELKIND_INDEX,
 626                                                                 shared_relation,
 627                                                                 allow_system_table_mods);
 628
 629         Assert(indexRelationId == RelationGetRelid(indexRelation));
 630
 631         /*
 632          * Obtain exclusive lock on it.  Although no other backends can see it
 633          * until we commit, this prevents deadlock-risk complaints from lock
 634          * manager in cases such as CLUSTER.
 635          */
 636         LockRelation(indexRelation, AccessExclusiveLock);
 637
 638         /*
 639          * Fill in fields of the index's pg_class entry that are not set correctly
 640          * by heap_create.
 641          *
 642          * XXX should have a cleaner way to create cataloged indexes
 643          */
 644         indexRelation->rd_rel->relowner = heapRelation->rd_rel->relowner;
 645         indexRelation->rd_rel->relam = accessMethodObjectId;
 646         indexRelation->rd_rel->relkind = RELKIND_INDEX;
 647         indexRelation->rd_rel->relhasoids = false;
 648
 649         /*
 650          * store index's pg_class entry
 651          */
 652         InsertPgClassTuple(pg_class, indexRelation,
 653                                            RelationGetRelid(indexRelation),
 654                                            reloptions);
 655
 656         /* done with pg_class */
 657         heap_close(pg_class, RowExclusiveLock);
 658
 659         /*
 660          * now update the object id's of all the attribute tuple forms in the
 661          * index relation's tuple descriptor
 662          */
 663         InitializeAttributeOids(indexRelation,
 664                                                         indexInfo->ii_NumIndexAttrs,
 665                                                         indexRelationId);
 666
 667         /*
 668          * append ATTRIBUTE tuples for the index
 669          */
 670         AppendAttributeTuples(indexRelation, indexInfo->ii_NumIndexAttrs);
 671
 672         /* ----------------
 673          *        update pg_index
 674          *        (append INDEX tuple)
 675          *
 676          *        Note that this stows away a representation of "predicate".
 677          *        (Or, could define a rule to maintain the predicate) --Nels, Feb '92
 678          * ----------------
 679          */
 680         UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo,
 681                                                 classObjectId, coloptions, isprimary, !concurrent);
 682
 683         /*
 684          * Register constraint and dependencies for the index.
 685          *
 686          * If the index is from a CONSTRAINT clause, construct a pg_constraint
 687          * entry. The index is then linked to the constraint, which in turn is
 688          * linked to the table.  If it's not a CONSTRAINT, make the dependency
 689          * directly on the table.
 690          *
 691          * We don't need a dependency on the namespace, because there'll be an
 692          * indirect dependency via our parent table.
 693          *
 694          * During bootstrap we can't register any dependencies, and we don't try
 695          * to make a constraint either.
 696          */
 697         if (!IsBootstrapProcessingMode())
 698         {
 699                 ObjectAddress myself,
 700                                         referenced;
 701
 702                 myself.classId = RelationRelationId;
 703                 myself.objectId = indexRelationId;
 704                 myself.objectSubId = 0;
 705
 706                 if (isconstraint)
 707                 {
 708                         char            constraintType;
 709                         Oid                     conOid;
 710
 711                         if (isprimary)
 712                                 constraintType = CONSTRAINT_PRIMARY;
 713                         else if (indexInfo->ii_Unique)
 714                                 constraintType = CONSTRAINT_UNIQUE;
 715                         else
 716                         {
 717                                 elog(ERROR, "constraint must be PRIMARY or UNIQUE");
 718                                 constraintType = 0;             /* keep compiler quiet */
 719                         }
 720
 721                         /* Shouldn't have any expressions */
 722                         if (indexInfo->ii_Expressions)
 723                                 elog(ERROR, "constraints cannot have index expressions");
 724
 725                         conOid = CreateConstraintEntry(indexRelationName,
 726                                                                                    namespaceId,
 727                                                                                    constraintType,
 728                                                                                    false,               /* isDeferrable */
 729                                                                                    false,               /* isDeferred */
 730                                                                                    heapRelationId,
 731                                                                                    indexInfo->ii_KeyAttrNumbers,
 732                                                                                    indexInfo->ii_NumIndexAttrs,
 733                                                                                    InvalidOid,  /* no domain */
 734                                                                                    InvalidOid,  /* no foreign key */
 735                                                                                    NULL,
 736                                                                                    NULL,
 737                                                                                    NULL,
 738                                                                                    NULL,
 739                                                                                    0,
 740                                                                                    ' ',
 741                                                                                    ' ',
 742                                                                                    ' ',
 743                                                                                    InvalidOid,  /* no associated index */
 744                                                                                    NULL,                /* no check constraint */
 745                                                                                    NULL,
 746                                                                                    NULL,
 747                                                                                    true, /* islocal */
 748                                                                                    0); /* inhcount */
 749
 750                         referenced.classId = ConstraintRelationId;
 751                         referenced.objectId = conOid;
 752                         referenced.objectSubId = 0;
 753
 754                         recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
 755                 }
 756                 else
 757                 {
 758                         bool            have_simple_col = false;
 759
 760                         /* Create auto dependencies on simply-referenced columns */
 761                         for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 762                         {
 763                                 if (indexInfo->ii_KeyAttrNumbers[i] != 0)
 764                                 {
 765                                         referenced.classId = RelationRelationId;
 766                                         referenced.objectId = heapRelationId;
 767                                         referenced.objectSubId = indexInfo->ii_KeyAttrNumbers[i];
 768
 769                                         recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
 770
 771                                         have_simple_col = true;
 772                                 }
 773                         }
 774
 775                         /*
 776                          * It's possible for an index to not depend on any columns of the
 777                          * table at all, in which case we need to give it a dependency on
 778                          * the table as a whole; else it won't get dropped when the table
 779                          * is dropped.  This edge case is not totally useless; for
 780                          * example, a unique index on a constant expression can serve to
 781                          * prevent a table from containing more than one row.
 782                          */
 783                         if (!have_simple_col &&
 784                          !contain_vars_of_level((Node *) indexInfo->ii_Expressions, 0) &&
 785                                 !contain_vars_of_level((Node *) indexInfo->ii_Predicate, 0))
 786                         {
 787                                 referenced.classId = RelationRelationId;
 788                                 referenced.objectId = heapRelationId;
 789                                 referenced.objectSubId = 0;
 790
 791                                 recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
 792                         }
 793                 }
 794
 795                 /* Store dependency on operator classes */
 796                 for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 797                 {
 798                         referenced.classId = OperatorClassRelationId;
 799                         referenced.objectId = classObjectId[i];
 800                         referenced.objectSubId = 0;
 801
 802                         recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 803                 }
 804
 805                 /* Store dependencies on anything mentioned in index expressions */
 806                 if (indexInfo->ii_Expressions)
 807                 {
 808                         recordDependencyOnSingleRelExpr(&myself,
 809                                                                                   (Node *) indexInfo->ii_Expressions,
 810                                                                                         heapRelationId,
 811                                                                                         DEPENDENCY_NORMAL,
 812                                                                                         DEPENDENCY_AUTO);
 813                 }
 814
 815                 /* Store dependencies on anything mentioned in predicate */
 816                 if (indexInfo->ii_Predicate)
 817                 {
 818                         recordDependencyOnSingleRelExpr(&myself,
 819                                                                                         (Node *) indexInfo->ii_Predicate,
 820                                                                                         heapRelationId,
 821                                                                                         DEPENDENCY_NORMAL,
 822                                                                                         DEPENDENCY_AUTO);
 823                 }
 824         }
 825
 826         /*
 827          * Advance the command counter so that we can see the newly-entered
 828          * catalog tuples for the index.
 829          */
 830         CommandCounterIncrement();
 831
 832         /*
 833          * In bootstrap mode, we have to fill in the index strategy structure with
 834          * information from the catalogs.  If we aren't bootstrapping, then the
 835          * relcache entry has already been rebuilt thanks to sinval update during
 836          * CommandCounterIncrement.
 837          */
 838         if (IsBootstrapProcessingMode())
 839                 RelationInitIndexAccessInfo(indexRelation);
 840         else
 841                 Assert(indexRelation->rd_indexcxt != NULL);
 842
 843         /*
 844          * If this is bootstrap (initdb) time, then we don't actually fill in the
 845          * index yet.  We'll be creating more indexes and classes later, so we
 846          * delay filling them in until just before we're done with bootstrapping.
 847          * Similarly, if the caller specified skip_build then filling the index is
 848          * delayed till later (ALTER TABLE can save work in some cases with this).
 849          * Otherwise, we call the AM routine that constructs the index.
 850          */
 851         if (IsBootstrapProcessingMode())
 852         {
 853                 index_register(heapRelationId, indexRelationId, indexInfo);
 854         }
 855         else if (skip_build)
 856         {
 857                 /*
 858                  * Caller is responsible for filling the index later on.  However,
 859                  * we'd better make sure that the heap relation is correctly marked as
 860                  * having an index.
 861                  */
 862                 index_update_stats(heapRelation,
 863                                                    true,
 864                                                    isprimary,
 865                                                    InvalidOid,
 866                                                    heapRelation->rd_rel->reltuples);
 867                 /* Make the above update visible */
 868                 CommandCounterIncrement();
 869         }
 870         else
 871         {
 872                 index_build(heapRelation, indexRelation, indexInfo, isprimary);
 873         }
 874
 875         /*
 876          * Close the heap and index; but we keep the locks that we acquired above
 877          * until end of transaction.
 878          */
 879         index_close(indexRelation, NoLock);
 880         heap_close(heapRelation, NoLock);
 881
 882         return indexRelationId;
 883 }
 884
 885 /*
 886  *              index_drop
 887  *
 888  * NOTE: this routine should now only be called through performDeletion(),
 889  * else associated dependencies won't be cleaned up.
 890  */
 891 void
 892 index_drop(Oid indexId)
 893 {
 894         Oid                     heapId;
 895         Relation        userHeapRelation;
 896         Relation        userIndexRelation;
 897         Relation        indexRelation;
 898         HeapTuple       tuple;
 899         bool            hasexprs;
 900         ForkNumber      forknum;
 901
 902         /*
 903          * To drop an index safely, we must grab exclusive lock on its parent
 904          * table; otherwise there could be other backends using the index!
 905          * Exclusive lock on the index alone is insufficient because another
 906          * backend might be in the midst of devising a query plan that will use
 907          * the index.  The parser and planner take care to hold an appropriate
 908          * lock on the parent table while working, but having them hold locks on
 909          * all the indexes too seems overly expensive.  We do grab exclusive lock
 910          * on the index too, just to be safe. Both locks must be held till end of
 911          * transaction, else other backends will still see this index in pg_index.
 912          */
 913         heapId = IndexGetRelation(indexId);
 914         userHeapRelation = heap_open(heapId, AccessExclusiveLock);
 915
 916         userIndexRelation = index_open(indexId, AccessExclusiveLock);
 917
 918         /*
 919          * Schedule physical removal of the files
 920          */
 921         RelationOpenSmgr(userIndexRelation);
 922         for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 923                 if (smgrexists(userIndexRelation->rd_smgr, forknum))
 924                         smgrscheduleunlink(userIndexRelation->rd_smgr, forknum,
 925                                                            userIndexRelation->rd_istemp);
 926         RelationCloseSmgr(userIndexRelation);
 927
 928         /*
 929          * Close and flush the index's relcache entry, to ensure relcache doesn't
 930          * try to rebuild it while we're deleting catalog entries. We keep the
 931          * lock though.
 932          */
 933         index_close(userIndexRelation, NoLock);
 934
 935         RelationForgetRelation(indexId);
 936
 937         /*
 938          * fix INDEX relation, and check for expressional index
 939          */
 940         indexRelation = heap_open(IndexRelationId, RowExclusiveLock);
 941
 942         tuple = SearchSysCache(INDEXRELID,
 943                                                    ObjectIdGetDatum(indexId),
 944                                                    0, 0, 0);
 945         if (!HeapTupleIsValid(tuple))
 946                 elog(ERROR, "cache lookup failed for index %u", indexId);
 947
 948         hasexprs = !heap_attisnull(tuple, Anum_pg_index_indexprs);
 949
 950         simple_heap_delete(indexRelation, &tuple->t_self);
 951
 952         ReleaseSysCache(tuple);
 953         heap_close(indexRelation, RowExclusiveLock);
 954
 955         /*
 956          * if it has any expression columns, we might have stored statistics about
 957          * them.
 958          */
 959         if (hasexprs)
 960                 RemoveStatistics(indexId, 0);
 961
 962         /*
 963          * fix ATTRIBUTE relation
 964          */
 965         DeleteAttributeTuples(indexId);
 966
 967         /*
 968          * fix RELATION relation
 969          */
 970         DeleteRelationTuple(indexId);
 971
 972         /*
 973          * We are presently too lazy to attempt to compute the new correct value
 974          * of relhasindex (the next VACUUM will fix it if necessary). So there is
 975          * no need to update the pg_class tuple for the owning relation. But we
 976          * must send out a shared-cache-inval notice on the owning relation to
 977          * ensure other backends update their relcache lists of indexes.
 978          */
 979         CacheInvalidateRelcache(userHeapRelation);
 980
 981         /*
 982          * Close owning rel, but keep lock
 983          */
 984         heap_close(userHeapRelation, NoLock);
 985 }
 986
 987 /* ----------------------------------------------------------------
 988  *                                              index_build support
 989  * ----------------------------------------------------------------
 990  */
 991
 992 /* ----------------
 993  *              BuildIndexInfo
 994  *                      Construct an IndexInfo record for an open index
 995  *
 996  * IndexInfo stores the information about the index that's needed by
 997  * FormIndexDatum, which is used for both index_build() and later insertion
 998  * of individual index tuples.  Normally we build an IndexInfo for an index
 999  * just once per command, and then use it for (potentially) many tuples.
1000  * ----------------
1001  */
1002 IndexInfo *
1003 BuildIndexInfo(Relation index)
1004 {
1005         IndexInfo  *ii = makeNode(IndexInfo);
1006         Form_pg_index indexStruct = index->rd_index;
1007         int                     i;
1008         int                     numKeys;
1009
1010         /* check the number of keys, and copy attr numbers into the IndexInfo */
1011         numKeys = indexStruct->indnatts;
1012         if (numKeys < 1 || numKeys > INDEX_MAX_KEYS)
1013                 elog(ERROR, "invalid indnatts %d for index %u",
1014                          numKeys, RelationGetRelid(index));
1015         ii->ii_NumIndexAttrs = numKeys;
1016         for (i = 0; i < numKeys; i++)
1017                 ii->ii_KeyAttrNumbers[i] = indexStruct->indkey.values[i];
1018
1019         /* fetch any expressions needed for expressional indexes */
1020         ii->ii_Expressions = RelationGetIndexExpressions(index);
1021         ii->ii_ExpressionsState = NIL;
1022
1023         /* fetch index predicate if any */
1024         ii->ii_Predicate = RelationGetIndexPredicate(index);
1025         ii->ii_PredicateState = NIL;
1026
1027         /* other info */
1028         ii->ii_Unique = indexStruct->indisunique;
1029         ii->ii_ReadyForInserts = indexStruct->indisready;
1030
1031         /* initialize index-build state to default */
1032         ii->ii_Concurrent = false;
1033         ii->ii_BrokenHotChain = false;
1034
1035         return ii;
1036 }
1037
1038 /* ----------------
1039  *              FormIndexDatum
1040  *                      Construct values[] and isnull[] arrays for a new index tuple.
1041  *
1042  *      indexInfo               Info about the index
1043  *      slot                    Heap tuple for which we must prepare an index entry
1044  *      estate                  executor state for evaluating any index expressions
1045  *      values                  Array of index Datums (output area)
1046  *      isnull                  Array of is-null indicators (output area)
1047  *
1048  * When there are no index expressions, estate may be NULL.  Otherwise it
1049  * must be supplied, *and* the ecxt_scantuple slot of its per-tuple expr
1050  * context must point to the heap tuple passed in.
1051  *
1052  * Notice we don't actually call index_form_tuple() here; we just prepare
1053  * its input arrays values[] and isnull[].      This is because the index AM
1054  * may wish to alter the data before storage.
1055  * ----------------
1056  */
1057 void
1058 FormIndexDatum(IndexInfo *indexInfo,
1059                            TupleTableSlot *slot,
1060                            EState *estate,
1061                            Datum *values,
1062                            bool *isnull)
1063 {
1064         ListCell   *indexpr_item;
1065         int                     i;
1066
1067         if (indexInfo->ii_Expressions != NIL &&
1068                 indexInfo->ii_ExpressionsState == NIL)
1069         {
1070                 /* First time through, set up expression evaluation state */
1071                 indexInfo->ii_ExpressionsState = (List *)
1072                         ExecPrepareExpr((Expr *) indexInfo->ii_Expressions,
1073                                                         estate);
1074                 /* Check caller has set up context correctly */
1075                 Assert(GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
1076         }
1077         indexpr_item = list_head(indexInfo->ii_ExpressionsState);
1078
1079         for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
1080         {
1081                 int                     keycol = indexInfo->ii_KeyAttrNumbers[i];
1082                 Datum           iDatum;
1083                 bool            isNull;
1084
1085                 if (keycol != 0)
1086                 {
1087                         /*
1088                          * Plain index column; get the value we need directly from the
1089                          * heap tuple.
1090                          */
1091                         iDatum = slot_getattr(slot, keycol, &isNull);
1092                 }
1093                 else
1094                 {
1095                         /*
1096                          * Index expression --- need to evaluate it.
1097                          */
1098                         if (indexpr_item == NULL)
1099                                 elog(ERROR, "wrong number of index expressions");
1100                         iDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(indexpr_item),
1101                                                                                            GetPerTupleExprContext(estate),
1102                                                                                            &isNull,
1103                                                                                            NULL);
1104                         indexpr_item = lnext(indexpr_item);
1105                 }
1106                 values[i] = iDatum;
1107                 isnull[i] = isNull;
1108         }
1109
1110         if (indexpr_item != NULL)
1111                 elog(ERROR, "wrong number of index expressions");
1112 }
1113
1114
1115 /*
1116  * index_update_stats --- update pg_class entry after CREATE INDEX or REINDEX
1117  *
1118  * This routine updates the pg_class row of either an index or its parent
1119  * relation after CREATE INDEX or REINDEX.      Its rather bizarre API is designed
1120  * to ensure we can do all the necessary work in just one update.
1121  *
1122  * hasindex: set relhasindex to this value
1123  * isprimary: if true, set relhaspkey true; else no change
1124  * reltoastidxid: if not InvalidOid, set reltoastidxid to this value;
1125  *              else no change
1126  * reltuples: set reltuples to this value
1127  *
1128  * relpages is also updated (using RelationGetNumberOfBlocks()).
1129  *
1130  * NOTE: an important side-effect of this operation is that an SI invalidation
1131  * message is sent out to all backends --- including me --- causing relcache
1132  * entries to be flushed or updated with the new data.  This must happen even
1133  * if we find that no change is needed in the pg_class row.  When updating
1134  * a heap entry, this ensures that other backends find out about the new
1135  * index.  When updating an index, it's important because some index AMs
1136  * expect a relcache flush to occur after REINDEX.
1137  */
1138 static void
1139 index_update_stats(Relation rel, bool hasindex, bool isprimary,
1140                                    Oid reltoastidxid, double reltuples)
1141 {
1142         BlockNumber relpages = RelationGetNumberOfBlocks(rel);
1143         Oid                     relid = RelationGetRelid(rel);
1144         Relation        pg_class;
1145         HeapTuple       tuple;
1146         Form_pg_class rd_rel;
1147         bool            dirty;
1148
1149         /*
1150          * We always update the pg_class row using a non-transactional,
1151          * overwrite-in-place update.  There are several reasons for this:
1152          *
1153          * 1. In bootstrap mode, we have no choice --- UPDATE wouldn't work.
1154          *
1155          * 2. We could be reindexing pg_class itself, in which case we can't move
1156          * its pg_class row because CatalogUpdateIndexes might not know about all
1157          * the indexes yet (see reindex_relation).
1158          *
1159          * 3. Because we execute CREATE INDEX with just share lock on the parent
1160          * rel (to allow concurrent index creations), an ordinary update could
1161          * suffer a tuple-concurrently-updated failure against another CREATE
1162          * INDEX committing at about the same time.  We can avoid that by having
1163          * them both do nontransactional updates (we assume they will both be
1164          * trying to change the pg_class row to the same thing, so it doesn't
1165          * matter which goes first).
1166          *
1167          * 4. Even with just a single CREATE INDEX, there's a risk factor because
1168          * someone else might be trying to open the rel while we commit, and this
1169          * creates a race condition as to whether he will see both or neither of
1170          * the pg_class row versions as valid.  Again, a non-transactional update
1171          * avoids the risk.  It is indeterminate which state of the row the other
1172          * process will see, but it doesn't matter (if he's only taking
1173          * AccessShareLock, then it's not critical that he see relhasindex true).
1174          *
1175          * It is safe to use a non-transactional update even though our
1176          * transaction could still fail before committing.      Setting relhasindex
1177          * true is safe even if there are no indexes (VACUUM will eventually fix
1178          * it), and of course the relpages and reltuples counts are correct (or at
1179          * least more so than the old values) regardless.
1180          */
1181
1182         pg_class = heap_open(RelationRelationId, RowExclusiveLock);
1183
1184         /*
1185          * Make a copy of the tuple to update.  Normally we use the syscache, but
1186          * we can't rely on that during bootstrap or while reindexing pg_class
1187          * itself.
1188          */
1189         if (IsBootstrapProcessingMode() ||
1190                 ReindexIsProcessingHeap(RelationRelationId))
1191         {
1192                 /* don't assume syscache will work */
1193                 HeapScanDesc pg_class_scan;
1194                 ScanKeyData key[1];
1195
1196                 ScanKeyInit(&key[0],
1197                                         ObjectIdAttributeNumber,
1198                                         BTEqualStrategyNumber, F_OIDEQ,
1199                                         ObjectIdGetDatum(relid));
1200
1201                 pg_class_scan = heap_beginscan(pg_class, SnapshotNow, 1, key);
1202                 tuple = heap_getnext(pg_class_scan, ForwardScanDirection);
1203                 tuple = heap_copytuple(tuple);
1204                 heap_endscan(pg_class_scan);
1205         }
1206         else
1207         {
1208                 /* normal case, use syscache */
1209                 tuple = SearchSysCacheCopy(RELOID,
1210                                                                    ObjectIdGetDatum(relid),
1211                                                                    0, 0, 0);
1212         }
1213
1214         if (!HeapTupleIsValid(tuple))
1215                 elog(ERROR, "could not find tuple for relation %u", relid);
1216         rd_rel = (Form_pg_class) GETSTRUCT(tuple);
1217
1218         /* Apply required updates, if any, to copied tuple */
1219
1220         dirty = false;
1221         if (rd_rel->relhasindex != hasindex)
1222         {
1223                 rd_rel->relhasindex = hasindex;
1224                 dirty = true;
1225         }
1226         if (isprimary)
1227         {
1228                 if (!rd_rel->relhaspkey)
1229                 {
1230                         rd_rel->relhaspkey = true;
1231                         dirty = true;
1232                 }
1233         }
1234         if (OidIsValid(reltoastidxid))
1235         {
1236                 Assert(rd_rel->relkind == RELKIND_TOASTVALUE);
1237                 if (rd_rel->reltoastidxid != reltoastidxid)
1238                 {
1239                         rd_rel->reltoastidxid = reltoastidxid;
1240                         dirty = true;
1241                 }
1242         }
1243         if (rd_rel->reltuples != (float4) reltuples)
1244         {
1245                 rd_rel->reltuples = (float4) reltuples;
1246                 dirty = true;
1247         }
1248         if (rd_rel->relpages != (int32) relpages)
1249         {
1250                 rd_rel->relpages = (int32) relpages;
1251                 dirty = true;
1252         }
1253
1254         /*
1255          * If anything changed, write out the tuple
1256          */
1257         if (dirty)
1258         {
1259                 heap_inplace_update(pg_class, tuple);
1260                 /* the above sends a cache inval message */
1261         }
1262         else
1263         {
1264                 /* no need to change tuple, but force relcache inval anyway */
1265                 CacheInvalidateRelcacheByTuple(tuple);
1266         }
1267
1268         heap_freetuple(tuple);
1269
1270         heap_close(pg_class, RowExclusiveLock);
1271 }
1272
1273 /*
1274  * setNewRelfilenode            - assign a new relfilenode value to the relation
1275  *
1276  * Caller must already hold exclusive lock on the relation.
1277  *
1278  * The relation is marked with relfrozenxid=freezeXid (InvalidTransactionId
1279  * must be passed for indexes)
1280  */
1281 void
1282 setNewRelfilenode(Relation relation, TransactionId freezeXid)
1283 {
1284         Oid                     newrelfilenode;
1285         RelFileNode newrnode;
1286         SMgrRelation srel;
1287         Relation        pg_class;
1288         HeapTuple       tuple;
1289         Form_pg_class rd_rel;
1290         ForkNumber      i;
1291
1292         /* Can't change relfilenode for nailed tables (indexes ok though) */
1293         Assert(!relation->rd_isnailed ||
1294                    relation->rd_rel->relkind == RELKIND_INDEX);
1295         /* Can't change for shared tables or indexes */
1296         Assert(!relation->rd_rel->relisshared);
1297         /* Indexes must have Invalid frozenxid; other relations must not */
1298         Assert((relation->rd_rel->relkind == RELKIND_INDEX &&
1299                         freezeXid == InvalidTransactionId) ||
1300                    TransactionIdIsNormal(freezeXid));
1301
1302         /* Allocate a new relfilenode */
1303         newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace,
1304                                                                            relation->rd_rel->relisshared,
1305                                                                            NULL);
1306
1307         /*
1308          * Find the pg_class tuple for the given relation.      This is not used
1309          * during bootstrap, so okay to use heap_update always.
1310          */
1311         pg_class = heap_open(RelationRelationId, RowExclusiveLock);
1312
1313         tuple = SearchSysCacheCopy(RELOID,
1314                                                            ObjectIdGetDatum(RelationGetRelid(relation)),
1315                                                            0, 0, 0);
1316         if (!HeapTupleIsValid(tuple))
1317                 elog(ERROR, "could not find tuple for relation %u",
1318                          RelationGetRelid(relation));
1319         rd_rel = (Form_pg_class) GETSTRUCT(tuple);
1320
1321         RelationOpenSmgr(relation);
1322
1323         /*
1324          * ... and create storage for corresponding forks in the new relfilenode.
1325          *
1326          * NOTE: any conflict in relfilenode value will be caught here
1327          */
1328         newrnode = relation->rd_node;
1329         newrnode.relNode = newrelfilenode;
1330         srel = smgropen(newrnode);
1331
1332         /* Create the main fork, like heap_create() does */
1333         smgrcreate(srel, MAIN_FORKNUM, relation->rd_istemp, false);
1334
1335         /*
1336          * For a heap, create FSM fork as well. Indexams are responsible for
1337          * creating any extra forks themselves.
1338          */
1339         if (relation->rd_rel->relkind == RELKIND_RELATION ||
1340                 relation->rd_rel->relkind == RELKIND_TOASTVALUE)
1341                 smgrcreate(srel, FSM_FORKNUM, relation->rd_istemp, false);
1342
1343         /* schedule unlinking old files */
1344         for (i = 0; i <= MAX_FORKNUM; i++)
1345         {
1346                 if (smgrexists(relation->rd_smgr, i))
1347                         smgrscheduleunlink(relation->rd_smgr, i, relation->rd_istemp);
1348         }
1349
1350         smgrclose(srel);
1351         RelationCloseSmgr(relation);
1352
1353         /* update the pg_class row */
1354         rd_rel->relfilenode = newrelfilenode;
1355         rd_rel->relpages = 0;           /* it's empty until further notice */
1356         rd_rel->reltuples = 0;
1357         rd_rel->relfrozenxid = freezeXid;
1358         simple_heap_update(pg_class, &tuple->t_self, tuple);
1359         CatalogUpdateIndexes(pg_class, tuple);
1360
1361         heap_freetuple(tuple);
1362
1363         heap_close(pg_class, RowExclusiveLock);
1364
1365         /* Make sure the relfilenode change is visible */
1366         CommandCounterIncrement();
1367
1368         /* Mark the rel as having a new relfilenode in current transaction */
1369         RelationCacheMarkNewRelfilenode(relation);
1370 }
1371
1372
1373 /*
1374  * index_build - invoke access-method-specific index build procedure
1375  *
1376  * On entry, the index's catalog entries are valid, and its physical disk
1377  * file has been created but is empty.  We call the AM-specific build
1378  * procedure to fill in the index contents.  We then update the pg_class
1379  * entries of the index and heap relation as needed, using statistics
1380  * returned by ambuild as well as data passed by the caller.
1381  *
1382  * Note: when reindexing an existing index, isprimary can be false;
1383  * the index is already properly marked and need not be re-marked.
1384  *
1385  * Note: before Postgres 8.2, the passed-in heap and index Relations
1386  * were automatically closed by this routine.  This is no longer the case.
1387  * The caller opened 'em, and the caller should close 'em.
1388  */
1389 void
1390 index_build(Relation heapRelation,
1391                         Relation indexRelation,
1392                         IndexInfo *indexInfo,
1393                         bool isprimary)
1394 {
1395         RegProcedure procedure;
1396         IndexBuildResult *stats;
1397         Oid                     save_userid;
1398         bool            save_secdefcxt;
1399
1400         /*
1401          * sanity checks
1402          */
1403         Assert(RelationIsValid(indexRelation));
1404         Assert(PointerIsValid(indexRelation->rd_am));
1405
1406         procedure = indexRelation->rd_am->ambuild;
1407         Assert(RegProcedureIsValid(procedure));
1408
1409         /*
1410          * Switch to the table owner's userid, so that any index functions are
1411          * run as that user.
1412          */
1413         GetUserIdAndContext(&save_userid, &save_secdefcxt);
1414         SetUserIdAndContext(heapRelation->rd_rel->relowner, true);
1415
1416         /*
1417          * Call the access method's build procedure
1418          */
1419         stats = (IndexBuildResult *)
1420                 DatumGetPointer(OidFunctionCall3(procedure,
1421                                                                                  PointerGetDatum(heapRelation),
1422                                                                                  PointerGetDatum(indexRelation),
1423                                                                                  PointerGetDatum(indexInfo)));
1424         Assert(PointerIsValid(stats));
1425
1426         /* Restore userid */
1427         SetUserIdAndContext(save_userid, save_secdefcxt);
1428
1429         /*
1430          * If we found any potentially broken HOT chains, mark the index as not
1431          * being usable until the current transaction is below the event horizon.
1432          * See src/backend/access/heap/README.HOT for discussion.
1433          */
1434         if (indexInfo->ii_BrokenHotChain)
1435         {
1436                 Oid                     indexId = RelationGetRelid(indexRelation);
1437                 Relation        pg_index;
1438                 HeapTuple       indexTuple;
1439                 Form_pg_index indexForm;
1440
1441                 pg_index = heap_open(IndexRelationId, RowExclusiveLock);
1442
1443                 indexTuple = SearchSysCacheCopy(INDEXRELID,
1444                                                                                 ObjectIdGetDatum(indexId),
1445                                                                                 0, 0, 0);
1446                 if (!HeapTupleIsValid(indexTuple))
1447                         elog(ERROR, "cache lookup failed for index %u", indexId);
1448                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
1449
1450                 indexForm->indcheckxmin = true;
1451                 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
1452                 CatalogUpdateIndexes(pg_index, indexTuple);
1453
1454                 heap_freetuple(indexTuple);
1455                 heap_close(pg_index, RowExclusiveLock);
1456         }
1457
1458         /*
1459          * Update heap and index pg_class rows
1460          */
1461         index_update_stats(heapRelation,
1462                                            true,
1463                                            isprimary,
1464                                            (heapRelation->rd_rel->relkind == RELKIND_TOASTVALUE) ?
1465                                            RelationGetRelid(indexRelation) : InvalidOid,
1466                                            stats->heap_tuples);
1467
1468         index_update_stats(indexRelation,
1469                                            false,
1470                                            false,
1471                                            InvalidOid,
1472                                            stats->index_tuples);
1473
1474         /* Make the updated versions visible */
1475         CommandCounterIncrement();
1476 }
1477
1478
1479 /*
1480  * IndexBuildHeapScan - scan the heap relation to find tuples to be indexed
1481  *
1482  * This is called back from an access-method-specific index build procedure
1483  * after the AM has done whatever setup it needs.  The parent heap relation
1484  * is scanned to find tuples that should be entered into the index.  Each
1485  * such tuple is passed to the AM's callback routine, which does the right
1486  * things to add it to the new index.  After we return, the AM's index
1487  * build procedure does whatever cleanup is needed; in particular, it should
1488  * close the heap and index relations.
1489  *
1490  * The total count of heap tuples is returned.  This is for updating pg_class
1491  * statistics.  (It's annoying not to be able to do that here, but we can't
1492  * do it until after the relation is closed.)  Note that the index AM itself
1493  * must keep track of the number of index tuples; we don't do so here because
1494  * the AM might reject some of the tuples for its own reasons, such as being
1495  * unable to store NULLs.
1496  *
1497  * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
1498  * any potentially broken HOT chains.  Currently, we set this if there are
1499  * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to
1500  * detect whether they're really incompatible with the chain tip.
1501  */
1502 double
1503 IndexBuildHeapScan(Relation heapRelation,
1504                                    Relation indexRelation,
1505                                    IndexInfo *indexInfo,
1506                                    bool allow_sync,
1507                                    IndexBuildCallback callback,
1508                                    void *callback_state)
1509 {
1510         HeapScanDesc scan;
1511         HeapTuple       heapTuple;
1512         Datum           values[INDEX_MAX_KEYS];
1513         bool            isnull[INDEX_MAX_KEYS];
1514         double          reltuples;
1515         List       *predicate;
1516         TupleTableSlot *slot;
1517         EState     *estate;
1518         ExprContext *econtext;
1519         Snapshot        snapshot;
1520         TransactionId OldestXmin;
1521         BlockNumber root_blkno = InvalidBlockNumber;
1522         OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1523
1524         /*
1525          * sanity checks
1526          */
1527         Assert(OidIsValid(indexRelation->rd_rel->relam));
1528
1529         /*
1530          * Need an EState for evaluation of index expressions and partial-index
1531          * predicates.  Also a slot to hold the current tuple.
1532          */
1533         estate = CreateExecutorState();
1534         econtext = GetPerTupleExprContext(estate);
1535         slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation));
1536
1537         /* Arrange for econtext's scan tuple to be the tuple under test */
1538         econtext->ecxt_scantuple = slot;
1539
1540         /* Set up execution state for predicate, if any. */
1541         predicate = (List *)
1542                 ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
1543                                                 estate);
1544
1545         /*
1546          * Prepare for scan of the base relation.  In a normal index build, we use
1547          * SnapshotAny because we must retrieve all tuples and do our own time
1548          * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1549          * concurrent build, we take a regular MVCC snapshot and index whatever's
1550          * live according to that.      During bootstrap we just use SnapshotNow.
1551          */
1552         if (IsBootstrapProcessingMode())
1553         {
1554                 snapshot = SnapshotNow;
1555                 OldestXmin = InvalidTransactionId;              /* not used */
1556         }
1557         else if (indexInfo->ii_Concurrent)
1558         {
1559                 snapshot = RegisterSnapshot(GetTransactionSnapshot());
1560                 OldestXmin = InvalidTransactionId;              /* not used */
1561         }
1562         else
1563         {
1564                 snapshot = SnapshotAny;
1565                 /* okay to ignore lazy VACUUMs here */
1566                 OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared, true);
1567         }
1568
1569         scan = heap_beginscan_strat(heapRelation,       /* relation */
1570                                                                 snapshot,               /* snapshot */
1571                                                                 0,                              /* number of keys */
1572                                                                 NULL,                   /* scan key */
1573                                                                 true,                   /* buffer access strategy OK */
1574                                                                 allow_sync);    /* syncscan OK? */
1575
1576         reltuples = 0;
1577
1578         /*
1579          * Scan all tuples in the base relation.
1580          */
1581         while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1582         {
1583                 bool            tupleIsAlive;
1584
1585                 CHECK_FOR_INTERRUPTS();
1586
1587                 /*
1588                  * When dealing with a HOT-chain of updated tuples, we want to index
1589                  * the values of the live tuple (if any), but index it under the TID
1590                  * of the chain's root tuple.  This approach is necessary to preserve
1591                  * the HOT-chain structure in the heap. So we need to be able to find
1592                  * the root item offset for every tuple that's in a HOT-chain.  When
1593                  * first reaching a new page of the relation, call
1594                  * heap_get_root_tuples() to build a map of root item offsets on the
1595                  * page.
1596                  *
1597                  * It might look unsafe to use this information across buffer
1598                  * lock/unlock.  However, we hold ShareLock on the table so no
1599                  * ordinary insert/update/delete should occur; and we hold pin on the
1600                  * buffer continuously while visiting the page, so no pruning
1601                  * operation can occur either.
1602                  *
1603                  * Note the implied assumption that there is no more than one live
1604                  * tuple per HOT-chain ...
1605                  */
1606                 if (scan->rs_cblock != root_blkno)
1607                 {
1608                         Page            page = BufferGetPage(scan->rs_cbuf);
1609
1610                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
1611                         heap_get_root_tuples(page, root_offsets);
1612                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1613
1614                         root_blkno = scan->rs_cblock;
1615                 }
1616
1617                 if (snapshot == SnapshotAny)
1618                 {
1619                         /* do our own time qual check */
1620                         bool            indexIt;
1621
1622         recheck:
1623
1624                         /*
1625                          * We could possibly get away with not locking the buffer here,
1626                          * since caller should hold ShareLock on the relation, but let's
1627                          * be conservative about it.  (This remark is still correct even
1628                          * with HOT-pruning: our pin on the buffer prevents pruning.)
1629                          */
1630                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
1631
1632                         switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin,
1633                                                                                          scan->rs_cbuf))
1634                         {
1635                                 case HEAPTUPLE_DEAD:
1636                                         /* Definitely dead, we can ignore it */
1637                                         indexIt = false;
1638                                         tupleIsAlive = false;
1639                                         break;
1640                                 case HEAPTUPLE_LIVE:
1641                                         /* Normal case, index and unique-check it */
1642                                         indexIt = true;
1643                                         tupleIsAlive = true;
1644                                         break;
1645                                 case HEAPTUPLE_RECENTLY_DEAD:
1646
1647                                         /*
1648                                          * If tuple is recently deleted then we must index it
1649                                          * anyway to preserve MVCC semantics.  (Pre-existing
1650                                          * transactions could try to use the index after we finish
1651                                          * building it, and may need to see such tuples.)
1652                                          *
1653                                          * However, if it was HOT-updated then we must only index
1654                                          * the live tuple at the end of the HOT-chain.  Since this
1655                                          * breaks semantics for pre-existing snapshots, mark the
1656                                          * index as unusable for them.
1657                                          *
1658                                          * If we've already decided that the index will be unsafe
1659                                          * for old snapshots, we may as well stop indexing
1660                                          * recently-dead tuples, since there's no longer any
1661                                          * point.
1662                                          */
1663                                         if (HeapTupleIsHotUpdated(heapTuple))
1664                                         {
1665                                                 indexIt = false;
1666                                                 /* mark the index as unsafe for old snapshots */
1667                                                 indexInfo->ii_BrokenHotChain = true;
1668                                         }
1669                                         else if (indexInfo->ii_BrokenHotChain)
1670                                                 indexIt = false;
1671                                         else
1672                                                 indexIt = true;
1673                                         /* In any case, exclude the tuple from unique-checking */
1674                                         tupleIsAlive = false;
1675                                         break;
1676                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
1677
1678                                         /*
1679                                          * Since caller should hold ShareLock or better, we should
1680                                          * not see any tuples inserted by open transactions ---
1681                                          * unless it's our own transaction. (Consider INSERT
1682                                          * followed by CREATE INDEX within a transaction.)      An
1683                                          * exception occurs when reindexing a system catalog,
1684                                          * because we often release lock on system catalogs before
1685                                          * committing.  In that case we wait for the inserting
1686                                          * transaction to finish and check again.  (We could do
1687                                          * that on user tables too, but since the case is not
1688                                          * expected it seems better to throw an error.)
1689                                          */
1690                                         if (!TransactionIdIsCurrentTransactionId(
1691                                                                   HeapTupleHeaderGetXmin(heapTuple->t_data)))
1692                                         {
1693                                                 if (!IsSystemRelation(heapRelation))
1694                                                         elog(ERROR, "concurrent insert in progress");
1695                                                 else
1696                                                 {
1697                                                         /*
1698                                                          * Must drop the lock on the buffer before we wait
1699                                                          */
1700                                                         TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1701
1702                                                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1703                                                         XactLockTableWait(xwait);
1704                                                         goto recheck;
1705                                                 }
1706                                         }
1707
1708                                         /*
1709                                          * We must index such tuples, since if the index build
1710                                          * commits then they're good.
1711                                          */
1712                                         indexIt = true;
1713                                         tupleIsAlive = true;
1714                                         break;
1715                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
1716
1717                                         /*
1718                                          * Since caller should hold ShareLock or better, we should
1719                                          * not see any tuples deleted by open transactions ---
1720                                          * unless it's our own transaction. (Consider DELETE
1721                                          * followed by CREATE INDEX within a transaction.)      An
1722                                          * exception occurs when reindexing a system catalog,
1723                                          * because we often release lock on system catalogs before
1724                                          * committing.  In that case we wait for the deleting
1725                                          * transaction to finish and check again.  (We could do
1726                                          * that on user tables too, but since the case is not
1727                                          * expected it seems better to throw an error.)
1728                                          */
1729                                         Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
1730                                         if (!TransactionIdIsCurrentTransactionId(
1731                                                                   HeapTupleHeaderGetXmax(heapTuple->t_data)))
1732                                         {
1733                                                 if (!IsSystemRelation(heapRelation))
1734                                                         elog(ERROR, "concurrent delete in progress");
1735                                                 else
1736                                                 {
1737                                                         /*
1738                                                          * Must drop the lock on the buffer before we wait
1739                                                          */
1740                                                         TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
1741
1742                                                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1743                                                         XactLockTableWait(xwait);
1744                                                         goto recheck;
1745                                                 }
1746                                         }
1747
1748                                         /*
1749                                          * Otherwise, we have to treat these tuples just like
1750                                          * RECENTLY_DELETED ones.
1751                                          */
1752                                         if (HeapTupleIsHotUpdated(heapTuple))
1753                                         {
1754                                                 indexIt = false;
1755                                                 /* mark the index as unsafe for old snapshots */
1756                                                 indexInfo->ii_BrokenHotChain = true;
1757                                         }
1758                                         else if (indexInfo->ii_BrokenHotChain)
1759                                                 indexIt = false;
1760                                         else
1761                                                 indexIt = true;
1762                                         /* In any case, exclude the tuple from unique-checking */
1763                                         tupleIsAlive = false;
1764                                         break;
1765                                 default:
1766                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1767                                         indexIt = tupleIsAlive = false;         /* keep compiler quiet */
1768                                         break;
1769                         }
1770
1771                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1772
1773                         if (!indexIt)
1774                                 continue;
1775                 }
1776                 else
1777                 {
1778                         /* heap_getnext did the time qual check */
1779                         tupleIsAlive = true;
1780                 }
1781
1782                 reltuples += 1;
1783
1784                 MemoryContextReset(econtext->ecxt_per_tuple_memory);
1785
1786                 /* Set up for predicate or expression evaluation */
1787                 ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
1788
1789                 /*
1790                  * In a partial index, discard tuples that don't satisfy the
1791                  * predicate.
1792                  */
1793                 if (predicate != NIL)
1794                 {
1795                         if (!ExecQual(predicate, econtext, false))
1796                                 continue;
1797                 }
1798
1799                 /*
1800                  * For the current heap tuple, extract all the attributes we use in
1801                  * this index, and note which are null.  This also performs evaluation
1802                  * of any expressions needed.
1803                  */
1804                 FormIndexDatum(indexInfo,
1805                                            slot,
1806                                            estate,
1807                                            values,
1808                                            isnull);
1809
1810                 /*
1811                  * You'd think we should go ahead and build the index tuple here, but
1812                  * some index AMs want to do further processing on the data first.      So
1813                  * pass the values[] and isnull[] arrays, instead.
1814                  */
1815
1816                 if (HeapTupleIsHeapOnly(heapTuple))
1817                 {
1818                         /*
1819                          * For a heap-only tuple, pretend its TID is that of the root. See
1820                          * src/backend/access/heap/README.HOT for discussion.
1821                          */
1822                         HeapTupleData rootTuple;
1823                         OffsetNumber offnum;
1824
1825                         rootTuple = *heapTuple;
1826                         offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1827
1828                         Assert(OffsetNumberIsValid(root_offsets[offnum - 1]));
1829
1830                         ItemPointerSetOffsetNumber(&rootTuple.t_self,
1831                                                                            root_offsets[offnum - 1]);
1832
1833                         /* Call the AM's callback routine to process the tuple */
1834                         callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
1835                                          callback_state);
1836                 }
1837                 else
1838                 {
1839                         /* Call the AM's callback routine to process the tuple */
1840                         callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
1841                                          callback_state);
1842                 }
1843         }
1844
1845         heap_endscan(scan);
1846
1847         /* we can now forget our snapshot, if set */
1848         if (indexInfo->ii_Concurrent)
1849                 UnregisterSnapshot(snapshot);
1850
1851         ExecDropSingleTupleTableSlot(slot);
1852
1853         FreeExecutorState(estate);
1854
1855         /* These may have been pointing to the now-gone estate */
1856         indexInfo->ii_ExpressionsState = NIL;
1857         indexInfo->ii_PredicateState = NIL;
1858
1859         return reltuples;
1860 }
1861
1862
1863 /*
1864  * validate_index - support code for concurrent index builds
1865  *
1866  * We do a concurrent index build by first inserting the catalog entry for the
1867  * index via index_create(), marking it not indisready and not indisvalid.
1868  * Then we commit our transaction and start a new one, then we wait for all
1869  * transactions that could have been modifying the table to terminate.  Now
1870  * we know that any subsequently-started transactions will see the index and
1871  * honor its constraints on HOT updates; so while existing HOT-chains might
1872  * be broken with respect to the index, no currently live tuple will have an
1873  * incompatible HOT update done to it.  We now build the index normally via
1874  * index_build(), while holding a weak lock that allows concurrent
1875  * insert/update/delete.  Also, we index only tuples that are valid
1876  * as of the start of the scan (see IndexBuildHeapScan), whereas a normal
1877  * build takes care to include recently-dead tuples.  This is OK because
1878  * we won't mark the index valid until all transactions that might be able
1879  * to see those tuples are gone.  The reason for doing that is to avoid
1880  * bogus unique-index failures due to concurrent UPDATEs (we might see
1881  * different versions of the same row as being valid when we pass over them,
1882  * if we used HeapTupleSatisfiesVacuum).  This leaves us with an index that
1883  * does not contain any tuples added to the table while we built the index.
1884  *
1885  * Next, we mark the index "indisready" (but still not "indisvalid") and
1886  * commit the second transaction and start a third.  Again we wait for all
1887  * transactions that could have been modifying the table to terminate.  Now
1888  * we know that any subsequently-started transactions will see the index and
1889  * insert their new tuples into it.  We then take a new reference snapshot
1890  * which is passed to validate_index().  Any tuples that are valid according
1891  * to this snap, but are not in the index, must be added to the index.
1892  * (Any tuples committed live after the snap will be inserted into the
1893  * index by their originating transaction.      Any tuples committed dead before
1894  * the snap need not be indexed, because we will wait out all transactions
1895  * that might care about them before we mark the index valid.)
1896  *
1897  * validate_index() works by first gathering all the TIDs currently in the
1898  * index, using a bulkdelete callback that just stores the TIDs and doesn't
1899  * ever say "delete it".  (This should be faster than a plain indexscan;
1900  * also, not all index AMs support full-index indexscan.)  Then we sort the
1901  * TIDs, and finally scan the table doing a "merge join" against the TID list
1902  * to see which tuples are missing from the index.      Thus we will ensure that
1903  * all tuples valid according to the reference snapshot are in the index.
1904  *
1905  * Building a unique index this way is tricky: we might try to insert a
1906  * tuple that is already dead or is in process of being deleted, and we
1907  * mustn't have a uniqueness failure against an updated version of the same
1908  * row.  We could try to check the tuple to see if it's already dead and tell
1909  * index_insert() not to do the uniqueness check, but that still leaves us
1910  * with a race condition against an in-progress update.  To handle that,
1911  * we expect the index AM to recheck liveness of the to-be-inserted tuple
1912  * before it declares a uniqueness error.
1913  *
1914  * After completing validate_index(), we wait until all transactions that
1915  * were alive at the time of the reference snapshot are gone; this is
1916  * necessary to be sure there are none left with a serializable snapshot
1917  * older than the reference (and hence possibly able to see tuples we did
1918  * not index).  Then we mark the index "indisvalid" and commit.  Subsequent
1919  * transactions will be able to use it for queries.
1920  *
1921  * Doing two full table scans is a brute-force strategy.  We could try to be
1922  * cleverer, eg storing new tuples in a special area of the table (perhaps
1923  * making the table append-only by setting use_fsm).  However that would
1924  * add yet more locking issues.
1925  */
1926 void
1927 validate_index(Oid heapId, Oid indexId, Snapshot snapshot)
1928 {
1929         Relation        heapRelation,
1930                                 indexRelation;
1931         IndexInfo  *indexInfo;
1932         IndexVacuumInfo ivinfo;
1933         v_i_state       state;
1934         Oid                     save_userid;
1935         bool            save_secdefcxt;
1936
1937         /* Open and lock the parent heap relation */
1938         heapRelation = heap_open(heapId, ShareUpdateExclusiveLock);
1939         /* And the target index relation */
1940         indexRelation = index_open(indexId, RowExclusiveLock);
1941
1942         /*
1943          * Fetch info needed for index_insert.  (You might think this should be
1944          * passed in from DefineIndex, but its copy is long gone due to having
1945          * been built in a previous transaction.)
1946          */
1947         indexInfo = BuildIndexInfo(indexRelation);
1948
1949         /* mark build is concurrent just for consistency */
1950         indexInfo->ii_Concurrent = true;
1951
1952         /*
1953          * Switch to the table owner's userid, so that any index functions are
1954          * run as that user.
1955          */
1956         GetUserIdAndContext(&save_userid, &save_secdefcxt);
1957         SetUserIdAndContext(heapRelation->rd_rel->relowner, true);
1958
1959         /*
1960          * Scan the index and gather up all the TIDs into a tuplesort object.
1961          */
1962         ivinfo.index = indexRelation;
1963         ivinfo.vacuum_full = false;
1964         ivinfo.message_level = DEBUG2;
1965         ivinfo.num_heap_tuples = -1;
1966         ivinfo.strategy = NULL;
1967
1968         state.tuplesort = tuplesort_begin_datum(TIDOID,
1969                                                                                         TIDLessOperator, false,
1970                                                                                         maintenance_work_mem,
1971                                                                                         false);
1972         state.htups = state.itups = state.tups_inserted = 0;
1973
1974         (void) index_bulk_delete(&ivinfo, NULL,
1975                                                          validate_index_callback, (void *) &state);
1976
1977         /* Execute the sort */
1978         tuplesort_performsort(state.tuplesort);
1979
1980         /*
1981          * Now scan the heap and "merge" it with the index
1982          */
1983         validate_index_heapscan(heapRelation,
1984                                                         indexRelation,
1985                                                         indexInfo,
1986                                                         snapshot,
1987                                                         &state);
1988
1989         /* Done with tuplesort object */
1990         tuplesort_end(state.tuplesort);
1991
1992         elog(DEBUG2,
1993                  "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples",
1994                  state.htups, state.itups, state.tups_inserted);
1995
1996         /* Restore userid */
1997         SetUserIdAndContext(save_userid, save_secdefcxt);
1998
1999         /* Close rels, but keep locks */
2000         index_close(indexRelation, NoLock);
2001         heap_close(heapRelation, NoLock);
2002 }
2003
2004 /*
2005  * validate_index_callback - bulkdelete callback to collect the index TIDs
2006  */
2007 static bool
2008 validate_index_callback(ItemPointer itemptr, void *opaque)
2009 {
2010         v_i_state  *state = (v_i_state *) opaque;
2011
2012         tuplesort_putdatum(state->tuplesort, PointerGetDatum(itemptr), false);
2013         state->itups += 1;
2014         return false;                           /* never actually delete anything */
2015 }
2016
2017 /*
2018  * validate_index_heapscan - second table scan for concurrent index build
2019  *
2020  * This has much code in common with IndexBuildHeapScan, but it's enough
2021  * different that it seems cleaner to have two routines not one.
2022  */
2023 static void
2024 validate_index_heapscan(Relation heapRelation,
2025                                                 Relation indexRelation,
2026                                                 IndexInfo *indexInfo,
2027                                                 Snapshot snapshot,
2028                                                 v_i_state *state)
2029 {
2030         HeapScanDesc scan;
2031         HeapTuple       heapTuple;
2032         Datum           values[INDEX_MAX_KEYS];
2033         bool            isnull[INDEX_MAX_KEYS];
2034         List       *predicate;
2035         TupleTableSlot *slot;
2036         EState     *estate;
2037         ExprContext *econtext;
2038         BlockNumber root_blkno = InvalidBlockNumber;
2039         OffsetNumber root_offsets[MaxHeapTuplesPerPage];
2040         bool            in_index[MaxHeapTuplesPerPage];
2041
2042         /* state variables for the merge */
2043         ItemPointer indexcursor = NULL;
2044         bool            tuplesort_empty = false;
2045
2046         /*
2047          * sanity checks
2048          */
2049         Assert(OidIsValid(indexRelation->rd_rel->relam));
2050
2051         /*
2052          * Need an EState for evaluation of index expressions and partial-index
2053          * predicates.  Also a slot to hold the current tuple.
2054          */
2055         estate = CreateExecutorState();
2056         econtext = GetPerTupleExprContext(estate);
2057         slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation));
2058
2059         /* Arrange for econtext's scan tuple to be the tuple under test */
2060         econtext->ecxt_scantuple = slot;
2061
2062         /* Set up execution state for predicate, if any. */
2063         predicate = (List *)
2064                 ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
2065                                                 estate);
2066
2067         /*
2068          * Prepare for scan of the base relation.  We need just those tuples
2069          * satisfying the passed-in reference snapshot.  We must disable syncscan
2070          * here, because it's critical that we read from block zero forward to
2071          * match the sorted TIDs.
2072          */
2073         scan = heap_beginscan_strat(heapRelation,       /* relation */
2074                                                                 snapshot,               /* snapshot */
2075                                                                 0,                              /* number of keys */
2076                                                                 NULL,                   /* scan key */
2077                                                                 true,                   /* buffer access strategy OK */
2078                                                                 false);                 /* syncscan not OK */
2079
2080         /*
2081          * Scan all tuples matching the snapshot.
2082          */
2083         while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
2084         {
2085                 ItemPointer heapcursor = &heapTuple->t_self;
2086                 ItemPointerData rootTuple;
2087                 OffsetNumber root_offnum;
2088
2089                 CHECK_FOR_INTERRUPTS();
2090
2091                 state->htups += 1;
2092
2093                 /*
2094                  * As commented in IndexBuildHeapScan, we should index heap-only
2095                  * tuples under the TIDs of their root tuples; so when we advance onto
2096                  * a new heap page, build a map of root item offsets on the page.
2097                  *
2098                  * This complicates merging against the tuplesort output: we will
2099                  * visit the live tuples in order by their offsets, but the root
2100                  * offsets that we need to compare against the index contents might be
2101                  * ordered differently.  So we might have to "look back" within the
2102                  * tuplesort output, but only within the current page.  We handle that
2103                  * by keeping a bool array in_index[] showing all the
2104                  * already-passed-over tuplesort output TIDs of the current page. We
2105                  * clear that array here, when advancing onto a new heap page.
2106                  */
2107                 if (scan->rs_cblock != root_blkno)
2108                 {
2109                         Page            page = BufferGetPage(scan->rs_cbuf);
2110
2111                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
2112                         heap_get_root_tuples(page, root_offsets);
2113                         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2114
2115                         memset(in_index, 0, sizeof(in_index));
2116
2117                         root_blkno = scan->rs_cblock;
2118                 }
2119
2120                 /* Convert actual tuple TID to root TID */
2121                 rootTuple = *heapcursor;
2122                 root_offnum = ItemPointerGetOffsetNumber(heapcursor);
2123
2124                 if (HeapTupleIsHeapOnly(heapTuple))
2125                 {
2126                         root_offnum = root_offsets[root_offnum - 1];
2127                         Assert(OffsetNumberIsValid(root_offnum));
2128                         ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
2129                 }
2130
2131                 /*
2132                  * "merge" by skipping through the index tuples until we find or pass
2133                  * the current root tuple.
2134                  */
2135                 while (!tuplesort_empty &&
2136                            (!indexcursor ||
2137                                 ItemPointerCompare(indexcursor, &rootTuple) < 0))
2138                 {
2139                         Datum           ts_val;
2140                         bool            ts_isnull;
2141
2142                         if (indexcursor)
2143                         {
2144                                 /*
2145                                  * Remember index items seen earlier on the current heap page
2146                                  */
2147                                 if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
2148                                         in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
2149                                 pfree(indexcursor);
2150                         }
2151
2152                         tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
2153                                                                                                   &ts_val, &ts_isnull);
2154                         Assert(tuplesort_empty || !ts_isnull);
2155                         indexcursor = (ItemPointer) DatumGetPointer(ts_val);
2156                 }
2157
2158                 /*
2159                  * If the tuplesort has overshot *and* we didn't see a match earlier,
2160                  * then this tuple is missing from the index, so insert it.
2161                  */
2162                 if ((tuplesort_empty ||
2163                          ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
2164                         !in_index[root_offnum - 1])
2165                 {
2166                         MemoryContextReset(econtext->ecxt_per_tuple_memory);
2167
2168                         /* Set up for predicate or expression evaluation */
2169                         ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
2170
2171                         /*
2172                          * In a partial index, discard tuples that don't satisfy the
2173                          * predicate.
2174                          */
2175                         if (predicate != NIL)
2176                         {
2177                                 if (!ExecQual(predicate, econtext, false))
2178                                         continue;
2179                         }
2180
2181                         /*
2182                          * For the current heap tuple, extract all the attributes we use
2183                          * in this index, and note which are null.      This also performs
2184                          * evaluation of any expressions needed.
2185                          */
2186                         FormIndexDatum(indexInfo,
2187                                                    slot,
2188                                                    estate,
2189                                                    values,
2190                                                    isnull);
2191
2192                         /*
2193                          * You'd think we should go ahead and build the index tuple here,
2194                          * but some index AMs want to do further processing on the data
2195                          * first. So pass the values[] and isnull[] arrays, instead.
2196                          */
2197
2198                         /*
2199                          * If the tuple is already committed dead, you might think we
2200                          * could suppress uniqueness checking, but this is no longer true
2201                          * in the presence of HOT, because the insert is actually a proxy
2202                          * for a uniqueness check on the whole HOT-chain.  That is, the
2203                          * tuple we have here could be dead because it was already
2204                          * HOT-updated, and if so the updating transaction will not have
2205                          * thought it should insert index entries.      The index AM will
2206                          * check the whole HOT-chain and correctly detect a conflict if
2207                          * there is one.
2208                          */
2209
2210                         index_insert(indexRelation,
2211                                                  values,
2212                                                  isnull,
2213                                                  &rootTuple,
2214                                                  heapRelation,
2215                                                  indexInfo->ii_Unique);
2216
2217                         state->tups_inserted += 1;
2218                 }
2219         }
2220
2221         heap_endscan(scan);
2222
2223         ExecDropSingleTupleTableSlot(slot);
2224
2225         FreeExecutorState(estate);
2226
2227         /* These may have been pointing to the now-gone estate */
2228         indexInfo->ii_ExpressionsState = NIL;
2229         indexInfo->ii_PredicateState = NIL;
2230 }
2231
2232
2233 /*
2234  * IndexGetRelation: given an index's relation OID, get the OID of the
2235  * relation it is an index on.  Uses the system cache.
2236  */
2237 static Oid
2238 IndexGetRelation(Oid indexId)
2239 {
2240         HeapTuple       tuple;
2241         Form_pg_index index;
2242         Oid                     result;
2243
2244         tuple = SearchSysCache(INDEXRELID,
2245                                                    ObjectIdGetDatum(indexId),
2246                                                    0, 0, 0);
2247         if (!HeapTupleIsValid(tuple))
2248                 elog(ERROR, "cache lookup failed for index %u", indexId);
2249         index = (Form_pg_index) GETSTRUCT(tuple);
2250         Assert(index->indexrelid == indexId);
2251
2252         result = index->indrelid;
2253         ReleaseSysCache(tuple);
2254         return result;
2255 }
2256
2257 /*
2258  * reindex_index - This routine is used to recreate a single index
2259  */
2260 void
2261 reindex_index(Oid indexId)
2262 {
2263         Relation        iRel,
2264                                 heapRelation,
2265                                 pg_index;
2266         Oid                     heapId;
2267         bool            inplace;
2268         HeapTuple       indexTuple;
2269         Form_pg_index indexForm;
2270
2271         /*
2272          * Open and lock the parent heap relation.      ShareLock is sufficient since
2273          * we only need to be sure no schema or data changes are going on.
2274          */
2275         heapId = IndexGetRelation(indexId);
2276         heapRelation = heap_open(heapId, ShareLock);
2277
2278         /*
2279          * Open the target index relation and get an exclusive lock on it, to
2280          * ensure that no one else is touching this particular index.
2281          */
2282         iRel = index_open(indexId, AccessExclusiveLock);
2283
2284         /*
2285          * Don't allow reindex on temp tables of other backends ... their local
2286          * buffer manager is not going to cope.
2287          */
2288         if (isOtherTempNamespace(RelationGetNamespace(iRel)))
2289                 ereport(ERROR,
2290                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2291                                  errmsg("cannot reindex temporary tables of other sessions")));
2292
2293         /*
2294          * Also check for active uses of the index in the current transaction;
2295          * we don't want to reindex underneath an open indexscan.
2296          */
2297         CheckTableNotInUse(iRel, "REINDEX INDEX");
2298
2299         /*
2300          * If it's a shared index, we must do inplace processing (because we have
2301          * no way to update relfilenode in other databases).  Otherwise we can do
2302          * it the normal transaction-safe way.
2303          *
2304          * Since inplace processing isn't crash-safe, we only allow it in a
2305          * standalone backend.  (In the REINDEX TABLE and REINDEX DATABASE cases,
2306          * the caller should have detected this.)
2307          */
2308         inplace = iRel->rd_rel->relisshared;
2309
2310         if (inplace && IsUnderPostmaster)
2311                 ereport(ERROR,
2312                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
2313                                  errmsg("shared index \"%s\" can only be reindexed in stand-alone mode",
2314                                                 RelationGetRelationName(iRel))));
2315
2316         PG_TRY();
2317         {
2318                 IndexInfo  *indexInfo;
2319
2320                 /* Suppress use of the target index while rebuilding it */
2321                 SetReindexProcessing(heapId, indexId);
2322
2323                 /* Fetch info needed for index_build */
2324                 indexInfo = BuildIndexInfo(iRel);
2325
2326                 if (inplace)
2327                 {
2328                         /*
2329                          * Truncate the actual file (and discard buffers). The indexam
2330                          * is responsible for truncating the FSM, if applicable
2331                          */
2332                         RelationTruncate(iRel, 0);
2333                 }
2334                 else
2335                 {
2336                         /*
2337                          * We'll build a new physical relation for the index.
2338                          */
2339                         setNewRelfilenode(iRel, InvalidTransactionId);
2340                 }
2341
2342                 /* Initialize the index and rebuild */
2343                 /* Note: we do not need to re-establish pkey setting */
2344                 index_build(heapRelation, iRel, indexInfo, false);
2345         }
2346         PG_CATCH();
2347         {
2348                 /* Make sure flag gets cleared on error exit */
2349                 ResetReindexProcessing();
2350                 PG_RE_THROW();
2351         }
2352         PG_END_TRY();
2353         ResetReindexProcessing();
2354
2355         /*
2356          * If the index is marked invalid or not ready (ie, it's from a failed
2357          * CREATE INDEX CONCURRENTLY), we can now mark it valid.  This allows
2358          * REINDEX to be used to clean up in such cases.
2359          */
2360         pg_index = heap_open(IndexRelationId, RowExclusiveLock);
2361
2362         indexTuple = SearchSysCacheCopy(INDEXRELID,
2363                                                                         ObjectIdGetDatum(indexId),
2364                                                                         0, 0, 0);
2365         if (!HeapTupleIsValid(indexTuple))
2366                 elog(ERROR, "cache lookup failed for index %u", indexId);
2367         indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
2368
2369         if (!indexForm->indisvalid || !indexForm->indisready)
2370         {
2371                 indexForm->indisvalid = true;
2372                 indexForm->indisready = true;
2373                 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
2374                 CatalogUpdateIndexes(pg_index, indexTuple);
2375         }
2376         heap_close(pg_index, RowExclusiveLock);
2377
2378         /* Close rels, but keep locks */
2379         index_close(iRel, NoLock);
2380         heap_close(heapRelation, NoLock);
2381 }
2382
2383 /*
2384  * reindex_relation - This routine is used to recreate all indexes
2385  * of a relation (and optionally its toast relation too, if any).
2386  *
2387  * Returns true if any indexes were rebuilt.  Note that a
2388  * CommandCounterIncrement will occur after each index rebuild.
2389  */
2390 bool
2391 reindex_relation(Oid relid, bool toast_too)
2392 {
2393         Relation        rel;
2394         Oid                     toast_relid;
2395         bool            is_pg_class;
2396         bool            result;
2397         List       *indexIds,
2398                            *doneIndexes;
2399         ListCell   *indexId;
2400
2401         /*
2402          * Open and lock the relation.  ShareLock is sufficient since we only need
2403          * to prevent schema and data changes in it.
2404          */
2405         rel = heap_open(relid, ShareLock);
2406
2407         toast_relid = rel->rd_rel->reltoastrelid;
2408
2409         /*
2410          * Get the list of index OIDs for this relation.  (We trust to the
2411          * relcache to get this with a sequential scan if ignoring system
2412          * indexes.)
2413          */
2414         indexIds = RelationGetIndexList(rel);
2415
2416         /*
2417          * reindex_index will attempt to update the pg_class rows for the relation
2418          * and index.  If we are processing pg_class itself, we want to make sure
2419          * that the updates do not try to insert index entries into indexes we
2420          * have not processed yet.      (When we are trying to recover from corrupted
2421          * indexes, that could easily cause a crash.) We can accomplish this
2422          * because CatalogUpdateIndexes will use the relcache's index list to know
2423          * which indexes to update. We just force the index list to be only the
2424          * stuff we've processed.
2425          *
2426          * It is okay to not insert entries into the indexes we have not processed
2427          * yet because all of this is transaction-safe.  If we fail partway
2428          * through, the updated rows are dead and it doesn't matter whether they
2429          * have index entries.  Also, a new pg_class index will be created with an
2430          * entry for its own pg_class row because we do setNewRelfilenode() before
2431          * we do index_build().
2432          *
2433          * Note that we also clear pg_class's rd_oidindex until the loop is done,
2434          * so that that index can't be accessed either.  This means we cannot
2435          * safely generate new relation OIDs while in the loop; shouldn't be a
2436          * problem.
2437          */
2438         is_pg_class = (RelationGetRelid(rel) == RelationRelationId);
2439
2440         /* Ensure rd_indexattr is valid; see comments for RelationSetIndexList */
2441         if (is_pg_class)
2442                 (void) RelationGetIndexAttrBitmap(rel);
2443
2444         /* Reindex all the indexes. */
2445         doneIndexes = NIL;
2446         foreach(indexId, indexIds)
2447         {
2448                 Oid                     indexOid = lfirst_oid(indexId);
2449
2450                 if (is_pg_class)
2451                         RelationSetIndexList(rel, doneIndexes, InvalidOid);
2452
2453                 reindex_index(indexOid);
2454
2455                 CommandCounterIncrement();
2456
2457                 if (is_pg_class)
2458                         doneIndexes = lappend_oid(doneIndexes, indexOid);
2459         }
2460
2461         if (is_pg_class)
2462                 RelationSetIndexList(rel, indexIds, ClassOidIndexId);
2463
2464         /*
2465          * Close rel, but continue to hold the lock.
2466          */
2467         heap_close(rel, NoLock);
2468
2469         result = (indexIds != NIL);
2470
2471         /*
2472          * If the relation has a secondary toast rel, reindex that too while we
2473          * still hold the lock on the master table.
2474          */
2475         if (toast_too && OidIsValid(toast_relid))
2476                 result |= reindex_relation(toast_relid, false);
2477
2478         return result;
2479 }