src/backend/commands/analyze.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * analyze.c
   4  *        the Postgres statistics generator
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/commands/analyze.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <math.h>
  18
  19 #include "access/detoast.h"
  20 #include "access/genam.h"
  21 #include "access/multixact.h"
  22 #include "access/relation.h"
  23 #include "access/table.h"
  24 #include "access/tableam.h"
  25 #include "access/transam.h"
  26 #include "access/tupconvert.h"
  27 #include "access/visibilitymap.h"
  28 #include "access/xact.h"
  29 #include "catalog/index.h"
  30 #include "catalog/indexing.h"
  31 #include "catalog/pg_inherits.h"
  32 #include "commands/dbcommands.h"
  33 #include "commands/progress.h"
  34 #include "commands/tablecmds.h"
  35 #include "commands/vacuum.h"
  36 #include "common/pg_prng.h"
  37 #include "executor/executor.h"
  38 #include "foreign/fdwapi.h"
  39 #include "miscadmin.h"
  40 #include "nodes/nodeFuncs.h"
  41 #include "parser/parse_oper.h"
  42 #include "parser/parse_relation.h"
  43 #include "pgstat.h"
  44 #include "postmaster/autovacuum.h"
  45 #include "statistics/extended_stats_internal.h"
  46 #include "statistics/statistics.h"
  47 #include "storage/bufmgr.h"
  48 #include "storage/procarray.h"
  49 #include "utils/attoptcache.h"
  50 #include "utils/datum.h"
  51 #include "utils/guc.h"
  52 #include "utils/lsyscache.h"
  53 #include "utils/memutils.h"
  54 #include "utils/pg_rusage.h"
  55 #include "utils/sampling.h"
  56 #include "utils/sortsupport.h"
  57 #include "utils/spccache.h"
  58 #include "utils/syscache.h"
  59 #include "utils/timestamp.h"
  60
  61
  62 /* Per-index data for ANALYZE */
  63 typedef struct AnlIndexData
  64 {
  65         IndexInfo  *indexInfo;          /* BuildIndexInfo result */
  66         double          tupleFract;             /* fraction of rows for partial index */
  67         VacAttrStats **vacattrstats;    /* index attrs to analyze */
  68         int                     attr_cnt;
  69 } AnlIndexData;
  70
  71
  72 /* Default statistics target (GUC parameter) */
  73 int                     default_statistics_target = 100;
  74
  75 /* A few variables that don't seem worth passing around as parameters */
  76 static MemoryContext anl_context = NULL;
  77 static BufferAccessStrategy vac_strategy;
  78
  79
  80 static void do_analyze_rel(Relation onerel,
  81                                                    VacuumParams *params, List *va_cols,
  82                                                    AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
  83                                                    bool inh, bool in_outer_xact, int elevel);
  84 static void compute_index_stats(Relation onerel, double totalrows,
  85                                                                 AnlIndexData *indexdata, int nindexes,
  86                                                                 HeapTuple *rows, int numrows,
  87                                                                 MemoryContext col_context);
  88 static VacAttrStats *examine_attribute(Relation onerel, int attnum,
  89                                                                            Node *index_expr);
  90 static int      acquire_sample_rows(Relation onerel, int elevel,
  91                                                                 HeapTuple *rows, int targrows,
  92                                                                 double *totalrows, double *totaldeadrows);
  93 static int      compare_rows(const void *a, const void *b, void *arg);
  94 static int      acquire_inherited_sample_rows(Relation onerel, int elevel,
  95                                                                                   HeapTuple *rows, int targrows,
  96                                                                                   double *totalrows, double *totaldeadrows);
  97 static void update_attstats(Oid relid, bool inh,
  98                                                         int natts, VacAttrStats **vacattrstats);
  99 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 100 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 101
 102
 103 /*
 104  *      analyze_rel() -- analyze one relation
 105  *
 106  * relid identifies the relation to analyze.  If relation is supplied, use
 107  * the name therein for reporting any failure to open/lock the rel; do not
 108  * use it once we've successfully opened the rel, since it might be stale.
 109  */
 110 void
 111 analyze_rel(Oid relid, RangeVar *relation,
 112                         VacuumParams *params, List *va_cols, bool in_outer_xact,
 113                         BufferAccessStrategy bstrategy)
 114 {
 115         Relation        onerel;
 116         int                     elevel;
 117         AcquireSampleRowsFunc acquirefunc = NULL;
 118         BlockNumber relpages = 0;
 119
 120         /* Select logging level */
 121         if (params->options & VACOPT_VERBOSE)
 122                 elevel = INFO;
 123         else
 124                 elevel = DEBUG2;
 125
 126         /* Set up static variables */
 127         vac_strategy = bstrategy;
 128
 129         /*
 130          * Check for user-requested abort.
 131          */
 132         CHECK_FOR_INTERRUPTS();
 133
 134         /*
 135          * Open the relation, getting ShareUpdateExclusiveLock to ensure that two
 136          * ANALYZEs don't run on it concurrently.  (This also locks out a
 137          * concurrent VACUUM, which doesn't matter much at the moment but might
 138          * matter if we ever try to accumulate stats on dead tuples.) If the rel
 139          * has been dropped since we last saw it, we don't need to process it.
 140          *
 141          * Make sure to generate only logs for ANALYZE in this case.
 142          */
 143         onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM),
 144                                                                   params->log_min_duration >= 0,
 145                                                                   ShareUpdateExclusiveLock);
 146
 147         /* leave if relation could not be opened or locked */
 148         if (!onerel)
 149                 return;
 150
 151         /*
 152          * Check if relation needs to be skipped based on privileges.  This check
 153          * happens also when building the relation list to analyze for a manual
 154          * operation, and needs to be done additionally here as ANALYZE could
 155          * happen across multiple transactions where privileges could have changed
 156          * in-between.  Make sure to generate only logs for ANALYZE in this case.
 157          */
 158         if (!vacuum_is_permitted_for_relation(RelationGetRelid(onerel),
 159                                                                                   onerel->rd_rel,
 160                                                                                   params->options & ~VACOPT_VACUUM))
 161         {
 162                 relation_close(onerel, ShareUpdateExclusiveLock);
 163                 return;
 164         }
 165
 166         /*
 167          * Silently ignore tables that are temp tables of other backends ---
 168          * trying to analyze these is rather pointless, since their contents are
 169          * probably not up-to-date on disk.  (We don't throw a warning here; it
 170          * would just lead to chatter during a database-wide ANALYZE.)
 171          */
 172         if (RELATION_IS_OTHER_TEMP(onerel))
 173         {
 174                 relation_close(onerel, ShareUpdateExclusiveLock);
 175                 return;
 176         }
 177
 178         /*
 179          * We can ANALYZE any table except pg_statistic. See update_attstats
 180          */
 181         if (RelationGetRelid(onerel) == StatisticRelationId)
 182         {
 183                 relation_close(onerel, ShareUpdateExclusiveLock);
 184                 return;
 185         }
 186
 187         /*
 188          * Check that it's of an analyzable relkind, and set up appropriately.
 189          */
 190         if (onerel->rd_rel->relkind == RELKIND_RELATION ||
 191                 onerel->rd_rel->relkind == RELKIND_MATVIEW)
 192         {
 193                 /* Regular table, so we'll use the regular row acquisition function */
 194                 acquirefunc = acquire_sample_rows;
 195                 /* Also get regular table's size */
 196                 relpages = RelationGetNumberOfBlocks(onerel);
 197         }
 198         else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
 199         {
 200                 /*
 201                  * For a foreign table, call the FDW's hook function to see whether it
 202                  * supports analysis.
 203                  */
 204                 FdwRoutine *fdwroutine;
 205                 bool            ok = false;
 206
 207                 fdwroutine = GetFdwRoutineForRelation(onerel, false);
 208
 209                 if (fdwroutine->AnalyzeForeignTable != NULL)
 210                         ok = fdwroutine->AnalyzeForeignTable(onerel,
 211                                                                                                  &acquirefunc,
 212                                                                                                  &relpages);
 213
 214                 if (!ok)
 215                 {
 216                         ereport(WARNING,
 217                                         (errmsg("skipping \"%s\" --- cannot analyze this foreign table",
 218                                                         RelationGetRelationName(onerel))));
 219                         relation_close(onerel, ShareUpdateExclusiveLock);
 220                         return;
 221                 }
 222         }
 223         else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 224         {
 225                 /*
 226                  * For partitioned tables, we want to do the recursive ANALYZE below.
 227                  */
 228         }
 229         else
 230         {
 231                 /* No need for a WARNING if we already complained during VACUUM */
 232                 if (!(params->options & VACOPT_VACUUM))
 233                         ereport(WARNING,
 234                                         (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables",
 235                                                         RelationGetRelationName(onerel))));
 236                 relation_close(onerel, ShareUpdateExclusiveLock);
 237                 return;
 238         }
 239
 240         /*
 241          * OK, let's do it.  First, initialize progress reporting.
 242          */
 243         pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE,
 244                                                                   RelationGetRelid(onerel));
 245
 246         /*
 247          * Do the normal non-recursive ANALYZE.  We can skip this for partitioned
 248          * tables, which don't contain any rows.
 249          */
 250         if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
 251                 do_analyze_rel(onerel, params, va_cols, acquirefunc,
 252                                            relpages, false, in_outer_xact, elevel);
 253
 254         /*
 255          * If there are child tables, do recursive ANALYZE.
 256          */
 257         if (onerel->rd_rel->relhassubclass)
 258                 do_analyze_rel(onerel, params, va_cols, acquirefunc, relpages,
 259                                            true, in_outer_xact, elevel);
 260
 261         /*
 262          * Close source relation now, but keep lock so that no one deletes it
 263          * before we commit.  (If someone did, they'd fail to clean up the entries
 264          * we made in pg_statistic.  Also, releasing the lock before commit would
 265          * expose us to concurrent-update failures in update_attstats.)
 266          */
 267         relation_close(onerel, NoLock);
 268
 269         pgstat_progress_end_command();
 270 }
 271
 272 /*
 273  *      do_analyze_rel() -- analyze one relation, recursively or not
 274  *
 275  * Note that "acquirefunc" is only relevant for the non-inherited case.
 276  * For the inherited case, acquire_inherited_sample_rows() determines the
 277  * appropriate acquirefunc for each child table.
 278  */
 279 static void
 280 do_analyze_rel(Relation onerel, VacuumParams *params,
 281                            List *va_cols, AcquireSampleRowsFunc acquirefunc,
 282                            BlockNumber relpages, bool inh, bool in_outer_xact,
 283                            int elevel)
 284 {
 285         int                     attr_cnt,
 286                                 tcnt,
 287                                 i,
 288                                 ind;
 289         Relation   *Irel;
 290         int                     nindexes;
 291         bool            hasindex;
 292         VacAttrStats **vacattrstats;
 293         AnlIndexData *indexdata;
 294         int                     targrows,
 295                                 numrows,
 296                                 minrows;
 297         double          totalrows,
 298                                 totaldeadrows;
 299         HeapTuple  *rows;
 300         PGRUsage        ru0;
 301         TimestampTz starttime = 0;
 302         MemoryContext caller_context;
 303         Oid                     save_userid;
 304         int                     save_sec_context;
 305         int                     save_nestlevel;
 306         int64           AnalyzePageHit = VacuumPageHit;
 307         int64           AnalyzePageMiss = VacuumPageMiss;
 308         int64           AnalyzePageDirty = VacuumPageDirty;
 309         PgStat_Counter startreadtime = 0;
 310         PgStat_Counter startwritetime = 0;
 311
 312         if (inh)
 313                 ereport(elevel,
 314                                 (errmsg("analyzing \"%s.%s\" inheritance tree",
 315                                                 get_namespace_name(RelationGetNamespace(onerel)),
 316                                                 RelationGetRelationName(onerel))));
 317         else
 318                 ereport(elevel,
 319                                 (errmsg("analyzing \"%s.%s\"",
 320                                                 get_namespace_name(RelationGetNamespace(onerel)),
 321                                                 RelationGetRelationName(onerel))));
 322
 323         /*
 324          * Set up a working context so that we can easily free whatever junk gets
 325          * created.
 326          */
 327         anl_context = AllocSetContextCreate(CurrentMemoryContext,
 328                                                                                 "Analyze",
 329                                                                                 ALLOCSET_DEFAULT_SIZES);
 330         caller_context = MemoryContextSwitchTo(anl_context);
 331
 332         /*
 333          * Switch to the table owner's userid, so that any index functions are run
 334          * as that user.  Also lock down security-restricted operations and
 335          * arrange to make GUC variable changes local to this command.
 336          */
 337         GetUserIdAndSecContext(&save_userid, &save_sec_context);
 338         SetUserIdAndSecContext(onerel->rd_rel->relowner,
 339                                                    save_sec_context | SECURITY_RESTRICTED_OPERATION);
 340         save_nestlevel = NewGUCNestLevel();
 341         RestrictSearchPath();
 342
 343         /* measure elapsed time iff autovacuum logging requires it */
 344         if (AmAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 345         {
 346                 if (track_io_timing)
 347                 {
 348                         startreadtime = pgStatBlockReadTime;
 349                         startwritetime = pgStatBlockWriteTime;
 350                 }
 351
 352                 pg_rusage_init(&ru0);
 353                 starttime = GetCurrentTimestamp();
 354         }
 355
 356         /*
 357          * Determine which columns to analyze
 358          *
 359          * Note that system attributes are never analyzed, so we just reject them
 360          * at the lookup stage.  We also reject duplicate column mentions.  (We
 361          * could alternatively ignore duplicates, but analyzing a column twice
 362          * won't work; we'd end up making a conflicting update in pg_statistic.)
 363          */
 364         if (va_cols != NIL)
 365         {
 366                 Bitmapset  *unique_cols = NULL;
 367                 ListCell   *le;
 368
 369                 vacattrstats = (VacAttrStats **) palloc(list_length(va_cols) *
 370                                                                                                 sizeof(VacAttrStats *));
 371                 tcnt = 0;
 372                 foreach(le, va_cols)
 373                 {
 374                         char       *col = strVal(lfirst(le));
 375
 376                         i = attnameAttNum(onerel, col, false);
 377                         if (i == InvalidAttrNumber)
 378                                 ereport(ERROR,
 379                                                 (errcode(ERRCODE_UNDEFINED_COLUMN),
 380                                                  errmsg("column \"%s\" of relation \"%s\" does not exist",
 381                                                                 col, RelationGetRelationName(onerel))));
 382                         if (bms_is_member(i, unique_cols))
 383                                 ereport(ERROR,
 384                                                 (errcode(ERRCODE_DUPLICATE_COLUMN),
 385                                                  errmsg("column \"%s\" of relation \"%s\" appears more than once",
 386                                                                 col, RelationGetRelationName(onerel))));
 387                         unique_cols = bms_add_member(unique_cols, i);
 388
 389                         vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);
 390                         if (vacattrstats[tcnt] != NULL)
 391                                 tcnt++;
 392                 }
 393                 attr_cnt = tcnt;
 394         }
 395         else
 396         {
 397                 attr_cnt = onerel->rd_att->natts;
 398                 vacattrstats = (VacAttrStats **)
 399                         palloc(attr_cnt * sizeof(VacAttrStats *));
 400                 tcnt = 0;
 401                 for (i = 1; i <= attr_cnt; i++)
 402                 {
 403                         vacattrstats[tcnt] = examine_attribute(onerel, i, NULL);
 404                         if (vacattrstats[tcnt] != NULL)
 405                                 tcnt++;
 406                 }
 407                 attr_cnt = tcnt;
 408         }
 409
 410         /*
 411          * Open all indexes of the relation, and see if there are any analyzable
 412          * columns in the indexes.  We do not analyze index columns if there was
 413          * an explicit column list in the ANALYZE command, however.
 414          *
 415          * If we are doing a recursive scan, we don't want to touch the parent's
 416          * indexes at all.  If we're processing a partitioned table, we need to
 417          * know if there are any indexes, but we don't want to process them.
 418          */
 419         if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 420         {
 421                 List       *idxs = RelationGetIndexList(onerel);
 422
 423                 Irel = NULL;
 424                 nindexes = 0;
 425                 hasindex = idxs != NIL;
 426                 list_free(idxs);
 427         }
 428         else if (!inh)
 429         {
 430                 vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel);
 431                 hasindex = nindexes > 0;
 432         }
 433         else
 434         {
 435                 Irel = NULL;
 436                 nindexes = 0;
 437                 hasindex = false;
 438         }
 439         indexdata = NULL;
 440         if (nindexes > 0)
 441         {
 442                 indexdata = (AnlIndexData *) palloc0(nindexes * sizeof(AnlIndexData));
 443                 for (ind = 0; ind < nindexes; ind++)
 444                 {
 445                         AnlIndexData *thisdata = &indexdata[ind];
 446                         IndexInfo  *indexInfo;
 447
 448                         thisdata->indexInfo = indexInfo = BuildIndexInfo(Irel[ind]);
 449                         thisdata->tupleFract = 1.0; /* fix later if partial */
 450                         if (indexInfo->ii_Expressions != NIL && va_cols == NIL)
 451                         {
 452                                 ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
 453
 454                                 thisdata->vacattrstats = (VacAttrStats **)
 455                                         palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *));
 456                                 tcnt = 0;
 457                                 for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
 458                                 {
 459                                         int                     keycol = indexInfo->ii_IndexAttrNumbers[i];
 460
 461                                         if (keycol == 0)
 462                                         {
 463                                                 /* Found an index expression */
 464                                                 Node       *indexkey;
 465
 466                                                 if (indexpr_item == NULL)       /* shouldn't happen */
 467                                                         elog(ERROR, "too few entries in indexprs list");
 468                                                 indexkey = (Node *) lfirst(indexpr_item);
 469                                                 indexpr_item = lnext(indexInfo->ii_Expressions,
 470                                                                                          indexpr_item);
 471                                                 thisdata->vacattrstats[tcnt] =
 472                                                         examine_attribute(Irel[ind], i + 1, indexkey);
 473                                                 if (thisdata->vacattrstats[tcnt] != NULL)
 474                                                         tcnt++;
 475                                         }
 476                                 }
 477                                 thisdata->attr_cnt = tcnt;
 478                         }
 479                 }
 480         }
 481
 482         /*
 483          * Determine how many rows we need to sample, using the worst case from
 484          * all analyzable columns.  We use a lower bound of 100 rows to avoid
 485          * possible overflow in Vitter's algorithm.  (Note: that will also be the
 486          * target in the corner case where there are no analyzable columns.)
 487          */
 488         targrows = 100;
 489         for (i = 0; i < attr_cnt; i++)
 490         {
 491                 if (targrows < vacattrstats[i]->minrows)
 492                         targrows = vacattrstats[i]->minrows;
 493         }
 494         for (ind = 0; ind < nindexes; ind++)
 495         {
 496                 AnlIndexData *thisdata = &indexdata[ind];
 497
 498                 for (i = 0; i < thisdata->attr_cnt; i++)
 499                 {
 500                         if (targrows < thisdata->vacattrstats[i]->minrows)
 501                                 targrows = thisdata->vacattrstats[i]->minrows;
 502                 }
 503         }
 504
 505         /*
 506          * Look at extended statistics objects too, as those may define custom
 507          * statistics target. So we may need to sample more rows and then build
 508          * the statistics with enough detail.
 509          */
 510         minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats);
 511
 512         if (targrows < minrows)
 513                 targrows = minrows;
 514
 515         /*
 516          * Acquire the sample rows
 517          */
 518         rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple));
 519         pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
 520                                                                  inh ? PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS_INH :
 521                                                                  PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS);
 522         if (inh)
 523                 numrows = acquire_inherited_sample_rows(onerel, elevel,
 524                                                                                                 rows, targrows,
 525                                                                                                 &totalrows, &totaldeadrows);
 526         else
 527                 numrows = (*acquirefunc) (onerel, elevel,
 528                                                                   rows, targrows,
 529                                                                   &totalrows, &totaldeadrows);
 530
 531         /*
 532          * Compute the statistics.  Temporary results during the calculations for
 533          * each column are stored in a child context.  The calc routines are
 534          * responsible to make sure that whatever they store into the VacAttrStats
 535          * structure is allocated in anl_context.
 536          */
 537         if (numrows > 0)
 538         {
 539                 MemoryContext col_context,
 540                                         old_context;
 541
 542                 pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
 543                                                                          PROGRESS_ANALYZE_PHASE_COMPUTE_STATS);
 544
 545                 col_context = AllocSetContextCreate(anl_context,
 546                                                                                         "Analyze Column",
 547                                                                                         ALLOCSET_DEFAULT_SIZES);
 548                 old_context = MemoryContextSwitchTo(col_context);
 549
 550                 for (i = 0; i < attr_cnt; i++)
 551                 {
 552                         VacAttrStats *stats = vacattrstats[i];
 553                         AttributeOpts *aopt;
 554
 555                         stats->rows = rows;
 556                         stats->tupDesc = onerel->rd_att;
 557                         stats->compute_stats(stats,
 558                                                                  std_fetch_func,
 559                                                                  numrows,
 560                                                                  totalrows);
 561
 562                         /*
 563                          * If the appropriate flavor of the n_distinct option is
 564                          * specified, override with the corresponding value.
 565                          */
 566                         aopt = get_attribute_options(onerel->rd_id, stats->tupattnum);
 567                         if (aopt != NULL)
 568                         {
 569                                 float8          n_distinct;
 570
 571                                 n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
 572                                 if (n_distinct != 0.0)
 573                                         stats->stadistinct = n_distinct;
 574                         }
 575
 576                         MemoryContextReset(col_context);
 577                 }
 578
 579                 if (nindexes > 0)
 580                         compute_index_stats(onerel, totalrows,
 581                                                                 indexdata, nindexes,
 582                                                                 rows, numrows,
 583                                                                 col_context);
 584
 585                 MemoryContextSwitchTo(old_context);
 586                 MemoryContextDelete(col_context);
 587
 588                 /*
 589                  * Emit the completed stats rows into pg_statistic, replacing any
 590                  * previous statistics for the target columns.  (If there are stats in
 591                  * pg_statistic for columns we didn't process, we leave them alone.)
 592                  */
 593                 update_attstats(RelationGetRelid(onerel), inh,
 594                                                 attr_cnt, vacattrstats);
 595
 596                 for (ind = 0; ind < nindexes; ind++)
 597                 {
 598                         AnlIndexData *thisdata = &indexdata[ind];
 599
 600                         update_attstats(RelationGetRelid(Irel[ind]), false,
 601                                                         thisdata->attr_cnt, thisdata->vacattrstats);
 602                 }
 603
 604                 /* Build extended statistics (if there are any). */
 605                 BuildRelationExtStatistics(onerel, inh, totalrows, numrows, rows,
 606                                                                    attr_cnt, vacattrstats);
 607         }
 608
 609         pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE,
 610                                                                  PROGRESS_ANALYZE_PHASE_FINALIZE_ANALYZE);
 611
 612         /*
 613          * Update pages/tuples stats in pg_class ... but not if we're doing
 614          * inherited stats.
 615          *
 616          * We assume that VACUUM hasn't set pg_class.reltuples already, even
 617          * during a VACUUM ANALYZE.  Although VACUUM often updates pg_class,
 618          * exceptions exist.  A "VACUUM (ANALYZE, INDEX_CLEANUP OFF)" command will
 619          * never update pg_class entries for index relations.  It's also possible
 620          * that an individual index's pg_class entry won't be updated during
 621          * VACUUM if the index AM returns NULL from its amvacuumcleanup() routine.
 622          */
 623         if (!inh)
 624         {
 625                 BlockNumber relallvisible;
 626
 627                 if (RELKIND_HAS_STORAGE(onerel->rd_rel->relkind))
 628                         visibilitymap_count(onerel, &relallvisible, NULL);
 629                 else
 630                         relallvisible = 0;
 631
 632                 /* Update pg_class for table relation */
 633                 vac_update_relstats(onerel,
 634                                                         relpages,
 635                                                         totalrows,
 636                                                         relallvisible,
 637                                                         hasindex,
 638                                                         InvalidTransactionId,
 639                                                         InvalidMultiXactId,
 640                                                         NULL, NULL,
 641                                                         in_outer_xact);
 642
 643                 /* Same for indexes */
 644                 for (ind = 0; ind < nindexes; ind++)
 645                 {
 646                         AnlIndexData *thisdata = &indexdata[ind];
 647                         double          totalindexrows;
 648
 649                         totalindexrows = ceil(thisdata->tupleFract * totalrows);
 650                         vac_update_relstats(Irel[ind],
 651                                                                 RelationGetNumberOfBlocks(Irel[ind]),
 652                                                                 totalindexrows,
 653                                                                 0,
 654                                                                 false,
 655                                                                 InvalidTransactionId,
 656                                                                 InvalidMultiXactId,
 657                                                                 NULL, NULL,
 658                                                                 in_outer_xact);
 659                 }
 660         }
 661         else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 662         {
 663                 /*
 664                  * Partitioned tables don't have storage, so we don't set any fields
 665                  * in their pg_class entries except for reltuples and relhasindex.
 666                  */
 667                 vac_update_relstats(onerel, -1, totalrows,
 668                                                         0, hasindex, InvalidTransactionId,
 669                                                         InvalidMultiXactId,
 670                                                         NULL, NULL,
 671                                                         in_outer_xact);
 672         }
 673
 674         /*
 675          * Now report ANALYZE to the cumulative stats system.  For regular tables,
 676          * we do it only if not doing inherited stats.  For partitioned tables, we
 677          * only do it for inherited stats. (We're never called for not-inherited
 678          * stats on partitioned tables anyway.)
 679          *
 680          * Reset the changes_since_analyze counter only if we analyzed all
 681          * columns; otherwise, there is still work for auto-analyze to do.
 682          */
 683         if (!inh)
 684                 pgstat_report_analyze(onerel, totalrows, totaldeadrows,
 685                                                           (va_cols == NIL));
 686         else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 687                 pgstat_report_analyze(onerel, 0, 0, (va_cols == NIL));
 688
 689         /*
 690          * If this isn't part of VACUUM ANALYZE, let index AMs do cleanup.
 691          *
 692          * Note that most index AMs perform a no-op as a matter of policy for
 693          * amvacuumcleanup() when called in ANALYZE-only mode.  The only exception
 694          * among core index AMs is GIN/ginvacuumcleanup().
 695          */
 696         if (!(params->options & VACOPT_VACUUM))
 697         {
 698                 for (ind = 0; ind < nindexes; ind++)
 699                 {
 700                         IndexBulkDeleteResult *stats;
 701                         IndexVacuumInfo ivinfo;
 702
 703                         ivinfo.index = Irel[ind];
 704                         ivinfo.heaprel = onerel;
 705                         ivinfo.analyze_only = true;
 706                         ivinfo.estimated_count = true;
 707                         ivinfo.message_level = elevel;
 708                         ivinfo.num_heap_tuples = onerel->rd_rel->reltuples;
 709                         ivinfo.strategy = vac_strategy;
 710
 711                         stats = index_vacuum_cleanup(&ivinfo, NULL);
 712
 713                         if (stats)
 714                                 pfree(stats);
 715                 }
 716         }
 717
 718         /* Done with indexes */
 719         vac_close_indexes(nindexes, Irel, NoLock);
 720
 721         /* Log the action if appropriate */
 722         if (AmAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
 723         {
 724                 TimestampTz endtime = GetCurrentTimestamp();
 725
 726                 if (params->log_min_duration == 0 ||
 727                         TimestampDifferenceExceeds(starttime, endtime,
 728                                                                            params->log_min_duration))
 729                 {
 730                         long            delay_in_ms;
 731                         double          read_rate = 0;
 732                         double          write_rate = 0;
 733                         StringInfoData buf;
 734
 735                         /*
 736                          * Calculate the difference in the Page Hit/Miss/Dirty that
 737                          * happened as part of the analyze by subtracting out the
 738                          * pre-analyze values which we saved above.
 739                          */
 740                         AnalyzePageHit = VacuumPageHit - AnalyzePageHit;
 741                         AnalyzePageMiss = VacuumPageMiss - AnalyzePageMiss;
 742                         AnalyzePageDirty = VacuumPageDirty - AnalyzePageDirty;
 743
 744                         /*
 745                          * We do not expect an analyze to take > 25 days and it simplifies
 746                          * things a bit to use TimestampDifferenceMilliseconds.
 747                          */
 748                         delay_in_ms = TimestampDifferenceMilliseconds(starttime, endtime);
 749
 750                         /*
 751                          * Note that we are reporting these read/write rates in the same
 752                          * manner as VACUUM does, which means that while the 'average read
 753                          * rate' here actually corresponds to page misses and resulting
 754                          * reads which are also picked up by track_io_timing, if enabled,
 755                          * the 'average write rate' is actually talking about the rate of
 756                          * pages being dirtied, not being written out, so it's typical to
 757                          * have a non-zero 'avg write rate' while I/O timings only reports
 758                          * reads.
 759                          *
 760                          * It's not clear that an ANALYZE will ever result in
 761                          * FlushBuffer() being called, but we track and support reporting
 762                          * on I/O write time in case that changes as it's practically free
 763                          * to do so anyway.
 764                          */
 765
 766                         if (delay_in_ms > 0)
 767                         {
 768                                 read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) /
 769                                         (delay_in_ms / 1000.0);
 770                                 write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) /
 771                                         (delay_in_ms / 1000.0);
 772                         }
 773
 774                         /*
 775                          * We split this up so we don't emit empty I/O timing values when
 776                          * track_io_timing isn't enabled.
 777                          */
 778
 779                         initStringInfo(&buf);
 780                         appendStringInfo(&buf, _("automatic analyze of table \"%s.%s.%s\"\n"),
 781                                                          get_database_name(MyDatabaseId),
 782                                                          get_namespace_name(RelationGetNamespace(onerel)),
 783                                                          RelationGetRelationName(onerel));
 784                         if (track_io_timing)
 785                         {
 786                                 double          read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000;
 787                                 double          write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000;
 788
 789                                 appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"),
 790                                                                  read_ms, write_ms);
 791                         }
 792                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
 793                                                          read_rate, write_rate);
 794                         appendStringInfo(&buf, _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"),
 795                                                          (long long) AnalyzePageHit,
 796                                                          (long long) AnalyzePageMiss,
 797                                                          (long long) AnalyzePageDirty);
 798                         appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
 799
 800                         ereport(LOG,
 801                                         (errmsg_internal("%s", buf.data)));
 802
 803                         pfree(buf.data);
 804                 }
 805         }
 806
 807         /* Roll back any GUC changes executed by index functions */
 808         AtEOXact_GUC(false, save_nestlevel);
 809
 810         /* Restore userid and security context */
 811         SetUserIdAndSecContext(save_userid, save_sec_context);
 812
 813         /* Restore current context and release memory */
 814         MemoryContextSwitchTo(caller_context);
 815         MemoryContextDelete(anl_context);
 816         anl_context = NULL;
 817 }
 818
 819 /*
 820  * Compute statistics about indexes of a relation
 821  */
 822 static void
 823 compute_index_stats(Relation onerel, double totalrows,
 824                                         AnlIndexData *indexdata, int nindexes,
 825                                         HeapTuple *rows, int numrows,
 826                                         MemoryContext col_context)
 827 {
 828         MemoryContext ind_context,
 829                                 old_context;
 830         Datum           values[INDEX_MAX_KEYS];
 831         bool            isnull[INDEX_MAX_KEYS];
 832         int                     ind,
 833                                 i;
 834
 835         ind_context = AllocSetContextCreate(anl_context,
 836                                                                                 "Analyze Index",
 837                                                                                 ALLOCSET_DEFAULT_SIZES);
 838         old_context = MemoryContextSwitchTo(ind_context);
 839
 840         for (ind = 0; ind < nindexes; ind++)
 841         {
 842                 AnlIndexData *thisdata = &indexdata[ind];
 843                 IndexInfo  *indexInfo = thisdata->indexInfo;
 844                 int                     attr_cnt = thisdata->attr_cnt;
 845                 TupleTableSlot *slot;
 846                 EState     *estate;
 847                 ExprContext *econtext;
 848                 ExprState  *predicate;
 849                 Datum      *exprvals;
 850                 bool       *exprnulls;
 851                 int                     numindexrows,
 852                                         tcnt,
 853                                         rowno;
 854                 double          totalindexrows;
 855
 856                 /* Ignore index if no columns to analyze and not partial */
 857                 if (attr_cnt == 0 && indexInfo->ii_Predicate == NIL)
 858                         continue;
 859
 860                 /*
 861                  * Need an EState for evaluation of index expressions and
 862                  * partial-index predicates.  Create it in the per-index context to be
 863                  * sure it gets cleaned up at the bottom of the loop.
 864                  */
 865                 estate = CreateExecutorState();
 866                 econtext = GetPerTupleExprContext(estate);
 867                 /* Need a slot to hold the current heap tuple, too */
 868                 slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel),
 869                                                                                 &TTSOpsHeapTuple);
 870
 871                 /* Arrange for econtext's scan tuple to be the tuple under test */
 872                 econtext->ecxt_scantuple = slot;
 873
 874                 /* Set up execution state for predicate. */
 875                 predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
 876
 877                 /* Compute and save index expression values */
 878                 exprvals = (Datum *) palloc(numrows * attr_cnt * sizeof(Datum));
 879                 exprnulls = (bool *) palloc(numrows * attr_cnt * sizeof(bool));
 880                 numindexrows = 0;
 881                 tcnt = 0;
 882                 for (rowno = 0; rowno < numrows; rowno++)
 883                 {
 884                         HeapTuple       heapTuple = rows[rowno];
 885
 886                         vacuum_delay_point();
 887
 888                         /*
 889                          * Reset the per-tuple context each time, to reclaim any cruft
 890                          * left behind by evaluating the predicate or index expressions.
 891                          */
 892                         ResetExprContext(econtext);
 893
 894                         /* Set up for predicate or expression evaluation */
 895                         ExecStoreHeapTuple(heapTuple, slot, false);
 896
 897                         /* If index is partial, check predicate */
 898                         if (predicate != NULL)
 899                         {
 900                                 if (!ExecQual(predicate, econtext))
 901                                         continue;
 902                         }
 903                         numindexrows++;
 904
 905                         if (attr_cnt > 0)
 906                         {
 907                                 /*
 908                                  * Evaluate the index row to compute expression values. We
 909                                  * could do this by hand, but FormIndexDatum is convenient.
 910                                  */
 911                                 FormIndexDatum(indexInfo,
 912                                                            slot,
 913                                                            estate,
 914                                                            values,
 915                                                            isnull);
 916
 917                                 /*
 918                                  * Save just the columns we care about.  We copy the values
 919                                  * into ind_context from the estate's per-tuple context.
 920                                  */
 921                                 for (i = 0; i < attr_cnt; i++)
 922                                 {
 923                                         VacAttrStats *stats = thisdata->vacattrstats[i];
 924                                         int                     attnum = stats->tupattnum;
 925
 926                                         if (isnull[attnum - 1])
 927                                         {
 928                                                 exprvals[tcnt] = (Datum) 0;
 929                                                 exprnulls[tcnt] = true;
 930                                         }
 931                                         else
 932                                         {
 933                                                 exprvals[tcnt] = datumCopy(values[attnum - 1],
 934                                                                                                    stats->attrtype->typbyval,
 935                                                                                                    stats->attrtype->typlen);
 936                                                 exprnulls[tcnt] = false;
 937                                         }
 938                                         tcnt++;
 939                                 }
 940                         }
 941                 }
 942
 943                 /*
 944                  * Having counted the number of rows that pass the predicate in the
 945                  * sample, we can estimate the total number of rows in the index.
 946                  */
 947                 thisdata->tupleFract = (double) numindexrows / (double) numrows;
 948                 totalindexrows = ceil(thisdata->tupleFract * totalrows);
 949
 950                 /*
 951                  * Now we can compute the statistics for the expression columns.
 952                  */
 953                 if (numindexrows > 0)
 954                 {
 955                         MemoryContextSwitchTo(col_context);
 956                         for (i = 0; i < attr_cnt; i++)
 957                         {
 958                                 VacAttrStats *stats = thisdata->vacattrstats[i];
 959
 960                                 stats->exprvals = exprvals + i;
 961                                 stats->exprnulls = exprnulls + i;
 962                                 stats->rowstride = attr_cnt;
 963                                 stats->compute_stats(stats,
 964                                                                          ind_fetch_func,
 965                                                                          numindexrows,
 966                                                                          totalindexrows);
 967
 968                                 MemoryContextReset(col_context);
 969                         }
 970                 }
 971
 972                 /* And clean up */
 973                 MemoryContextSwitchTo(ind_context);
 974
 975                 ExecDropSingleTupleTableSlot(slot);
 976                 FreeExecutorState(estate);
 977                 MemoryContextReset(ind_context);
 978         }
 979
 980         MemoryContextSwitchTo(old_context);
 981         MemoryContextDelete(ind_context);
 982 }
 983
 984 /*
 985  * examine_attribute -- pre-analysis of a single column
 986  *
 987  * Determine whether the column is analyzable; if so, create and initialize
 988  * a VacAttrStats struct for it.  If not, return NULL.
 989  *
 990  * If index_expr isn't NULL, then we're trying to analyze an expression index,
 991  * and index_expr is the expression tree representing the column's data.
 992  */
 993 static VacAttrStats *
 994 examine_attribute(Relation onerel, int attnum, Node *index_expr)
 995 {
 996         Form_pg_attribute attr = TupleDescAttr(onerel->rd_att, attnum - 1);
 997         int                     attstattarget;
 998         HeapTuple       atttuple;
 999         Datum           dat;
1000         bool            isnull;
1001         HeapTuple       typtuple;
1002         VacAttrStats *stats;
1003         int                     i;
1004         bool            ok;
1005
1006         /* Never analyze dropped columns */
1007         if (attr->attisdropped)
1008                 return NULL;
1009
1010         /*
1011          * Get attstattarget value.  Set to -1 if null.  (Analyze functions expect
1012          * -1 to mean use default_statistics_target; see for example
1013          * std_typanalyze.)
1014          */
1015         atttuple = SearchSysCache2(ATTNUM, ObjectIdGetDatum(RelationGetRelid(onerel)), Int16GetDatum(attnum));
1016         if (!HeapTupleIsValid(atttuple))
1017                 elog(ERROR, "cache lookup failed for attribute %d of relation %u",
1018                          attnum, RelationGetRelid(onerel));
1019         dat = SysCacheGetAttr(ATTNUM, atttuple, Anum_pg_attribute_attstattarget, &isnull);
1020         attstattarget = isnull ? -1 : DatumGetInt16(dat);
1021         ReleaseSysCache(atttuple);
1022
1023         /* Don't analyze column if user has specified not to */
1024         if (attstattarget == 0)
1025                 return NULL;
1026
1027         /*
1028          * Create the VacAttrStats struct.
1029          */
1030         stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
1031         stats->attstattarget = attstattarget;
1032
1033         /*
1034          * When analyzing an expression index, believe the expression tree's type
1035          * not the column datatype --- the latter might be the opckeytype storage
1036          * type of the opclass, which is not interesting for our purposes.  (Note:
1037          * if we did anything with non-expression index columns, we'd need to
1038          * figure out where to get the correct type info from, but for now that's
1039          * not a problem.)      It's not clear whether anyone will care about the
1040          * typmod, but we store that too just in case.
1041          */
1042         if (index_expr)
1043         {
1044                 stats->attrtypid = exprType(index_expr);
1045                 stats->attrtypmod = exprTypmod(index_expr);
1046
1047                 /*
1048                  * If a collation has been specified for the index column, use that in
1049                  * preference to anything else; but if not, fall back to whatever we
1050                  * can get from the expression.
1051                  */
1052                 if (OidIsValid(onerel->rd_indcollation[attnum - 1]))
1053                         stats->attrcollid = onerel->rd_indcollation[attnum - 1];
1054                 else
1055                         stats->attrcollid = exprCollation(index_expr);
1056         }
1057         else
1058         {
1059                 stats->attrtypid = attr->atttypid;
1060                 stats->attrtypmod = attr->atttypmod;
1061                 stats->attrcollid = attr->attcollation;
1062         }
1063
1064         typtuple = SearchSysCacheCopy1(TYPEOID,
1065                                                                    ObjectIdGetDatum(stats->attrtypid));
1066         if (!HeapTupleIsValid(typtuple))
1067                 elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
1068         stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
1069         stats->anl_context = anl_context;
1070         stats->tupattnum = attnum;
1071
1072         /*
1073          * The fields describing the stats->stavalues[n] element types default to
1074          * the type of the data being analyzed, but the type-specific typanalyze
1075          * function can change them if it wants to store something else.
1076          */
1077         for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
1078         {
1079                 stats->statypid[i] = stats->attrtypid;
1080                 stats->statyplen[i] = stats->attrtype->typlen;
1081                 stats->statypbyval[i] = stats->attrtype->typbyval;
1082                 stats->statypalign[i] = stats->attrtype->typalign;
1083         }
1084
1085         /*
1086          * Call the type-specific typanalyze function.  If none is specified, use
1087          * std_typanalyze().
1088          */
1089         if (OidIsValid(stats->attrtype->typanalyze))
1090                 ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
1091                                                                                    PointerGetDatum(stats)));
1092         else
1093                 ok = std_typanalyze(stats);
1094
1095         if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
1096         {
1097                 heap_freetuple(typtuple);
1098                 pfree(stats);
1099                 return NULL;
1100         }
1101
1102         return stats;
1103 }
1104
1105 /*
1106  * Read stream callback returning the next BlockNumber as chosen by the
1107  * BlockSampling algorithm.
1108  */
1109 static BlockNumber
1110 block_sampling_read_stream_next(ReadStream *stream,
1111                                                                 void *callback_private_data,
1112                                                                 void *per_buffer_data)
1113 {
1114         BlockSamplerData *bs = callback_private_data;
1115
1116         return BlockSampler_HasMore(bs) ? BlockSampler_Next(bs) : InvalidBlockNumber;
1117 }
1118
1119 /*
1120  * acquire_sample_rows -- acquire a random sample of rows from the table
1121  *
1122  * Selected rows are returned in the caller-allocated array rows[], which
1123  * must have at least targrows entries.
1124  * The actual number of rows selected is returned as the function result.
1125  * We also estimate the total numbers of live and dead rows in the table,
1126  * and return them into *totalrows and *totaldeadrows, respectively.
1127  *
1128  * The returned list of tuples is in order by physical position in the table.
1129  * (We will rely on this later to derive correlation estimates.)
1130  *
1131  * As of May 2004 we use a new two-stage method:  Stage one selects up
1132  * to targrows random blocks (or all blocks, if there aren't so many).
1133  * Stage two scans these blocks and uses the Vitter algorithm to create
1134  * a random sample of targrows rows (or less, if there are less in the
1135  * sample of blocks).  The two stages are executed simultaneously: each
1136  * block is processed as soon as stage one returns its number and while
1137  * the rows are read stage two controls which ones are to be inserted
1138  * into the sample.
1139  *
1140  * Although every row has an equal chance of ending up in the final
1141  * sample, this sampling method is not perfect: not every possible
1142  * sample has an equal chance of being selected.  For large relations
1143  * the number of different blocks represented by the sample tends to be
1144  * too small.  We can live with that for now.  Improvements are welcome.
1145  *
1146  * An important property of this sampling method is that because we do
1147  * look at a statistically unbiased set of blocks, we should get
1148  * unbiased estimates of the average numbers of live and dead rows per
1149  * block.  The previous sampling method put too much credence in the row
1150  * density near the start of the table.
1151  */
1152 static int
1153 acquire_sample_rows(Relation onerel, int elevel,
1154                                         HeapTuple *rows, int targrows,
1155                                         double *totalrows, double *totaldeadrows)
1156 {
1157         int                     numrows = 0;    /* # rows now in reservoir */
1158         double          samplerows = 0; /* total # rows collected */
1159         double          liverows = 0;   /* # live rows seen */
1160         double          deadrows = 0;   /* # dead rows seen */
1161         double          rowstoskip = -1;        /* -1 means not set yet */
1162         uint32          randseed;               /* Seed for block sampler(s) */
1163         BlockNumber totalblocks;
1164         TransactionId OldestXmin;
1165         BlockSamplerData bs;
1166         ReservoirStateData rstate;
1167         TupleTableSlot *slot;
1168         TableScanDesc scan;
1169         BlockNumber nblocks;
1170         BlockNumber blksdone = 0;
1171         ReadStream *stream;
1172
1173         Assert(targrows > 0);
1174
1175         totalblocks = RelationGetNumberOfBlocks(onerel);
1176
1177         /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
1178         OldestXmin = GetOldestNonRemovableTransactionId(onerel);
1179
1180         /* Prepare for sampling block numbers */
1181         randseed = pg_prng_uint32(&pg_global_prng_state);
1182         nblocks = BlockSampler_Init(&bs, totalblocks, targrows, randseed);
1183
1184         /* Report sampling block numbers */
1185         pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL,
1186                                                                  nblocks);
1187
1188         /* Prepare for sampling rows */
1189         reservoir_init_selection_state(&rstate, targrows);
1190
1191         scan = table_beginscan_analyze(onerel);
1192         slot = table_slot_create(onerel, NULL);
1193
1194         stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE,
1195                                                                                 vac_strategy,
1196                                                                                 scan->rs_rd,
1197                                                                                 MAIN_FORKNUM,
1198                                                                                 block_sampling_read_stream_next,
1199                                                                                 &bs,
1200                                                                                 0);
1201
1202         /* Outer loop over blocks to sample */
1203         while (table_scan_analyze_next_block(scan, stream))
1204         {
1205                 vacuum_delay_point();
1206
1207                 while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot))
1208                 {
1209                         /*
1210                          * The first targrows sample rows are simply copied into the
1211                          * reservoir. Then we start replacing tuples in the sample until
1212                          * we reach the end of the relation.  This algorithm is from Jeff
1213                          * Vitter's paper (see full citation in utils/misc/sampling.c). It
1214                          * works by repeatedly computing the number of tuples to skip
1215                          * before selecting a tuple, which replaces a randomly chosen
1216                          * element of the reservoir (current set of tuples).  At all times
1217                          * the reservoir is a true random sample of the tuples we've
1218                          * passed over so far, so when we fall off the end of the relation
1219                          * we're done.
1220                          */
1221                         if (numrows < targrows)
1222                                 rows[numrows++] = ExecCopySlotHeapTuple(slot);
1223                         else
1224                         {
1225                                 /*
1226                                  * t in Vitter's paper is the number of records already
1227                                  * processed.  If we need to compute a new S value, we must
1228                                  * use the not-yet-incremented value of samplerows as t.
1229                                  */
1230                                 if (rowstoskip < 0)
1231                                         rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows);
1232
1233                                 if (rowstoskip <= 0)
1234                                 {
1235                                         /*
1236                                          * Found a suitable tuple, so save it, replacing one old
1237                                          * tuple at random
1238                                          */
1239                                         int                     k = (int) (targrows * sampler_random_fract(&rstate.randstate));
1240
1241                                         Assert(k >= 0 && k < targrows);
1242                                         heap_freetuple(rows[k]);
1243                                         rows[k] = ExecCopySlotHeapTuple(slot);
1244                                 }
1245
1246                                 rowstoskip -= 1;
1247                         }
1248
1249                         samplerows += 1;
1250                 }
1251
1252                 pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE,
1253                                                                          ++blksdone);
1254         }
1255
1256         read_stream_end(stream);
1257
1258         ExecDropSingleTupleTableSlot(slot);
1259         table_endscan(scan);
1260
1261         /*
1262          * If we didn't find as many tuples as we wanted then we're done. No sort
1263          * is needed, since they're already in order.
1264          *
1265          * Otherwise we need to sort the collected tuples by position
1266          * (itempointer). It's not worth worrying about corner cases where the
1267          * tuples are already sorted.
1268          */
1269         if (numrows == targrows)
1270                 qsort_interruptible(rows, numrows, sizeof(HeapTuple),
1271                                                         compare_rows, NULL);
1272
1273         /*
1274          * Estimate total numbers of live and dead rows in relation, extrapolating
1275          * on the assumption that the average tuple density in pages we didn't
1276          * scan is the same as in the pages we did scan.  Since what we scanned is
1277          * a random sample of the pages in the relation, this should be a good
1278          * assumption.
1279          */
1280         if (bs.m > 0)
1281         {
1282                 *totalrows = floor((liverows / bs.m) * totalblocks + 0.5);
1283                 *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
1284         }
1285         else
1286         {
1287                 *totalrows = 0.0;
1288                 *totaldeadrows = 0.0;
1289         }
1290
1291         /*
1292          * Emit some interesting relation info
1293          */
1294         ereport(elevel,
1295                         (errmsg("\"%s\": scanned %d of %u pages, "
1296                                         "containing %.0f live rows and %.0f dead rows; "
1297                                         "%d rows in sample, %.0f estimated total rows",
1298                                         RelationGetRelationName(onerel),
1299                                         bs.m, totalblocks,
1300                                         liverows, deadrows,
1301                                         numrows, *totalrows)));
1302
1303         return numrows;
1304 }
1305
1306 /*
1307  * Comparator for sorting rows[] array
1308  */
1309 static int
1310 compare_rows(const void *a, const void *b, void *arg)
1311 {
1312         HeapTuple       ha = *(const HeapTuple *) a;
1313         HeapTuple       hb = *(const HeapTuple *) b;
1314         BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
1315         OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
1316         BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
1317         OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
1318
1319         if (ba < bb)
1320                 return -1;
1321         if (ba > bb)
1322                 return 1;
1323         if (oa < ob)
1324                 return -1;
1325         if (oa > ob)
1326                 return 1;
1327         return 0;
1328 }
1329
1330
1331 /*
1332  * acquire_inherited_sample_rows -- acquire sample rows from inheritance tree
1333  *
1334  * This has the same API as acquire_sample_rows, except that rows are
1335  * collected from all inheritance children as well as the specified table.
1336  * We fail and return zero if there are no inheritance children, or if all
1337  * children are foreign tables that don't support ANALYZE.
1338  */
1339 static int
1340 acquire_inherited_sample_rows(Relation onerel, int elevel,
1341                                                           HeapTuple *rows, int targrows,
1342                                                           double *totalrows, double *totaldeadrows)
1343 {
1344         List       *tableOIDs;
1345         Relation   *rels;
1346         AcquireSampleRowsFunc *acquirefuncs;
1347         double     *relblocks;
1348         double          totalblocks;
1349         int                     numrows,
1350                                 nrels,
1351                                 i;
1352         ListCell   *lc;
1353         bool            has_child;
1354
1355         /* Initialize output parameters to zero now, in case we exit early */
1356         *totalrows = 0;
1357         *totaldeadrows = 0;
1358
1359         /*
1360          * Find all members of inheritance set.  We only need AccessShareLock on
1361          * the children.
1362          */
1363         tableOIDs =
1364                 find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL);
1365
1366         /*
1367          * Check that there's at least one descendant, else fail.  This could
1368          * happen despite analyze_rel's relhassubclass check, if table once had a
1369          * child but no longer does.  In that case, we can clear the
1370          * relhassubclass field so as not to make the same mistake again later.
1371          * (This is safe because we hold ShareUpdateExclusiveLock.)
1372          */
1373         if (list_length(tableOIDs) < 2)
1374         {
1375                 /* CCI because we already updated the pg_class row in this command */
1376                 CommandCounterIncrement();
1377                 SetRelationHasSubclass(RelationGetRelid(onerel), false);
1378                 ereport(elevel,
1379                                 (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables",
1380                                                 get_namespace_name(RelationGetNamespace(onerel)),
1381                                                 RelationGetRelationName(onerel))));
1382                 return 0;
1383         }
1384
1385         /*
1386          * Identify acquirefuncs to use, and count blocks in all the relations.
1387          * The result could overflow BlockNumber, so we use double arithmetic.
1388          */
1389         rels = (Relation *) palloc(list_length(tableOIDs) * sizeof(Relation));
1390         acquirefuncs = (AcquireSampleRowsFunc *)
1391                 palloc(list_length(tableOIDs) * sizeof(AcquireSampleRowsFunc));
1392         relblocks = (double *) palloc(list_length(tableOIDs) * sizeof(double));
1393         totalblocks = 0;
1394         nrels = 0;
1395         has_child = false;
1396         foreach(lc, tableOIDs)
1397         {
1398                 Oid                     childOID = lfirst_oid(lc);
1399                 Relation        childrel;
1400                 AcquireSampleRowsFunc acquirefunc = NULL;
1401                 BlockNumber relpages = 0;
1402
1403                 /* We already got the needed lock */
1404                 childrel = table_open(childOID, NoLock);
1405
1406                 /* Ignore if temp table of another backend */
1407                 if (RELATION_IS_OTHER_TEMP(childrel))
1408                 {
1409                         /* ... but release the lock on it */
1410                         Assert(childrel != onerel);
1411                         table_close(childrel, AccessShareLock);
1412                         continue;
1413                 }
1414
1415                 /* Check table type (MATVIEW can't happen, but might as well allow) */
1416                 if (childrel->rd_rel->relkind == RELKIND_RELATION ||
1417                         childrel->rd_rel->relkind == RELKIND_MATVIEW)
1418                 {
1419                         /* Regular table, so use the regular row acquisition function */
1420                         acquirefunc = acquire_sample_rows;
1421                         relpages = RelationGetNumberOfBlocks(childrel);
1422                 }
1423                 else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
1424                 {
1425                         /*
1426                          * For a foreign table, call the FDW's hook function to see
1427                          * whether it supports analysis.
1428                          */
1429                         FdwRoutine *fdwroutine;
1430                         bool            ok = false;
1431
1432                         fdwroutine = GetFdwRoutineForRelation(childrel, false);
1433
1434                         if (fdwroutine->AnalyzeForeignTable != NULL)
1435                                 ok = fdwroutine->AnalyzeForeignTable(childrel,
1436                                                                                                          &acquirefunc,
1437                                                                                                          &relpages);
1438
1439                         if (!ok)
1440                         {
1441                                 /* ignore, but release the lock on it */
1442                                 Assert(childrel != onerel);
1443                                 table_close(childrel, AccessShareLock);
1444                                 continue;
1445                         }
1446                 }
1447                 else
1448                 {
1449                         /*
1450                          * ignore, but release the lock on it.  don't try to unlock the
1451                          * passed-in relation
1452                          */
1453                         Assert(childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
1454                         if (childrel != onerel)
1455                                 table_close(childrel, AccessShareLock);
1456                         else
1457                                 table_close(childrel, NoLock);
1458                         continue;
1459                 }
1460
1461                 /* OK, we'll process this child */
1462                 has_child = true;
1463                 rels[nrels] = childrel;
1464                 acquirefuncs[nrels] = acquirefunc;
1465                 relblocks[nrels] = (double) relpages;
1466                 totalblocks += (double) relpages;
1467                 nrels++;
1468         }
1469
1470         /*
1471          * If we don't have at least one child table to consider, fail.  If the
1472          * relation is a partitioned table, it's not counted as a child table.
1473          */
1474         if (!has_child)
1475         {
1476                 ereport(elevel,
1477                                 (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no analyzable child tables",
1478                                                 get_namespace_name(RelationGetNamespace(onerel)),
1479                                                 RelationGetRelationName(onerel))));
1480                 return 0;
1481         }
1482
1483         /*
1484          * Now sample rows from each relation, proportionally to its fraction of
1485          * the total block count.  (This might be less than desirable if the child
1486          * rels have radically different free-space percentages, but it's not
1487          * clear that it's worth working harder.)
1488          */
1489         pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_TOTAL,
1490                                                                  nrels);
1491         numrows = 0;
1492         for (i = 0; i < nrels; i++)
1493         {
1494                 Relation        childrel = rels[i];
1495                 AcquireSampleRowsFunc acquirefunc = acquirefuncs[i];
1496                 double          childblocks = relblocks[i];
1497
1498                 /*
1499                  * Report progress.  The sampling function will normally report blocks
1500                  * done/total, but we need to reset them to 0 here, so that they don't
1501                  * show an old value until that.
1502                  */
1503                 {
1504                         const int       progress_index[] = {
1505                                 PROGRESS_ANALYZE_CURRENT_CHILD_TABLE_RELID,
1506                                 PROGRESS_ANALYZE_BLOCKS_DONE,
1507                                 PROGRESS_ANALYZE_BLOCKS_TOTAL
1508                         };
1509                         const int64 progress_vals[] = {
1510                                 RelationGetRelid(childrel),
1511                                 0,
1512                                 0,
1513                         };
1514
1515                         pgstat_progress_update_multi_param(3, progress_index, progress_vals);
1516                 }
1517
1518                 if (childblocks > 0)
1519                 {
1520                         int                     childtargrows;
1521
1522                         childtargrows = (int) rint(targrows * childblocks / totalblocks);
1523                         /* Make sure we don't overrun due to roundoff error */
1524                         childtargrows = Min(childtargrows, targrows - numrows);
1525                         if (childtargrows > 0)
1526                         {
1527                                 int                     childrows;
1528                                 double          trows,
1529                                                         tdrows;
1530
1531                                 /* Fetch a random sample of the child's rows */
1532                                 childrows = (*acquirefunc) (childrel, elevel,
1533                                                                                         rows + numrows, childtargrows,
1534                                                                                         &trows, &tdrows);
1535
1536                                 /* We may need to convert from child's rowtype to parent's */
1537                                 if (childrows > 0 &&
1538                                         !equalRowTypes(RelationGetDescr(childrel),
1539                                                                    RelationGetDescr(onerel)))
1540                                 {
1541                                         TupleConversionMap *map;
1542
1543                                         map = convert_tuples_by_name(RelationGetDescr(childrel),
1544                                                                                                  RelationGetDescr(onerel));
1545                                         if (map != NULL)
1546                                         {
1547                                                 int                     j;
1548
1549                                                 for (j = 0; j < childrows; j++)
1550                                                 {
1551                                                         HeapTuple       newtup;
1552
1553                                                         newtup = execute_attr_map_tuple(rows[numrows + j], map);
1554                                                         heap_freetuple(rows[numrows + j]);
1555                                                         rows[numrows + j] = newtup;
1556                                                 }
1557                                                 free_conversion_map(map);
1558                                         }
1559                                 }
1560
1561                                 /* And add to counts */
1562                                 numrows += childrows;
1563                                 *totalrows += trows;
1564                                 *totaldeadrows += tdrows;
1565                         }
1566                 }
1567
1568                 /*
1569                  * Note: we cannot release the child-table locks, since we may have
1570                  * pointers to their TOAST tables in the sampled rows.
1571                  */
1572                 table_close(childrel, NoLock);
1573                 pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_DONE,
1574                                                                          i + 1);
1575         }
1576
1577         return numrows;
1578 }
1579
1580
1581 /*
1582  *      update_attstats() -- update attribute statistics for one relation
1583  *
1584  *              Statistics are stored in several places: the pg_class row for the
1585  *              relation has stats about the whole relation, and there is a
1586  *              pg_statistic row for each (non-system) attribute that has ever
1587  *              been analyzed.  The pg_class values are updated by VACUUM, not here.
1588  *
1589  *              pg_statistic rows are just added or updated normally.  This means
1590  *              that pg_statistic will probably contain some deleted rows at the
1591  *              completion of a vacuum cycle, unless it happens to get vacuumed last.
1592  *
1593  *              To keep things simple, we punt for pg_statistic, and don't try
1594  *              to compute or store rows for pg_statistic itself in pg_statistic.
1595  *              This could possibly be made to work, but it's not worth the trouble.
1596  *              Note analyze_rel() has seen to it that we won't come here when
1597  *              vacuuming pg_statistic itself.
1598  *
1599  *              Note: there would be a race condition here if two backends could
1600  *              ANALYZE the same table concurrently.  Presently, we lock that out
1601  *              by taking a self-exclusive lock on the relation in analyze_rel().
1602  */
1603 static void
1604 update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
1605 {
1606         Relation        sd;
1607         int                     attno;
1608         CatalogIndexState indstate = NULL;
1609
1610         if (natts <= 0)
1611                 return;                                 /* nothing to do */
1612
1613         sd = table_open(StatisticRelationId, RowExclusiveLock);
1614
1615         for (attno = 0; attno < natts; attno++)
1616         {
1617                 VacAttrStats *stats = vacattrstats[attno];
1618                 HeapTuple       stup,
1619                                         oldtup;
1620                 int                     i,
1621                                         k,
1622                                         n;
1623                 Datum           values[Natts_pg_statistic];
1624                 bool            nulls[Natts_pg_statistic];
1625                 bool            replaces[Natts_pg_statistic];
1626
1627                 /* Ignore attr if we weren't able to collect stats */
1628                 if (!stats->stats_valid)
1629                         continue;
1630
1631                 /*
1632                  * Construct a new pg_statistic tuple
1633                  */
1634                 for (i = 0; i < Natts_pg_statistic; ++i)
1635                 {
1636                         nulls[i] = false;
1637                         replaces[i] = true;
1638                 }
1639
1640                 values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(relid);
1641                 values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(stats->tupattnum);
1642                 values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh);
1643                 values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
1644                 values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
1645                 values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
1646                 i = Anum_pg_statistic_stakind1 - 1;
1647                 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1648                 {
1649                         values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */
1650                 }
1651                 i = Anum_pg_statistic_staop1 - 1;
1652                 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1653                 {
1654                         values[i++] = ObjectIdGetDatum(stats->staop[k]);        /* staopN */
1655                 }
1656                 i = Anum_pg_statistic_stacoll1 - 1;
1657                 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1658                 {
1659                         values[i++] = ObjectIdGetDatum(stats->stacoll[k]);      /* stacollN */
1660                 }
1661                 i = Anum_pg_statistic_stanumbers1 - 1;
1662                 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1663                 {
1664                         int                     nnum = stats->numnumbers[k];
1665
1666                         if (nnum > 0)
1667                         {
1668                                 Datum      *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
1669                                 ArrayType  *arry;
1670
1671                                 for (n = 0; n < nnum; n++)
1672                                         numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
1673                                 arry = construct_array_builtin(numdatums, nnum, FLOAT4OID);
1674                                 values[i++] = PointerGetDatum(arry);    /* stanumbersN */
1675                         }
1676                         else
1677                         {
1678                                 nulls[i] = true;
1679                                 values[i++] = (Datum) 0;
1680                         }
1681                 }
1682                 i = Anum_pg_statistic_stavalues1 - 1;
1683                 for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
1684                 {
1685                         if (stats->numvalues[k] > 0)
1686                         {
1687                                 ArrayType  *arry;
1688
1689                                 arry = construct_array(stats->stavalues[k],
1690                                                                            stats->numvalues[k],
1691                                                                            stats->statypid[k],
1692                                                                            stats->statyplen[k],
1693                                                                            stats->statypbyval[k],
1694                                                                            stats->statypalign[k]);
1695                                 values[i++] = PointerGetDatum(arry);    /* stavaluesN */
1696                         }
1697                         else
1698                         {
1699                                 nulls[i] = true;
1700                                 values[i++] = (Datum) 0;
1701                         }
1702                 }
1703
1704                 /* Is there already a pg_statistic tuple for this attribute? */
1705                 oldtup = SearchSysCache3(STATRELATTINH,
1706                                                                  ObjectIdGetDatum(relid),
1707                                                                  Int16GetDatum(stats->tupattnum),
1708                                                                  BoolGetDatum(inh));
1709
1710                 /* Open index information when we know we need it */
1711                 if (indstate == NULL)
1712                         indstate = CatalogOpenIndexes(sd);
1713
1714                 if (HeapTupleIsValid(oldtup))
1715                 {
1716                         /* Yes, replace it */
1717                         stup = heap_modify_tuple(oldtup,
1718                                                                          RelationGetDescr(sd),
1719                                                                          values,
1720                                                                          nulls,
1721                                                                          replaces);
1722                         ReleaseSysCache(oldtup);
1723                         CatalogTupleUpdateWithInfo(sd, &stup->t_self, stup, indstate);
1724                 }
1725                 else
1726                 {
1727                         /* No, insert new tuple */
1728                         stup = heap_form_tuple(RelationGetDescr(sd), values, nulls);
1729                         CatalogTupleInsertWithInfo(sd, stup, indstate);
1730                 }
1731
1732                 heap_freetuple(stup);
1733         }
1734
1735         if (indstate != NULL)
1736                 CatalogCloseIndexes(indstate);
1737         table_close(sd, RowExclusiveLock);
1738 }
1739
1740 /*
1741  * Standard fetch function for use by compute_stats subroutines.
1742  *
1743  * This exists to provide some insulation between compute_stats routines
1744  * and the actual storage of the sample data.
1745  */
1746 static Datum
1747 std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
1748 {
1749         int                     attnum = stats->tupattnum;
1750         HeapTuple       tuple = stats->rows[rownum];
1751         TupleDesc       tupDesc = stats->tupDesc;
1752
1753         return heap_getattr(tuple, attnum, tupDesc, isNull);
1754 }
1755
1756 /*
1757  * Fetch function for analyzing index expressions.
1758  *
1759  * We have not bothered to construct index tuples, instead the data is
1760  * just in Datum arrays.
1761  */
1762 static Datum
1763 ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
1764 {
1765         int                     i;
1766
1767         /* exprvals and exprnulls are already offset for proper column */
1768         i = rownum * stats->rowstride;
1769         *isNull = stats->exprnulls[i];
1770         return stats->exprvals[i];
1771 }
1772
1773
1774 /*==========================================================================
1775  *
1776  * Code below this point represents the "standard" type-specific statistics
1777  * analysis algorithms.  This code can be replaced on a per-data-type basis
1778  * by setting a nonzero value in pg_type.typanalyze.
1779  *
1780  *==========================================================================
1781  */
1782
1783
1784 /*
1785  * To avoid consuming too much memory during analysis and/or too much space
1786  * in the resulting pg_statistic rows, we ignore varlena datums that are wider
1787  * than WIDTH_THRESHOLD (after detoasting!).  This is legitimate for MCV
1788  * and distinct-value calculations since a wide value is unlikely to be
1789  * duplicated at all, much less be a most-common value.  For the same reason,
1790  * ignoring wide values will not affect our estimates of histogram bin
1791  * boundaries very much.
1792  */
1793 #define WIDTH_THRESHOLD  1024
1794
1795 #define swapInt(a,b)    do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0)
1796 #define swapDatum(a,b)  do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0)
1797
1798 /*
1799  * Extra information used by the default analysis routines
1800  */
1801 typedef struct
1802 {
1803         int                     count;                  /* # of duplicates */
1804         int                     first;                  /* values[] index of first occurrence */
1805 } ScalarMCVItem;
1806
1807 typedef struct
1808 {
1809         SortSupport ssup;
1810         int                *tupnoLink;
1811 } CompareScalarsContext;
1812
1813
1814 static void compute_trivial_stats(VacAttrStatsP stats,
1815                                                                   AnalyzeAttrFetchFunc fetchfunc,
1816                                                                   int samplerows,
1817                                                                   double totalrows);
1818 static void compute_distinct_stats(VacAttrStatsP stats,
1819                                                                    AnalyzeAttrFetchFunc fetchfunc,
1820                                                                    int samplerows,
1821                                                                    double totalrows);
1822 static void compute_scalar_stats(VacAttrStatsP stats,
1823                                                                  AnalyzeAttrFetchFunc fetchfunc,
1824                                                                  int samplerows,
1825                                                                  double totalrows);
1826 static int      compare_scalars(const void *a, const void *b, void *arg);
1827 static int      compare_mcvs(const void *a, const void *b, void *arg);
1828 static int      analyze_mcv_list(int *mcv_counts,
1829                                                          int num_mcv,
1830                                                          double stadistinct,
1831                                                          double stanullfrac,
1832                                                          int samplerows,
1833                                                          double totalrows);
1834
1835
1836 /*
1837  * std_typanalyze -- the default type-specific typanalyze function
1838  */
1839 bool
1840 std_typanalyze(VacAttrStats *stats)
1841 {
1842         Oid                     ltopr;
1843         Oid                     eqopr;
1844         StdAnalyzeData *mystats;
1845
1846         /* If the attstattarget column is negative, use the default value */
1847         if (stats->attstattarget < 0)
1848                 stats->attstattarget = default_statistics_target;
1849
1850         /* Look for default "<" and "=" operators for column's type */
1851         get_sort_group_operators(stats->attrtypid,
1852                                                          false, false, false,
1853                                                          &ltopr, &eqopr, NULL,
1854                                                          NULL);
1855
1856         /* Save the operator info for compute_stats routines */
1857         mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData));
1858         mystats->eqopr = eqopr;
1859         mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid;
1860         mystats->ltopr = ltopr;
1861         stats->extra_data = mystats;
1862
1863         /*
1864          * Determine which standard statistics algorithm to use
1865          */
1866         if (OidIsValid(eqopr) && OidIsValid(ltopr))
1867         {
1868                 /* Seems to be a scalar datatype */
1869                 stats->compute_stats = compute_scalar_stats;
1870                 /*--------------------
1871                  * The following choice of minrows is based on the paper
1872                  * "Random sampling for histogram construction: how much is enough?"
1873                  * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in
1874                  * Proceedings of ACM SIGMOD International Conference on Management
1875                  * of Data, 1998, Pages 436-447.  Their Corollary 1 to Theorem 5
1876                  * says that for table size n, histogram size k, maximum relative
1877                  * error in bin size f, and error probability gamma, the minimum
1878                  * random sample size is
1879                  *              r = 4 * k * ln(2*n/gamma) / f^2
1880                  * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain
1881                  *              r = 305.82 * k
1882                  * Note that because of the log function, the dependence on n is
1883                  * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66
1884                  * bin size error with probability 0.99.  So there's no real need to
1885                  * scale for n, which is a good thing because we don't necessarily
1886                  * know it at this point.
1887                  *--------------------
1888                  */
1889                 stats->minrows = 300 * stats->attstattarget;
1890         }
1891         else if (OidIsValid(eqopr))
1892         {
1893                 /* We can still recognize distinct values */
1894                 stats->compute_stats = compute_distinct_stats;
1895                 /* Might as well use the same minrows as above */
1896                 stats->minrows = 300 * stats->attstattarget;
1897         }
1898         else
1899         {
1900                 /* Can't do much but the trivial stuff */
1901                 stats->compute_stats = compute_trivial_stats;
1902                 /* Might as well use the same minrows as above */
1903                 stats->minrows = 300 * stats->attstattarget;
1904         }
1905
1906         return true;
1907 }
1908
1909
1910 /*
1911  *      compute_trivial_stats() -- compute very basic column statistics
1912  *
1913  *      We use this when we cannot find a hash "=" operator for the datatype.
1914  *
1915  *      We determine the fraction of non-null rows and the average datum width.
1916  */
1917 static void
1918 compute_trivial_stats(VacAttrStatsP stats,
1919                                           AnalyzeAttrFetchFunc fetchfunc,
1920                                           int samplerows,
1921                                           double totalrows)
1922 {
1923         int                     i;
1924         int                     null_cnt = 0;
1925         int                     nonnull_cnt = 0;
1926         double          total_width = 0;
1927         bool            is_varlena = (!stats->attrtype->typbyval &&
1928                                                           stats->attrtype->typlen == -1);
1929         bool            is_varwidth = (!stats->attrtype->typbyval &&
1930                                                            stats->attrtype->typlen < 0);
1931
1932         for (i = 0; i < samplerows; i++)
1933         {
1934                 Datum           value;
1935                 bool            isnull;
1936
1937                 vacuum_delay_point();
1938
1939                 value = fetchfunc(stats, i, &isnull);
1940
1941                 /* Check for null/nonnull */
1942                 if (isnull)
1943                 {
1944                         null_cnt++;
1945                         continue;
1946                 }
1947                 nonnull_cnt++;
1948
1949                 /*
1950                  * If it's a variable-width field, add up widths for average width
1951                  * calculation.  Note that if the value is toasted, we use the toasted
1952                  * width.  We don't bother with this calculation if it's a fixed-width
1953                  * type.
1954                  */
1955                 if (is_varlena)
1956                 {
1957                         total_width += VARSIZE_ANY(DatumGetPointer(value));
1958                 }
1959                 else if (is_varwidth)
1960                 {
1961                         /* must be cstring */
1962                         total_width += strlen(DatumGetCString(value)) + 1;
1963                 }
1964         }
1965
1966         /* We can only compute average width if we found some non-null values. */
1967         if (nonnull_cnt > 0)
1968         {
1969                 stats->stats_valid = true;
1970                 /* Do the simple null-frac and width stats */
1971                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
1972                 if (is_varwidth)
1973                         stats->stawidth = total_width / (double) nonnull_cnt;
1974                 else
1975                         stats->stawidth = stats->attrtype->typlen;
1976                 stats->stadistinct = 0.0;       /* "unknown" */
1977         }
1978         else if (null_cnt > 0)
1979         {
1980                 /* We found only nulls; assume the column is entirely null */
1981                 stats->stats_valid = true;
1982                 stats->stanullfrac = 1.0;
1983                 if (is_varwidth)
1984                         stats->stawidth = 0;    /* "unknown" */
1985                 else
1986                         stats->stawidth = stats->attrtype->typlen;
1987                 stats->stadistinct = 0.0;       /* "unknown" */
1988         }
1989 }
1990
1991
1992 /*
1993  *      compute_distinct_stats() -- compute column statistics including ndistinct
1994  *
1995  *      We use this when we can find only an "=" operator for the datatype.
1996  *
1997  *      We determine the fraction of non-null rows, the average width, the
1998  *      most common values, and the (estimated) number of distinct values.
1999  *
2000  *      The most common values are determined by brute force: we keep a list
2001  *      of previously seen values, ordered by number of times seen, as we scan
2002  *      the samples.  A newly seen value is inserted just after the last
2003  *      multiply-seen value, causing the bottommost (oldest) singly-seen value
2004  *      to drop off the list.  The accuracy of this method, and also its cost,
2005  *      depend mainly on the length of the list we are willing to keep.
2006  */
2007 static void
2008 compute_distinct_stats(VacAttrStatsP stats,
2009                                            AnalyzeAttrFetchFunc fetchfunc,
2010                                            int samplerows,
2011                                            double totalrows)
2012 {
2013         int                     i;
2014         int                     null_cnt = 0;
2015         int                     nonnull_cnt = 0;
2016         int                     toowide_cnt = 0;
2017         double          total_width = 0;
2018         bool            is_varlena = (!stats->attrtype->typbyval &&
2019                                                           stats->attrtype->typlen == -1);
2020         bool            is_varwidth = (!stats->attrtype->typbyval &&
2021                                                            stats->attrtype->typlen < 0);
2022         FmgrInfo        f_cmpeq;
2023         typedef struct
2024         {
2025                 Datum           value;
2026                 int                     count;
2027         } TrackItem;
2028         TrackItem  *track;
2029         int                     track_cnt,
2030                                 track_max;
2031         int                     num_mcv = stats->attstattarget;
2032         StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
2033
2034         /*
2035          * We track up to 2*n values for an n-element MCV list; but at least 10
2036          */
2037         track_max = 2 * num_mcv;
2038         if (track_max < 10)
2039                 track_max = 10;
2040         track = (TrackItem *) palloc(track_max * sizeof(TrackItem));
2041         track_cnt = 0;
2042
2043         fmgr_info(mystats->eqfunc, &f_cmpeq);
2044
2045         for (i = 0; i < samplerows; i++)
2046         {
2047                 Datum           value;
2048                 bool            isnull;
2049                 bool            match;
2050                 int                     firstcount1,
2051                                         j;
2052
2053                 vacuum_delay_point();
2054
2055                 value = fetchfunc(stats, i, &isnull);
2056
2057                 /* Check for null/nonnull */
2058                 if (isnull)
2059                 {
2060                         null_cnt++;
2061                         continue;
2062                 }
2063                 nonnull_cnt++;
2064
2065                 /*
2066                  * If it's a variable-width field, add up widths for average width
2067                  * calculation.  Note that if the value is toasted, we use the toasted
2068                  * width.  We don't bother with this calculation if it's a fixed-width
2069                  * type.
2070                  */
2071                 if (is_varlena)
2072                 {
2073                         total_width += VARSIZE_ANY(DatumGetPointer(value));
2074
2075                         /*
2076                          * If the value is toasted, we want to detoast it just once to
2077                          * avoid repeated detoastings and resultant excess memory usage
2078                          * during the comparisons.  Also, check to see if the value is
2079                          * excessively wide, and if so don't detoast at all --- just
2080                          * ignore the value.
2081                          */
2082                         if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
2083                         {
2084                                 toowide_cnt++;
2085                                 continue;
2086                         }
2087                         value = PointerGetDatum(PG_DETOAST_DATUM(value));
2088                 }
2089                 else if (is_varwidth)
2090                 {
2091                         /* must be cstring */
2092                         total_width += strlen(DatumGetCString(value)) + 1;
2093                 }
2094
2095                 /*
2096                  * See if the value matches anything we're already tracking.
2097                  */
2098                 match = false;
2099                 firstcount1 = track_cnt;
2100                 for (j = 0; j < track_cnt; j++)
2101                 {
2102                         if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
2103                                                                                            stats->attrcollid,
2104                                                                                            value, track[j].value)))
2105                         {
2106                                 match = true;
2107                                 break;
2108                         }
2109                         if (j < firstcount1 && track[j].count == 1)
2110                                 firstcount1 = j;
2111                 }
2112
2113                 if (match)
2114                 {
2115                         /* Found a match */
2116                         track[j].count++;
2117                         /* This value may now need to "bubble up" in the track list */
2118                         while (j > 0 && track[j].count > track[j - 1].count)
2119                         {
2120                                 swapDatum(track[j].value, track[j - 1].value);
2121                                 swapInt(track[j].count, track[j - 1].count);
2122                                 j--;
2123                         }
2124                 }
2125                 else
2126                 {
2127                         /* No match.  Insert at head of count-1 list */
2128                         if (track_cnt < track_max)
2129                                 track_cnt++;
2130                         for (j = track_cnt - 1; j > firstcount1; j--)
2131                         {
2132                                 track[j].value = track[j - 1].value;
2133                                 track[j].count = track[j - 1].count;
2134                         }
2135                         if (firstcount1 < track_cnt)
2136                         {
2137                                 track[firstcount1].value = value;
2138                                 track[firstcount1].count = 1;
2139                         }
2140                 }
2141         }
2142
2143         /* We can only compute real stats if we found some non-null values. */
2144         if (nonnull_cnt > 0)
2145         {
2146                 int                     nmultiple,
2147                                         summultiple;
2148
2149                 stats->stats_valid = true;
2150                 /* Do the simple null-frac and width stats */
2151                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2152                 if (is_varwidth)
2153                         stats->stawidth = total_width / (double) nonnull_cnt;
2154                 else
2155                         stats->stawidth = stats->attrtype->typlen;
2156
2157                 /* Count the number of values we found multiple times */
2158                 summultiple = 0;
2159                 for (nmultiple = 0; nmultiple < track_cnt; nmultiple++)
2160                 {
2161                         if (track[nmultiple].count == 1)
2162                                 break;
2163                         summultiple += track[nmultiple].count;
2164                 }
2165
2166                 if (nmultiple == 0)
2167                 {
2168                         /*
2169                          * If we found no repeated non-null values, assume it's a unique
2170                          * column; but be sure to discount for any nulls we found.
2171                          */
2172                         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2173                 }
2174                 else if (track_cnt < track_max && toowide_cnt == 0 &&
2175                                  nmultiple == track_cnt)
2176                 {
2177                         /*
2178                          * Our track list includes every value in the sample, and every
2179                          * value appeared more than once.  Assume the column has just
2180                          * these values.  (This case is meant to address columns with
2181                          * small, fixed sets of possible values, such as boolean or enum
2182                          * columns.  If there are any values that appear just once in the
2183                          * sample, including too-wide values, we should assume that that's
2184                          * not what we're dealing with.)
2185                          */
2186                         stats->stadistinct = track_cnt;
2187                 }
2188                 else
2189                 {
2190                         /*----------
2191                          * Estimate the number of distinct values using the estimator
2192                          * proposed by Haas and Stokes in IBM Research Report RJ 10025:
2193                          *              n*d / (n - f1 + f1*n/N)
2194                          * where f1 is the number of distinct values that occurred
2195                          * exactly once in our sample of n rows (from a total of N),
2196                          * and d is the total number of distinct values in the sample.
2197                          * This is their Duj1 estimator; the other estimators they
2198                          * recommend are considerably more complex, and are numerically
2199                          * very unstable when n is much smaller than N.
2200                          *
2201                          * In this calculation, we consider only non-nulls.  We used to
2202                          * include rows with null values in the n and N counts, but that
2203                          * leads to inaccurate answers in columns with many nulls, and
2204                          * it's intuitively bogus anyway considering the desired result is
2205                          * the number of distinct non-null values.
2206                          *
2207                          * We assume (not very reliably!) that all the multiply-occurring
2208                          * values are reflected in the final track[] list, and the other
2209                          * nonnull values all appeared but once.  (XXX this usually
2210                          * results in a drastic overestimate of ndistinct.  Can we do
2211                          * any better?)
2212                          *----------
2213                          */
2214                         int                     f1 = nonnull_cnt - summultiple;
2215                         int                     d = f1 + nmultiple;
2216                         double          n = samplerows - null_cnt;
2217                         double          N = totalrows * (1.0 - stats->stanullfrac);
2218                         double          stadistinct;
2219
2220                         /* N == 0 shouldn't happen, but just in case ... */
2221                         if (N > 0)
2222                                 stadistinct = (n * d) / ((n - f1) + f1 * n / N);
2223                         else
2224                                 stadistinct = 0;
2225
2226                         /* Clamp to sane range in case of roundoff error */
2227                         if (stadistinct < d)
2228                                 stadistinct = d;
2229                         if (stadistinct > N)
2230                                 stadistinct = N;
2231                         /* And round to integer */
2232                         stats->stadistinct = floor(stadistinct + 0.5);
2233                 }
2234
2235                 /*
2236                  * If we estimated the number of distinct values at more than 10% of
2237                  * the total row count (a very arbitrary limit), then assume that
2238                  * stadistinct should scale with the row count rather than be a fixed
2239                  * value.
2240                  */
2241                 if (stats->stadistinct > 0.1 * totalrows)
2242                         stats->stadistinct = -(stats->stadistinct / totalrows);
2243
2244                 /*
2245                  * Decide how many values are worth storing as most-common values. If
2246                  * we are able to generate a complete MCV list (all the values in the
2247                  * sample will fit, and we think these are all the ones in the table),
2248                  * then do so.  Otherwise, store only those values that are
2249                  * significantly more common than the values not in the list.
2250                  *
2251                  * Note: the first of these cases is meant to address columns with
2252                  * small, fixed sets of possible values, such as boolean or enum
2253                  * columns.  If we can *completely* represent the column population by
2254                  * an MCV list that will fit into the stats target, then we should do
2255                  * so and thus provide the planner with complete information.  But if
2256                  * the MCV list is not complete, it's generally worth being more
2257                  * selective, and not just filling it all the way up to the stats
2258                  * target.
2259                  */
2260                 if (track_cnt < track_max && toowide_cnt == 0 &&
2261                         stats->stadistinct > 0 &&
2262                         track_cnt <= num_mcv)
2263                 {
2264                         /* Track list includes all values seen, and all will fit */
2265                         num_mcv = track_cnt;
2266                 }
2267                 else
2268                 {
2269                         int                *mcv_counts;
2270
2271                         /* Incomplete list; decide how many values are worth keeping */
2272                         if (num_mcv > track_cnt)
2273                                 num_mcv = track_cnt;
2274
2275                         if (num_mcv > 0)
2276                         {
2277                                 mcv_counts = (int *) palloc(num_mcv * sizeof(int));
2278                                 for (i = 0; i < num_mcv; i++)
2279                                         mcv_counts[i] = track[i].count;
2280
2281                                 num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
2282                                                                                    stats->stadistinct,
2283                                                                                    stats->stanullfrac,
2284                                                                                    samplerows, totalrows);
2285                         }
2286                 }
2287
2288                 /* Generate MCV slot entry */
2289                 if (num_mcv > 0)
2290                 {
2291                         MemoryContext old_context;
2292                         Datum      *mcv_values;
2293                         float4     *mcv_freqs;
2294
2295                         /* Must copy the target values into anl_context */
2296                         old_context = MemoryContextSwitchTo(stats->anl_context);
2297                         mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
2298                         mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
2299                         for (i = 0; i < num_mcv; i++)
2300                         {
2301                                 mcv_values[i] = datumCopy(track[i].value,
2302                                                                                   stats->attrtype->typbyval,
2303                                                                                   stats->attrtype->typlen);
2304                                 mcv_freqs[i] = (double) track[i].count / (double) samplerows;
2305                         }
2306                         MemoryContextSwitchTo(old_context);
2307
2308                         stats->stakind[0] = STATISTIC_KIND_MCV;
2309                         stats->staop[0] = mystats->eqopr;
2310                         stats->stacoll[0] = stats->attrcollid;
2311                         stats->stanumbers[0] = mcv_freqs;
2312                         stats->numnumbers[0] = num_mcv;
2313                         stats->stavalues[0] = mcv_values;
2314                         stats->numvalues[0] = num_mcv;
2315
2316                         /*
2317                          * Accept the defaults for stats->statypid and others. They have
2318                          * been set before we were called (see vacuum.h)
2319                          */
2320                 }
2321         }
2322         else if (null_cnt > 0)
2323         {
2324                 /* We found only nulls; assume the column is entirely null */
2325                 stats->stats_valid = true;
2326                 stats->stanullfrac = 1.0;
2327                 if (is_varwidth)
2328                         stats->stawidth = 0;    /* "unknown" */
2329                 else
2330                         stats->stawidth = stats->attrtype->typlen;
2331                 stats->stadistinct = 0.0;       /* "unknown" */
2332         }
2333
2334         /* We don't need to bother cleaning up any of our temporary palloc's */
2335 }
2336
2337
2338 /*
2339  *      compute_scalar_stats() -- compute column statistics
2340  *
2341  *      We use this when we can find "=" and "<" operators for the datatype.
2342  *
2343  *      We determine the fraction of non-null rows, the average width, the
2344  *      most common values, the (estimated) number of distinct values, the
2345  *      distribution histogram, and the correlation of physical to logical order.
2346  *
2347  *      The desired stats can be determined fairly easily after sorting the
2348  *      data values into order.
2349  */
2350 static void
2351 compute_scalar_stats(VacAttrStatsP stats,
2352                                          AnalyzeAttrFetchFunc fetchfunc,
2353                                          int samplerows,
2354                                          double totalrows)
2355 {
2356         int                     i;
2357         int                     null_cnt = 0;
2358         int                     nonnull_cnt = 0;
2359         int                     toowide_cnt = 0;
2360         double          total_width = 0;
2361         bool            is_varlena = (!stats->attrtype->typbyval &&
2362                                                           stats->attrtype->typlen == -1);
2363         bool            is_varwidth = (!stats->attrtype->typbyval &&
2364                                                            stats->attrtype->typlen < 0);
2365         double          corr_xysum;
2366         SortSupportData ssup;
2367         ScalarItem *values;
2368         int                     values_cnt = 0;
2369         int                *tupnoLink;
2370         ScalarMCVItem *track;
2371         int                     track_cnt = 0;
2372         int                     num_mcv = stats->attstattarget;
2373         int                     num_bins = stats->attstattarget;
2374         StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
2375
2376         values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem));
2377         tupnoLink = (int *) palloc(samplerows * sizeof(int));
2378         track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem));
2379
2380         memset(&ssup, 0, sizeof(ssup));
2381         ssup.ssup_cxt = CurrentMemoryContext;
2382         ssup.ssup_collation = stats->attrcollid;
2383         ssup.ssup_nulls_first = false;
2384
2385         /*
2386          * For now, don't perform abbreviated key conversion, because full values
2387          * are required for MCV slot generation.  Supporting that optimization
2388          * would necessitate teaching compare_scalars() to call a tie-breaker.
2389          */
2390         ssup.abbreviate = false;
2391
2392         PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
2393
2394         /* Initial scan to find sortable values */
2395         for (i = 0; i < samplerows; i++)
2396         {
2397                 Datum           value;
2398                 bool            isnull;
2399
2400                 vacuum_delay_point();
2401
2402                 value = fetchfunc(stats, i, &isnull);
2403
2404                 /* Check for null/nonnull */
2405                 if (isnull)
2406                 {
2407                         null_cnt++;
2408                         continue;
2409                 }
2410                 nonnull_cnt++;
2411
2412                 /*
2413                  * If it's a variable-width field, add up widths for average width
2414                  * calculation.  Note that if the value is toasted, we use the toasted
2415                  * width.  We don't bother with this calculation if it's a fixed-width
2416                  * type.
2417                  */
2418                 if (is_varlena)
2419                 {
2420                         total_width += VARSIZE_ANY(DatumGetPointer(value));
2421
2422                         /*
2423                          * If the value is toasted, we want to detoast it just once to
2424                          * avoid repeated detoastings and resultant excess memory usage
2425                          * during the comparisons.  Also, check to see if the value is
2426                          * excessively wide, and if so don't detoast at all --- just
2427                          * ignore the value.
2428                          */
2429                         if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
2430                         {
2431                                 toowide_cnt++;
2432                                 continue;
2433                         }
2434                         value = PointerGetDatum(PG_DETOAST_DATUM(value));
2435                 }
2436                 else if (is_varwidth)
2437                 {
2438                         /* must be cstring */
2439                         total_width += strlen(DatumGetCString(value)) + 1;
2440                 }
2441
2442                 /* Add it to the list to be sorted */
2443                 values[values_cnt].value = value;
2444                 values[values_cnt].tupno = values_cnt;
2445                 tupnoLink[values_cnt] = values_cnt;
2446                 values_cnt++;
2447         }
2448
2449         /* We can only compute real stats if we found some sortable values. */
2450         if (values_cnt > 0)
2451         {
2452                 int                     ndistinct,      /* # distinct values in sample */
2453                                         nmultiple,      /* # that appear multiple times */
2454                                         num_hist,
2455                                         dups_cnt;
2456                 int                     slot_idx = 0;
2457                 CompareScalarsContext cxt;
2458
2459                 /* Sort the collected values */
2460                 cxt.ssup = &ssup;
2461                 cxt.tupnoLink = tupnoLink;
2462                 qsort_interruptible(values, values_cnt, sizeof(ScalarItem),
2463                                                         compare_scalars, &cxt);
2464
2465                 /*
2466                  * Now scan the values in order, find the most common ones, and also
2467                  * accumulate ordering-correlation statistics.
2468                  *
2469                  * To determine which are most common, we first have to count the
2470                  * number of duplicates of each value.  The duplicates are adjacent in
2471                  * the sorted list, so a brute-force approach is to compare successive
2472                  * datum values until we find two that are not equal. However, that
2473                  * requires N-1 invocations of the datum comparison routine, which are
2474                  * completely redundant with work that was done during the sort.  (The
2475                  * sort algorithm must at some point have compared each pair of items
2476                  * that are adjacent in the sorted order; otherwise it could not know
2477                  * that it's ordered the pair correctly.) We exploit this by having
2478                  * compare_scalars remember the highest tupno index that each
2479                  * ScalarItem has been found equal to.  At the end of the sort, a
2480                  * ScalarItem's tupnoLink will still point to itself if and only if it
2481                  * is the last item of its group of duplicates (since the group will
2482                  * be ordered by tupno).
2483                  */
2484                 corr_xysum = 0;
2485                 ndistinct = 0;
2486                 nmultiple = 0;
2487                 dups_cnt = 0;
2488                 for (i = 0; i < values_cnt; i++)
2489                 {
2490                         int                     tupno = values[i].tupno;
2491
2492                         corr_xysum += ((double) i) * ((double) tupno);
2493                         dups_cnt++;
2494                         if (tupnoLink[tupno] == tupno)
2495                         {
2496                                 /* Reached end of duplicates of this value */
2497                                 ndistinct++;
2498                                 if (dups_cnt > 1)
2499                                 {
2500                                         nmultiple++;
2501                                         if (track_cnt < num_mcv ||
2502                                                 dups_cnt > track[track_cnt - 1].count)
2503                                         {
2504                                                 /*
2505                                                  * Found a new item for the mcv list; find its
2506                                                  * position, bubbling down old items if needed. Loop
2507                                                  * invariant is that j points at an empty/ replaceable
2508                                                  * slot.
2509                                                  */
2510                                                 int                     j;
2511
2512                                                 if (track_cnt < num_mcv)
2513                                                         track_cnt++;
2514                                                 for (j = track_cnt - 1; j > 0; j--)
2515                                                 {
2516                                                         if (dups_cnt <= track[j - 1].count)
2517                                                                 break;
2518                                                         track[j].count = track[j - 1].count;
2519                                                         track[j].first = track[j - 1].first;
2520                                                 }
2521                                                 track[j].count = dups_cnt;
2522                                                 track[j].first = i + 1 - dups_cnt;
2523                                         }
2524                                 }
2525                                 dups_cnt = 0;
2526                         }
2527                 }
2528
2529                 stats->stats_valid = true;
2530                 /* Do the simple null-frac and width stats */
2531                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2532                 if (is_varwidth)
2533                         stats->stawidth = total_width / (double) nonnull_cnt;
2534                 else
2535                         stats->stawidth = stats->attrtype->typlen;
2536
2537                 if (nmultiple == 0)
2538                 {
2539                         /*
2540                          * If we found no repeated non-null values, assume it's a unique
2541                          * column; but be sure to discount for any nulls we found.
2542                          */
2543                         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2544                 }
2545                 else if (toowide_cnt == 0 && nmultiple == ndistinct)
2546                 {
2547                         /*
2548                          * Every value in the sample appeared more than once.  Assume the
2549                          * column has just these values.  (This case is meant to address
2550                          * columns with small, fixed sets of possible values, such as
2551                          * boolean or enum columns.  If there are any values that appear
2552                          * just once in the sample, including too-wide values, we should
2553                          * assume that that's not what we're dealing with.)
2554                          */
2555                         stats->stadistinct = ndistinct;
2556                 }
2557                 else
2558                 {
2559                         /*----------
2560                          * Estimate the number of distinct values using the estimator
2561                          * proposed by Haas and Stokes in IBM Research Report RJ 10025:
2562                          *              n*d / (n - f1 + f1*n/N)
2563                          * where f1 is the number of distinct values that occurred
2564                          * exactly once in our sample of n rows (from a total of N),
2565                          * and d is the total number of distinct values in the sample.
2566                          * This is their Duj1 estimator; the other estimators they
2567                          * recommend are considerably more complex, and are numerically
2568                          * very unstable when n is much smaller than N.
2569                          *
2570                          * In this calculation, we consider only non-nulls.  We used to
2571                          * include rows with null values in the n and N counts, but that
2572                          * leads to inaccurate answers in columns with many nulls, and
2573                          * it's intuitively bogus anyway considering the desired result is
2574                          * the number of distinct non-null values.
2575                          *
2576                          * Overwidth values are assumed to have been distinct.
2577                          *----------
2578                          */
2579                         int                     f1 = ndistinct - nmultiple + toowide_cnt;
2580                         int                     d = f1 + nmultiple;
2581                         double          n = samplerows - null_cnt;
2582                         double          N = totalrows * (1.0 - stats->stanullfrac);
2583                         double          stadistinct;
2584
2585                         /* N == 0 shouldn't happen, but just in case ... */
2586                         if (N > 0)
2587                                 stadistinct = (n * d) / ((n - f1) + f1 * n / N);
2588                         else
2589                                 stadistinct = 0;
2590
2591                         /* Clamp to sane range in case of roundoff error */
2592                         if (stadistinct < d)
2593                                 stadistinct = d;
2594                         if (stadistinct > N)
2595                                 stadistinct = N;
2596                         /* And round to integer */
2597                         stats->stadistinct = floor(stadistinct + 0.5);
2598                 }
2599
2600                 /*
2601                  * If we estimated the number of distinct values at more than 10% of
2602                  * the total row count (a very arbitrary limit), then assume that
2603                  * stadistinct should scale with the row count rather than be a fixed
2604                  * value.
2605                  */
2606                 if (stats->stadistinct > 0.1 * totalrows)
2607                         stats->stadistinct = -(stats->stadistinct / totalrows);
2608
2609                 /*
2610                  * Decide how many values are worth storing as most-common values. If
2611                  * we are able to generate a complete MCV list (all the values in the
2612                  * sample will fit, and we think these are all the ones in the table),
2613                  * then do so.  Otherwise, store only those values that are
2614                  * significantly more common than the values not in the list.
2615                  *
2616                  * Note: the first of these cases is meant to address columns with
2617                  * small, fixed sets of possible values, such as boolean or enum
2618                  * columns.  If we can *completely* represent the column population by
2619                  * an MCV list that will fit into the stats target, then we should do
2620                  * so and thus provide the planner with complete information.  But if
2621                  * the MCV list is not complete, it's generally worth being more
2622                  * selective, and not just filling it all the way up to the stats
2623                  * target.
2624                  */
2625                 if (track_cnt == ndistinct && toowide_cnt == 0 &&
2626                         stats->stadistinct > 0 &&
2627                         track_cnt <= num_mcv)
2628                 {
2629                         /* Track list includes all values seen, and all will fit */
2630                         num_mcv = track_cnt;
2631                 }
2632                 else
2633                 {
2634                         int                *mcv_counts;
2635
2636                         /* Incomplete list; decide how many values are worth keeping */
2637                         if (num_mcv > track_cnt)
2638                                 num_mcv = track_cnt;
2639
2640                         if (num_mcv > 0)
2641                         {
2642                                 mcv_counts = (int *) palloc(num_mcv * sizeof(int));
2643                                 for (i = 0; i < num_mcv; i++)
2644                                         mcv_counts[i] = track[i].count;
2645
2646                                 num_mcv = analyze_mcv_list(mcv_counts, num_mcv,
2647                                                                                    stats->stadistinct,
2648                                                                                    stats->stanullfrac,
2649                                                                                    samplerows, totalrows);
2650                         }
2651                 }
2652
2653                 /* Generate MCV slot entry */
2654                 if (num_mcv > 0)
2655                 {
2656                         MemoryContext old_context;
2657                         Datum      *mcv_values;
2658                         float4     *mcv_freqs;
2659
2660                         /* Must copy the target values into anl_context */
2661                         old_context = MemoryContextSwitchTo(stats->anl_context);
2662                         mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum));
2663                         mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4));
2664                         for (i = 0; i < num_mcv; i++)
2665                         {
2666                                 mcv_values[i] = datumCopy(values[track[i].first].value,
2667                                                                                   stats->attrtype->typbyval,
2668                                                                                   stats->attrtype->typlen);
2669                                 mcv_freqs[i] = (double) track[i].count / (double) samplerows;
2670                         }
2671                         MemoryContextSwitchTo(old_context);
2672
2673                         stats->stakind[slot_idx] = STATISTIC_KIND_MCV;
2674                         stats->staop[slot_idx] = mystats->eqopr;
2675                         stats->stacoll[slot_idx] = stats->attrcollid;
2676                         stats->stanumbers[slot_idx] = mcv_freqs;
2677                         stats->numnumbers[slot_idx] = num_mcv;
2678                         stats->stavalues[slot_idx] = mcv_values;
2679                         stats->numvalues[slot_idx] = num_mcv;
2680
2681                         /*
2682                          * Accept the defaults for stats->statypid and others. They have
2683                          * been set before we were called (see vacuum.h)
2684                          */
2685                         slot_idx++;
2686                 }
2687
2688                 /*
2689                  * Generate a histogram slot entry if there are at least two distinct
2690                  * values not accounted for in the MCV list.  (This ensures the
2691                  * histogram won't collapse to empty or a singleton.)
2692                  */
2693                 num_hist = ndistinct - num_mcv;
2694                 if (num_hist > num_bins)
2695                         num_hist = num_bins + 1;
2696                 if (num_hist >= 2)
2697                 {
2698                         MemoryContext old_context;
2699                         Datum      *hist_values;
2700                         int                     nvals;
2701                         int                     pos,
2702                                                 posfrac,
2703                                                 delta,
2704                                                 deltafrac;
2705
2706                         /* Sort the MCV items into position order to speed next loop */
2707                         qsort_interruptible(track, num_mcv, sizeof(ScalarMCVItem),
2708                                                                 compare_mcvs, NULL);
2709
2710                         /*
2711                          * Collapse out the MCV items from the values[] array.
2712                          *
2713                          * Note we destroy the values[] array here... but we don't need it
2714                          * for anything more.  We do, however, still need values_cnt.
2715                          * nvals will be the number of remaining entries in values[].
2716                          */
2717                         if (num_mcv > 0)
2718                         {
2719                                 int                     src,
2720                                                         dest;
2721                                 int                     j;
2722
2723                                 src = dest = 0;
2724                                 j = 0;                  /* index of next interesting MCV item */
2725                                 while (src < values_cnt)
2726                                 {
2727                                         int                     ncopy;
2728
2729                                         if (j < num_mcv)
2730                                         {
2731                                                 int                     first = track[j].first;
2732
2733                                                 if (src >= first)
2734                                                 {
2735                                                         /* advance past this MCV item */
2736                                                         src = first + track[j].count;
2737                                                         j++;
2738                                                         continue;
2739                                                 }
2740                                                 ncopy = first - src;
2741                                         }
2742                                         else
2743                                                 ncopy = values_cnt - src;
2744                                         memmove(&values[dest], &values[src],
2745                                                         ncopy * sizeof(ScalarItem));
2746                                         src += ncopy;
2747                                         dest += ncopy;
2748                                 }
2749                                 nvals = dest;
2750                         }
2751                         else
2752                                 nvals = values_cnt;
2753                         Assert(nvals >= num_hist);
2754
2755                         /* Must copy the target values into anl_context */
2756                         old_context = MemoryContextSwitchTo(stats->anl_context);
2757                         hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
2758
2759                         /*
2760                          * The object of this loop is to copy the first and last values[]
2761                          * entries along with evenly-spaced values in between.  So the
2762                          * i'th value is values[(i * (nvals - 1)) / (num_hist - 1)].  But
2763                          * computing that subscript directly risks integer overflow when
2764                          * the stats target is more than a couple thousand.  Instead we
2765                          * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
2766                          * the integral and fractional parts of the sum separately.
2767                          */
2768                         delta = (nvals - 1) / (num_hist - 1);
2769                         deltafrac = (nvals - 1) % (num_hist - 1);
2770                         pos = posfrac = 0;
2771
2772                         for (i = 0; i < num_hist; i++)
2773                         {
2774                                 hist_values[i] = datumCopy(values[pos].value,
2775                                                                                    stats->attrtype->typbyval,
2776                                                                                    stats->attrtype->typlen);
2777                                 pos += delta;
2778                                 posfrac += deltafrac;
2779                                 if (posfrac >= (num_hist - 1))
2780                                 {
2781                                         /* fractional part exceeds 1, carry to integer part */
2782                                         pos++;
2783                                         posfrac -= (num_hist - 1);
2784                                 }
2785                         }
2786
2787                         MemoryContextSwitchTo(old_context);
2788
2789                         stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM;
2790                         stats->staop[slot_idx] = mystats->ltopr;
2791                         stats->stacoll[slot_idx] = stats->attrcollid;
2792                         stats->stavalues[slot_idx] = hist_values;
2793                         stats->numvalues[slot_idx] = num_hist;
2794
2795                         /*
2796                          * Accept the defaults for stats->statypid and others. They have
2797                          * been set before we were called (see vacuum.h)
2798                          */
2799                         slot_idx++;
2800                 }
2801
2802                 /* Generate a correlation entry if there are multiple values */
2803                 if (values_cnt > 1)
2804                 {
2805                         MemoryContext old_context;
2806                         float4     *corrs;
2807                         double          corr_xsum,
2808                                                 corr_x2sum;
2809
2810                         /* Must copy the target values into anl_context */
2811                         old_context = MemoryContextSwitchTo(stats->anl_context);
2812                         corrs = (float4 *) palloc(sizeof(float4));
2813                         MemoryContextSwitchTo(old_context);
2814
2815                         /*----------
2816                          * Since we know the x and y value sets are both
2817                          *              0, 1, ..., values_cnt-1
2818                          * we have sum(x) = sum(y) =
2819                          *              (values_cnt-1)*values_cnt / 2
2820                          * and sum(x^2) = sum(y^2) =
2821                          *              (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6.
2822                          *----------
2823                          */
2824                         corr_xsum = ((double) (values_cnt - 1)) *
2825                                 ((double) values_cnt) / 2.0;
2826                         corr_x2sum = ((double) (values_cnt - 1)) *
2827                                 ((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0;
2828
2829                         /* And the correlation coefficient reduces to */
2830                         corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) /
2831                                 (values_cnt * corr_x2sum - corr_xsum * corr_xsum);
2832
2833                         stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION;
2834                         stats->staop[slot_idx] = mystats->ltopr;
2835                         stats->stacoll[slot_idx] = stats->attrcollid;
2836                         stats->stanumbers[slot_idx] = corrs;
2837                         stats->numnumbers[slot_idx] = 1;
2838                         slot_idx++;
2839                 }
2840         }
2841         else if (nonnull_cnt > 0)
2842         {
2843                 /* We found some non-null values, but they were all too wide */
2844                 Assert(nonnull_cnt == toowide_cnt);
2845                 stats->stats_valid = true;
2846                 /* Do the simple null-frac and width stats */
2847                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
2848                 if (is_varwidth)
2849                         stats->stawidth = total_width / (double) nonnull_cnt;
2850                 else
2851                         stats->stawidth = stats->attrtype->typlen;
2852                 /* Assume all too-wide values are distinct, so it's a unique column */
2853                 stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
2854         }
2855         else if (null_cnt > 0)
2856         {
2857                 /* We found only nulls; assume the column is entirely null */
2858                 stats->stats_valid = true;
2859                 stats->stanullfrac = 1.0;
2860                 if (is_varwidth)
2861                         stats->stawidth = 0;    /* "unknown" */
2862                 else
2863                         stats->stawidth = stats->attrtype->typlen;
2864                 stats->stadistinct = 0.0;       /* "unknown" */
2865         }
2866
2867         /* We don't need to bother cleaning up any of our temporary palloc's */
2868 }
2869
2870 /*
2871  * Comparator for sorting ScalarItems
2872  *
2873  * Aside from sorting the items, we update the tupnoLink[] array
2874  * whenever two ScalarItems are found to contain equal datums.  The array
2875  * is indexed by tupno; for each ScalarItem, it contains the highest
2876  * tupno that that item's datum has been found to be equal to.  This allows
2877  * us to avoid additional comparisons in compute_scalar_stats().
2878  */
2879 static int
2880 compare_scalars(const void *a, const void *b, void *arg)
2881 {
2882         Datum           da = ((const ScalarItem *) a)->value;
2883         int                     ta = ((const ScalarItem *) a)->tupno;
2884         Datum           db = ((const ScalarItem *) b)->value;
2885         int                     tb = ((const ScalarItem *) b)->tupno;
2886         CompareScalarsContext *cxt = (CompareScalarsContext *) arg;
2887         int                     compare;
2888
2889         compare = ApplySortComparator(da, false, db, false, cxt->ssup);
2890         if (compare != 0)
2891                 return compare;
2892
2893         /*
2894          * The two datums are equal, so update cxt->tupnoLink[].
2895          */
2896         if (cxt->tupnoLink[ta] < tb)
2897                 cxt->tupnoLink[ta] = tb;
2898         if (cxt->tupnoLink[tb] < ta)
2899                 cxt->tupnoLink[tb] = ta;
2900
2901         /*
2902          * For equal datums, sort by tupno
2903          */
2904         return ta - tb;
2905 }
2906
2907 /*
2908  * Comparator for sorting ScalarMCVItems by position
2909  */
2910 static int
2911 compare_mcvs(const void *a, const void *b, void *arg)
2912 {
2913         int                     da = ((const ScalarMCVItem *) a)->first;
2914         int                     db = ((const ScalarMCVItem *) b)->first;
2915
2916         return da - db;
2917 }
2918
2919 /*
2920  * Analyze the list of common values in the sample and decide how many are
2921  * worth storing in the table's MCV list.
2922  *
2923  * mcv_counts is assumed to be a list of the counts of the most common values
2924  * seen in the sample, starting with the most common.  The return value is the
2925  * number that are significantly more common than the values not in the list,
2926  * and which are therefore deemed worth storing in the table's MCV list.
2927  */
2928 static int
2929 analyze_mcv_list(int *mcv_counts,
2930                                  int num_mcv,
2931                                  double stadistinct,
2932                                  double stanullfrac,
2933                                  int samplerows,
2934                                  double totalrows)
2935 {
2936         double          ndistinct_table;
2937         double          sumcount;
2938         int                     i;
2939
2940         /*
2941          * If the entire table was sampled, keep the whole list.  This also
2942          * protects us against division by zero in the code below.
2943          */
2944         if (samplerows == totalrows || totalrows <= 1.0)
2945                 return num_mcv;
2946
2947         /* Re-extract the estimated number of distinct nonnull values in table */
2948         ndistinct_table = stadistinct;
2949         if (ndistinct_table < 0)
2950                 ndistinct_table = -ndistinct_table * totalrows;
2951
2952         /*
2953          * Exclude the least common values from the MCV list, if they are not
2954          * significantly more common than the estimated selectivity they would
2955          * have if they weren't in the list.  All non-MCV values are assumed to be
2956          * equally common, after taking into account the frequencies of all the
2957          * values in the MCV list and the number of nulls (c.f. eqsel()).
2958          *
2959          * Here sumcount tracks the total count of all but the last (least common)
2960          * value in the MCV list, allowing us to determine the effect of excluding
2961          * that value from the list.
2962          *
2963          * Note that we deliberately do this by removing values from the full
2964          * list, rather than starting with an empty list and adding values,
2965          * because the latter approach can fail to add any values if all the most
2966          * common values have around the same frequency and make up the majority
2967          * of the table, so that the overall average frequency of all values is
2968          * roughly the same as that of the common values.  This would lead to any
2969          * uncommon values being significantly overestimated.
2970          */
2971         sumcount = 0.0;
2972         for (i = 0; i < num_mcv - 1; i++)
2973                 sumcount += mcv_counts[i];
2974
2975         while (num_mcv > 0)
2976         {
2977                 double          selec,
2978                                         otherdistinct,
2979                                         N,
2980                                         n,
2981                                         K,
2982                                         variance,
2983                                         stddev;
2984
2985                 /*
2986                  * Estimated selectivity the least common value would have if it
2987                  * wasn't in the MCV list (c.f. eqsel()).
2988                  */
2989                 selec = 1.0 - sumcount / samplerows - stanullfrac;
2990                 if (selec < 0.0)
2991                         selec = 0.0;
2992                 if (selec > 1.0)
2993                         selec = 1.0;
2994                 otherdistinct = ndistinct_table - (num_mcv - 1);
2995                 if (otherdistinct > 1)
2996                         selec /= otherdistinct;
2997
2998                 /*
2999                  * If the value is kept in the MCV list, its population frequency is
3000                  * assumed to equal its sample frequency.  We use the lower end of a
3001                  * textbook continuity-corrected Wald-type confidence interval to
3002                  * determine if that is significantly more common than the non-MCV
3003                  * frequency --- specifically we assume the population frequency is
3004                  * highly likely to be within around 2 standard errors of the sample
3005                  * frequency, which equates to an interval of 2 standard deviations
3006                  * either side of the sample count, plus an additional 0.5 for the
3007                  * continuity correction.  Since we are sampling without replacement,
3008                  * this is a hypergeometric distribution.
3009                  *
3010                  * XXX: Empirically, this approach seems to work quite well, but it
3011                  * may be worth considering more advanced techniques for estimating
3012                  * the confidence interval of the hypergeometric distribution.
3013                  */
3014                 N = totalrows;
3015                 n = samplerows;
3016                 K = N * mcv_counts[num_mcv - 1] / n;
3017                 variance = n * K * (N - K) * (N - n) / (N * N * (N - 1));
3018                 stddev = sqrt(variance);
3019
3020                 if (mcv_counts[num_mcv - 1] > selec * samplerows + 2 * stddev + 0.5)
3021                 {
3022                         /*
3023                          * The value is significantly more common than the non-MCV
3024                          * selectivity would suggest.  Keep it, and all the other more
3025                          * common values in the list.
3026                          */
3027                         break;
3028                 }
3029                 else
3030                 {
3031                         /* Discard this value and consider the next least common value */
3032                         num_mcv--;
3033                         if (num_mcv == 0)
3034                                 break;
3035                         sumcount -= mcv_counts[num_mcv - 1];
3036                 }
3037         }
3038         return num_mcv;
3039 }