src/backend/statistics/dependencies.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * dependencies.c
   4  *        POSTGRES functional dependencies
   5  *
   6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  * IDENTIFICATION
  10  *        src/backend/statistics/dependencies.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include "access/htup_details.h"
  17 #include "access/sysattr.h"
  18 #include "catalog/pg_operator.h"
  19 #include "catalog/pg_statistic_ext.h"
  20 #include "catalog/pg_statistic_ext_data.h"
  21 #include "lib/stringinfo.h"
  22 #include "nodes/nodeFuncs.h"
  23 #include "nodes/nodes.h"
  24 #include "nodes/pathnodes.h"
  25 #include "optimizer/clauses.h"
  26 #include "optimizer/optimizer.h"
  27 #include "parser/parsetree.h"
  28 #include "statistics/extended_stats_internal.h"
  29 #include "statistics/statistics.h"
  30 #include "utils/bytea.h"
  31 #include "utils/fmgroids.h"
  32 #include "utils/fmgrprotos.h"
  33 #include "utils/lsyscache.h"
  34 #include "utils/memutils.h"
  35 #include "utils/selfuncs.h"
  36 #include "utils/syscache.h"
  37 #include "utils/typcache.h"
  38
  39 /* size of the struct header fields (magic, type, ndeps) */
  40 #define SizeOfHeader            (3 * sizeof(uint32))
  41
  42 /* size of a serialized dependency (degree, natts, atts) */
  43 #define SizeOfItem(natts) \
  44         (sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
  45
  46 /* minimal size of a dependency (with two attributes) */
  47 #define MinSizeOfItem   SizeOfItem(2)
  48
  49 /* minimal size of dependencies, when all deps are minimal */
  50 #define MinSizeOfItems(ndeps) \
  51         (SizeOfHeader + (ndeps) * MinSizeOfItem)
  52
  53 /*
  54  * Internal state for DependencyGenerator of dependencies. Dependencies are similar to
  55  * k-permutations of n elements, except that the order does not matter for the
  56  * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
  57  */
  58 typedef struct DependencyGeneratorData
  59 {
  60         int                     k;                              /* size of the dependency */
  61         int                     n;                              /* number of possible attributes */
  62         int                     current;                /* next dependency to return (index) */
  63         AttrNumber      ndependencies;  /* number of dependencies generated */
  64         AttrNumber *dependencies;       /* array of pre-generated dependencies  */
  65 } DependencyGeneratorData;
  66
  67 typedef DependencyGeneratorData *DependencyGenerator;
  68
  69 static void generate_dependencies_recurse(DependencyGenerator state,
  70                                                                                   int index, AttrNumber start, AttrNumber *current);
  71 static void generate_dependencies(DependencyGenerator state);
  72 static DependencyGenerator DependencyGenerator_init(int n, int k);
  73 static void DependencyGenerator_free(DependencyGenerator state);
  74 static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
  75 static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
  76 static bool dependency_is_fully_matched(MVDependency *dependency,
  77                                                                                 Bitmapset *attnums);
  78 static bool dependency_is_compatible_clause(Node *clause, Index relid,
  79                                                                                         AttrNumber *attnum);
  80 static bool dependency_is_compatible_expression(Node *clause, Index relid,
  81                                                                                                 List *statlist, Node **expr);
  82 static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
  83                                                                                            int ndependencies, Bitmapset *attnums);
  84 static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
  85                                                                                                  int varRelid, JoinType jointype,
  86                                                                                                  SpecialJoinInfo *sjinfo,
  87                                                                                                  MVDependency **dependencies,
  88                                                                                                  int ndependencies,
  89                                                                                                  AttrNumber *list_attnums,
  90                                                                                                  Bitmapset **estimatedclauses);
  91
  92 static void
  93 generate_dependencies_recurse(DependencyGenerator state, int index,
  94                                                           AttrNumber start, AttrNumber *current)
  95 {
  96         /*
  97          * The generator handles the first (k-1) elements differently from the
  98          * last element.
  99          */
 100         if (index < (state->k - 1))
 101         {
 102                 AttrNumber      i;
 103
 104                 /*
 105                  * The first (k-1) values have to be in ascending order, which we
 106                  * generate recursively.
 107                  */
 108
 109                 for (i = start; i < state->n; i++)
 110                 {
 111                         current[index] = i;
 112                         generate_dependencies_recurse(state, (index + 1), (i + 1), current);
 113                 }
 114         }
 115         else
 116         {
 117                 int                     i;
 118
 119                 /*
 120                  * the last element is the implied value, which does not respect the
 121                  * ascending order. We just need to check that the value is not in the
 122                  * first (k-1) elements.
 123                  */
 124
 125                 for (i = 0; i < state->n; i++)
 126                 {
 127                         int                     j;
 128                         bool            match = false;
 129
 130                         current[index] = i;
 131
 132                         for (j = 0; j < index; j++)
 133                         {
 134                                 if (current[j] == i)
 135                                 {
 136                                         match = true;
 137                                         break;
 138                                 }
 139                         }
 140
 141                         /*
 142                          * If the value is not found in the first part of the dependency,
 143                          * we're done.
 144                          */
 145                         if (!match)
 146                         {
 147                                 state->dependencies = (AttrNumber *) repalloc(state->dependencies,
 148                                                                                                                           state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
 149                                 memcpy(&state->dependencies[(state->k * state->ndependencies)],
 150                                            current, state->k * sizeof(AttrNumber));
 151                                 state->ndependencies++;
 152                         }
 153                 }
 154         }
 155 }
 156
 157 /* generate all dependencies (k-permutations of n elements) */
 158 static void
 159 generate_dependencies(DependencyGenerator state)
 160 {
 161         AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
 162
 163         generate_dependencies_recurse(state, 0, 0, current);
 164
 165         pfree(current);
 166 }
 167
 168 /*
 169  * initialize the DependencyGenerator of variations, and prebuild the variations
 170  *
 171  * This pre-builds all the variations. We could also generate them in
 172  * DependencyGenerator_next(), but this seems simpler.
 173  */
 174 static DependencyGenerator
 175 DependencyGenerator_init(int n, int k)
 176 {
 177         DependencyGenerator state;
 178
 179         Assert((n >= k) && (k > 0));
 180
 181         /* allocate the DependencyGenerator state */
 182         state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
 183         state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
 184
 185         state->ndependencies = 0;
 186         state->current = 0;
 187         state->k = k;
 188         state->n = n;
 189
 190         /* now actually pre-generate all the variations */
 191         generate_dependencies(state);
 192
 193         return state;
 194 }
 195
 196 /* free the DependencyGenerator state */
 197 static void
 198 DependencyGenerator_free(DependencyGenerator state)
 199 {
 200         pfree(state->dependencies);
 201         pfree(state);
 202 }
 203
 204 /* generate next combination */
 205 static AttrNumber *
 206 DependencyGenerator_next(DependencyGenerator state)
 207 {
 208         if (state->current == state->ndependencies)
 209                 return NULL;
 210
 211         return &state->dependencies[state->k * state->current++];
 212 }
 213
 214
 215 /*
 216  * validates functional dependency on the data
 217  *
 218  * An actual work horse of detecting functional dependencies. Given a variation
 219  * of k attributes, it checks that the first (k-1) are sufficient to determine
 220  * the last one.
 221  */
 222 static double
 223 dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
 224 {
 225         int                     i,
 226                                 nitems;
 227         MultiSortSupport mss;
 228         SortItem   *items;
 229         AttrNumber *attnums_dep;
 230
 231         /* counters valid within a group */
 232         int                     group_size = 0;
 233         int                     n_violations = 0;
 234
 235         /* total number of rows supporting (consistent with) the dependency */
 236         int                     n_supporting_rows = 0;
 237
 238         /* Make sure we have at least two input attributes. */
 239         Assert(k >= 2);
 240
 241         /* sort info for all attributes columns */
 242         mss = multi_sort_init(k);
 243
 244         /*
 245          * Translate the array of indexes to regular attnums for the dependency
 246          * (we will need this to identify the columns in StatsBuildData).
 247          */
 248         attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
 249         for (i = 0; i < k; i++)
 250                 attnums_dep[i] = data->attnums[dependency[i]];
 251
 252         /*
 253          * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
 254          *
 255          * (a) sort the data lexicographically
 256          *
 257          * (b) split the data into groups by first (k-1) columns
 258          *
 259          * (c) for each group count different values in the last column
 260          *
 261          * We use the column data types' default sort operators and collations;
 262          * perhaps at some point it'd be worth using column-specific collations?
 263          */
 264
 265         /* prepare the sort function for the dimensions */
 266         for (i = 0; i < k; i++)
 267         {
 268                 VacAttrStats *colstat = data->stats[dependency[i]];
 269                 TypeCacheEntry *type;
 270
 271                 type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
 272                 if (type->lt_opr == InvalidOid) /* shouldn't happen */
 273                         elog(ERROR, "cache lookup failed for ordering operator for type %u",
 274                                  colstat->attrtypid);
 275
 276                 /* prepare the sort function for this dimension */
 277                 multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
 278         }
 279
 280         /*
 281          * build an array of SortItem(s) sorted using the multi-sort support
 282          *
 283          * XXX This relies on all stats entries pointing to the same tuple
 284          * descriptor.  For now that assumption holds, but it might change in the
 285          * future for example if we support statistics on multiple tables.
 286          */
 287         items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
 288
 289         /*
 290          * Walk through the sorted array, split it into rows according to the
 291          * first (k-1) columns. If there's a single value in the last column, we
 292          * count the group as 'supporting' the functional dependency. Otherwise we
 293          * count it as contradicting.
 294          */
 295
 296         /* start with the first row forming a group */
 297         group_size = 1;
 298
 299         /* loop 1 beyond the end of the array so that we count the final group */
 300         for (i = 1; i <= nitems; i++)
 301         {
 302                 /*
 303                  * Check if the group ended, which may be either because we processed
 304                  * all the items (i==nitems), or because the i-th item is not equal to
 305                  * the preceding one.
 306                  */
 307                 if (i == nitems ||
 308                         multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
 309                 {
 310                         /*
 311                          * If no violations were found in the group then track the rows of
 312                          * the group as supporting the functional dependency.
 313                          */
 314                         if (n_violations == 0)
 315                                 n_supporting_rows += group_size;
 316
 317                         /* Reset counters for the new group */
 318                         n_violations = 0;
 319                         group_size = 1;
 320                         continue;
 321                 }
 322                 /* first columns match, but the last one does not (so contradicting) */
 323                 else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
 324                         n_violations++;
 325
 326                 group_size++;
 327         }
 328
 329         /* Compute the 'degree of validity' as (supporting/total). */
 330         return (n_supporting_rows * 1.0 / data->numrows);
 331 }
 332
 333 /*
 334  * detects functional dependencies between groups of columns
 335  *
 336  * Generates all possible subsets of columns (variations) and computes
 337  * the degree of validity for each one. For example when creating statistics
 338  * on three columns (a,b,c) there are 9 possible dependencies
 339  *
 340  *         two columns                    three columns
 341  *         -----------                    -------------
 342  *         (a) -> b                               (a,b) -> c
 343  *         (a) -> c                               (a,c) -> b
 344  *         (b) -> a                               (b,c) -> a
 345  *         (b) -> c
 346  *         (c) -> a
 347  *         (c) -> b
 348  */
 349 MVDependencies *
 350 statext_dependencies_build(StatsBuildData *data)
 351 {
 352         int                     i,
 353                                 k;
 354
 355         /* result */
 356         MVDependencies *dependencies = NULL;
 357         MemoryContext cxt;
 358
 359         Assert(data->nattnums >= 2);
 360
 361         /* tracks memory allocated by dependency_degree calls */
 362         cxt = AllocSetContextCreate(CurrentMemoryContext,
 363                                                                 "dependency_degree cxt",
 364                                                                 ALLOCSET_DEFAULT_SIZES);
 365
 366         /*
 367          * We'll try build functional dependencies starting from the smallest ones
 368          * covering just 2 columns, to the largest ones, covering all columns
 369          * included in the statistics object.  We start from the smallest ones
 370          * because we want to be able to skip already implied ones.
 371          */
 372         for (k = 2; k <= data->nattnums; k++)
 373         {
 374                 AttrNumber *dependency; /* array with k elements */
 375
 376                 /* prepare a DependencyGenerator of variation */
 377                 DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
 378
 379                 /* generate all possible variations of k values (out of n) */
 380                 while ((dependency = DependencyGenerator_next(DependencyGenerator)))
 381                 {
 382                         double          degree;
 383                         MVDependency *d;
 384                         MemoryContext oldcxt;
 385
 386                         /* release memory used by dependency degree calculation */
 387                         oldcxt = MemoryContextSwitchTo(cxt);
 388
 389                         /* compute how valid the dependency seems */
 390                         degree = dependency_degree(data, k, dependency);
 391
 392                         MemoryContextSwitchTo(oldcxt);
 393                         MemoryContextReset(cxt);
 394
 395                         /*
 396                          * if the dependency seems entirely invalid, don't store it
 397                          */
 398                         if (degree == 0.0)
 399                                 continue;
 400
 401                         d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
 402                                                                                  + k * sizeof(AttrNumber));
 403
 404                         /* copy the dependency (and keep the indexes into stxkeys) */
 405                         d->degree = degree;
 406                         d->nattributes = k;
 407                         for (i = 0; i < k; i++)
 408                                 d->attributes[i] = data->attnums[dependency[i]];
 409
 410                         /* initialize the list of dependencies */
 411                         if (dependencies == NULL)
 412                         {
 413                                 dependencies
 414                                         = (MVDependencies *) palloc0(sizeof(MVDependencies));
 415
 416                                 dependencies->magic = STATS_DEPS_MAGIC;
 417                                 dependencies->type = STATS_DEPS_TYPE_BASIC;
 418                                 dependencies->ndeps = 0;
 419                         }
 420
 421                         dependencies->ndeps++;
 422                         dependencies = (MVDependencies *) repalloc(dependencies,
 423                                                                                                            offsetof(MVDependencies, deps)
 424                                                                                                            + dependencies->ndeps * sizeof(MVDependency *));
 425
 426                         dependencies->deps[dependencies->ndeps - 1] = d;
 427                 }
 428
 429                 /*
 430                  * we're done with variations of k elements, so free the
 431                  * DependencyGenerator
 432                  */
 433                 DependencyGenerator_free(DependencyGenerator);
 434         }
 435
 436         MemoryContextDelete(cxt);
 437
 438         return dependencies;
 439 }
 440
 441
 442 /*
 443  * Serialize list of dependencies into a bytea value.
 444  */
 445 bytea *
 446 statext_dependencies_serialize(MVDependencies *dependencies)
 447 {
 448         int                     i;
 449         bytea      *output;
 450         char       *tmp;
 451         Size            len;
 452
 453         /* we need to store ndeps, with a number of attributes for each one */
 454         len = VARHDRSZ + SizeOfHeader;
 455
 456         /* and also include space for the actual attribute numbers and degrees */
 457         for (i = 0; i < dependencies->ndeps; i++)
 458                 len += SizeOfItem(dependencies->deps[i]->nattributes);
 459
 460         output = (bytea *) palloc0(len);
 461         SET_VARSIZE(output, len);
 462
 463         tmp = VARDATA(output);
 464
 465         /* Store the base struct values (magic, type, ndeps) */
 466         memcpy(tmp, &dependencies->magic, sizeof(uint32));
 467         tmp += sizeof(uint32);
 468         memcpy(tmp, &dependencies->type, sizeof(uint32));
 469         tmp += sizeof(uint32);
 470         memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
 471         tmp += sizeof(uint32);
 472
 473         /* store number of attributes and attribute numbers for each dependency */
 474         for (i = 0; i < dependencies->ndeps; i++)
 475         {
 476                 MVDependency *d = dependencies->deps[i];
 477
 478                 memcpy(tmp, &d->degree, sizeof(double));
 479                 tmp += sizeof(double);
 480
 481                 memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
 482                 tmp += sizeof(AttrNumber);
 483
 484                 memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
 485                 tmp += sizeof(AttrNumber) * d->nattributes;
 486
 487                 /* protect against overflow */
 488                 Assert(tmp <= ((char *) output + len));
 489         }
 490
 491         /* make sure we've produced exactly the right amount of data */
 492         Assert(tmp == ((char *) output + len));
 493
 494         return output;
 495 }
 496
 497 /*
 498  * Reads serialized dependencies into MVDependencies structure.
 499  */
 500 MVDependencies *
 501 statext_dependencies_deserialize(bytea *data)
 502 {
 503         int                     i;
 504         Size            min_expected_size;
 505         MVDependencies *dependencies;
 506         char       *tmp;
 507
 508         if (data == NULL)
 509                 return NULL;
 510
 511         if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
 512                 elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
 513                          VARSIZE_ANY_EXHDR(data), SizeOfHeader);
 514
 515         /* read the MVDependencies header */
 516         dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
 517
 518         /* initialize pointer to the data part (skip the varlena header) */
 519         tmp = VARDATA_ANY(data);
 520
 521         /* read the header fields and perform basic sanity checks */
 522         memcpy(&dependencies->magic, tmp, sizeof(uint32));
 523         tmp += sizeof(uint32);
 524         memcpy(&dependencies->type, tmp, sizeof(uint32));
 525         tmp += sizeof(uint32);
 526         memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
 527         tmp += sizeof(uint32);
 528
 529         if (dependencies->magic != STATS_DEPS_MAGIC)
 530                 elog(ERROR, "invalid dependency magic %d (expected %d)",
 531                          dependencies->magic, STATS_DEPS_MAGIC);
 532
 533         if (dependencies->type != STATS_DEPS_TYPE_BASIC)
 534                 elog(ERROR, "invalid dependency type %d (expected %d)",
 535                          dependencies->type, STATS_DEPS_TYPE_BASIC);
 536
 537         if (dependencies->ndeps == 0)
 538                 elog(ERROR, "invalid zero-length item array in MVDependencies");
 539
 540         /* what minimum bytea size do we expect for those parameters */
 541         min_expected_size = SizeOfItem(dependencies->ndeps);
 542
 543         if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
 544                 elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
 545                          VARSIZE_ANY_EXHDR(data), min_expected_size);
 546
 547         /* allocate space for the MCV items */
 548         dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
 549                                                         + (dependencies->ndeps * sizeof(MVDependency *)));
 550
 551         for (i = 0; i < dependencies->ndeps; i++)
 552         {
 553                 double          degree;
 554                 AttrNumber      k;
 555                 MVDependency *d;
 556
 557                 /* degree of validity */
 558                 memcpy(&degree, tmp, sizeof(double));
 559                 tmp += sizeof(double);
 560
 561                 /* number of attributes */
 562                 memcpy(&k, tmp, sizeof(AttrNumber));
 563                 tmp += sizeof(AttrNumber);
 564
 565                 /* is the number of attributes valid? */
 566                 Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
 567
 568                 /* now that we know the number of attributes, allocate the dependency */
 569                 d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
 570                                                                          + (k * sizeof(AttrNumber)));
 571
 572                 d->degree = degree;
 573                 d->nattributes = k;
 574
 575                 /* copy attribute numbers */
 576                 memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
 577                 tmp += sizeof(AttrNumber) * d->nattributes;
 578
 579                 dependencies->deps[i] = d;
 580
 581                 /* still within the bytea */
 582                 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
 583         }
 584
 585         /* we should have consumed the whole bytea exactly */
 586         Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
 587
 588         return dependencies;
 589 }
 590
 591 /*
 592  * dependency_is_fully_matched
 593  *              checks that a functional dependency is fully matched given clauses on
 594  *              attributes (assuming the clauses are suitable equality clauses)
 595  */
 596 static bool
 597 dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
 598 {
 599         int                     j;
 600
 601         /*
 602          * Check that the dependency actually is fully covered by clauses. We have
 603          * to translate all attribute numbers, as those are referenced
 604          */
 605         for (j = 0; j < dependency->nattributes; j++)
 606         {
 607                 int                     attnum = dependency->attributes[j];
 608
 609                 if (!bms_is_member(attnum, attnums))
 610                         return false;
 611         }
 612
 613         return true;
 614 }
 615
 616 /*
 617  * statext_dependencies_load
 618  *              Load the functional dependencies for the indicated pg_statistic_ext tuple
 619  */
 620 MVDependencies *
 621 statext_dependencies_load(Oid mvoid, bool inh)
 622 {
 623         MVDependencies *result;
 624         bool            isnull;
 625         Datum           deps;
 626         HeapTuple       htup;
 627
 628         htup = SearchSysCache2(STATEXTDATASTXOID,
 629                                                    ObjectIdGetDatum(mvoid),
 630                                                    BoolGetDatum(inh));
 631         if (!HeapTupleIsValid(htup))
 632                 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
 633
 634         deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
 635                                                    Anum_pg_statistic_ext_data_stxddependencies, &isnull);
 636         if (isnull)
 637                 elog(ERROR,
 638                          "requested statistics kind \"%c\" is not yet built for statistics object %u",
 639                          STATS_EXT_DEPENDENCIES, mvoid);
 640
 641         result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
 642
 643         ReleaseSysCache(htup);
 644
 645         return result;
 646 }
 647
 648 /*
 649  * pg_dependencies_in           - input routine for type pg_dependencies.
 650  *
 651  * pg_dependencies is real enough to be a table column, but it has no operations
 652  * of its own, and disallows input too
 653  */
 654 Datum
 655 pg_dependencies_in(PG_FUNCTION_ARGS)
 656 {
 657         /*
 658          * pg_node_list stores the data in binary form and parsing text input is
 659          * not needed, so disallow this.
 660          */
 661         ereport(ERROR,
 662                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 663                          errmsg("cannot accept a value of type %s", "pg_dependencies")));
 664
 665         PG_RETURN_VOID();                       /* keep compiler quiet */
 666 }
 667
 668 /*
 669  * pg_dependencies              - output routine for type pg_dependencies.
 670  */
 671 Datum
 672 pg_dependencies_out(PG_FUNCTION_ARGS)
 673 {
 674         bytea      *data = PG_GETARG_BYTEA_PP(0);
 675         MVDependencies *dependencies = statext_dependencies_deserialize(data);
 676         int                     i,
 677                                 j;
 678         StringInfoData str;
 679
 680         initStringInfo(&str);
 681         appendStringInfoChar(&str, '{');
 682
 683         for (i = 0; i < dependencies->ndeps; i++)
 684         {
 685                 MVDependency *dependency = dependencies->deps[i];
 686
 687                 if (i > 0)
 688                         appendStringInfoString(&str, ", ");
 689
 690                 appendStringInfoChar(&str, '"');
 691                 for (j = 0; j < dependency->nattributes; j++)
 692                 {
 693                         if (j == dependency->nattributes - 1)
 694                                 appendStringInfoString(&str, " => ");
 695                         else if (j > 0)
 696                                 appendStringInfoString(&str, ", ");
 697
 698                         appendStringInfo(&str, "%d", dependency->attributes[j]);
 699                 }
 700                 appendStringInfo(&str, "\": %f", dependency->degree);
 701         }
 702
 703         appendStringInfoChar(&str, '}');
 704
 705         PG_RETURN_CSTRING(str.data);
 706 }
 707
 708 /*
 709  * pg_dependencies_recv         - binary input routine for type pg_dependencies.
 710  */
 711 Datum
 712 pg_dependencies_recv(PG_FUNCTION_ARGS)
 713 {
 714         ereport(ERROR,
 715                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 716                          errmsg("cannot accept a value of type %s", "pg_dependencies")));
 717
 718         PG_RETURN_VOID();                       /* keep compiler quiet */
 719 }
 720
 721 /*
 722  * pg_dependencies_send         - binary output routine for type pg_dependencies.
 723  *
 724  * Functional dependencies are serialized in a bytea value (although the type
 725  * is named differently), so let's just send that.
 726  */
 727 Datum
 728 pg_dependencies_send(PG_FUNCTION_ARGS)
 729 {
 730         return byteasend(fcinfo);
 731 }
 732
 733 /*
 734  * dependency_is_compatible_clause
 735  *              Determines if the clause is compatible with functional dependencies
 736  *
 737  * Only clauses that have the form of equality to a pseudoconstant, or can be
 738  * interpreted that way, are currently accepted.  Furthermore the variable
 739  * part of the clause must be a simple Var belonging to the specified
 740  * relation, whose attribute number we return in *attnum on success.
 741  */
 742 static bool
 743 dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
 744 {
 745         Var                *var;
 746         Node       *clause_expr;
 747
 748         if (IsA(clause, RestrictInfo))
 749         {
 750                 RestrictInfo *rinfo = (RestrictInfo *) clause;
 751
 752                 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
 753                 if (rinfo->pseudoconstant)
 754                         return false;
 755
 756                 /* Clauses referencing multiple, or no, varnos are incompatible */
 757                 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
 758                         return false;
 759
 760                 clause = (Node *) rinfo->clause;
 761         }
 762
 763         if (is_opclause(clause))
 764         {
 765                 /* If it's an opclause, check for Var = Const or Const = Var. */
 766                 OpExpr     *expr = (OpExpr *) clause;
 767
 768                 /* Only expressions with two arguments are candidates. */
 769                 if (list_length(expr->args) != 2)
 770                         return false;
 771
 772                 /* Make sure non-selected argument is a pseudoconstant. */
 773                 if (is_pseudo_constant_clause(lsecond(expr->args)))
 774                         clause_expr = linitial(expr->args);
 775                 else if (is_pseudo_constant_clause(linitial(expr->args)))
 776                         clause_expr = lsecond(expr->args);
 777                 else
 778                         return false;
 779
 780                 /*
 781                  * If it's not an "=" operator, just ignore the clause, as it's not
 782                  * compatible with functional dependencies.
 783                  *
 784                  * This uses the function for estimating selectivity, not the operator
 785                  * directly (a bit awkward, but well ...).
 786                  *
 787                  * XXX this is pretty dubious; probably it'd be better to check btree
 788                  * or hash opclass membership, so as not to be fooled by custom
 789                  * selectivity functions, and to be more consistent with decisions
 790                  * elsewhere in the planner.
 791                  */
 792                 if (get_oprrest(expr->opno) != F_EQSEL)
 793                         return false;
 794
 795                 /* OK to proceed with checking "var" */
 796         }
 797         else if (IsA(clause, ScalarArrayOpExpr))
 798         {
 799                 /* If it's an scalar array operator, check for Var IN Const. */
 800                 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
 801
 802                 /*
 803                  * Reject ALL() variant, we only care about ANY/IN.
 804                  *
 805                  * XXX Maybe we should check if all the values are the same, and allow
 806                  * ALL in that case? Doesn't seem very practical, though.
 807                  */
 808                 if (!expr->useOr)
 809                         return false;
 810
 811                 /* Only expressions with two arguments are candidates. */
 812                 if (list_length(expr->args) != 2)
 813                         return false;
 814
 815                 /*
 816                  * We know it's always (Var IN Const), so we assume the var is the
 817                  * first argument, and pseudoconstant is the second one.
 818                  */
 819                 if (!is_pseudo_constant_clause(lsecond(expr->args)))
 820                         return false;
 821
 822                 clause_expr = linitial(expr->args);
 823
 824                 /*
 825                  * If it's not an "=" operator, just ignore the clause, as it's not
 826                  * compatible with functional dependencies. The operator is identified
 827                  * simply by looking at which function it uses to estimate
 828                  * selectivity. That's a bit strange, but it's what other similar
 829                  * places do.
 830                  */
 831                 if (get_oprrest(expr->opno) != F_EQSEL)
 832                         return false;
 833
 834                 /* OK to proceed with checking "var" */
 835         }
 836         else if (is_orclause(clause))
 837         {
 838                 BoolExpr   *bool_expr = (BoolExpr *) clause;
 839                 ListCell   *lc;
 840
 841                 /* start with no attribute number */
 842                 *attnum = InvalidAttrNumber;
 843
 844                 foreach(lc, bool_expr->args)
 845                 {
 846                         AttrNumber      clause_attnum;
 847
 848                         /*
 849                          * Had we found incompatible clause in the arguments, treat the
 850                          * whole clause as incompatible.
 851                          */
 852                         if (!dependency_is_compatible_clause((Node *) lfirst(lc),
 853                                                                                                  relid, &clause_attnum))
 854                                 return false;
 855
 856                         if (*attnum == InvalidAttrNumber)
 857                                 *attnum = clause_attnum;
 858
 859                         /* ensure all the variables are the same (same attnum) */
 860                         if (*attnum != clause_attnum)
 861                                 return false;
 862                 }
 863
 864                 /* the Var is already checked by the recursive call */
 865                 return true;
 866         }
 867         else if (is_notclause(clause))
 868         {
 869                 /*
 870                  * "NOT x" can be interpreted as "x = false", so get the argument and
 871                  * proceed with seeing if it's a suitable Var.
 872                  */
 873                 clause_expr = (Node *) get_notclausearg(clause);
 874         }
 875         else
 876         {
 877                 /*
 878                  * A boolean expression "x" can be interpreted as "x = true", so
 879                  * proceed with seeing if it's a suitable Var.
 880                  */
 881                 clause_expr = (Node *) clause;
 882         }
 883
 884         /*
 885          * We may ignore any RelabelType node above the operand.  (There won't be
 886          * more than one, since eval_const_expressions has been applied already.)
 887          */
 888         if (IsA(clause_expr, RelabelType))
 889                 clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
 890
 891         /* We only support plain Vars for now */
 892         if (!IsA(clause_expr, Var))
 893                 return false;
 894
 895         /* OK, we know we have a Var */
 896         var = (Var *) clause_expr;
 897
 898         /* Ensure Var is from the correct relation */
 899         if (var->varno != relid)
 900                 return false;
 901
 902         /* We also better ensure the Var is from the current level */
 903         if (var->varlevelsup != 0)
 904                 return false;
 905
 906         /* Also ignore system attributes (we don't allow stats on those) */
 907         if (!AttrNumberIsForUserDefinedAttr(var->varattno))
 908                 return false;
 909
 910         *attnum = var->varattno;
 911         return true;
 912 }
 913
 914 /*
 915  * find_strongest_dependency
 916  *              find the strongest dependency on the attributes
 917  *
 918  * When applying functional dependencies, we start with the strongest
 919  * dependencies. That is, we select the dependency that:
 920  *
 921  * (a) has all attributes covered by equality clauses
 922  *
 923  * (b) has the most attributes
 924  *
 925  * (c) has the highest degree of validity
 926  *
 927  * This guarantees that we eliminate the most redundant conditions first
 928  * (see the comment in dependencies_clauselist_selectivity).
 929  */
 930 static MVDependency *
 931 find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
 932                                                   Bitmapset *attnums)
 933 {
 934         int                     i,
 935                                 j;
 936         MVDependency *strongest = NULL;
 937
 938         /* number of attnums in clauses */
 939         int                     nattnums = bms_num_members(attnums);
 940
 941         /*
 942          * Iterate over the MVDependency items and find the strongest one from the
 943          * fully-matched dependencies. We do the cheap checks first, before
 944          * matching it against the attnums.
 945          */
 946         for (i = 0; i < ndependencies; i++)
 947         {
 948                 for (j = 0; j < dependencies[i]->ndeps; j++)
 949                 {
 950                         MVDependency *dependency = dependencies[i]->deps[j];
 951
 952                         /*
 953                          * Skip dependencies referencing more attributes than available
 954                          * clauses, as those can't be fully matched.
 955                          */
 956                         if (dependency->nattributes > nattnums)
 957                                 continue;
 958
 959                         if (strongest)
 960                         {
 961                                 /* skip dependencies on fewer attributes than the strongest. */
 962                                 if (dependency->nattributes < strongest->nattributes)
 963                                         continue;
 964
 965                                 /* also skip weaker dependencies when attribute count matches */
 966                                 if (strongest->nattributes == dependency->nattributes &&
 967                                         strongest->degree > dependency->degree)
 968                                         continue;
 969                         }
 970
 971                         /*
 972                          * this dependency is stronger, but we must still check that it's
 973                          * fully matched to these attnums. We perform this check last as
 974                          * it's slightly more expensive than the previous checks.
 975                          */
 976                         if (dependency_is_fully_matched(dependency, attnums))
 977                                 strongest = dependency; /* save new best match */
 978                 }
 979         }
 980
 981         return strongest;
 982 }
 983
 984 /*
 985  * clauselist_apply_dependencies
 986  *              Apply the specified functional dependencies to a list of clauses and
 987  *              return the estimated selectivity of the clauses that are compatible
 988  *              with any of the given dependencies.
 989  *
 990  * This will estimate all not-already-estimated clauses that are compatible
 991  * with functional dependencies, and which have an attribute mentioned by any
 992  * of the given dependencies (either as an implying or implied attribute).
 993  *
 994  * Given (lists of) clauses on attributes (a,b) and a functional dependency
 995  * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
 996  * using the formula
 997  *
 998  *              P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
 999  *
1000  * where 'f' is the degree of dependency.  This reflects the fact that we
1001  * expect a fraction f of all rows to be consistent with the dependency
1002  * (a=>b), and so have a selectivity of P(a), while the remaining rows are
1003  * treated as independent.
1004  *
1005  * In practice, we use a slightly modified version of this formula, which uses
1006  * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
1007  * should obviously not exceed either column's individual selectivity.  I.e.,
1008  * we actually combine selectivities using the formula
1009  *
1010  *              P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1011  *
1012  * This can make quite a difference if the specific values matching the
1013  * clauses are not consistent with the functional dependency.
1014  */
1015 static Selectivity
1016 clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
1017                                                           int varRelid, JoinType jointype,
1018                                                           SpecialJoinInfo *sjinfo,
1019                                                           MVDependency **dependencies, int ndependencies,
1020                                                           AttrNumber *list_attnums,
1021                                                           Bitmapset **estimatedclauses)
1022 {
1023         Bitmapset  *attnums;
1024         int                     i;
1025         int                     j;
1026         int                     nattrs;
1027         Selectivity *attr_sel;
1028         int                     attidx;
1029         int                     listidx;
1030         ListCell   *l;
1031         Selectivity s1;
1032
1033         /*
1034          * Extract the attnums of all implying and implied attributes from all the
1035          * given dependencies.  Each of these attributes is expected to have at
1036          * least 1 not-already-estimated compatible clause that we will estimate
1037          * here.
1038          */
1039         attnums = NULL;
1040         for (i = 0; i < ndependencies; i++)
1041         {
1042                 for (j = 0; j < dependencies[i]->nattributes; j++)
1043                 {
1044                         AttrNumber      attnum = dependencies[i]->attributes[j];
1045
1046                         attnums = bms_add_member(attnums, attnum);
1047                 }
1048         }
1049
1050         /*
1051          * Compute per-column selectivity estimates for each of these attributes,
1052          * and mark all the corresponding clauses as estimated.
1053          */
1054         nattrs = bms_num_members(attnums);
1055         attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
1056
1057         attidx = 0;
1058         i = -1;
1059         while ((i = bms_next_member(attnums, i)) >= 0)
1060         {
1061                 List       *attr_clauses = NIL;
1062                 Selectivity simple_sel;
1063
1064                 listidx = -1;
1065                 foreach(l, clauses)
1066                 {
1067                         Node       *clause = (Node *) lfirst(l);
1068
1069                         listidx++;
1070                         if (list_attnums[listidx] == i)
1071                         {
1072                                 attr_clauses = lappend(attr_clauses, clause);
1073                                 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
1074                         }
1075                 }
1076
1077                 simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
1078                                                                                                 jointype, sjinfo, false);
1079                 attr_sel[attidx++] = simple_sel;
1080         }
1081
1082         /*
1083          * Now combine these selectivities using the dependency information.  For
1084          * chains of dependencies such as a -> b -> c, the b -> c dependency will
1085          * come before the a -> b dependency in the array, so we traverse the
1086          * array backwards to ensure such chains are computed in the right order.
1087          *
1088          * As explained above, pairs of selectivities are combined using the
1089          * formula
1090          *
1091          * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
1092          *
1093          * to ensure that the combined selectivity is never greater than either
1094          * individual selectivity.
1095          *
1096          * Where multiple dependencies apply (e.g., a -> b -> c), we use
1097          * conditional probabilities to compute the overall result as follows:
1098          *
1099          * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
1100          *
1101          * so we replace the selectivities of all implied attributes with
1102          * conditional probabilities, that are conditional on all their implying
1103          * attributes.  The selectivities of all other non-implied attributes are
1104          * left as they are.
1105          */
1106         for (i = ndependencies - 1; i >= 0; i--)
1107         {
1108                 MVDependency *dependency = dependencies[i];
1109                 AttrNumber      attnum;
1110                 Selectivity s2;
1111                 double          f;
1112
1113                 /* Selectivity of all the implying attributes */
1114                 s1 = 1.0;
1115                 for (j = 0; j < dependency->nattributes - 1; j++)
1116                 {
1117                         attnum = dependency->attributes[j];
1118                         attidx = bms_member_index(attnums, attnum);
1119                         s1 *= attr_sel[attidx];
1120                 }
1121
1122                 /* Original selectivity of the implied attribute */
1123                 attnum = dependency->attributes[j];
1124                 attidx = bms_member_index(attnums, attnum);
1125                 s2 = attr_sel[attidx];
1126
1127                 /*
1128                  * Replace s2 with the conditional probability s2 given s1, computed
1129                  * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
1130                  *
1131                  * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
1132                  *
1133                  * where P(a) = s1, the selectivity of the implying attributes, and
1134                  * P(b) = s2, the selectivity of the implied attribute.
1135                  */
1136                 f = dependency->degree;
1137
1138                 if (s1 <= s2)
1139                         attr_sel[attidx] = f + (1 - f) * s2;
1140                 else
1141                         attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
1142         }
1143
1144         /*
1145          * The overall selectivity of all the clauses on all these attributes is
1146          * then the product of all the original (non-implied) probabilities and
1147          * the new conditional (implied) probabilities.
1148          */
1149         s1 = 1.0;
1150         for (i = 0; i < nattrs; i++)
1151                 s1 *= attr_sel[i];
1152
1153         CLAMP_PROBABILITY(s1);
1154
1155         pfree(attr_sel);
1156         bms_free(attnums);
1157
1158         return s1;
1159 }
1160
1161 /*
1162  * dependency_is_compatible_expression
1163  *              Determines if the expression is compatible with functional dependencies
1164  *
1165  * Similar to dependency_is_compatible_clause, but doesn't enforce that the
1166  * expression is a simple Var. OTOH we check that there's at least one
1167  * statistics object matching the expression.
1168  */
1169 static bool
1170 dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
1171 {
1172         List       *vars;
1173         ListCell   *lc,
1174                            *lc2;
1175         Node       *clause_expr;
1176
1177         if (IsA(clause, RestrictInfo))
1178         {
1179                 RestrictInfo *rinfo = (RestrictInfo *) clause;
1180
1181                 /* Pseudoconstants are not interesting (they couldn't contain a Var) */
1182                 if (rinfo->pseudoconstant)
1183                         return false;
1184
1185                 /* Clauses referencing multiple, or no, varnos are incompatible */
1186                 if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
1187                         return false;
1188
1189                 clause = (Node *) rinfo->clause;
1190         }
1191
1192         if (is_opclause(clause))
1193         {
1194                 /* If it's an opclause, check for Var = Const or Const = Var. */
1195                 OpExpr     *expr = (OpExpr *) clause;
1196
1197                 /* Only expressions with two arguments are candidates. */
1198                 if (list_length(expr->args) != 2)
1199                         return false;
1200
1201                 /* Make sure non-selected argument is a pseudoconstant. */
1202                 if (is_pseudo_constant_clause(lsecond(expr->args)))
1203                         clause_expr = linitial(expr->args);
1204                 else if (is_pseudo_constant_clause(linitial(expr->args)))
1205                         clause_expr = lsecond(expr->args);
1206                 else
1207                         return false;
1208
1209                 /*
1210                  * If it's not an "=" operator, just ignore the clause, as it's not
1211                  * compatible with functional dependencies.
1212                  *
1213                  * This uses the function for estimating selectivity, not the operator
1214                  * directly (a bit awkward, but well ...).
1215                  *
1216                  * XXX this is pretty dubious; probably it'd be better to check btree
1217                  * or hash opclass membership, so as not to be fooled by custom
1218                  * selectivity functions, and to be more consistent with decisions
1219                  * elsewhere in the planner.
1220                  */
1221                 if (get_oprrest(expr->opno) != F_EQSEL)
1222                         return false;
1223
1224                 /* OK to proceed with checking "var" */
1225         }
1226         else if (IsA(clause, ScalarArrayOpExpr))
1227         {
1228                 /* If it's an scalar array operator, check for Var IN Const. */
1229                 ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1230
1231                 /*
1232                  * Reject ALL() variant, we only care about ANY/IN.
1233                  *
1234                  * FIXME Maybe we should check if all the values are the same, and
1235                  * allow ALL in that case? Doesn't seem very practical, though.
1236                  */
1237                 if (!expr->useOr)
1238                         return false;
1239
1240                 /* Only expressions with two arguments are candidates. */
1241                 if (list_length(expr->args) != 2)
1242                         return false;
1243
1244                 /*
1245                  * We know it's always (Var IN Const), so we assume the var is the
1246                  * first argument, and pseudoconstant is the second one.
1247                  */
1248                 if (!is_pseudo_constant_clause(lsecond(expr->args)))
1249                         return false;
1250
1251                 clause_expr = linitial(expr->args);
1252
1253                 /*
1254                  * If it's not an "=" operator, just ignore the clause, as it's not
1255                  * compatible with functional dependencies. The operator is identified
1256                  * simply by looking at which function it uses to estimate
1257                  * selectivity. That's a bit strange, but it's what other similar
1258                  * places do.
1259                  */
1260                 if (get_oprrest(expr->opno) != F_EQSEL)
1261                         return false;
1262
1263                 /* OK to proceed with checking "var" */
1264         }
1265         else if (is_orclause(clause))
1266         {
1267                 BoolExpr   *bool_expr = (BoolExpr *) clause;
1268
1269                 /* start with no expression (we'll use the first match) */
1270                 *expr = NULL;
1271
1272                 foreach(lc, bool_expr->args)
1273                 {
1274                         Node       *or_expr = NULL;
1275
1276                         /*
1277                          * Had we found incompatible expression in the arguments, treat
1278                          * the whole expression as incompatible.
1279                          */
1280                         if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
1281                                                                                                          statlist, &or_expr))
1282                                 return false;
1283
1284                         if (*expr == NULL)
1285                                 *expr = or_expr;
1286
1287                         /* ensure all the expressions are the same */
1288                         if (!equal(or_expr, *expr))
1289                                 return false;
1290                 }
1291
1292                 /* the expression is already checked by the recursive call */
1293                 return true;
1294         }
1295         else if (is_notclause(clause))
1296         {
1297                 /*
1298                  * "NOT x" can be interpreted as "x = false", so get the argument and
1299                  * proceed with seeing if it's a suitable Var.
1300                  */
1301                 clause_expr = (Node *) get_notclausearg(clause);
1302         }
1303         else
1304         {
1305                 /*
1306                  * A boolean expression "x" can be interpreted as "x = true", so
1307                  * proceed with seeing if it's a suitable Var.
1308                  */
1309                 clause_expr = (Node *) clause;
1310         }
1311
1312         /*
1313          * We may ignore any RelabelType node above the operand.  (There won't be
1314          * more than one, since eval_const_expressions has been applied already.)
1315          */
1316         if (IsA(clause_expr, RelabelType))
1317                 clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
1318
1319         vars = pull_var_clause(clause_expr, 0);
1320
1321         foreach(lc, vars)
1322         {
1323                 Var                *var = (Var *) lfirst(lc);
1324
1325                 /* Ensure Var is from the correct relation */
1326                 if (var->varno != relid)
1327                         return false;
1328
1329                 /* We also better ensure the Var is from the current level */
1330                 if (var->varlevelsup != 0)
1331                         return false;
1332
1333                 /* Also ignore system attributes (we don't allow stats on those) */
1334                 if (!AttrNumberIsForUserDefinedAttr(var->varattno))
1335                         return false;
1336         }
1337
1338         /*
1339          * Check if we actually have a matching statistics for the expression.
1340          *
1341          * XXX Maybe this is an overkill. We'll eliminate the expressions later.
1342          */
1343         foreach(lc, statlist)
1344         {
1345                 StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
1346
1347                 /* ignore stats without dependencies */
1348                 if (info->kind != STATS_EXT_DEPENDENCIES)
1349                         continue;
1350
1351                 foreach(lc2, info->exprs)
1352                 {
1353                         Node       *stat_expr = (Node *) lfirst(lc2);
1354
1355                         if (equal(clause_expr, stat_expr))
1356                         {
1357                                 *expr = stat_expr;
1358                                 return true;
1359                         }
1360                 }
1361         }
1362
1363         return false;
1364 }
1365
1366 /*
1367  * dependencies_clauselist_selectivity
1368  *              Return the estimated selectivity of (a subset of) the given clauses
1369  *              using functional dependency statistics, or 1.0 if no useful functional
1370  *              dependency statistic exists.
1371  *
1372  * 'estimatedclauses' is an input/output argument that gets a bit set
1373  * corresponding to the (zero-based) list index of each clause that is included
1374  * in the estimated selectivity.
1375  *
1376  * Given equality clauses on attributes (a,b) we find the strongest dependency
1377  * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
1378  * dependency, we then combine the per-clause selectivities using the formula
1379  *
1380  *         P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
1381  *
1382  * where 'f' is the degree of the dependency.  (Actually we use a slightly
1383  * modified version of this formula -- see clauselist_apply_dependencies()).
1384  *
1385  * With clauses on more than two attributes, the dependencies are applied
1386  * recursively, starting with the widest/strongest dependencies. For example
1387  * P(a,b,c) is first split like this:
1388  *
1389  *         P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
1390  *
1391  * assuming (a,b=>c) is the strongest dependency.
1392  */
1393 Selectivity
1394 dependencies_clauselist_selectivity(PlannerInfo *root,
1395                                                                         List *clauses,
1396                                                                         int varRelid,
1397                                                                         JoinType jointype,
1398                                                                         SpecialJoinInfo *sjinfo,
1399                                                                         RelOptInfo *rel,
1400                                                                         Bitmapset **estimatedclauses)
1401 {
1402         Selectivity s1 = 1.0;
1403         ListCell   *l;
1404         Bitmapset  *clauses_attnums = NULL;
1405         AttrNumber *list_attnums;
1406         int                     listidx;
1407         MVDependencies **func_dependencies;
1408         int                     nfunc_dependencies;
1409         int                     total_ndeps;
1410         MVDependency **dependencies;
1411         int                     ndependencies;
1412         int                     i;
1413         AttrNumber      attnum_offset;
1414         RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
1415
1416         /* unique expressions */
1417         Node      **unique_exprs;
1418         int                     unique_exprs_cnt;
1419
1420         /* check if there's any stats that might be useful for us. */
1421         if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
1422                 return 1.0;
1423
1424         list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
1425                                                                                  list_length(clauses));
1426
1427         /*
1428          * We allocate space as if every clause was a unique expression, although
1429          * that's probably overkill. Some will be simple column references that
1430          * we'll translate to attnums, and there might be duplicates. But it's
1431          * easier and cheaper to just do one allocation than repalloc later.
1432          */
1433         unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
1434         unique_exprs_cnt = 0;
1435
1436         /*
1437          * Pre-process the clauses list to extract the attnums seen in each item.
1438          * We need to determine if there's any clauses which will be useful for
1439          * dependency selectivity estimations. Along the way we'll record all of
1440          * the attnums for each clause in a list which we'll reference later so we
1441          * don't need to repeat the same work again. We'll also keep track of all
1442          * attnums seen.
1443          *
1444          * We also skip clauses that we already estimated using different types of
1445          * statistics (we treat them as incompatible).
1446          *
1447          * To handle expressions, we assign them negative attnums, as if it was a
1448          * system attribute (this is fine, as we only allow extended stats on user
1449          * attributes). And then we offset everything by the number of
1450          * expressions, so that we can store the values in a bitmapset.
1451          */
1452         listidx = 0;
1453         foreach(l, clauses)
1454         {
1455                 Node       *clause = (Node *) lfirst(l);
1456                 AttrNumber      attnum;
1457                 Node       *expr = NULL;
1458
1459                 /* ignore clause by default */
1460                 list_attnums[listidx] = InvalidAttrNumber;
1461
1462                 if (!bms_is_member(listidx, *estimatedclauses))
1463                 {
1464                         /*
1465                          * If it's a simple column reference, just extract the attnum. If
1466                          * it's an expression, assign a negative attnum as if it was a
1467                          * system attribute.
1468                          */
1469                         if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
1470                         {
1471                                 list_attnums[listidx] = attnum;
1472                         }
1473                         else if (dependency_is_compatible_expression(clause, rel->relid,
1474                                                                                                                  rel->statlist,
1475                                                                                                                  &expr))
1476                         {
1477                                 /* special attnum assigned to this expression */
1478                                 attnum = InvalidAttrNumber;
1479
1480                                 Assert(expr != NULL);
1481
1482                                 /* If the expression is duplicate, use the same attnum. */
1483                                 for (i = 0; i < unique_exprs_cnt; i++)
1484                                 {
1485                                         if (equal(unique_exprs[i], expr))
1486                                         {
1487                                                 /* negative attribute number to expression */
1488                                                 attnum = -(i + 1);
1489                                                 break;
1490                                         }
1491                                 }
1492
1493                                 /* not found in the list, so add it */
1494                                 if (attnum == InvalidAttrNumber)
1495                                 {
1496                                         unique_exprs[unique_exprs_cnt++] = expr;
1497
1498                                         /* after incrementing the value, to get -1, -2, ... */
1499                                         attnum = (-unique_exprs_cnt);
1500                                 }
1501
1502                                 /* remember which attnum was assigned to this clause */
1503                                 list_attnums[listidx] = attnum;
1504                         }
1505                 }
1506
1507                 listidx++;
1508         }
1509
1510         Assert(listidx == list_length(clauses));
1511
1512         /*
1513          * How much we need to offset the attnums? If there are no expressions,
1514          * then no offset is needed. Otherwise we need to offset enough for the
1515          * lowest value (-unique_exprs_cnt) to become 1.
1516          */
1517         if (unique_exprs_cnt > 0)
1518                 attnum_offset = (unique_exprs_cnt + 1);
1519         else
1520                 attnum_offset = 0;
1521
1522         /*
1523          * Now that we know how many expressions there are, we can offset the
1524          * values just enough to build the bitmapset.
1525          */
1526         for (i = 0; i < list_length(clauses); i++)
1527         {
1528                 AttrNumber      attnum;
1529
1530                 /* ignore incompatible or already estimated clauses */
1531                 if (list_attnums[i] == InvalidAttrNumber)
1532                         continue;
1533
1534                 /* make sure the attnum is in the expected range */
1535                 Assert(list_attnums[i] >= (-unique_exprs_cnt));
1536                 Assert(list_attnums[i] <= MaxHeapAttributeNumber);
1537
1538                 /* make sure the attnum is positive (valid AttrNumber) */
1539                 attnum = list_attnums[i] + attnum_offset;
1540
1541                 /*
1542                  * Either it's a regular attribute, or it's an expression, in which
1543                  * case we must not have seen it before (expressions are unique).
1544                  *
1545                  * XXX Check whether it's a regular attribute has to be done using the
1546                  * original attnum, while the second check has to use the value with
1547                  * an offset.
1548                  */
1549                 Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
1550                            !bms_is_member(attnum, clauses_attnums));
1551
1552                 /*
1553                  * Remember the offset attnum, both for attributes and expressions.
1554                  * We'll pass list_attnums to clauselist_apply_dependencies, which
1555                  * uses it to identify clauses in a bitmap. We could also pass the
1556                  * offset, but this is more convenient.
1557                  */
1558                 list_attnums[i] = attnum;
1559
1560                 clauses_attnums = bms_add_member(clauses_attnums, attnum);
1561         }
1562
1563         /*
1564          * If there's not at least two distinct attnums and expressions, then
1565          * reject the whole list of clauses. We must return 1.0 so the calling
1566          * function's selectivity is unaffected.
1567          */
1568         if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
1569         {
1570                 bms_free(clauses_attnums);
1571                 pfree(list_attnums);
1572                 return 1.0;
1573         }
1574
1575         /*
1576          * Load all functional dependencies matching at least two parameters. We
1577          * can simply consider all dependencies at once, without having to search
1578          * for the best statistics object.
1579          *
1580          * To not waste cycles and memory, we deserialize dependencies only for
1581          * statistics that match at least two attributes. The array is allocated
1582          * with the assumption that all objects match - we could grow the array to
1583          * make it just the right size, but it's likely wasteful anyway thanks to
1584          * moving the freed chunks to freelists etc.
1585          */
1586         func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
1587                                                                                                    list_length(rel->statlist));
1588         nfunc_dependencies = 0;
1589         total_ndeps = 0;
1590
1591         foreach(l, rel->statlist)
1592         {
1593                 StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
1594                 int                     nmatched;
1595                 int                     nexprs;
1596                 int                     k;
1597                 MVDependencies *deps;
1598
1599                 /* skip statistics that are not of the correct type */
1600                 if (stat->kind != STATS_EXT_DEPENDENCIES)
1601                         continue;
1602
1603                 /* skip statistics with mismatching stxdinherit value */
1604                 if (stat->inherit != rte->inh)
1605                         continue;
1606
1607                 /*
1608                  * Count matching attributes - we have to undo the attnum offsets. The
1609                  * input attribute numbers are not offset (expressions are not
1610                  * included in stat->keys, so it's not necessary). But we need to
1611                  * offset it before checking against clauses_attnums.
1612                  */
1613                 nmatched = 0;
1614                 k = -1;
1615                 while ((k = bms_next_member(stat->keys, k)) >= 0)
1616                 {
1617                         AttrNumber      attnum = (AttrNumber) k;
1618
1619                         /* skip expressions */
1620                         if (!AttrNumberIsForUserDefinedAttr(attnum))
1621                                 continue;
1622
1623                         /* apply the same offset as above */
1624                         attnum += attnum_offset;
1625
1626                         if (bms_is_member(attnum, clauses_attnums))
1627                                 nmatched++;
1628                 }
1629
1630                 /* count matching expressions */
1631                 nexprs = 0;
1632                 for (i = 0; i < unique_exprs_cnt; i++)
1633                 {
1634                         ListCell   *lc;
1635
1636                         foreach(lc, stat->exprs)
1637                         {
1638                                 Node       *stat_expr = (Node *) lfirst(lc);
1639
1640                                 /* try to match it */
1641                                 if (equal(stat_expr, unique_exprs[i]))
1642                                         nexprs++;
1643                         }
1644                 }
1645
1646                 /*
1647                  * Skip objects matching fewer than two attributes/expressions from
1648                  * clauses.
1649                  */
1650                 if (nmatched + nexprs < 2)
1651                         continue;
1652
1653                 deps = statext_dependencies_load(stat->statOid, rte->inh);
1654
1655                 /*
1656                  * The expressions may be represented by different attnums in the
1657                  * stats, we need to remap them to be consistent with the clauses.
1658                  * That will make the later steps (e.g. picking the strongest item and
1659                  * so on) much simpler and cheaper, because it won't need to care
1660                  * about the offset at all.
1661                  *
1662                  * When we're at it, we can ignore dependencies that are not fully
1663                  * matched by clauses (i.e. referencing attributes or expressions that
1664                  * are not in the clauses).
1665                  *
1666                  * We have to do this for all statistics, as long as there are any
1667                  * expressions - we need to shift the attnums in all dependencies.
1668                  *
1669                  * XXX Maybe we should do this always, because it also eliminates some
1670                  * of the dependencies early. It might be cheaper than having to walk
1671                  * the longer list in find_strongest_dependency later, especially as
1672                  * we need to do that repeatedly?
1673                  *
1674                  * XXX We have to do this even when there are no expressions in
1675                  * clauses, otherwise find_strongest_dependency may fail for stats
1676                  * with expressions (due to lookup of negative value in bitmap). So we
1677                  * need to at least filter out those dependencies. Maybe we could do
1678                  * it in a cheaper way (if there are no expr clauses, we can just
1679                  * discard all negative attnums without any lookups).
1680                  */
1681                 if (unique_exprs_cnt > 0 || stat->exprs != NIL)
1682                 {
1683                         int                     ndeps = 0;
1684
1685                         for (i = 0; i < deps->ndeps; i++)
1686                         {
1687                                 bool            skip = false;
1688                                 MVDependency *dep = deps->deps[i];
1689                                 int                     j;
1690
1691                                 for (j = 0; j < dep->nattributes; j++)
1692                                 {
1693                                         int                     idx;
1694                                         Node       *expr;
1695                                         int                     k;
1696                                         AttrNumber      unique_attnum = InvalidAttrNumber;
1697                                         AttrNumber      attnum;
1698
1699                                         /* undo the per-statistics offset */
1700                                         attnum = dep->attributes[j];
1701
1702                                         /*
1703                                          * For regular attributes we can simply check if it
1704                                          * matches any clause. If there's no matching clause, we
1705                                          * can just ignore it. We need to offset the attnum
1706                                          * though.
1707                                          */
1708                                         if (AttrNumberIsForUserDefinedAttr(attnum))
1709                                         {
1710                                                 dep->attributes[j] = attnum + attnum_offset;
1711
1712                                                 if (!bms_is_member(dep->attributes[j], clauses_attnums))
1713                                                 {
1714                                                         skip = true;
1715                                                         break;
1716                                                 }
1717
1718                                                 continue;
1719                                         }
1720
1721                                         /*
1722                                          * the attnum should be a valid system attnum (-1, -2,
1723                                          * ...)
1724                                          */
1725                                         Assert(AttributeNumberIsValid(attnum));
1726
1727                                         /*
1728                                          * For expressions, we need to do two translations. First
1729                                          * we have to translate the negative attnum to index in
1730                                          * the list of expressions (in the statistics object).
1731                                          * Then we need to see if there's a matching clause. The
1732                                          * index of the unique expression determines the attnum
1733                                          * (and we offset it).
1734                                          */
1735                                         idx = -(1 + attnum);
1736
1737                                         /* Is the expression index is valid? */
1738                                         Assert((idx >= 0) && (idx < list_length(stat->exprs)));
1739
1740                                         expr = (Node *) list_nth(stat->exprs, idx);
1741
1742                                         /* try to find the expression in the unique list */
1743                                         for (k = 0; k < unique_exprs_cnt; k++)
1744                                         {
1745                                                 /*
1746                                                  * found a matching unique expression, use the attnum
1747                                                  * (derived from index of the unique expression)
1748                                                  */
1749                                                 if (equal(unique_exprs[k], expr))
1750                                                 {
1751                                                         unique_attnum = -(k + 1) + attnum_offset;
1752                                                         break;
1753                                                 }
1754                                         }
1755
1756                                         /*
1757                                          * Found no matching expression, so we can simply skip
1758                                          * this dependency, because there's no chance it will be
1759                                          * fully covered.
1760                                          */
1761                                         if (unique_attnum == InvalidAttrNumber)
1762                                         {
1763                                                 skip = true;
1764                                                 break;
1765                                         }
1766
1767                                         /* otherwise remap it to the new attnum */
1768                                         dep->attributes[j] = unique_attnum;
1769                                 }
1770
1771                                 /* if found a matching dependency, keep it */
1772                                 if (!skip)
1773                                 {
1774                                         /* maybe we've skipped something earlier, so move it */
1775                                         if (ndeps != i)
1776                                                 deps->deps[ndeps] = deps->deps[i];
1777
1778                                         ndeps++;
1779                                 }
1780                         }
1781
1782                         deps->ndeps = ndeps;
1783                 }
1784
1785                 /*
1786                  * It's possible we've removed all dependencies, in which case we
1787                  * don't bother adding it to the list.
1788                  */
1789                 if (deps->ndeps > 0)
1790                 {
1791                         func_dependencies[nfunc_dependencies] = deps;
1792                         total_ndeps += deps->ndeps;
1793                         nfunc_dependencies++;
1794                 }
1795         }
1796
1797         /* if no matching stats could be found then we've nothing to do */
1798         if (nfunc_dependencies == 0)
1799         {
1800                 pfree(func_dependencies);
1801                 bms_free(clauses_attnums);
1802                 pfree(list_attnums);
1803                 pfree(unique_exprs);
1804                 return 1.0;
1805         }
1806
1807         /*
1808          * Work out which dependencies we can apply, starting with the
1809          * widest/strongest ones, and proceeding to smaller/weaker ones.
1810          */
1811         dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
1812                                                                                         total_ndeps);
1813         ndependencies = 0;
1814
1815         while (true)
1816         {
1817                 MVDependency *dependency;
1818                 AttrNumber      attnum;
1819
1820                 /* the widest/strongest dependency, fully matched by clauses */
1821                 dependency = find_strongest_dependency(func_dependencies,
1822                                                                                            nfunc_dependencies,
1823                                                                                            clauses_attnums);
1824                 if (!dependency)
1825                         break;
1826
1827                 dependencies[ndependencies++] = dependency;
1828
1829                 /* Ignore dependencies using this implied attribute in later loops */
1830                 attnum = dependency->attributes[dependency->nattributes - 1];
1831                 clauses_attnums = bms_del_member(clauses_attnums, attnum);
1832         }
1833
1834         /*
1835          * If we found applicable dependencies, use them to estimate all
1836          * compatible clauses on attributes that they refer to.
1837          */
1838         if (ndependencies != 0)
1839                 s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
1840                                                                                    sjinfo, dependencies, ndependencies,
1841                                                                                    list_attnums, estimatedclauses);
1842
1843         /* free deserialized functional dependencies (and then the array) */
1844         for (i = 0; i < nfunc_dependencies; i++)
1845                 pfree(func_dependencies[i]);
1846
1847         pfree(dependencies);
1848         pfree(func_dependencies);
1849         bms_free(clauses_attnums);
1850         pfree(list_attnums);
1851         pfree(unique_exprs);
1852
1853         return s1;
1854 }