src/backend/utils/adt/array_selfuncs.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * array_selfuncs.c
   4  *        Functions for selectivity estimation of array operators
   5  *
   6  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/array_selfuncs.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <math.h>
  18
  19 #include "access/htup_details.h"
  20 #include "catalog/pg_operator.h"
  21 #include "catalog/pg_statistic.h"
  22 #include "utils/array.h"
  23 #include "utils/fmgrprotos.h"
  24 #include "utils/lsyscache.h"
  25 #include "utils/selfuncs.h"
  26 #include "utils/typcache.h"
  27
  28
  29 /* Default selectivity constant for "@>" and "<@" operators */
  30 #define DEFAULT_CONTAIN_SEL 0.005
  31
  32 /* Default selectivity constant for "&&" operator */
  33 #define DEFAULT_OVERLAP_SEL 0.01
  34
  35 /* Default selectivity for given operator */
  36 #define DEFAULT_SEL(operator) \
  37         ((operator) == OID_ARRAY_OVERLAP_OP ? \
  38                 DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
  39
  40 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
  41                                                                          Oid elemtype, Oid operator);
  42 static Selectivity mcelem_array_selec(ArrayType *array,
  43                                                                           TypeCacheEntry *typentry,
  44                                                                           Datum *mcelem, int nmcelem,
  45                                                                           float4 *numbers, int nnumbers,
  46                                                                           float4 *hist, int nhist,
  47                                                                           Oid operator);
  48 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
  49                                                                                                           float4 *numbers, int nnumbers,
  50                                                                                                           Datum *array_data, int nitems,
  51                                                                                                           Oid operator, TypeCacheEntry *typentry);
  52 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
  53                                                                                                 float4 *numbers, int nnumbers,
  54                                                                                                 Datum *array_data, int nitems,
  55                                                                                                 float4 *hist, int nhist,
  56                                                                                                 Oid operator, TypeCacheEntry *typentry);
  57 static float *calc_hist(const float4 *hist, int nhist, int n);
  58 static float *calc_distr(const float *p, int n, int m, float rest);
  59 static int      floor_log2(uint32 n);
  60 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
  61                                                          int *index, TypeCacheEntry *typentry);
  62 static int      element_compare(const void *key1, const void *key2, void *arg);
  63 static int      float_compare_desc(const void *key1, const void *key2);
  64
  65
  66 /*
  67  * scalararraysel_containment
  68  *              Estimate selectivity of ScalarArrayOpExpr via array containment.
  69  *
  70  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
  71  * selectivity as though this were an array containment operator,
  72  * array_var op ARRAY[const].
  73  *
  74  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
  75  * is the array element type's default equality or inequality operator, and
  76  * has aggressively simplified both inputs to constants.
  77  *
  78  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
  79  */
  80 Selectivity
  81 scalararraysel_containment(PlannerInfo *root,
  82                                                    Node *leftop, Node *rightop,
  83                                                    Oid elemtype, bool isEquality, bool useOr,
  84                                                    int varRelid)
  85 {
  86         Selectivity selec;
  87         VariableStatData vardata;
  88         Datum           constval;
  89         TypeCacheEntry *typentry;
  90         FmgrInfo   *cmpfunc;
  91
  92         /*
  93          * rightop must be a variable, else punt.
  94          */
  95         examine_variable(root, rightop, varRelid, &vardata);
  96         if (!vardata.rel)
  97         {
  98                 ReleaseVariableStats(vardata);
  99                 return -1.0;
 100         }
 101
 102         /*
 103          * leftop must be a constant, else punt.
 104          */
 105         if (!IsA(leftop, Const))
 106         {
 107                 ReleaseVariableStats(vardata);
 108                 return -1.0;
 109         }
 110         if (((Const *) leftop)->constisnull)
 111         {
 112                 /* qual can't succeed if null on left */
 113                 ReleaseVariableStats(vardata);
 114                 return (Selectivity) 0.0;
 115         }
 116         constval = ((Const *) leftop)->constvalue;
 117
 118         /* Get element type's default comparison function */
 119         typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
 120         if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
 121         {
 122                 ReleaseVariableStats(vardata);
 123                 return -1.0;
 124         }
 125         cmpfunc = &typentry->cmp_proc_finfo;
 126
 127         /*
 128          * If the operator is <>, swap ANY/ALL, then invert the result later.
 129          */
 130         if (!isEquality)
 131                 useOr = !useOr;
 132
 133         /* Get array element stats for var, if available */
 134         if (HeapTupleIsValid(vardata.statsTuple) &&
 135                 statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
 136         {
 137                 Form_pg_statistic stats;
 138                 AttStatsSlot sslot;
 139                 AttStatsSlot hslot;
 140
 141                 stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 142
 143                 /* MCELEM will be an array of same type as element */
 144                 if (get_attstatsslot(&sslot, vardata.statsTuple,
 145                                                          STATISTIC_KIND_MCELEM, InvalidOid,
 146                                                          ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
 147                 {
 148                         /* For ALL case, also get histogram of distinct-element counts */
 149                         if (useOr ||
 150                                 !get_attstatsslot(&hslot, vardata.statsTuple,
 151                                                                   STATISTIC_KIND_DECHIST, InvalidOid,
 152                                                                   ATTSTATSSLOT_NUMBERS))
 153                                 memset(&hslot, 0, sizeof(hslot));
 154
 155                         /*
 156                          * For = ANY, estimate as var @> ARRAY[const].
 157                          *
 158                          * For = ALL, estimate as var <@ ARRAY[const].
 159                          */
 160                         if (useOr)
 161                                 selec = mcelem_array_contain_overlap_selec(sslot.values,
 162                                                                                                                    sslot.nvalues,
 163                                                                                                                    sslot.numbers,
 164                                                                                                                    sslot.nnumbers,
 165                                                                                                                    &constval, 1,
 166                                                                                                                    OID_ARRAY_CONTAINS_OP,
 167                                                                                                                    typentry);
 168                         else
 169                                 selec = mcelem_array_contained_selec(sslot.values,
 170                                                                                                          sslot.nvalues,
 171                                                                                                          sslot.numbers,
 172                                                                                                          sslot.nnumbers,
 173                                                                                                          &constval, 1,
 174                                                                                                          hslot.numbers,
 175                                                                                                          hslot.nnumbers,
 176                                                                                                          OID_ARRAY_CONTAINED_OP,
 177                                                                                                          typentry);
 178
 179                         free_attstatsslot(&hslot);
 180                         free_attstatsslot(&sslot);
 181                 }
 182                 else
 183                 {
 184                         /* No most-common-elements info, so do without */
 185                         if (useOr)
 186                                 selec = mcelem_array_contain_overlap_selec(NULL, 0,
 187                                                                                                                    NULL, 0,
 188                                                                                                                    &constval, 1,
 189                                                                                                                    OID_ARRAY_CONTAINS_OP,
 190                                                                                                                    typentry);
 191                         else
 192                                 selec = mcelem_array_contained_selec(NULL, 0,
 193                                                                                                          NULL, 0,
 194                                                                                                          &constval, 1,
 195                                                                                                          NULL, 0,
 196                                                                                                          OID_ARRAY_CONTAINED_OP,
 197                                                                                                          typentry);
 198                 }
 199
 200                 /*
 201                  * MCE stats count only non-null rows, so adjust for null rows.
 202                  */
 203                 selec *= (1.0 - stats->stanullfrac);
 204         }
 205         else
 206         {
 207                 /* No stats at all, so do without */
 208                 if (useOr)
 209                         selec = mcelem_array_contain_overlap_selec(NULL, 0,
 210                                                                                                            NULL, 0,
 211                                                                                                            &constval, 1,
 212                                                                                                            OID_ARRAY_CONTAINS_OP,
 213                                                                                                            typentry);
 214                 else
 215                         selec = mcelem_array_contained_selec(NULL, 0,
 216                                                                                                  NULL, 0,
 217                                                                                                  &constval, 1,
 218                                                                                                  NULL, 0,
 219                                                                                                  OID_ARRAY_CONTAINED_OP,
 220                                                                                                  typentry);
 221                 /* we assume no nulls here, so no stanullfrac correction */
 222         }
 223
 224         ReleaseVariableStats(vardata);
 225
 226         /*
 227          * If the operator is <>, invert the results.
 228          */
 229         if (!isEquality)
 230                 selec = 1.0 - selec;
 231
 232         CLAMP_PROBABILITY(selec);
 233
 234         return selec;
 235 }
 236
 237 /*
 238  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
 239  */
 240 Datum
 241 arraycontsel(PG_FUNCTION_ARGS)
 242 {
 243         PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
 244         Oid                     operator = PG_GETARG_OID(1);
 245         List       *args = (List *) PG_GETARG_POINTER(2);
 246         int                     varRelid = PG_GETARG_INT32(3);
 247         VariableStatData vardata;
 248         Node       *other;
 249         bool            varonleft;
 250         Selectivity selec;
 251         Oid                     element_typeid;
 252
 253         /*
 254          * If expression is not (variable op something) or (something op
 255          * variable), then punt and return a default estimate.
 256          */
 257         if (!get_restriction_variable(root, args, varRelid,
 258                                                                   &vardata, &other, &varonleft))
 259                 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 260
 261         /*
 262          * Can't do anything useful if the something is not a constant, either.
 263          */
 264         if (!IsA(other, Const))
 265         {
 266                 ReleaseVariableStats(vardata);
 267                 PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 268         }
 269
 270         /*
 271          * The "&&", "@>" and "<@" operators are strict, so we can cope with a
 272          * NULL constant right away.
 273          */
 274         if (((Const *) other)->constisnull)
 275         {
 276                 ReleaseVariableStats(vardata);
 277                 PG_RETURN_FLOAT8(0.0);
 278         }
 279
 280         /*
 281          * If var is on the right, commute the operator, so that we can assume the
 282          * var is on the left in what follows.
 283          */
 284         if (!varonleft)
 285         {
 286                 if (operator == OID_ARRAY_CONTAINS_OP)
 287                         operator = OID_ARRAY_CONTAINED_OP;
 288                 else if (operator == OID_ARRAY_CONTAINED_OP)
 289                         operator = OID_ARRAY_CONTAINS_OP;
 290         }
 291
 292         /*
 293          * OK, there's a Var and a Const we're dealing with here.  We need the
 294          * Const to be an array with same element type as column, else we can't do
 295          * anything useful.  (Such cases will likely fail at runtime, but here
 296          * we'd rather just return a default estimate.)
 297          */
 298         element_typeid = get_base_element_type(((Const *) other)->consttype);
 299         if (element_typeid != InvalidOid &&
 300                 element_typeid == get_base_element_type(vardata.vartype))
 301         {
 302                 selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
 303                                                                   element_typeid, operator);
 304         }
 305         else
 306         {
 307                 selec = DEFAULT_SEL(operator);
 308         }
 309
 310         ReleaseVariableStats(vardata);
 311
 312         CLAMP_PROBABILITY(selec);
 313
 314         PG_RETURN_FLOAT8((float8) selec);
 315 }
 316
 317 /*
 318  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
 319  */
 320 Datum
 321 arraycontjoinsel(PG_FUNCTION_ARGS)
 322 {
 323         /* For the moment this is just a stub */
 324         Oid                     operator = PG_GETARG_OID(1);
 325
 326         PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
 327 }
 328
 329 /*
 330  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
 331  * or "arraycolumn <@ const" based on the statistics
 332  *
 333  * This function is mainly responsible for extracting the pg_statistic data
 334  * to be used; we then pass the problem on to mcelem_array_selec().
 335  */
 336 static Selectivity
 337 calc_arraycontsel(VariableStatData *vardata, Datum constval,
 338                                   Oid elemtype, Oid operator)
 339 {
 340         Selectivity selec;
 341         TypeCacheEntry *typentry;
 342         FmgrInfo   *cmpfunc;
 343         ArrayType  *array;
 344
 345         /* Get element type's default comparison function */
 346         typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
 347         if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
 348                 return DEFAULT_SEL(operator);
 349         cmpfunc = &typentry->cmp_proc_finfo;
 350
 351         /*
 352          * The caller made sure the const is an array with same element type, so
 353          * get it now
 354          */
 355         array = DatumGetArrayTypeP(constval);
 356
 357         if (HeapTupleIsValid(vardata->statsTuple) &&
 358                 statistic_proc_security_check(vardata, cmpfunc->fn_oid))
 359         {
 360                 Form_pg_statistic stats;
 361                 AttStatsSlot sslot;
 362                 AttStatsSlot hslot;
 363
 364                 stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 365
 366                 /* MCELEM will be an array of same type as column */
 367                 if (get_attstatsslot(&sslot, vardata->statsTuple,
 368                                                          STATISTIC_KIND_MCELEM, InvalidOid,
 369                                                          ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
 370                 {
 371                         /*
 372                          * For "array <@ const" case we also need histogram of distinct
 373                          * element counts.
 374                          */
 375                         if (operator != OID_ARRAY_CONTAINED_OP ||
 376                                 !get_attstatsslot(&hslot, vardata->statsTuple,
 377                                                                   STATISTIC_KIND_DECHIST, InvalidOid,
 378                                                                   ATTSTATSSLOT_NUMBERS))
 379                                 memset(&hslot, 0, sizeof(hslot));
 380
 381                         /* Use the most-common-elements slot for the array Var. */
 382                         selec = mcelem_array_selec(array, typentry,
 383                                                                            sslot.values, sslot.nvalues,
 384                                                                            sslot.numbers, sslot.nnumbers,
 385                                                                            hslot.numbers, hslot.nnumbers,
 386                                                                            operator);
 387
 388                         free_attstatsslot(&hslot);
 389                         free_attstatsslot(&sslot);
 390                 }
 391                 else
 392                 {
 393                         /* No most-common-elements info, so do without */
 394                         selec = mcelem_array_selec(array, typentry,
 395                                                                            NULL, 0, NULL, 0, NULL, 0,
 396                                                                            operator);
 397                 }
 398
 399                 /*
 400                  * MCE stats count only non-null rows, so adjust for null rows.
 401                  */
 402                 selec *= (1.0 - stats->stanullfrac);
 403         }
 404         else
 405         {
 406                 /* No stats at all, so do without */
 407                 selec = mcelem_array_selec(array, typentry,
 408                                                                    NULL, 0, NULL, 0, NULL, 0,
 409                                                                    operator);
 410                 /* we assume no nulls here, so no stanullfrac correction */
 411         }
 412
 413         /* If constant was toasted, release the copy we made */
 414         if (PointerGetDatum(array) != constval)
 415                 pfree(array);
 416
 417         return selec;
 418 }
 419
 420 /*
 421  * Array selectivity estimation based on most common elements statistics
 422  *
 423  * This function just deconstructs and sorts the array constant's contents,
 424  * and then passes the problem on to mcelem_array_contain_overlap_selec or
 425  * mcelem_array_contained_selec depending on the operator.
 426  */
 427 static Selectivity
 428 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
 429                                    Datum *mcelem, int nmcelem,
 430                                    float4 *numbers, int nnumbers,
 431                                    float4 *hist, int nhist,
 432                                    Oid operator)
 433 {
 434         Selectivity selec;
 435         int                     num_elems;
 436         Datum      *elem_values;
 437         bool       *elem_nulls;
 438         bool            null_present;
 439         int                     nonnull_nitems;
 440         int                     i;
 441
 442         /*
 443          * Prepare constant array data for sorting.  Sorting lets us find unique
 444          * elements and efficiently merge with the MCELEM array.
 445          */
 446         deconstruct_array(array,
 447                                           typentry->type_id,
 448                                           typentry->typlen,
 449                                           typentry->typbyval,
 450                                           typentry->typalign,
 451                                           &elem_values, &elem_nulls, &num_elems);
 452
 453         /* Collapse out any null elements */
 454         nonnull_nitems = 0;
 455         null_present = false;
 456         for (i = 0; i < num_elems; i++)
 457         {
 458                 if (elem_nulls[i])
 459                         null_present = true;
 460                 else
 461                         elem_values[nonnull_nitems++] = elem_values[i];
 462         }
 463
 464         /*
 465          * Query "column @> '{anything, null}'" matches nothing.  For the other
 466          * two operators, presence of a null in the constant can be ignored.
 467          */
 468         if (null_present && operator == OID_ARRAY_CONTAINS_OP)
 469         {
 470                 pfree(elem_values);
 471                 pfree(elem_nulls);
 472                 return (Selectivity) 0.0;
 473         }
 474
 475         /* Sort extracted elements using their default comparison function. */
 476         qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
 477                           element_compare, typentry);
 478
 479         /* Separate cases according to operator */
 480         if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
 481                 selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
 482                                                                                                    numbers, nnumbers,
 483                                                                                                    elem_values, nonnull_nitems,
 484                                                                                                    operator, typentry);
 485         else if (operator == OID_ARRAY_CONTAINED_OP)
 486                 selec = mcelem_array_contained_selec(mcelem, nmcelem,
 487                                                                                          numbers, nnumbers,
 488                                                                                          elem_values, nonnull_nitems,
 489                                                                                          hist, nhist,
 490                                                                                          operator, typentry);
 491         else
 492         {
 493                 elog(ERROR, "arraycontsel called for unrecognized operator %u",
 494                          operator);
 495                 selec = 0.0;                    /* keep compiler quiet */
 496         }
 497
 498         pfree(elem_values);
 499         pfree(elem_nulls);
 500         return selec;
 501 }
 502
 503 /*
 504  * Estimate selectivity of "column @> const" and "column && const" based on
 505  * most common element statistics.  This estimation assumes element
 506  * occurrences are independent.
 507  *
 508  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 509  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 510  * not available.  array_data (of length nitems) is the constant's elements.
 511  *
 512  * Both the mcelem and array_data arrays are assumed presorted according
 513  * to the element type's cmpfunc.  Null elements are not present.
 514  *
 515  * TODO: this estimate probably could be improved by using the distinct
 516  * elements count histogram.  For example, excepting the special case of
 517  * "column @> '{}'", we can multiply the calculated selectivity by the
 518  * fraction of nonempty arrays in the column.
 519  */
 520 static Selectivity
 521 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
 522                                                                    float4 *numbers, int nnumbers,
 523                                                                    Datum *array_data, int nitems,
 524                                                                    Oid operator, TypeCacheEntry *typentry)
 525 {
 526         Selectivity selec,
 527                                 elem_selec;
 528         int                     mcelem_index,
 529                                 i;
 530         bool            use_bsearch;
 531         float4          minfreq;
 532
 533         /*
 534          * There should be three more Numbers than Values, because the last three
 535          * cells should hold minimal and maximal frequency among the non-null
 536          * elements, and then the frequency of null elements.  Ignore the Numbers
 537          * if not right.
 538          */
 539         if (nnumbers != nmcelem + 3)
 540         {
 541                 numbers = NULL;
 542                 nnumbers = 0;
 543         }
 544
 545         if (numbers)
 546         {
 547                 /* Grab the lowest observed frequency */
 548                 minfreq = numbers[nmcelem];
 549         }
 550         else
 551         {
 552                 /* Without statistics make some default assumptions */
 553                 minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
 554         }
 555
 556         /* Decide whether it is faster to use binary search or not. */
 557         if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
 558                 use_bsearch = true;
 559         else
 560                 use_bsearch = false;
 561
 562         if (operator == OID_ARRAY_CONTAINS_OP)
 563         {
 564                 /*
 565                  * Initial selectivity for "column @> const" query is 1.0, and it will
 566                  * be decreased with each element of constant array.
 567                  */
 568                 selec = 1.0;
 569         }
 570         else
 571         {
 572                 /*
 573                  * Initial selectivity for "column && const" query is 0.0, and it will
 574                  * be increased with each element of constant array.
 575                  */
 576                 selec = 0.0;
 577         }
 578
 579         /* Scan mcelem and array in parallel. */
 580         mcelem_index = 0;
 581         for (i = 0; i < nitems; i++)
 582         {
 583                 bool            match = false;
 584
 585                 /* Ignore any duplicates in the array data. */
 586                 if (i > 0 &&
 587                         element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
 588                         continue;
 589
 590                 /* Find the smallest MCELEM >= this array item. */
 591                 if (use_bsearch)
 592                 {
 593                         match = find_next_mcelem(mcelem, nmcelem, array_data[i],
 594                                                                          &mcelem_index, typentry);
 595                 }
 596                 else
 597                 {
 598                         while (mcelem_index < nmcelem)
 599                         {
 600                                 int                     cmp = element_compare(&mcelem[mcelem_index],
 601                                                                                                   &array_data[i],
 602                                                                                                   typentry);
 603
 604                                 if (cmp < 0)
 605                                         mcelem_index++;
 606                                 else
 607                                 {
 608                                         if (cmp == 0)
 609                                                 match = true;   /* mcelem is found */
 610                                         break;
 611                                 }
 612                         }
 613                 }
 614
 615                 if (match && numbers)
 616                 {
 617                         /* MCELEM matches the array item; use its frequency. */
 618                         elem_selec = numbers[mcelem_index];
 619                         mcelem_index++;
 620                 }
 621                 else
 622                 {
 623                         /*
 624                          * The element is not in MCELEM.  Punt, but assume that the
 625                          * selectivity cannot be more than minfreq / 2.
 626                          */
 627                         elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
 628                 }
 629
 630                 /*
 631                  * Update overall selectivity using the current element's selectivity
 632                  * and an assumption of element occurrence independence.
 633                  */
 634                 if (operator == OID_ARRAY_CONTAINS_OP)
 635                         selec *= elem_selec;
 636                 else
 637                         selec = selec + elem_selec - selec * elem_selec;
 638
 639                 /* Clamp intermediate results to stay sane despite roundoff error */
 640                 CLAMP_PROBABILITY(selec);
 641         }
 642
 643         return selec;
 644 }
 645
 646 /*
 647  * Estimate selectivity of "column <@ const" based on most common element
 648  * statistics.
 649  *
 650  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 651  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 652  * not available.  array_data (of length nitems) is the constant's elements.
 653  * hist (of length nhist) is from the array column's DECHIST statistics slot,
 654  * or is NULL/0 if those stats are not available.
 655  *
 656  * Both the mcelem and array_data arrays are assumed presorted according
 657  * to the element type's cmpfunc.  Null elements are not present.
 658  *
 659  * Independent element occurrence would imply a particular distribution of
 660  * distinct element counts among matching rows.  Real data usually falsifies
 661  * that assumption.  For example, in a set of 11-element integer arrays having
 662  * elements in the range [0..10], element occurrences are typically not
 663  * independent.  If they were, a sufficiently-large set would include all
 664  * distinct element counts 0 through 11.  We correct for this using the
 665  * histogram of distinct element counts.
 666  *
 667  * In the "column @> const" and "column && const" cases, we usually have a
 668  * "const" with low number of elements (otherwise we have selectivity close
 669  * to 0 or 1 respectively).  That's why the effect of dependence related
 670  * to distinct element count distribution is negligible there.  In the
 671  * "column <@ const" case, number of elements is usually high (otherwise we
 672  * have selectivity close to 0).  That's why we should do a correction with
 673  * the array distinct element count distribution here.
 674  *
 675  * Using the histogram of distinct element counts produces a different
 676  * distribution law than independent occurrences of elements.  This
 677  * distribution law can be described as follows:
 678  *
 679  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
 680  *        (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
 681  *
 682  * where:
 683  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
 684  *              (1 - occurrence, 0 - no occurrence) in row
 685  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
 686  *              (scalar values in [0..1]) according to collected statistics
 687  * m = o1 + o2 + ... + on = total number of distinct elements in row
 688  * hist[m] - histogram data for occurrence of m elements.
 689  * ind[m] - probability of m occurrences from n events assuming their
 690  *        probabilities to be equal to frequencies of array elements.
 691  *
 692  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
 693  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
 694  */
 695 static Selectivity
 696 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
 697                                                          float4 *numbers, int nnumbers,
 698                                                          Datum *array_data, int nitems,
 699                                                          float4 *hist, int nhist,
 700                                                          Oid operator, TypeCacheEntry *typentry)
 701 {
 702         int                     mcelem_index,
 703                                 i,
 704                                 unique_nitems = 0;
 705         float           selec,
 706                                 minfreq,
 707                                 nullelem_freq;
 708         float      *dist,
 709                            *mcelem_dist,
 710                            *hist_part;
 711         float           avg_count,
 712                                 mult,
 713                                 rest;
 714         float      *elem_selec;
 715
 716         /*
 717          * There should be three more Numbers than Values in the MCELEM slot,
 718          * because the last three cells should hold minimal and maximal frequency
 719          * among the non-null elements, and then the frequency of null elements.
 720          * Punt if not right, because we can't do much without the element freqs.
 721          */
 722         if (numbers == NULL || nnumbers != nmcelem + 3)
 723                 return DEFAULT_CONTAIN_SEL;
 724
 725         /* Can't do much without a count histogram, either */
 726         if (hist == NULL || nhist < 3)
 727                 return DEFAULT_CONTAIN_SEL;
 728
 729         /*
 730          * Grab some of the summary statistics that compute_array_stats() stores:
 731          * lowest frequency, frequency of null elements, and average distinct
 732          * element count.
 733          */
 734         minfreq = numbers[nmcelem];
 735         nullelem_freq = numbers[nmcelem + 2];
 736         avg_count = hist[nhist - 1];
 737
 738         /*
 739          * "rest" will be the sum of the frequencies of all elements not
 740          * represented in MCELEM.  The average distinct element count is the sum
 741          * of the frequencies of *all* elements.  Begin with that; we will proceed
 742          * to subtract the MCELEM frequencies.
 743          */
 744         rest = avg_count;
 745
 746         /*
 747          * mult is a multiplier representing estimate of probability that each
 748          * mcelem that is not present in constant doesn't occur.
 749          */
 750         mult = 1.0f;
 751
 752         /*
 753          * elem_selec is array of estimated frequencies for elements in the
 754          * constant.
 755          */
 756         elem_selec = (float *) palloc(sizeof(float) * nitems);
 757
 758         /* Scan mcelem and array in parallel. */
 759         mcelem_index = 0;
 760         for (i = 0; i < nitems; i++)
 761         {
 762                 bool            match = false;
 763
 764                 /* Ignore any duplicates in the array data. */
 765                 if (i > 0 &&
 766                         element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
 767                         continue;
 768
 769                 /*
 770                  * Iterate over MCELEM until we find an entry greater than or equal to
 771                  * this element of the constant.  Update "rest" and "mult" for mcelem
 772                  * entries skipped over.
 773                  */
 774                 while (mcelem_index < nmcelem)
 775                 {
 776                         int                     cmp = element_compare(&mcelem[mcelem_index],
 777                                                                                           &array_data[i],
 778                                                                                           typentry);
 779
 780                         if (cmp < 0)
 781                         {
 782                                 mult *= (1.0f - numbers[mcelem_index]);
 783                                 rest -= numbers[mcelem_index];
 784                                 mcelem_index++;
 785                         }
 786                         else
 787                         {
 788                                 if (cmp == 0)
 789                                         match = true;   /* mcelem is found */
 790                                 break;
 791                         }
 792                 }
 793
 794                 if (match)
 795                 {
 796                         /* MCELEM matches the array item. */
 797                         elem_selec[unique_nitems] = numbers[mcelem_index];
 798                         /* "rest" is decremented for all mcelems, matched or not */
 799                         rest -= numbers[mcelem_index];
 800                         mcelem_index++;
 801                 }
 802                 else
 803                 {
 804                         /*
 805                          * The element is not in MCELEM.  Punt, but assume that the
 806                          * selectivity cannot be more than minfreq / 2.
 807                          */
 808                         elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
 809                                                                                         minfreq / 2);
 810                 }
 811
 812                 unique_nitems++;
 813         }
 814
 815         /*
 816          * If we handled all constant elements without exhausting the MCELEM
 817          * array, finish walking it to complete calculation of "rest" and "mult".
 818          */
 819         while (mcelem_index < nmcelem)
 820         {
 821                 mult *= (1.0f - numbers[mcelem_index]);
 822                 rest -= numbers[mcelem_index];
 823                 mcelem_index++;
 824         }
 825
 826         /*
 827          * The presence of many distinct rare elements materially decreases
 828          * selectivity.  Use the Poisson distribution to estimate the probability
 829          * of a column value having zero occurrences of such elements.  See above
 830          * for the definition of "rest".
 831          */
 832         mult *= exp(-rest);
 833
 834         /*----------
 835          * Using the distinct element count histogram requires
 836          *              O(unique_nitems * (nmcelem + unique_nitems))
 837          * operations.  Beyond a certain computational cost threshold, it's
 838          * reasonable to sacrifice accuracy for decreased planning time.  We limit
 839          * the number of operations to EFFORT * nmcelem; since nmcelem is limited
 840          * by the column's statistics target, the work done is user-controllable.
 841          *
 842          * If the number of operations would be too large, we can reduce it
 843          * without losing all accuracy by reducing unique_nitems and considering
 844          * only the most-common elements of the constant array.  To make the
 845          * results exactly match what we would have gotten with only those
 846          * elements to start with, we'd have to remove any discarded elements'
 847          * frequencies from "mult", but since this is only an approximation
 848          * anyway, we don't bother with that.  Therefore it's sufficient to qsort
 849          * elem_selec[] and take the largest elements.  (They will no longer match
 850          * up with the elements of array_data[], but we don't care.)
 851          *----------
 852          */
 853 #define EFFORT 100
 854
 855         if ((nmcelem + unique_nitems) > 0 &&
 856                 unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
 857         {
 858                 /*
 859                  * Use the quadratic formula to solve for largest allowable N.  We
 860                  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
 861                  */
 862                 double          b = (double) nmcelem;
 863                 int                     n;
 864
 865                 n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
 866
 867                 /* Sort, then take just the first n elements */
 868                 qsort(elem_selec, unique_nitems, sizeof(float),
 869                           float_compare_desc);
 870                 unique_nitems = n;
 871         }
 872
 873         /*
 874          * Calculate probabilities of each distinct element count for both mcelems
 875          * and constant elements.  At this point, assume independent element
 876          * occurrence.
 877          */
 878         dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
 879         mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
 880
 881         /* ignore hist[nhist-1], which is the average not a histogram member */
 882         hist_part = calc_hist(hist, nhist - 1, unique_nitems);
 883
 884         selec = 0.0f;
 885         for (i = 0; i <= unique_nitems; i++)
 886         {
 887                 /*
 888                  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
 889                  * matching from assumption of independent element occurrence with the
 890                  * condition that distinct element count = i.
 891                  */
 892                 if (mcelem_dist[i] > 0)
 893                         selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
 894         }
 895
 896         pfree(dist);
 897         pfree(mcelem_dist);
 898         pfree(hist_part);
 899         pfree(elem_selec);
 900
 901         /* Take into account occurrence of NULL element. */
 902         selec *= (1.0f - nullelem_freq);
 903
 904         CLAMP_PROBABILITY(selec);
 905
 906         return selec;
 907 }
 908
 909 /*
 910  * Calculate the first n distinct element count probabilities from a
 911  * histogram of distinct element counts.
 912  *
 913  * Returns a palloc'd array of n+1 entries, with array[k] being the
 914  * probability of element count k, k in [0..n].
 915  *
 916  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
 917  * (nhist - 1)) probability to each value in (a,b) and an additional half of
 918  * that to a and b themselves.
 919  */
 920 static float *
 921 calc_hist(const float4 *hist, int nhist, int n)
 922 {
 923         float      *hist_part;
 924         int                     k,
 925                                 i = 0;
 926         float           prev_interval = 0,
 927                                 next_interval;
 928         float           frac;
 929
 930         hist_part = (float *) palloc((n + 1) * sizeof(float));
 931
 932         /*
 933          * frac is a probability contribution for each interval between histogram
 934          * values.  We have nhist - 1 intervals, so contribution of each one will
 935          * be 1 / (nhist - 1).
 936          */
 937         frac = 1.0f / ((float) (nhist - 1));
 938
 939         for (k = 0; k <= n; k++)
 940         {
 941                 int                     count = 0;
 942
 943                 /*
 944                  * Count the histogram boundaries equal to k.  (Although the histogram
 945                  * should theoretically contain only exact integers, entries are
 946                  * floats so there could be roundoff error in large values.  Treat any
 947                  * fractional value as equal to the next larger k.)
 948                  */
 949                 while (i < nhist && hist[i] <= k)
 950                 {
 951                         count++;
 952                         i++;
 953                 }
 954
 955                 if (count > 0)
 956                 {
 957                         /* k is an exact bound for at least one histogram box. */
 958                         float           val;
 959
 960                         /* Find length between current histogram value and the next one */
 961                         if (i < nhist)
 962                                 next_interval = hist[i] - hist[i - 1];
 963                         else
 964                                 next_interval = 0;
 965
 966                         /*
 967                          * count - 1 histogram boxes contain k exclusively.  They
 968                          * contribute a total of (count - 1) * frac probability.  Also
 969                          * factor in the partial histogram boxes on either side.
 970                          */
 971                         val = (float) (count - 1);
 972                         if (next_interval > 0)
 973                                 val += 0.5f / next_interval;
 974                         if (prev_interval > 0)
 975                                 val += 0.5f / prev_interval;
 976                         hist_part[k] = frac * val;
 977
 978                         prev_interval = next_interval;
 979                 }
 980                 else
 981                 {
 982                         /* k does not appear as an exact histogram bound. */
 983                         if (prev_interval > 0)
 984                                 hist_part[k] = frac / prev_interval;
 985                         else
 986                                 hist_part[k] = 0.0f;
 987                 }
 988         }
 989
 990         return hist_part;
 991 }
 992
 993 /*
 994  * Consider n independent events with probabilities p[].  This function
 995  * calculates probabilities of exact k of events occurrence for k in [0..m].
 996  * Returns a palloc'd array of size m+1.
 997  *
 998  * "rest" is the sum of the probabilities of all low-probability events not
 999  * included in p.
1000  *
1001  * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
1002  * probability that exactly j of first i events occur.  Obviously M[0,0] = 1.
1003  * For any constant j, each increment of i increases the probability iff the
1004  * event occurs.  So, by the law of total probability:
1005  *      M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1006  *              for i > 0, j > 0.
1007  *      M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1008  */
1009 static float *
1010 calc_distr(const float *p, int n, int m, float rest)
1011 {
1012         float      *row,
1013                            *prev_row,
1014                            *tmp;
1015         int                     i,
1016                                 j;
1017
1018         /*
1019          * Since we return only the last row of the matrix and need only the
1020          * current and previous row for calculations, allocate two rows.
1021          */
1022         row = (float *) palloc((m + 1) * sizeof(float));
1023         prev_row = (float *) palloc((m + 1) * sizeof(float));
1024
1025         /* M[0,0] = 1 */
1026         row[0] = 1.0f;
1027         for (i = 1; i <= n; i++)
1028         {
1029                 float           t = p[i - 1];
1030
1031                 /* Swap rows */
1032                 tmp = row;
1033                 row = prev_row;
1034                 prev_row = tmp;
1035
1036                 /* Calculate next row */
1037                 for (j = 0; j <= i && j <= m; j++)
1038                 {
1039                         float           val = 0.0f;
1040
1041                         if (j < i)
1042                                 val += prev_row[j] * (1.0f - t);
1043                         if (j > 0)
1044                                 val += prev_row[j - 1] * t;
1045                         row[j] = val;
1046                 }
1047         }
1048
1049         /*
1050          * The presence of many distinct rare (not in "p") elements materially
1051          * decreases selectivity.  Model their collective occurrence with the
1052          * Poisson distribution.
1053          */
1054         if (rest > DEFAULT_CONTAIN_SEL)
1055         {
1056                 float           t;
1057
1058                 /* Swap rows */
1059                 tmp = row;
1060                 row = prev_row;
1061                 prev_row = tmp;
1062
1063                 for (i = 0; i <= m; i++)
1064                         row[i] = 0.0f;
1065
1066                 /* Value of Poisson distribution for 0 occurrences */
1067                 t = exp(-rest);
1068
1069                 /*
1070                  * Calculate convolution of previously computed distribution and the
1071                  * Poisson distribution.
1072                  */
1073                 for (i = 0; i <= m; i++)
1074                 {
1075                         for (j = 0; j <= m - i; j++)
1076                                 row[j + i] += prev_row[j] * t;
1077
1078                         /* Get Poisson distribution value for (i + 1) occurrences */
1079                         t *= rest / (float) (i + 1);
1080                 }
1081         }
1082
1083         pfree(prev_row);
1084         return row;
1085 }
1086
1087 /* Fast function for floor value of 2 based logarithm calculation. */
1088 static int
1089 floor_log2(uint32 n)
1090 {
1091         int                     logval = 0;
1092
1093         if (n == 0)
1094                 return -1;
1095         if (n >= (1 << 16))
1096         {
1097                 n >>= 16;
1098                 logval += 16;
1099         }
1100         if (n >= (1 << 8))
1101         {
1102                 n >>= 8;
1103                 logval += 8;
1104         }
1105         if (n >= (1 << 4))
1106         {
1107                 n >>= 4;
1108                 logval += 4;
1109         }
1110         if (n >= (1 << 2))
1111         {
1112                 n >>= 2;
1113                 logval += 2;
1114         }
1115         if (n >= (1 << 1))
1116         {
1117                 logval += 1;
1118         }
1119         return logval;
1120 }
1121
1122 /*
1123  * find_next_mcelem binary-searches a most common elements array, starting
1124  * from *index, for the first member >= value.  It saves the position of the
1125  * match into *index and returns true if it's an exact match.  (Note: we
1126  * assume the mcelem elements are distinct so there can't be more than one
1127  * exact match.)
1128  */
1129 static bool
1130 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1131                                  TypeCacheEntry *typentry)
1132 {
1133         int                     l = *index,
1134                                 r = nmcelem - 1,
1135                                 i,
1136                                 res;
1137
1138         while (l <= r)
1139         {
1140                 i = (l + r) / 2;
1141                 res = element_compare(&mcelem[i], &value, typentry);
1142                 if (res == 0)
1143                 {
1144                         *index = i;
1145                         return true;
1146                 }
1147                 else if (res < 0)
1148                         l = i + 1;
1149                 else
1150                         r = i - 1;
1151         }
1152         *index = l;
1153         return false;
1154 }
1155
1156 /*
1157  * Comparison function for elements.
1158  *
1159  * We use the element type's default btree opclass, and its default collation
1160  * if the type is collation-sensitive.
1161  *
1162  * XXX consider using SortSupport infrastructure
1163  */
1164 static int
1165 element_compare(const void *key1, const void *key2, void *arg)
1166 {
1167         Datum           d1 = *((const Datum *) key1);
1168         Datum           d2 = *((const Datum *) key2);
1169         TypeCacheEntry *typentry = (TypeCacheEntry *) arg;
1170         FmgrInfo   *cmpfunc = &typentry->cmp_proc_finfo;
1171         Datum           c;
1172
1173         c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
1174         return DatumGetInt32(c);
1175 }
1176
1177 /*
1178  * Comparison function for sorting floats into descending order.
1179  */
1180 static int
1181 float_compare_desc(const void *key1, const void *key2)
1182 {
1183         float           d1 = *((const float *) key1);
1184         float           d2 = *((const float *) key2);
1185
1186         if (d1 > d2)
1187                 return -1;
1188         else if (d1 < d2)
1189                 return 1;
1190         else
1191                 return 0;
1192 }