Improve nbtree unsatisfiable RowCompare detection.
[pgsql.git] / src / backend / statistics / mvdistinct.c
blob7e7a63405c8ba9c16ce31f22a22950195b99518a
1 /*-------------------------------------------------------------------------
3 * mvdistinct.c
4 * POSTGRES multivariate ndistinct coefficients
6 * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 * is tricky, and the estimation error is often significant.
9 * The multivariate ndistinct coefficients address this by storing ndistinct
10 * estimates for combinations of the user-specified columns. So for example
11 * given a statistics object on three columns (a,b,c), this module estimates
12 * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 * estimates are already available in pg_statistic.
16 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
19 * IDENTIFICATION
20 * src/backend/statistics/mvdistinct.c
22 *-------------------------------------------------------------------------
24 #include "postgres.h"
26 #include <math.h>
28 #include "catalog/pg_statistic_ext.h"
29 #include "catalog/pg_statistic_ext_data.h"
30 #include "lib/stringinfo.h"
31 #include "statistics/extended_stats_internal.h"
32 #include "statistics/statistics.h"
33 #include "utils/fmgrprotos.h"
34 #include "utils/syscache.h"
35 #include "utils/typcache.h"
36 #include "varatt.h"
38 static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
39 int k, int *combination);
40 static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
41 static int n_choose_k(int n, int k);
42 static int num_combinations(int n);
44 /* size of the struct header fields (magic, type, nitems) */
45 #define SizeOfHeader (3 * sizeof(uint32))
47 /* size of a serialized ndistinct item (coefficient, natts, atts) */
48 #define SizeOfItem(natts) \
49 (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
51 /* minimal size of a ndistinct item (with two attributes) */
52 #define MinSizeOfItem SizeOfItem(2)
54 /* minimal size of mvndistinct, when all items are minimal */
55 #define MinSizeOfItems(nitems) \
56 (SizeOfHeader + (nitems) * MinSizeOfItem)
58 /* Combination generator API */
60 /* internal state for generator of k-combinations of n elements */
61 typedef struct CombinationGenerator
63 int k; /* size of the combination */
64 int n; /* total number of elements */
65 int current; /* index of the next combination to return */
66 int ncombinations; /* number of combinations (size of array) */
67 int *combinations; /* array of pre-built combinations */
68 } CombinationGenerator;
70 static CombinationGenerator *generator_init(int n, int k);
71 static void generator_free(CombinationGenerator *state);
72 static int *generator_next(CombinationGenerator *state);
73 static void generate_combinations(CombinationGenerator *state);
77 * statext_ndistinct_build
78 * Compute ndistinct coefficient for the combination of attributes.
80 * This computes the ndistinct estimate using the same estimator used
81 * in analyze.c and then computes the coefficient.
83 * To handle expressions easily, we treat them as system attributes with
84 * negative attnums, and offset everything by number of expressions to
85 * allow using Bitmapsets.
87 MVNDistinct *
88 statext_ndistinct_build(double totalrows, StatsBuildData *data)
90 MVNDistinct *result;
91 int k;
92 int itemcnt;
93 int numattrs = data->nattnums;
94 int numcombs = num_combinations(numattrs);
96 result = palloc(offsetof(MVNDistinct, items) +
97 numcombs * sizeof(MVNDistinctItem));
98 result->magic = STATS_NDISTINCT_MAGIC;
99 result->type = STATS_NDISTINCT_TYPE_BASIC;
100 result->nitems = numcombs;
102 itemcnt = 0;
103 for (k = 2; k <= numattrs; k++)
105 int *combination;
106 CombinationGenerator *generator;
108 /* generate combinations of K out of N elements */
109 generator = generator_init(numattrs, k);
111 while ((combination = generator_next(generator)))
113 MVNDistinctItem *item = &result->items[itemcnt];
114 int j;
116 item->attributes = palloc(sizeof(AttrNumber) * k);
117 item->nattributes = k;
119 /* translate the indexes to attnums */
120 for (j = 0; j < k; j++)
122 item->attributes[j] = data->attnums[combination[j]];
124 Assert(AttributeNumberIsValid(item->attributes[j]));
127 item->ndistinct =
128 ndistinct_for_combination(totalrows, data, k, combination);
130 itemcnt++;
131 Assert(itemcnt <= result->nitems);
134 generator_free(generator);
137 /* must consume exactly the whole output array */
138 Assert(itemcnt == result->nitems);
140 return result;
144 * statext_ndistinct_load
145 * Load the ndistinct value for the indicated pg_statistic_ext tuple
147 MVNDistinct *
148 statext_ndistinct_load(Oid mvoid, bool inh)
150 MVNDistinct *result;
151 bool isnull;
152 Datum ndist;
153 HeapTuple htup;
155 htup = SearchSysCache2(STATEXTDATASTXOID,
156 ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
157 if (!HeapTupleIsValid(htup))
158 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
160 ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
161 Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
162 if (isnull)
163 elog(ERROR,
164 "requested statistics kind \"%c\" is not yet built for statistics object %u",
165 STATS_EXT_NDISTINCT, mvoid);
167 result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist));
169 ReleaseSysCache(htup);
171 return result;
175 * statext_ndistinct_serialize
176 * serialize ndistinct to the on-disk bytea format
178 bytea *
179 statext_ndistinct_serialize(MVNDistinct *ndistinct)
181 int i;
182 bytea *output;
183 char *tmp;
184 Size len;
186 Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
187 Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC);
190 * Base size is size of scalar fields in the struct, plus one base struct
191 * for each item, including number of items for each.
193 len = VARHDRSZ + SizeOfHeader;
195 /* and also include space for the actual attribute numbers */
196 for (i = 0; i < ndistinct->nitems; i++)
198 int nmembers;
200 nmembers = ndistinct->items[i].nattributes;
201 Assert(nmembers >= 2);
203 len += SizeOfItem(nmembers);
206 output = (bytea *) palloc(len);
207 SET_VARSIZE(output, len);
209 tmp = VARDATA(output);
211 /* Store the base struct values (magic, type, nitems) */
212 memcpy(tmp, &ndistinct->magic, sizeof(uint32));
213 tmp += sizeof(uint32);
214 memcpy(tmp, &ndistinct->type, sizeof(uint32));
215 tmp += sizeof(uint32);
216 memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
217 tmp += sizeof(uint32);
220 * store number of attributes and attribute numbers for each entry
222 for (i = 0; i < ndistinct->nitems; i++)
224 MVNDistinctItem item = ndistinct->items[i];
225 int nmembers = item.nattributes;
227 memcpy(tmp, &item.ndistinct, sizeof(double));
228 tmp += sizeof(double);
229 memcpy(tmp, &nmembers, sizeof(int));
230 tmp += sizeof(int);
232 memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
233 tmp += nmembers * sizeof(AttrNumber);
235 /* protect against overflows */
236 Assert(tmp <= ((char *) output + len));
239 /* check we used exactly the expected space */
240 Assert(tmp == ((char *) output + len));
242 return output;
246 * statext_ndistinct_deserialize
247 * Read an on-disk bytea format MVNDistinct to in-memory format
249 MVNDistinct *
250 statext_ndistinct_deserialize(bytea *data)
252 int i;
253 Size minimum_size;
254 MVNDistinct ndist;
255 MVNDistinct *ndistinct;
256 char *tmp;
258 if (data == NULL)
259 return NULL;
261 /* we expect at least the basic fields of MVNDistinct struct */
262 if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
263 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
264 VARSIZE_ANY_EXHDR(data), SizeOfHeader);
266 /* initialize pointer to the data part (skip the varlena header) */
267 tmp = VARDATA_ANY(data);
269 /* read the header fields and perform basic sanity checks */
270 memcpy(&ndist.magic, tmp, sizeof(uint32));
271 tmp += sizeof(uint32);
272 memcpy(&ndist.type, tmp, sizeof(uint32));
273 tmp += sizeof(uint32);
274 memcpy(&ndist.nitems, tmp, sizeof(uint32));
275 tmp += sizeof(uint32);
277 if (ndist.magic != STATS_NDISTINCT_MAGIC)
278 elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
279 ndist.magic, STATS_NDISTINCT_MAGIC);
280 if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
281 elog(ERROR, "invalid ndistinct type %d (expected %d)",
282 ndist.type, STATS_NDISTINCT_TYPE_BASIC);
283 if (ndist.nitems == 0)
284 elog(ERROR, "invalid zero-length item array in MVNDistinct");
286 /* what minimum bytea size do we expect for those parameters */
287 minimum_size = MinSizeOfItems(ndist.nitems);
288 if (VARSIZE_ANY_EXHDR(data) < minimum_size)
289 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
290 VARSIZE_ANY_EXHDR(data), minimum_size);
293 * Allocate space for the ndistinct items (no space for each item's
294 * attnos: those live in bitmapsets allocated separately)
296 ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
297 (ndist.nitems * sizeof(MVNDistinctItem)));
298 ndistinct->magic = ndist.magic;
299 ndistinct->type = ndist.type;
300 ndistinct->nitems = ndist.nitems;
302 for (i = 0; i < ndistinct->nitems; i++)
304 MVNDistinctItem *item = &ndistinct->items[i];
306 /* ndistinct value */
307 memcpy(&item->ndistinct, tmp, sizeof(double));
308 tmp += sizeof(double);
310 /* number of attributes */
311 memcpy(&item->nattributes, tmp, sizeof(int));
312 tmp += sizeof(int);
313 Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
315 item->attributes
316 = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
318 memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
319 tmp += sizeof(AttrNumber) * item->nattributes;
321 /* still within the bytea */
322 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
325 /* we should have consumed the whole bytea exactly */
326 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
328 return ndistinct;
332 * pg_ndistinct_in
333 * input routine for type pg_ndistinct
335 * pg_ndistinct is real enough to be a table column, but it has no
336 * operations of its own, and disallows input (just like pg_node_tree).
338 Datum
339 pg_ndistinct_in(PG_FUNCTION_ARGS)
341 ereport(ERROR,
342 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
343 errmsg("cannot accept a value of type %s", "pg_ndistinct")));
345 PG_RETURN_VOID(); /* keep compiler quiet */
349 * pg_ndistinct
350 * output routine for type pg_ndistinct
352 * Produces a human-readable representation of the value.
354 Datum
355 pg_ndistinct_out(PG_FUNCTION_ARGS)
357 bytea *data = PG_GETARG_BYTEA_PP(0);
358 MVNDistinct *ndist = statext_ndistinct_deserialize(data);
359 int i;
360 StringInfoData str;
362 initStringInfo(&str);
363 appendStringInfoChar(&str, '{');
365 for (i = 0; i < ndist->nitems; i++)
367 int j;
368 MVNDistinctItem item = ndist->items[i];
370 if (i > 0)
371 appendStringInfoString(&str, ", ");
373 for (j = 0; j < item.nattributes; j++)
375 AttrNumber attnum = item.attributes[j];
377 appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
379 appendStringInfo(&str, "\": %d", (int) item.ndistinct);
382 appendStringInfoChar(&str, '}');
384 PG_RETURN_CSTRING(str.data);
388 * pg_ndistinct_recv
389 * binary input routine for type pg_ndistinct
391 Datum
392 pg_ndistinct_recv(PG_FUNCTION_ARGS)
394 ereport(ERROR,
395 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
396 errmsg("cannot accept a value of type %s", "pg_ndistinct")));
398 PG_RETURN_VOID(); /* keep compiler quiet */
402 * pg_ndistinct_send
403 * binary output routine for type pg_ndistinct
405 * n-distinct is serialized into a bytea value, so let's send that.
407 Datum
408 pg_ndistinct_send(PG_FUNCTION_ARGS)
410 return byteasend(fcinfo);
414 * ndistinct_for_combination
415 * Estimates number of distinct values in a combination of columns.
417 * This uses the same ndistinct estimator as compute_scalar_stats() in
418 * ANALYZE, i.e.,
419 * n*d / (n - f1 + f1*n/N)
421 * except that instead of values in a single column we are dealing with
422 * combination of multiple columns.
424 static double
425 ndistinct_for_combination(double totalrows, StatsBuildData *data,
426 int k, int *combination)
428 int i,
430 int f1,
431 cnt,
433 bool *isnull;
434 Datum *values;
435 SortItem *items;
436 MultiSortSupport mss;
437 int numrows = data->numrows;
439 mss = multi_sort_init(k);
442 * In order to determine the number of distinct elements, create separate
443 * values[]/isnull[] arrays with all the data we have, then sort them
444 * using the specified column combination as dimensions. We could try to
445 * sort in place, but it'd probably be more complex and bug-prone.
447 items = (SortItem *) palloc(numrows * sizeof(SortItem));
448 values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
449 isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
451 for (i = 0; i < numrows; i++)
453 items[i].values = &values[i * k];
454 items[i].isnull = &isnull[i * k];
458 * For each dimension, set up sort-support and fill in the values from the
459 * sample data.
461 * We use the column data types' default sort operators and collations;
462 * perhaps at some point it'd be worth using column-specific collations?
464 for (i = 0; i < k; i++)
466 Oid typid;
467 TypeCacheEntry *type;
468 Oid collid = InvalidOid;
469 VacAttrStats *colstat = data->stats[combination[i]];
471 typid = colstat->attrtypid;
472 collid = colstat->attrcollid;
474 type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
475 if (type->lt_opr == InvalidOid) /* shouldn't happen */
476 elog(ERROR, "cache lookup failed for ordering operator for type %u",
477 typid);
479 /* prepare the sort function for this dimension */
480 multi_sort_add_dimension(mss, i, type->lt_opr, collid);
482 /* accumulate all the data for this dimension into the arrays */
483 for (j = 0; j < numrows; j++)
485 items[j].values[i] = data->values[combination[i]][j];
486 items[j].isnull[i] = data->nulls[combination[i]][j];
490 /* We can sort the array now ... */
491 qsort_interruptible(items, numrows, sizeof(SortItem),
492 multi_sort_compare, mss);
494 /* ... and count the number of distinct combinations */
496 f1 = 0;
497 cnt = 1;
498 d = 1;
499 for (i = 1; i < numrows; i++)
501 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
503 if (cnt == 1)
504 f1 += 1;
506 d++;
507 cnt = 0;
510 cnt += 1;
513 if (cnt == 1)
514 f1 += 1;
516 return estimate_ndistinct(totalrows, numrows, d, f1);
519 /* The Duj1 estimator (already used in analyze.c). */
520 static double
521 estimate_ndistinct(double totalrows, int numrows, int d, int f1)
523 double numer,
524 denom,
525 ndistinct;
527 numer = (double) numrows * (double) d;
529 denom = (double) (numrows - f1) +
530 (double) f1 * (double) numrows / totalrows;
532 ndistinct = numer / denom;
534 /* Clamp to sane range in case of roundoff error */
535 if (ndistinct < (double) d)
536 ndistinct = (double) d;
538 if (ndistinct > totalrows)
539 ndistinct = totalrows;
541 return floor(ndistinct + 0.5);
545 * n_choose_k
546 * computes binomial coefficients using an algorithm that is both
547 * efficient and prevents overflows
549 static int
550 n_choose_k(int n, int k)
552 int d,
555 Assert((k > 0) && (n >= k));
557 /* use symmetry of the binomial coefficients */
558 k = Min(k, n - k);
560 r = 1;
561 for (d = 1; d <= k; ++d)
563 r *= n--;
564 r /= d;
567 return r;
571 * num_combinations
572 * number of combinations, excluding single-value combinations
574 static int
575 num_combinations(int n)
577 return (1 << n) - (n + 1);
581 * generator_init
582 * initialize the generator of combinations
584 * The generator produces combinations of K elements in the interval (0..N).
585 * We prebuild all the combinations in this method, which is simpler than
586 * generating them on the fly.
588 static CombinationGenerator *
589 generator_init(int n, int k)
591 CombinationGenerator *state;
593 Assert((n >= k) && (k > 0));
595 /* allocate the generator state as a single chunk of memory */
596 state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator));
598 state->ncombinations = n_choose_k(n, k);
600 /* pre-allocate space for all combinations */
601 state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
603 state->current = 0;
604 state->k = k;
605 state->n = n;
607 /* now actually pre-generate all the combinations of K elements */
608 generate_combinations(state);
610 /* make sure we got the expected number of combinations */
611 Assert(state->current == state->ncombinations);
613 /* reset the number, so we start with the first one */
614 state->current = 0;
616 return state;
620 * generator_next
621 * returns the next combination from the prebuilt list
623 * Returns a combination of K array indexes (0 .. N), as specified to
624 * generator_init), or NULL when there are no more combination.
626 static int *
627 generator_next(CombinationGenerator *state)
629 if (state->current == state->ncombinations)
630 return NULL;
632 return &state->combinations[state->k * state->current++];
636 * generator_free
637 * free the internal state of the generator
639 * Releases the generator internal state (pre-built combinations).
641 static void
642 generator_free(CombinationGenerator *state)
644 pfree(state->combinations);
645 pfree(state);
649 * generate_combinations_recurse
650 * given a prefix, generate all possible combinations
652 * Given a prefix (first few elements of the combination), generate following
653 * elements recursively. We generate the combinations in lexicographic order,
654 * which eliminates permutations of the same combination.
656 static void
657 generate_combinations_recurse(CombinationGenerator *state,
658 int index, int start, int *current)
660 /* If we haven't filled all the elements, simply recurse. */
661 if (index < state->k)
663 int i;
666 * The values have to be in ascending order, so make sure we start
667 * with the value passed by parameter.
670 for (i = start; i < state->n; i++)
672 current[index] = i;
673 generate_combinations_recurse(state, (index + 1), (i + 1), current);
676 return;
678 else
680 /* we got a valid combination, add it to the array */
681 memcpy(&state->combinations[(state->k * state->current)],
682 current, state->k * sizeof(int));
683 state->current++;
688 * generate_combinations
689 * generate all k-combinations of N elements
691 static void
692 generate_combinations(CombinationGenerator *state)
694 int *current = (int *) palloc0(sizeof(int) * state->k);
696 generate_combinations_recurse(state, 0, 0, current);
698 pfree(current);