src/backend/tsearch/ts_typanalyze.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * ts_typanalyze.c
   4  *        functions for gathering statistics from tsvector columns
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #include "postgres.h"
  15
  16 #include "access/hash.h"
  17 #include "catalog/pg_operator.h"
  18 #include "commands/vacuum.h"
  19 #include "tsearch/ts_type.h"
  20 #include "utils/builtins.h"
  21 #include "utils/hsearch.h"
  22
  23
  24 /* A hash key for lexemes */
  25 typedef struct
  26 {
  27         char       *lexeme;                     /* lexeme (not NULL terminated!) */
  28         int                     length;                 /* its length in bytes */
  29 } LexemeHashKey;
  30
  31 /* A hash table entry for the Lossy Counting algorithm */
  32 typedef struct
  33 {
  34         LexemeHashKey key;                      /* This is 'e' from the LC algorithm. */
  35         int                     frequency;              /* This is 'f'. */
  36         int                     delta;                  /* And this is 'delta'. */
  37 } TrackItem;
  38
  39 static void compute_tsvector_stats(VacAttrStats *stats,
  40                                            AnalyzeAttrFetchFunc fetchfunc,
  41                                            int samplerows,
  42                                            double totalrows);
  43 static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
  44 static uint32 lexeme_hash(const void *key, Size keysize);
  45 static int      lexeme_match(const void *key1, const void *key2, Size keysize);
  46 static int      lexeme_compare(const void *key1, const void *key2);
  47 static int      trackitem_compare_frequencies_desc(const void *e1, const void *e2);
  48 static int      trackitem_compare_lexemes(const void *e1, const void *e2);
  49
  50
  51 /*
  52  *      ts_typanalyze -- a custom typanalyze function for tsvector columns
  53  */
  54 Datum
  55 ts_typanalyze(PG_FUNCTION_ARGS)
  56 {
  57         VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
  58         Form_pg_attribute attr = stats->attr;
  59
  60         /* If the attstattarget column is negative, use the default value */
  61         /* NB: it is okay to scribble on stats->attr since it's a copy */
  62         if (attr->attstattarget < 0)
  63                 attr->attstattarget = default_statistics_target;
  64
  65         stats->compute_stats = compute_tsvector_stats;
  66         /* see comment about the choice of minrows in commands/analyze.c */
  67         stats->minrows = 300 * attr->attstattarget;
  68
  69         PG_RETURN_BOOL(true);
  70 }
  71
  72 /*
  73  *      compute_tsvector_stats() -- compute statistics for a tsvector column
  74  *
  75  *      This functions computes statistics that are useful for determining @@
  76  *      operations' selectivity, along with the fraction of non-null rows and
  77  *      average width.
  78  *
  79  *      Instead of finding the most common values, as we do for most datatypes,
  80  *      we're looking for the most common lexemes. This is more useful, because
  81  *      there most probably won't be any two rows with the same tsvector and thus
  82  *      the notion of a MCV is a bit bogus with this datatype. With a list of the
  83  *      most common lexemes we can do a better job at figuring out @@ selectivity.
  84  *
  85  *      For the same reasons we assume that tsvector columns are unique when
  86  *      determining the number of distinct values.
  87  *
  88  *      The algorithm used is Lossy Counting, as proposed in the paper "Approximate
  89  *      frequency counts over data streams" by G. S. Manku and R. Motwani, in
  90  *      Proceedings of the 28th International Conference on Very Large Data Bases,
  91  *      Hong Kong, China, August 2002, section 4.2. The paper is available at
  92  *      http://www.vldb.org/conf/2002/S10P03.pdf
  93  *
  94  *      The Lossy Counting (aka LC) algorithm goes like this:
  95  *      Let D be a set of triples (e, f, d), where e is an element value, f is
  96  *      that element's frequency (occurrence count) and d is the maximum error in
  97  *      f.      We start with D empty and process the elements in batches of size
  98  *      w. (The batch size is also known as "bucket size".) Let the current batch
  99  *      number be b_current, starting with 1. For each element e we either
 100  *      increment its f count, if it's already in D, or insert a new triple into D
 101  *      with values (e, 1, b_current - 1). After processing each batch we prune D,
 102  *      by removing from it all elements with f + d <= b_current. Finally, we
 103  *      gather elements with largest f.  The LC paper proves error bounds on f
 104  *      dependent on the batch size w, and shows that the required table size
 105  *      is no more than a few times w.
 106  *
 107  *      We use a hashtable for the D structure and a bucket width of
 108  *      statistics_target * 10, where 10 is an arbitrarily chosen constant,
 109  *      meant to approximate the number of lexemes in a single tsvector.
 110  */
 111 static void
 112 compute_tsvector_stats(VacAttrStats *stats,
 113                                            AnalyzeAttrFetchFunc fetchfunc,
 114                                            int samplerows,
 115                                            double totalrows)
 116 {
 117         int                     num_mcelem;
 118         int                     null_cnt = 0;
 119         double          total_width = 0;
 120
 121         /* This is D from the LC algorithm. */
 122         HTAB       *lexemes_tab;
 123         HASHCTL         hash_ctl;
 124         HASH_SEQ_STATUS scan_status;
 125
 126         /* This is the current bucket number from the LC algorithm */
 127         int                     b_current;
 128
 129         /* This is 'w' from the LC algorithm */
 130         int                     bucket_width;
 131         int                     vector_no,
 132                                 lexeme_no;
 133         LexemeHashKey hash_key;
 134         TrackItem  *item;
 135
 136         /* We want statistics_target * 10 lexemes in the MCELEM array */
 137         num_mcelem = stats->attr->attstattarget * 10;
 138
 139         /*
 140          * We set bucket width equal to the target number of result lexemes. This
 141          * is probably about right but perhaps might need to be scaled up or down
 142          * a bit?
 143          */
 144         bucket_width = num_mcelem;
 145
 146         /*
 147          * Create the hashtable. It will be in local memory, so we don't need to
 148          * worry about initial size too much. Also we don't need to pay any
 149          * attention to locking and memory management.
 150          */
 151         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 152         hash_ctl.keysize = sizeof(LexemeHashKey);
 153         hash_ctl.entrysize = sizeof(TrackItem);
 154         hash_ctl.hash = lexeme_hash;
 155         hash_ctl.match = lexeme_match;
 156         hash_ctl.hcxt = CurrentMemoryContext;
 157         lexemes_tab = hash_create("Analyzed lexemes table",
 158                                                           bucket_width * 4,
 159                                                           &hash_ctl,
 160                                         HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
 161
 162         /* Initialize counters. */
 163         b_current = 1;
 164         lexeme_no = 1;
 165
 166         /* Loop over the tsvectors. */
 167         for (vector_no = 0; vector_no < samplerows; vector_no++)
 168         {
 169                 Datum           value;
 170                 bool            isnull;
 171                 TSVector        vector;
 172                 WordEntry  *curentryptr;
 173                 char       *lexemesptr;
 174                 int                     j;
 175
 176                 vacuum_delay_point();
 177
 178                 value = fetchfunc(stats, vector_no, &isnull);
 179
 180                 /*
 181                  * Check for null/nonnull.
 182                  */
 183                 if (isnull)
 184                 {
 185                         null_cnt++;
 186                         continue;
 187                 }
 188
 189                 /*
 190                  * Add up widths for average-width calculation.  Since it's a
 191                  * tsvector, we know it's varlena.  As in the regular
 192                  * compute_minimal_stats function, we use the toasted width for this
 193                  * calculation.
 194                  */
 195                 total_width += VARSIZE_ANY(DatumGetPointer(value));
 196
 197                 /*
 198                  * Now detoast the tsvector if needed.
 199                  */
 200                 vector = DatumGetTSVector(value);
 201
 202                 /*
 203                  * We loop through the lexemes in the tsvector and add them to our
 204                  * tracking hashtable.  Note: the hashtable entries will point into
 205                  * the (detoasted) tsvector value, therefore we cannot free that
 206                  * storage until we're done.
 207                  */
 208                 lexemesptr = STRPTR(vector);
 209                 curentryptr = ARRPTR(vector);
 210                 for (j = 0; j < vector->size; j++)
 211                 {
 212                         bool            found;
 213
 214                         /* Construct a hash key */
 215                         hash_key.lexeme = lexemesptr + curentryptr->pos;
 216                         hash_key.length = curentryptr->len;
 217
 218                         /* Lookup current lexeme in hashtable, adding it if new */
 219                         item = (TrackItem *) hash_search(lexemes_tab,
 220                                                                                          (const void *) &hash_key,
 221                                                                                          HASH_ENTER, &found);
 222
 223                         if (found)
 224                         {
 225                                 /* The lexeme is already on the tracking list */
 226                                 item->frequency++;
 227                         }
 228                         else
 229                         {
 230                                 /* Initialize new tracking list element */
 231                                 item->frequency = 1;
 232                                 item->delta = b_current - 1;
 233                         }
 234
 235                         /* We prune the D structure after processing each bucket */
 236                         if (lexeme_no % bucket_width == 0)
 237                         {
 238                                 prune_lexemes_hashtable(lexemes_tab, b_current);
 239                                 b_current++;
 240                         }
 241
 242                         /* Advance to the next WordEntry in the tsvector */
 243                         lexeme_no++;
 244                         curentryptr++;
 245                 }
 246         }
 247
 248         /* We can only compute real stats if we found some non-null values. */
 249         if (null_cnt < samplerows)
 250         {
 251                 int                     nonnull_cnt = samplerows - null_cnt;
 252                 int                     i;
 253                 TrackItem **sort_table;
 254                 int                     track_len;
 255                 int                     minfreq,
 256                                         maxfreq;
 257
 258                 stats->stats_valid = true;
 259                 /* Do the simple null-frac and average width stats */
 260                 stats->stanullfrac = (double) null_cnt / (double) samplerows;
 261                 stats->stawidth = total_width / (double) nonnull_cnt;
 262
 263                 /* Assume it's a unique column (see notes above) */
 264                 stats->stadistinct = -1.0;
 265
 266                 /*
 267                  * Determine the top-N lexemes by simply copying pointers from the
 268                  * hashtable into an array and applying qsort()
 269                  */
 270                 track_len = hash_get_num_entries(lexemes_tab);
 271
 272                 sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * track_len);
 273
 274                 hash_seq_init(&scan_status, lexemes_tab);
 275                 i = 0;
 276                 while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
 277                 {
 278                         sort_table[i++] = item;
 279                 }
 280                 Assert(i == track_len);
 281
 282                 qsort(sort_table, track_len, sizeof(TrackItem *),
 283                           trackitem_compare_frequencies_desc);
 284
 285                 /* Suppress any single-occurrence items */
 286                 while (track_len > 0)
 287                 {
 288                         if (sort_table[track_len - 1]->frequency > 1)
 289                                 break;
 290                         track_len--;
 291                 }
 292
 293                 /* Determine the number of most common lexemes to be stored */
 294                 if (num_mcelem > track_len)
 295                         num_mcelem = track_len;
 296
 297                 /* Generate MCELEM slot entry */
 298                 if (num_mcelem > 0)
 299                 {
 300                         MemoryContext old_context;
 301                         Datum      *mcelem_values;
 302                         float4     *mcelem_freqs;
 303
 304                         /* Grab the minimal and maximal frequencies that will get stored */
 305                         minfreq = sort_table[num_mcelem - 1]->frequency;
 306                         maxfreq = sort_table[0]->frequency;
 307
 308                         /*
 309                          * We want to store statistics sorted on the lexeme value using
 310                          * first length, then byte-for-byte comparison. The reason for
 311                          * doing length comparison first is that we don't care about the
 312                          * ordering so long as it's consistent, and comparing lengths
 313                          * first gives us a chance to avoid a strncmp() call.
 314                          *
 315                          * This is different from what we do with scalar statistics --
 316                          * they get sorted on frequencies. The rationale is that we
 317                          * usually search through most common elements looking for a
 318                          * specific value, so we can grab its frequency.  When values are
 319                          * presorted we can employ binary search for that.      See
 320                          * ts_selfuncs.c for a real usage scenario.
 321                          */
 322                         qsort(sort_table, num_mcelem, sizeof(TrackItem *),
 323                                   trackitem_compare_lexemes);
 324
 325                         /* Must copy the target values into anl_context */
 326                         old_context = MemoryContextSwitchTo(stats->anl_context);
 327
 328                         /*
 329                          * We sorted statistics on the lexeme value, but we want to be
 330                          * able to find out the minimal and maximal frequency without
 331                          * going through all the values.  We keep those two extra
 332                          * frequencies in two extra cells in mcelem_freqs.
 333                          */
 334                         mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
 335                         mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
 336
 337                         for (i = 0; i < num_mcelem; i++)
 338                         {
 339                                 TrackItem  *item = sort_table[i];
 340
 341                                 mcelem_values[i] =
 342                                         PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
 343                                                                                                                   item->key.length));
 344                                 mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
 345                         }
 346                         mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
 347                         mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
 348                         MemoryContextSwitchTo(old_context);
 349
 350                         stats->stakind[0] = STATISTIC_KIND_MCELEM;
 351                         stats->staop[0] = TextEqualOperator;
 352                         stats->stanumbers[0] = mcelem_freqs;
 353                         /* See above comment about two extra frequency fields */
 354                         stats->numnumbers[0] = num_mcelem + 2;
 355                         stats->stavalues[0] = mcelem_values;
 356                         stats->numvalues[0] = num_mcelem;
 357                         /* We are storing text values */
 358                         stats->statypid[0] = TEXTOID;
 359                         stats->statyplen[0] = -1;       /* typlen, -1 for varlena */
 360                         stats->statypbyval[0] = false;
 361                         stats->statypalign[0] = 'i';
 362                 }
 363         }
 364         else
 365         {
 366                 /* We found only nulls; assume the column is entirely null */
 367                 stats->stats_valid = true;
 368                 stats->stanullfrac = 1.0;
 369                 stats->stawidth = 0;    /* "unknown" */
 370                 stats->stadistinct = 0.0;               /* "unknown" */
 371         }
 372
 373         /*
 374          * We don't need to bother cleaning up any of our temporary palloc's. The
 375          * hashtable should also go away, as it used a child memory context.
 376          */
 377 }
 378
 379 /*
 380  *      A function to prune the D structure from the Lossy Counting algorithm.
 381  *      Consult compute_tsvector_stats() for wider explanation.
 382  */
 383 static void
 384 prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
 385 {
 386         HASH_SEQ_STATUS scan_status;
 387         TrackItem  *item;
 388
 389         hash_seq_init(&scan_status, lexemes_tab);
 390         while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
 391         {
 392                 if (item->frequency + item->delta <= b_current)
 393                 {
 394                         if (hash_search(lexemes_tab, (const void *) &item->key,
 395                                                         HASH_REMOVE, NULL) == NULL)
 396                                 elog(ERROR, "hash table corrupted");
 397                 }
 398         }
 399 }
 400
 401 /*
 402  * Hash functions for lexemes. They are strings, but not NULL terminated,
 403  * so we need a special hash function.
 404  */
 405 static uint32
 406 lexeme_hash(const void *key, Size keysize)
 407 {
 408         const LexemeHashKey *l = (const LexemeHashKey *) key;
 409
 410         return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme,
 411                                                                    l->length));
 412 }
 413
 414 /*
 415  *      Matching function for lexemes, to be used in hashtable lookups.
 416  */
 417 static int
 418 lexeme_match(const void *key1, const void *key2, Size keysize)
 419 {
 420         /* The keysize parameter is superfluous, the keys store their lengths */
 421         return lexeme_compare(key1, key2);
 422 }
 423
 424 /*
 425  *      Comparison function for lexemes.
 426  */
 427 static int
 428 lexeme_compare(const void *key1, const void *key2)
 429 {
 430         const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
 431         const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
 432
 433         /* First, compare by length */
 434         if (d1->length > d2->length)
 435                 return 1;
 436         else if (d1->length < d2->length)
 437                 return -1;
 438         /* Lengths are equal, do a byte-by-byte comparison */
 439         return strncmp(d1->lexeme, d2->lexeme, d1->length);
 440 }
 441
 442 /*
 443  *      qsort() comparator for sorting TrackItems on frequencies (descending sort)
 444  */
 445 static int
 446 trackitem_compare_frequencies_desc(const void *e1, const void *e2)
 447 {
 448         const TrackItem *const * t1 = (const TrackItem *const *) e1;
 449         const TrackItem *const * t2 = (const TrackItem *const *) e2;
 450
 451         return (*t2)->frequency - (*t1)->frequency;
 452 }
 453
 454 /*
 455  *      qsort() comparator for sorting TrackItems on lexemes
 456  */
 457 static int
 458 trackitem_compare_lexemes(const void *e1, const void *e2)
 459 {
 460         const TrackItem *const * t1 = (const TrackItem *const *) e1;
 461         const TrackItem *const * t2 = (const TrackItem *const *) e2;
 462
 463         return lexeme_compare(&(*t1)->key, &(*t2)->key);
 464 }