1 /*-------------------------------------------------------------------------
4 * Selectivity estimation functions for text search operators.
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
16 #include "catalog/pg_statistic.h"
17 #include "catalog/pg_type.h"
18 #include "miscadmin.h"
19 #include "nodes/nodes.h"
20 #include "tsearch/ts_type.h"
21 #include "utils/lsyscache.h"
22 #include "utils/selfuncs.h"
23 #include "utils/syscache.h"
27 * The default text search selectivity is chosen to be small enough to
28 * encourage indexscans for typical table densities. See selfuncs.h and
29 * DEFAULT_EQ_SEL for details.
31 #define DEFAULT_TS_MATCH_SEL 0.005
33 /* lookup table type for binary searching through MCELEMs */
40 /* type of keys for bsearch'ing through an array of TextFreqs */
47 static Selectivity
tsquerysel(VariableStatData
*vardata
, Datum constval
);
48 static Selectivity
mcelem_tsquery_selec(TSQuery query
,
49 Datum
*mcelem
, int nmcelem
,
50 float4
*numbers
, int nnumbers
);
51 static Selectivity
tsquery_opr_selec(QueryItem
*item
, char *operand
,
52 TextFreq
*lookup
, int length
, float4 minfreq
);
53 static int compare_lexeme_textfreq(const void *e1
, const void *e2
);
57 * tsmatchsel -- Selectivity of "@@"
59 * restriction selectivity function for tsvector @@ tsquery and
63 tsmatchsel(PG_FUNCTION_ARGS
)
65 PlannerInfo
*root
= (PlannerInfo
*) PG_GETARG_POINTER(0);
67 Oid
operator = PG_GETARG_OID(1);
69 List
*args
= (List
*) PG_GETARG_POINTER(2);
70 int varRelid
= PG_GETARG_INT32(3);
71 VariableStatData vardata
;
77 * If expression is not variable = something or something = variable, then
78 * punt and return a default estimate.
80 if (!get_restriction_variable(root
, args
, varRelid
,
81 &vardata
, &other
, &varonleft
))
82 PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL
);
85 * Can't do anything useful if the something is not a constant, either.
87 if (!IsA(other
, Const
))
89 ReleaseVariableStats(vardata
);
90 PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL
);
94 * The "@@" operator is strict, so we can cope with NULL right away
96 if (((Const
*) other
)->constisnull
)
98 ReleaseVariableStats(vardata
);
99 PG_RETURN_FLOAT8(0.0);
103 * OK, there's a Var and a Const we're dealing with here. We need the Var
104 * to be a TSVector (or else we don't have any useful statistic for it).
105 * We have to check this because the Var might be the TSQuery not the
108 if (vardata
.vartype
== TSVECTOROID
)
110 /* tsvector @@ tsquery or the other way around */
111 Assert(((Const
*) other
)->consttype
== TSQUERYOID
);
113 selec
= tsquerysel(&vardata
, ((Const
*) other
)->constvalue
);
117 /* The Var is something we don't have useful statistics for */
118 selec
= DEFAULT_TS_MATCH_SEL
;
121 ReleaseVariableStats(vardata
);
123 CLAMP_PROBABILITY(selec
);
125 PG_RETURN_FLOAT8((float8
) selec
);
130 * tsmatchjoinsel -- join selectivity of "@@"
132 * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
135 tsmatchjoinsel(PG_FUNCTION_ARGS
)
137 /* for the moment we just punt */
138 PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL
);
143 * @@ selectivity for tsvector var vs tsquery constant
146 tsquerysel(VariableStatData
*vardata
, Datum constval
)
150 if (HeapTupleIsValid(vardata
->statsTuple
))
153 Form_pg_statistic stats
;
159 /* The caller made sure the const is a TSQuery, so get it now */
160 query
= DatumGetTSQuery(constval
);
162 stats
= (Form_pg_statistic
) GETSTRUCT(vardata
->statsTuple
);
164 /* MCELEM will be an array of TEXT elements for a tsvector column */
165 if (get_attstatsslot(vardata
->statsTuple
,
167 STATISTIC_KIND_MCELEM
, InvalidOid
,
169 &numbers
, &nnumbers
))
172 * There is a most-common-elements slot for the tsvector Var, so
175 selec
= mcelem_tsquery_selec(query
, values
, nvalues
,
177 free_attstatsslot(TEXTOID
, values
, nvalues
, numbers
, nnumbers
);
181 /* No most-common-elements info, so we must punt */
182 selec
= (Selectivity
) DEFAULT_TS_MATCH_SEL
;
187 /* No stats at all, so we must punt */
188 selec
= (Selectivity
) DEFAULT_TS_MATCH_SEL
;
195 * Extract data from the pg_statistic arrays into useful format.
198 mcelem_tsquery_selec(TSQuery query
, Datum
*mcelem
, int nmcelem
,
199 float4
*numbers
, int nnumbers
)
207 * There should be two more Numbers than Values, because the last two
208 * cells are taken for minimal and maximal frequency. Punt if not.
210 if (nnumbers
!= nmcelem
+ 2)
211 return DEFAULT_TS_MATCH_SEL
;
214 * Transpose the data into a single array so we can use bsearch().
216 lookup
= (TextFreq
*) palloc(sizeof(TextFreq
) * nmcelem
);
217 for (i
= 0; i
< nmcelem
; i
++)
220 * The text Datums came from an array, so it cannot be compressed
221 * or stored out-of-line -- it's safe to use VARSIZE_ANY*.
223 Assert(!VARATT_IS_COMPRESSED(mcelem
[i
]) && !VARATT_IS_EXTERNAL(mcelem
[i
]));
224 lookup
[i
].element
= (text
*) DatumGetPointer(mcelem
[i
]);
225 lookup
[i
].frequency
= numbers
[i
];
229 * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
230 * the one before the last cell of the Numbers array. See ts_typanalyze.c
232 minfreq
= numbers
[nnumbers
- 2];
234 selec
= tsquery_opr_selec(GETQUERY(query
), GETOPERAND(query
), lookup
,
243 * Traverse the tsquery in preorder, calculating selectivity as:
245 * selec(left_oper) * selec(right_oper) in AND nodes,
247 * selec(left_oper) + selec(right_oper) -
248 * selec(left_oper) * selec(right_oper) in OR nodes,
250 * 1 - select(oper) in NOT nodes
252 * freq[val] in VAL nodes, if the value is in MCELEM
253 * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
256 * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
257 * binary search for determining freq[MCELEM].
260 tsquery_opr_selec(QueryItem
*item
, char *operand
,
261 TextFreq
*lookup
, int length
, float4 minfreq
)
265 Selectivity selec
, s1
, s2
;
267 /* since this function recurses, it could be driven to stack overflow */
270 if (item
->type
== QI_VAL
)
272 QueryOperand
*oper
= (QueryOperand
*) item
;
275 * Prepare the key for bsearch().
277 key
.lexeme
= operand
+ oper
->distance
;
278 key
.length
= oper
->length
;
280 searchres
= (TextFreq
*) bsearch(&key
, lookup
, length
,
282 compare_lexeme_textfreq
);
287 * The element is in MCELEM. Return precise selectivity (or at
288 * least as precise as ANALYZE could find out).
290 return (Selectivity
) searchres
->frequency
;
295 * The element is not in MCELEM. Punt, but assert that the
296 * selectivity cannot be more than minfreq / 2.
298 return (Selectivity
) Min(DEFAULT_TS_MATCH_SEL
, minfreq
/ 2);
302 /* Current TSQuery node is an operator */
303 switch (item
->operator.oper
)
306 selec
= 1.0 - tsquery_opr_selec(item
+ 1, operand
,
307 lookup
, length
, minfreq
);
311 s1
= tsquery_opr_selec(item
+ 1, operand
,
312 lookup
, length
, minfreq
);
313 s2
= tsquery_opr_selec(item
+ item
->operator.left
, operand
,
314 lookup
, length
, minfreq
);
319 s1
= tsquery_opr_selec(item
+ 1, operand
,
320 lookup
, length
, minfreq
);
321 s2
= tsquery_opr_selec(item
+ item
->operator.left
, operand
,
322 lookup
, length
, minfreq
);
323 selec
= s1
+ s2
- s1
* s2
;
327 elog(ERROR
, "unrecognized operator: %d", item
->operator.oper
);
328 selec
= 0; /* keep compiler quiet */
332 /* Clamp intermediate results to stay sane despite roundoff error */
333 CLAMP_PROBABILITY(selec
);
339 * bsearch() comparator for a lexeme (non-NULL terminated string with length)
340 * and a TextFreq. Use length, then byte-for-byte comparison, because that's
341 * how ANALYZE code sorted data before storing it in a statistic tuple.
342 * See ts_typanalyze.c for details.
345 compare_lexeme_textfreq(const void *e1
, const void *e2
)
347 const LexemeKey
*key
= (const LexemeKey
*) e1
;
348 const TextFreq
*t
= (const TextFreq
*) e2
;
353 len2
= VARSIZE_ANY_EXHDR(t
->element
);
355 /* Compare lengths first, possibly avoiding a strncmp call */
358 else if (len1
< len2
)
361 /* Fall back on byte-for-byte comparison */
362 return strncmp(key
->lexeme
, VARDATA_ANY(t
->element
), len1
);