3 final class PhutilSearchQueryCompiler
6 private $operators = '+ -><()~*:""&|';
9 private $enableFunctions = false;
11 const OPERATOR_NOT
= 'not';
12 const OPERATOR_AND
= 'and';
13 const OPERATOR_SUBSTRING
= 'sub';
14 const OPERATOR_EXACT
= 'exact';
15 const OPERATOR_ABSENT
= 'absent';
16 const OPERATOR_PRESENT
= 'present';
18 public function setOperators($operators) {
19 $this->operators
= $operators;
23 public function getOperators() {
24 return $this->operators
;
27 public function setStemmer(PhutilSearchStemmer
$stemmer) {
28 $this->stemmer
= $stemmer;
32 public function getStemmer() {
33 return $this->stemmer
;
36 public function setEnableFunctions($enable_functions) {
37 $this->enableFunctions
= $enable_functions;
41 public function getEnableFunctions() {
42 return $this->enableFunctions
;
45 public function compileQuery(array $tokens) {
46 assert_instances_of($tokens, 'PhutilSearchQueryToken');
49 foreach ($tokens as $token) {
50 $result[] = $this->renderToken($token);
53 return $this->compileRenderedTokens($result);
56 public function compileLiteralQuery(array $tokens) {
57 assert_instances_of($tokens, 'PhutilSearchQueryToken');
60 foreach ($tokens as $token) {
61 if (!$token->isQuoted()) {
64 $result[] = $this->renderToken($token);
67 return $this->compileRenderedTokens($result);
70 public function compileStemmedQuery(array $tokens) {
71 assert_instances_of($tokens, 'PhutilSearchQueryToken');
74 foreach ($tokens as $token) {
75 if ($token->isQuoted()) {
78 $result[] = $this->renderToken($token, $this->getStemmer());
81 return $this->compileRenderedTokens($result);
84 private function compileRenderedTokens(array $list) {
89 $list = array_unique($list);
90 return implode(' ', $list);
93 public function newTokens($query) {
94 $results = $this->tokenizeQuery($query);
97 foreach ($results as $result) {
98 $tokens[] = PhutilSearchQueryToken
::newFromDictionary($result);
104 private function tokenizeQuery($query) {
105 $maximum_bytes = 1024;
107 if ($query === null) {
110 $query_bytes = strlen($query);
111 if ($query_bytes > $maximum_bytes) {
112 throw new PhutilSearchQueryCompilerSyntaxException(
114 'Query is too long (%s bytes, maximum is %s bytes).',
115 new PhutilNumber($query_bytes),
116 new PhutilNumber($maximum_bytes)));
119 $query = phutil_utf8v($query);
120 $length = count($query);
122 $enable_functions = $this->getEnableFunctions();
125 $current_operator = array();
126 $current_token = array();
127 $current_function = null;
131 if ($enable_functions) {
132 $operator_characters = '[~=+-]';
134 $operator_characters = '[+-]';
137 for ($ii = 0; $ii < $length; $ii++
) {
138 $character = $query[$ii];
140 if ($mode == 'scan') {
141 if (preg_match('/^\s\z/u', $character)) {
148 if ($mode == 'function') {
151 if ($enable_functions) {
153 for ($jj = $ii; $jj < $length; $jj++
) {
154 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) {
157 if ($query[$jj] == ':') {
163 if ($found !== false) {
164 $function = array_slice($query, $ii, ($jj - $ii));
165 $current_function = implode('', $function);
167 if (!strlen($current_function)) {
168 $current_function = null;
177 if ($mode == 'operator') {
178 if (!$current_operator) {
179 if (preg_match('/^\s\z/u', $character)) {
184 if (preg_match('/^'.$operator_characters.'\z/', $character)) {
185 $current_operator[] = $character;
192 if ($mode == 'quote') {
193 if (preg_match('/^"\z/', $character)) {
202 if ($mode == 'token') {
204 $was_quoted = $is_quoted;
206 if (preg_match('/^"\z/', $character)) {
212 if (preg_match('/^\s\z/u', $character)) {
217 if (preg_match('/^"\z/', $character)) {
226 'operator' => $current_operator,
227 'quoted' => $was_quoted,
228 'value' => $current_token,
231 if ($enable_functions) {
232 $token['function'] = $current_function;
237 $current_operator = array();
238 $current_token = array();
239 $current_function = null;
242 $current_token[] = $character;
248 throw new PhutilSearchQueryCompilerSyntaxException(
250 'Query contains unmatched double quotes.'));
253 // If the input query has trailing space, like "a b ", we may exit the
254 // parser without a final token.
255 if ($current_function !== null ||
$current_operator ||
$current_token) {
257 'operator' => $current_operator,
259 'value' => $current_token,
262 if ($enable_functions) {
263 $token['function'] = $current_function;
270 $last_function = null;
271 foreach ($tokens as $token) {
272 $value = implode('', $token['value']);
273 $operator_string = implode('', $token['operator']);
274 $is_quoted = $token['quoted'];
276 switch ($operator_string) {
278 $operator = self
::OPERATOR_NOT
;
281 $operator = self
::OPERATOR_SUBSTRING
;
284 $operator = self
::OPERATOR_EXACT
;
287 $operator = self
::OPERATOR_AND
;
290 $use_substring = false;
292 if ($enable_functions && !$is_quoted) {
293 // See T12995. If this query term contains Chinese, Japanese or
294 // Korean characters, treat the term as a substring term by default.
295 // These languages do not separate words with spaces, so the term
296 // search mode is normally useless.
297 if (phutil_utf8_is_cjk($value)) {
298 $use_substring = true;
299 } else if (phutil_preg_match('/^_/', $value)) {
300 // See T13632. Assume users searching for any term that begins
301 // with an undescore intend to perform substring search if they
302 // don't provide an explicit search function.
303 $use_substring = true;
307 if ($use_substring) {
308 $operator = self
::OPERATOR_SUBSTRING
;
310 $operator = self
::OPERATOR_AND
;
314 throw new PhutilSearchQueryCompilerSyntaxException(
316 'Query has an invalid sequence of operators ("%s").',
320 if (!strlen($value)) {
321 $require_value = $is_quoted;
324 case self
::OPERATOR_NOT
:
325 if ($enable_functions && ($token['function'] !== null)) {
326 $operator = self
::OPERATOR_ABSENT
;
329 $require_value = true;
332 case self
::OPERATOR_SUBSTRING
:
333 if ($enable_functions && ($token['function'] !== null)) {
334 $operator = self
::OPERATOR_PRESENT
;
337 $require_value = true;
341 $require_value = true;
345 if ($require_value) {
346 throw new PhutilSearchQueryCompilerSyntaxException(
348 'Query contains a token ("%s") with no search term. Query '.
349 'tokens specify text to search for.',
350 $this->getDisplayToken($token)));
355 'operator' => $operator,
356 'quoted' => $is_quoted,
358 'raw' => $this->getDisplayToken($token),
361 if ($enable_functions) {
362 // If a user provides a query like "title:a b c", we interpret all
363 // of the terms to be title terms: the "title:" function sticks
364 // until we encounter another function.
366 // If a user provides a query like "title:"a"" (with a quoted term),
367 // the function is not sticky.
369 if ($token['function'] !== null) {
370 $function = $token['function'];
372 $function = $last_function;
375 $result['function'] = $function;
377 // Note that the function remains sticky across quoted terms appearing
378 // after the function term. For example, all of these terms are title
383 $is_sticky = (!$result['quoted'] ||
($token['function'] === null));
386 case self
::OPERATOR_ABSENT
:
387 case self
::OPERATOR_PRESENT
:
393 $last_function = $function;
395 $last_function = null;
399 $results[] = $result;
402 if ($enable_functions) {
403 // If any function is required to be "absent", there must be no other
404 // terms which make assertions about it.
406 $present_tokens = array();
407 $absent_tokens = array();
408 foreach ($results as $result) {
409 $function = $result['function'];
411 if ($result['operator'] === self
::OPERATOR_ABSENT
) {
412 $absent_tokens[$function][] = $result;
414 $present_tokens[$function][] = $result;
418 foreach ($absent_tokens as $function => $tokens) {
419 $absent_token = head($tokens);
421 if (empty($present_tokens[$function])) {
425 $present_token = head($present_tokens[$function]);
427 throw new PhutilSearchQueryCompilerSyntaxException(
429 'Query field must be absent ("%s") and present ("%s"). This '.
430 'is impossible, so the query is not valid.',
431 $absent_token['raw'],
432 $present_token['raw']));
439 private function renderToken(
440 PhutilSearchQueryToken
$token,
441 PhutilSearchStemmer
$stemmer = null) {
442 $value = $token->getValue();
445 $value = $stemmer->stemToken($value);
448 $value = $this->quoteToken($value);
449 $operator = $token->getOperator();
450 $prefix = $this->getOperatorPrefix($operator);
452 $value = $prefix.$value;
457 private function getOperatorPrefix($operator) {
458 $operators = $this->operators
;
461 case self
::OPERATOR_AND
:
462 $prefix = $operators[0];
464 case self
::OPERATOR_NOT
:
465 $prefix = $operators[2];
468 throw new PhutilSearchQueryCompilerSyntaxException(
470 'Unsupported operator prefix "%s".',
474 if ($prefix == ' ') {
481 private function quoteToken($value) {
482 $operators = $this->operators
;
484 $open_quote = $this->operators
[10];
485 $close_quote = $this->operators
[11];
487 return $open_quote.$value.$close_quote;
490 private function getDisplayToken(array $token) {
491 if (isset($token['function'])) {
492 $function = $token['function'].':';
497 $operator_string = implode('', $token['operator']);
499 $value = implode('', $token['value']);
501 $is_quoted = $token['quoted'];
503 $value = $this->quoteToken($value);
506 return sprintf('%s%s%s', $function, $operator_string, $value);