3 final class PhutilSearchQueryCompiler
6 private $operators = '+ -><()~*:""&|';
9 private $enableFunctions = false;
11 const OPERATOR_NOT
= 'not';
12 const OPERATOR_AND
= 'and';
13 const OPERATOR_SUBSTRING
= 'sub';
14 const OPERATOR_EXACT
= 'exact';
15 const OPERATOR_ABSENT
= 'absent';
16 const OPERATOR_PRESENT
= 'present';
18 public function setOperators($operators) {
19 $this->operators
= $operators;
23 public function getOperators() {
24 return $this->operators
;
27 public function setStemmer(PhutilSearchStemmer
$stemmer) {
28 $this->stemmer
= $stemmer;
32 public function getStemmer() {
33 return $this->stemmer
;
36 public function setEnableFunctions($enable_functions) {
37 $this->enableFunctions
= $enable_functions;
41 public function getEnableFunctions() {
42 return $this->enableFunctions
;
45 public function compileQuery(array $tokens) {
46 assert_instances_of($tokens, 'PhutilSearchQueryToken');
49 foreach ($tokens as $token) {
50 $result[] = $this->renderToken($token);
53 return $this->compileRenderedTokens($result);
56 public function compileLiteralQuery(array $tokens) {
57 assert_instances_of($tokens, 'PhutilSearchQueryToken');
60 foreach ($tokens as $token) {
61 if (!$token->isQuoted()) {
64 $result[] = $this->renderToken($token);
67 return $this->compileRenderedTokens($result);
70 public function compileStemmedQuery(array $tokens) {
71 assert_instances_of($tokens, 'PhutilSearchQueryToken');
74 foreach ($tokens as $token) {
75 if ($token->isQuoted()) {
78 $result[] = $this->renderToken($token, $this->getStemmer());
81 return $this->compileRenderedTokens($result);
84 private function compileRenderedTokens(array $list) {
89 $list = array_unique($list);
90 return implode(' ', $list);
93 public function newTokens($query) {
94 $results = $this->tokenizeQuery($query);
97 foreach ($results as $result) {
98 $tokens[] = PhutilSearchQueryToken
::newFromDictionary($result);
104 private function tokenizeQuery($query) {
105 $maximum_bytes = 1024;
107 $query_bytes = strlen($query);
108 if ($query_bytes > $maximum_bytes) {
109 throw new PhutilSearchQueryCompilerSyntaxException(
111 'Query is too long (%s bytes, maximum is %s bytes).',
112 new PhutilNumber($query_bytes),
113 new PhutilNumber($maximum_bytes)));
116 $query = phutil_utf8v($query);
117 $length = count($query);
119 $enable_functions = $this->getEnableFunctions();
122 $current_operator = array();
123 $current_token = array();
124 $current_function = null;
128 if ($enable_functions) {
129 $operator_characters = '[~=+-]';
131 $operator_characters = '[+-]';
134 for ($ii = 0; $ii < $length; $ii++
) {
135 $character = $query[$ii];
137 if ($mode == 'scan') {
138 if (preg_match('/^\s\z/u', $character)) {
145 if ($mode == 'function') {
148 if ($enable_functions) {
150 for ($jj = $ii; $jj < $length; $jj++
) {
151 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) {
154 if ($query[$jj] == ':') {
160 if ($found !== false) {
161 $function = array_slice($query, $ii, ($jj - $ii));
162 $current_function = implode('', $function);
164 if (!strlen($current_function)) {
165 $current_function = null;
174 if ($mode == 'operator') {
175 if (!$current_operator) {
176 if (preg_match('/^\s\z/u', $character)) {
181 if (preg_match('/^'.$operator_characters.'\z/', $character)) {
182 $current_operator[] = $character;
189 if ($mode == 'quote') {
190 if (preg_match('/^"\z/', $character)) {
199 if ($mode == 'token') {
201 $was_quoted = $is_quoted;
203 if (preg_match('/^"\z/', $character)) {
209 if (preg_match('/^\s\z/u', $character)) {
214 if (preg_match('/^"\z/', $character)) {
223 'operator' => $current_operator,
224 'quoted' => $was_quoted,
225 'value' => $current_token,
228 if ($enable_functions) {
229 $token['function'] = $current_function;
234 $current_operator = array();
235 $current_token = array();
236 $current_function = null;
239 $current_token[] = $character;
245 throw new PhutilSearchQueryCompilerSyntaxException(
247 'Query contains unmatched double quotes.'));
250 // If the input query has trailing space, like "a b ", we may exit the
251 // parser without a final token.
252 if ($current_function !== null ||
$current_operator ||
$current_token) {
254 'operator' => $current_operator,
256 'value' => $current_token,
259 if ($enable_functions) {
260 $token['function'] = $current_function;
267 $last_function = null;
268 foreach ($tokens as $token) {
269 $value = implode('', $token['value']);
270 $operator_string = implode('', $token['operator']);
271 $is_quoted = $token['quoted'];
273 switch ($operator_string) {
275 $operator = self
::OPERATOR_NOT
;
278 $operator = self
::OPERATOR_SUBSTRING
;
281 $operator = self
::OPERATOR_EXACT
;
284 $operator = self
::OPERATOR_AND
;
287 $use_substring = false;
289 if ($enable_functions && !$is_quoted) {
290 // See T12995. If this query term contains Chinese, Japanese or
291 // Korean characters, treat the term as a substring term by default.
292 // These languages do not separate words with spaces, so the term
293 // search mode is normally useless.
294 if (phutil_utf8_is_cjk($value)) {
295 $use_substring = true;
296 } else if (phutil_preg_match('/^_/', $value)) {
297 // See T13632. Assume users searching for any term that begins
298 // with an undescore intend to perform substring search if they
299 // don't provide an explicit search function.
300 $use_substring = true;
304 if ($use_substring) {
305 $operator = self
::OPERATOR_SUBSTRING
;
307 $operator = self
::OPERATOR_AND
;
311 throw new PhutilSearchQueryCompilerSyntaxException(
313 'Query has an invalid sequence of operators ("%s").',
317 if (!strlen($value)) {
318 $require_value = $is_quoted;
321 case self
::OPERATOR_NOT
:
322 if ($enable_functions && ($token['function'] !== null)) {
323 $operator = self
::OPERATOR_ABSENT
;
326 $require_value = true;
329 case self
::OPERATOR_SUBSTRING
:
330 if ($enable_functions && ($token['function'] !== null)) {
331 $operator = self
::OPERATOR_PRESENT
;
334 $require_value = true;
338 $require_value = true;
342 if ($require_value) {
343 throw new PhutilSearchQueryCompilerSyntaxException(
345 'Query contains a token ("%s") with no search term. Query '.
346 'tokens specify text to search for.',
347 $this->getDisplayToken($token)));
352 'operator' => $operator,
353 'quoted' => $is_quoted,
355 'raw' => $this->getDisplayToken($token),
358 if ($enable_functions) {
359 // If a user provides a query like "title:a b c", we interpret all
360 // of the terms to be title terms: the "title:" function sticks
361 // until we encounter another function.
363 // If a user provides a query like "title:"a"" (with a quoted term),
364 // the function is not sticky.
366 if ($token['function'] !== null) {
367 $function = $token['function'];
369 $function = $last_function;
372 $result['function'] = $function;
374 // Note that the function remains sticky across quoted terms appearing
375 // after the function term. For example, all of these terms are title
380 $is_sticky = (!$result['quoted'] ||
($token['function'] === null));
383 case self
::OPERATOR_ABSENT
:
384 case self
::OPERATOR_PRESENT
:
390 $last_function = $function;
392 $last_function = null;
396 $results[] = $result;
399 if ($enable_functions) {
400 // If any function is required to be "absent", there must be no other
401 // terms which make assertions about it.
403 $present_tokens = array();
404 $absent_tokens = array();
405 foreach ($results as $result) {
406 $function = $result['function'];
408 if ($result['operator'] === self
::OPERATOR_ABSENT
) {
409 $absent_tokens[$function][] = $result;
411 $present_tokens[$function][] = $result;
415 foreach ($absent_tokens as $function => $tokens) {
416 $absent_token = head($tokens);
418 if (empty($present_tokens[$function])) {
422 $present_token = head($present_tokens[$function]);
424 throw new PhutilSearchQueryCompilerSyntaxException(
426 'Query field must be absent ("%s") and present ("%s"). This '.
427 'is impossible, so the query is not valid.',
428 $absent_token['raw'],
429 $present_token['raw']));
436 private function renderToken(
437 PhutilSearchQueryToken
$token,
438 PhutilSearchStemmer
$stemmer = null) {
439 $value = $token->getValue();
442 $value = $stemmer->stemToken($value);
445 $value = $this->quoteToken($value);
446 $operator = $token->getOperator();
447 $prefix = $this->getOperatorPrefix($operator);
449 $value = $prefix.$value;
454 private function getOperatorPrefix($operator) {
455 $operators = $this->operators
;
458 case self
::OPERATOR_AND
:
459 $prefix = $operators[0];
461 case self
::OPERATOR_NOT
:
462 $prefix = $operators[2];
465 throw new PhutilSearchQueryCompilerSyntaxException(
467 'Unsupported operator prefix "%s".',
471 if ($prefix == ' ') {
478 private function quoteToken($value) {
479 $operators = $this->operators
;
481 $open_quote = $this->operators
[10];
482 $close_quote = $this->operators
[11];
484 return $open_quote.$value.$close_quote;
487 private function getDisplayToken(array $token) {
488 if (isset($token['function'])) {
489 $function = $token['function'].':';
494 $operator_string = implode('', $token['operator']);
496 $value = implode('', $token['value']);
498 $is_quoted = $token['quoted'];
500 $value = $this->quoteToken($value);
503 return sprintf('%s%s%s', $function, $operator_string, $value);