Correct a parameter order swap in "diffusion.historyquery" for Mercurial
[phabricator.git] / src / applications / search / compiler / PhutilSearchQueryCompiler.php
blobc3f93e16c9de0ce40161751e6cce8b244d6f85d6
1 <?php
3 final class PhutilSearchQueryCompiler
4 extends Phobject {
6 private $operators = '+ -><()~*:""&|';
7 private $query;
8 private $stemmer;
9 private $enableFunctions = false;
11 const OPERATOR_NOT = 'not';
12 const OPERATOR_AND = 'and';
13 const OPERATOR_SUBSTRING = 'sub';
14 const OPERATOR_EXACT = 'exact';
15 const OPERATOR_ABSENT = 'absent';
16 const OPERATOR_PRESENT = 'present';
18 public function setOperators($operators) {
19 $this->operators = $operators;
20 return $this;
23 public function getOperators() {
24 return $this->operators;
27 public function setStemmer(PhutilSearchStemmer $stemmer) {
28 $this->stemmer = $stemmer;
29 return $this;
32 public function getStemmer() {
33 return $this->stemmer;
36 public function setEnableFunctions($enable_functions) {
37 $this->enableFunctions = $enable_functions;
38 return $this;
41 public function getEnableFunctions() {
42 return $this->enableFunctions;
45 public function compileQuery(array $tokens) {
46 assert_instances_of($tokens, 'PhutilSearchQueryToken');
48 $result = array();
49 foreach ($tokens as $token) {
50 $result[] = $this->renderToken($token);
53 return $this->compileRenderedTokens($result);
56 public function compileLiteralQuery(array $tokens) {
57 assert_instances_of($tokens, 'PhutilSearchQueryToken');
59 $result = array();
60 foreach ($tokens as $token) {
61 if (!$token->isQuoted()) {
62 continue;
64 $result[] = $this->renderToken($token);
67 return $this->compileRenderedTokens($result);
70 public function compileStemmedQuery(array $tokens) {
71 assert_instances_of($tokens, 'PhutilSearchQueryToken');
73 $result = array();
74 foreach ($tokens as $token) {
75 if ($token->isQuoted()) {
76 continue;
78 $result[] = $this->renderToken($token, $this->getStemmer());
81 return $this->compileRenderedTokens($result);
84 private function compileRenderedTokens(array $list) {
85 if (!$list) {
86 return null;
89 $list = array_unique($list);
90 return implode(' ', $list);
93 public function newTokens($query) {
94 $results = $this->tokenizeQuery($query);
96 $tokens = array();
97 foreach ($results as $result) {
98 $tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
101 return $tokens;
104 private function tokenizeQuery($query) {
105 $maximum_bytes = 1024;
107 $query_bytes = strlen($query);
108 if ($query_bytes > $maximum_bytes) {
109 throw new PhutilSearchQueryCompilerSyntaxException(
110 pht(
111 'Query is too long (%s bytes, maximum is %s bytes).',
112 new PhutilNumber($query_bytes),
113 new PhutilNumber($maximum_bytes)));
116 $query = phutil_utf8v($query);
117 $length = count($query);
119 $enable_functions = $this->getEnableFunctions();
121 $mode = 'scan';
122 $current_operator = array();
123 $current_token = array();
124 $current_function = null;
125 $is_quoted = false;
126 $tokens = array();
128 if ($enable_functions) {
129 $operator_characters = '[~=+-]';
130 } else {
131 $operator_characters = '[+-]';
134 for ($ii = 0; $ii < $length; $ii++) {
135 $character = $query[$ii];
137 if ($mode == 'scan') {
138 if (preg_match('/^\s\z/u', $character)) {
139 continue;
142 $mode = 'function';
145 if ($mode == 'function') {
146 $mode = 'operator';
148 if ($enable_functions) {
149 $found = false;
150 for ($jj = $ii; $jj < $length; $jj++) {
151 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) {
152 continue;
154 if ($query[$jj] == ':') {
155 $found = $jj;
157 break;
160 if ($found !== false) {
161 $function = array_slice($query, $ii, ($jj - $ii));
162 $current_function = implode('', $function);
164 if (!strlen($current_function)) {
165 $current_function = null;
168 $ii = $jj;
169 continue;
174 if ($mode == 'operator') {
175 if (!$current_operator) {
176 if (preg_match('/^\s\z/u', $character)) {
177 continue;
181 if (preg_match('/^'.$operator_characters.'\z/', $character)) {
182 $current_operator[] = $character;
183 continue;
186 $mode = 'quote';
189 if ($mode == 'quote') {
190 if (preg_match('/^"\z/', $character)) {
191 $is_quoted = true;
192 $mode = 'token';
193 continue;
196 $mode = 'token';
199 if ($mode == 'token') {
200 $capture = false;
201 $was_quoted = $is_quoted;
202 if ($is_quoted) {
203 if (preg_match('/^"\z/', $character)) {
204 $capture = true;
205 $mode = 'scan';
206 $is_quoted = false;
208 } else {
209 if (preg_match('/^\s\z/u', $character)) {
210 $capture = true;
211 $mode = 'scan';
214 if (preg_match('/^"\z/', $character)) {
215 $capture = true;
216 $mode = 'token';
217 $is_quoted = true;
221 if ($capture) {
222 $token = array(
223 'operator' => $current_operator,
224 'quoted' => $was_quoted,
225 'value' => $current_token,
228 if ($enable_functions) {
229 $token['function'] = $current_function;
232 $tokens[] = $token;
234 $current_operator = array();
235 $current_token = array();
236 $current_function = null;
237 continue;
238 } else {
239 $current_token[] = $character;
244 if ($is_quoted) {
245 throw new PhutilSearchQueryCompilerSyntaxException(
246 pht(
247 'Query contains unmatched double quotes.'));
250 // If the input query has trailing space, like "a b ", we may exit the
251 // parser without a final token.
252 if ($current_function !== null || $current_operator || $current_token) {
253 $token = array(
254 'operator' => $current_operator,
255 'quoted' => false,
256 'value' => $current_token,
259 if ($enable_functions) {
260 $token['function'] = $current_function;
263 $tokens[] = $token;
266 $results = array();
267 $last_function = null;
268 foreach ($tokens as $token) {
269 $value = implode('', $token['value']);
270 $operator_string = implode('', $token['operator']);
271 $is_quoted = $token['quoted'];
273 switch ($operator_string) {
274 case '-':
275 $operator = self::OPERATOR_NOT;
276 break;
277 case '~':
278 $operator = self::OPERATOR_SUBSTRING;
279 break;
280 case '=':
281 $operator = self::OPERATOR_EXACT;
282 break;
283 case '+':
284 $operator = self::OPERATOR_AND;
285 break;
286 case '':
287 $use_substring = false;
289 if ($enable_functions && !$is_quoted) {
290 // See T12995. If this query term contains Chinese, Japanese or
291 // Korean characters, treat the term as a substring term by default.
292 // These languages do not separate words with spaces, so the term
293 // search mode is normally useless.
294 if (phutil_utf8_is_cjk($value)) {
295 $use_substring = true;
296 } else if (phutil_preg_match('/^_/', $value)) {
297 // See T13632. Assume users searching for any term that begins
298 // with an undescore intend to perform substring search if they
299 // don't provide an explicit search function.
300 $use_substring = true;
304 if ($use_substring) {
305 $operator = self::OPERATOR_SUBSTRING;
306 } else {
307 $operator = self::OPERATOR_AND;
309 break;
310 default:
311 throw new PhutilSearchQueryCompilerSyntaxException(
312 pht(
313 'Query has an invalid sequence of operators ("%s").',
314 $operator_string));
317 if (!strlen($value)) {
318 $require_value = $is_quoted;
320 switch ($operator) {
321 case self::OPERATOR_NOT:
322 if ($enable_functions && ($token['function'] !== null)) {
323 $operator = self::OPERATOR_ABSENT;
324 $value = null;
325 } else {
326 $require_value = true;
328 break;
329 case self::OPERATOR_SUBSTRING:
330 if ($enable_functions && ($token['function'] !== null)) {
331 $operator = self::OPERATOR_PRESENT;
332 $value = null;
333 } else {
334 $require_value = true;
336 break;
337 default:
338 $require_value = true;
339 break;
342 if ($require_value) {
343 throw new PhutilSearchQueryCompilerSyntaxException(
344 pht(
345 'Query contains a token ("%s") with no search term. Query '.
346 'tokens specify text to search for.',
347 $this->getDisplayToken($token)));
351 $result = array(
352 'operator' => $operator,
353 'quoted' => $is_quoted,
354 'value' => $value,
355 'raw' => $this->getDisplayToken($token),
358 if ($enable_functions) {
359 // If a user provides a query like "title:a b c", we interpret all
360 // of the terms to be title terms: the "title:" function sticks
361 // until we encounter another function.
363 // If a user provides a query like "title:"a"" (with a quoted term),
364 // the function is not sticky.
366 if ($token['function'] !== null) {
367 $function = $token['function'];
368 } else {
369 $function = $last_function;
372 $result['function'] = $function;
374 // Note that the function remains sticky across quoted terms appearing
375 // after the function term. For example, all of these terms are title
376 // terms:
378 // title:a "b c" d
380 $is_sticky = (!$result['quoted'] || ($token['function'] === null));
382 switch ($operator) {
383 case self::OPERATOR_ABSENT:
384 case self::OPERATOR_PRESENT:
385 $is_sticky = false;
386 break;
389 if ($is_sticky) {
390 $last_function = $function;
391 } else {
392 $last_function = null;
396 $results[] = $result;
399 if ($enable_functions) {
400 // If any function is required to be "absent", there must be no other
401 // terms which make assertions about it.
403 $present_tokens = array();
404 $absent_tokens = array();
405 foreach ($results as $result) {
406 $function = $result['function'];
408 if ($result['operator'] === self::OPERATOR_ABSENT) {
409 $absent_tokens[$function][] = $result;
410 } else {
411 $present_tokens[$function][] = $result;
415 foreach ($absent_tokens as $function => $tokens) {
416 $absent_token = head($tokens);
418 if (empty($present_tokens[$function])) {
419 continue;
422 $present_token = head($present_tokens[$function]);
424 throw new PhutilSearchQueryCompilerSyntaxException(
425 pht(
426 'Query field must be absent ("%s") and present ("%s"). This '.
427 'is impossible, so the query is not valid.',
428 $absent_token['raw'],
429 $present_token['raw']));
433 return $results;
436 private function renderToken(
437 PhutilSearchQueryToken $token,
438 PhutilSearchStemmer $stemmer = null) {
439 $value = $token->getValue();
441 if ($stemmer) {
442 $value = $stemmer->stemToken($value);
445 $value = $this->quoteToken($value);
446 $operator = $token->getOperator();
447 $prefix = $this->getOperatorPrefix($operator);
449 $value = $prefix.$value;
451 return $value;
454 private function getOperatorPrefix($operator) {
455 $operators = $this->operators;
457 switch ($operator) {
458 case self::OPERATOR_AND:
459 $prefix = $operators[0];
460 break;
461 case self::OPERATOR_NOT:
462 $prefix = $operators[2];
463 break;
464 default:
465 throw new PhutilSearchQueryCompilerSyntaxException(
466 pht(
467 'Unsupported operator prefix "%s".',
468 $operator));
471 if ($prefix == ' ') {
472 $prefix = null;
475 return $prefix;
478 private function quoteToken($value) {
479 $operators = $this->operators;
481 $open_quote = $this->operators[10];
482 $close_quote = $this->operators[11];
484 return $open_quote.$value.$close_quote;
487 private function getDisplayToken(array $token) {
488 if (isset($token['function'])) {
489 $function = $token['function'].':';
490 } else {
491 $function = '';
494 $operator_string = implode('', $token['operator']);
496 $value = implode('', $token['value']);
498 $is_quoted = $token['quoted'];
499 if ($is_quoted) {
500 $value = $this->quoteToken($value);
503 return sprintf('%s%s%s', $function, $operator_string, $value);