Correct Aphlict websocket URI construction after PHP8 compatibility changes
[phabricator.git] / src / applications / search / compiler / PhutilSearchQueryCompiler.php
blob79e246c1177b20fd21246a16a8f2ff5b68411da0
1 <?php
3 final class PhutilSearchQueryCompiler
4 extends Phobject {
6 private $operators = '+ -><()~*:""&|';
7 private $query;
8 private $stemmer;
9 private $enableFunctions = false;
11 const OPERATOR_NOT = 'not';
12 const OPERATOR_AND = 'and';
13 const OPERATOR_SUBSTRING = 'sub';
14 const OPERATOR_EXACT = 'exact';
15 const OPERATOR_ABSENT = 'absent';
16 const OPERATOR_PRESENT = 'present';
18 public function setOperators($operators) {
19 $this->operators = $operators;
20 return $this;
23 public function getOperators() {
24 return $this->operators;
27 public function setStemmer(PhutilSearchStemmer $stemmer) {
28 $this->stemmer = $stemmer;
29 return $this;
32 public function getStemmer() {
33 return $this->stemmer;
36 public function setEnableFunctions($enable_functions) {
37 $this->enableFunctions = $enable_functions;
38 return $this;
41 public function getEnableFunctions() {
42 return $this->enableFunctions;
45 public function compileQuery(array $tokens) {
46 assert_instances_of($tokens, 'PhutilSearchQueryToken');
48 $result = array();
49 foreach ($tokens as $token) {
50 $result[] = $this->renderToken($token);
53 return $this->compileRenderedTokens($result);
56 public function compileLiteralQuery(array $tokens) {
57 assert_instances_of($tokens, 'PhutilSearchQueryToken');
59 $result = array();
60 foreach ($tokens as $token) {
61 if (!$token->isQuoted()) {
62 continue;
64 $result[] = $this->renderToken($token);
67 return $this->compileRenderedTokens($result);
70 public function compileStemmedQuery(array $tokens) {
71 assert_instances_of($tokens, 'PhutilSearchQueryToken');
73 $result = array();
74 foreach ($tokens as $token) {
75 if ($token->isQuoted()) {
76 continue;
78 $result[] = $this->renderToken($token, $this->getStemmer());
81 return $this->compileRenderedTokens($result);
84 private function compileRenderedTokens(array $list) {
85 if (!$list) {
86 return null;
89 $list = array_unique($list);
90 return implode(' ', $list);
93 public function newTokens($query) {
94 $results = $this->tokenizeQuery($query);
96 $tokens = array();
97 foreach ($results as $result) {
98 $tokens[] = PhutilSearchQueryToken::newFromDictionary($result);
101 return $tokens;
104 private function tokenizeQuery($query) {
105 $maximum_bytes = 1024;
107 if ($query === null) {
108 $query = '';
110 $query_bytes = strlen($query);
111 if ($query_bytes > $maximum_bytes) {
112 throw new PhutilSearchQueryCompilerSyntaxException(
113 pht(
114 'Query is too long (%s bytes, maximum is %s bytes).',
115 new PhutilNumber($query_bytes),
116 new PhutilNumber($maximum_bytes)));
119 $query = phutil_utf8v($query);
120 $length = count($query);
122 $enable_functions = $this->getEnableFunctions();
124 $mode = 'scan';
125 $current_operator = array();
126 $current_token = array();
127 $current_function = null;
128 $is_quoted = false;
129 $tokens = array();
131 if ($enable_functions) {
132 $operator_characters = '[~=+-]';
133 } else {
134 $operator_characters = '[+-]';
137 for ($ii = 0; $ii < $length; $ii++) {
138 $character = $query[$ii];
140 if ($mode == 'scan') {
141 if (preg_match('/^\s\z/u', $character)) {
142 continue;
145 $mode = 'function';
148 if ($mode == 'function') {
149 $mode = 'operator';
151 if ($enable_functions) {
152 $found = false;
153 for ($jj = $ii; $jj < $length; $jj++) {
154 if (preg_match('/^[a-zA-Z-]\z/u', $query[$jj])) {
155 continue;
157 if ($query[$jj] == ':') {
158 $found = $jj;
160 break;
163 if ($found !== false) {
164 $function = array_slice($query, $ii, ($jj - $ii));
165 $current_function = implode('', $function);
167 if (!strlen($current_function)) {
168 $current_function = null;
171 $ii = $jj;
172 continue;
177 if ($mode == 'operator') {
178 if (!$current_operator) {
179 if (preg_match('/^\s\z/u', $character)) {
180 continue;
184 if (preg_match('/^'.$operator_characters.'\z/', $character)) {
185 $current_operator[] = $character;
186 continue;
189 $mode = 'quote';
192 if ($mode == 'quote') {
193 if (preg_match('/^"\z/', $character)) {
194 $is_quoted = true;
195 $mode = 'token';
196 continue;
199 $mode = 'token';
202 if ($mode == 'token') {
203 $capture = false;
204 $was_quoted = $is_quoted;
205 if ($is_quoted) {
206 if (preg_match('/^"\z/', $character)) {
207 $capture = true;
208 $mode = 'scan';
209 $is_quoted = false;
211 } else {
212 if (preg_match('/^\s\z/u', $character)) {
213 $capture = true;
214 $mode = 'scan';
217 if (preg_match('/^"\z/', $character)) {
218 $capture = true;
219 $mode = 'token';
220 $is_quoted = true;
224 if ($capture) {
225 $token = array(
226 'operator' => $current_operator,
227 'quoted' => $was_quoted,
228 'value' => $current_token,
231 if ($enable_functions) {
232 $token['function'] = $current_function;
235 $tokens[] = $token;
237 $current_operator = array();
238 $current_token = array();
239 $current_function = null;
240 continue;
241 } else {
242 $current_token[] = $character;
247 if ($is_quoted) {
248 throw new PhutilSearchQueryCompilerSyntaxException(
249 pht(
250 'Query contains unmatched double quotes.'));
253 // If the input query has trailing space, like "a b ", we may exit the
254 // parser without a final token.
255 if ($current_function !== null || $current_operator || $current_token) {
256 $token = array(
257 'operator' => $current_operator,
258 'quoted' => false,
259 'value' => $current_token,
262 if ($enable_functions) {
263 $token['function'] = $current_function;
266 $tokens[] = $token;
269 $results = array();
270 $last_function = null;
271 foreach ($tokens as $token) {
272 $value = implode('', $token['value']);
273 $operator_string = implode('', $token['operator']);
274 $is_quoted = $token['quoted'];
276 switch ($operator_string) {
277 case '-':
278 $operator = self::OPERATOR_NOT;
279 break;
280 case '~':
281 $operator = self::OPERATOR_SUBSTRING;
282 break;
283 case '=':
284 $operator = self::OPERATOR_EXACT;
285 break;
286 case '+':
287 $operator = self::OPERATOR_AND;
288 break;
289 case '':
290 $use_substring = false;
292 if ($enable_functions && !$is_quoted) {
293 // See T12995. If this query term contains Chinese, Japanese or
294 // Korean characters, treat the term as a substring term by default.
295 // These languages do not separate words with spaces, so the term
296 // search mode is normally useless.
297 if (phutil_utf8_is_cjk($value)) {
298 $use_substring = true;
299 } else if (phutil_preg_match('/^_/', $value)) {
300 // See T13632. Assume users searching for any term that begins
301 // with an undescore intend to perform substring search if they
302 // don't provide an explicit search function.
303 $use_substring = true;
307 if ($use_substring) {
308 $operator = self::OPERATOR_SUBSTRING;
309 } else {
310 $operator = self::OPERATOR_AND;
312 break;
313 default:
314 throw new PhutilSearchQueryCompilerSyntaxException(
315 pht(
316 'Query has an invalid sequence of operators ("%s").',
317 $operator_string));
320 if (!strlen($value)) {
321 $require_value = $is_quoted;
323 switch ($operator) {
324 case self::OPERATOR_NOT:
325 if ($enable_functions && ($token['function'] !== null)) {
326 $operator = self::OPERATOR_ABSENT;
327 $value = null;
328 } else {
329 $require_value = true;
331 break;
332 case self::OPERATOR_SUBSTRING:
333 if ($enable_functions && ($token['function'] !== null)) {
334 $operator = self::OPERATOR_PRESENT;
335 $value = null;
336 } else {
337 $require_value = true;
339 break;
340 default:
341 $require_value = true;
342 break;
345 if ($require_value) {
346 throw new PhutilSearchQueryCompilerSyntaxException(
347 pht(
348 'Query contains a token ("%s") with no search term. Query '.
349 'tokens specify text to search for.',
350 $this->getDisplayToken($token)));
354 $result = array(
355 'operator' => $operator,
356 'quoted' => $is_quoted,
357 'value' => $value,
358 'raw' => $this->getDisplayToken($token),
361 if ($enable_functions) {
362 // If a user provides a query like "title:a b c", we interpret all
363 // of the terms to be title terms: the "title:" function sticks
364 // until we encounter another function.
366 // If a user provides a query like "title:"a"" (with a quoted term),
367 // the function is not sticky.
369 if ($token['function'] !== null) {
370 $function = $token['function'];
371 } else {
372 $function = $last_function;
375 $result['function'] = $function;
377 // Note that the function remains sticky across quoted terms appearing
378 // after the function term. For example, all of these terms are title
379 // terms:
381 // title:a "b c" d
383 $is_sticky = (!$result['quoted'] || ($token['function'] === null));
385 switch ($operator) {
386 case self::OPERATOR_ABSENT:
387 case self::OPERATOR_PRESENT:
388 $is_sticky = false;
389 break;
392 if ($is_sticky) {
393 $last_function = $function;
394 } else {
395 $last_function = null;
399 $results[] = $result;
402 if ($enable_functions) {
403 // If any function is required to be "absent", there must be no other
404 // terms which make assertions about it.
406 $present_tokens = array();
407 $absent_tokens = array();
408 foreach ($results as $result) {
409 $function = $result['function'];
411 if ($result['operator'] === self::OPERATOR_ABSENT) {
412 $absent_tokens[$function][] = $result;
413 } else {
414 $present_tokens[$function][] = $result;
418 foreach ($absent_tokens as $function => $tokens) {
419 $absent_token = head($tokens);
421 if (empty($present_tokens[$function])) {
422 continue;
425 $present_token = head($present_tokens[$function]);
427 throw new PhutilSearchQueryCompilerSyntaxException(
428 pht(
429 'Query field must be absent ("%s") and present ("%s"). This '.
430 'is impossible, so the query is not valid.',
431 $absent_token['raw'],
432 $present_token['raw']));
436 return $results;
439 private function renderToken(
440 PhutilSearchQueryToken $token,
441 PhutilSearchStemmer $stemmer = null) {
442 $value = $token->getValue();
444 if ($stemmer) {
445 $value = $stemmer->stemToken($value);
448 $value = $this->quoteToken($value);
449 $operator = $token->getOperator();
450 $prefix = $this->getOperatorPrefix($operator);
452 $value = $prefix.$value;
454 return $value;
457 private function getOperatorPrefix($operator) {
458 $operators = $this->operators;
460 switch ($operator) {
461 case self::OPERATOR_AND:
462 $prefix = $operators[0];
463 break;
464 case self::OPERATOR_NOT:
465 $prefix = $operators[2];
466 break;
467 default:
468 throw new PhutilSearchQueryCompilerSyntaxException(
469 pht(
470 'Unsupported operator prefix "%s".',
471 $operator));
474 if ($prefix == ' ') {
475 $prefix = null;
478 return $prefix;
481 private function quoteToken($value) {
482 $operators = $this->operators;
484 $open_quote = $this->operators[10];
485 $close_quote = $this->operators[11];
487 return $open_quote.$value.$close_quote;
490 private function getDisplayToken(array $token) {
491 if (isset($token['function'])) {
492 $function = $token['function'].':';
493 } else {
494 $function = '';
497 $operator_string = implode('', $token['operator']);
499 $value = implode('', $token['value']);
501 $is_quoted = $token['quoted'];
502 if ($is_quoted) {
503 $value = $this->quoteToken($value);
506 return sprintf('%s%s%s', $function, $operator_string, $value);