Correct a parameter order swap in "diffusion.historyquery" for Mercurial
[phabricator.git] / src / applications / search / engine / PhabricatorSearchNgramEngine.php
blob9cd2c499148a9b8d44f81cdcf572c3e470a2ddcd
1 <?php
3 final class PhabricatorSearchNgramEngine
4 extends Phobject {
6 public function tokenizeNgramString($value) {
7 $value = trim($value, ' ');
8 $value = preg_split('/\s+/u', $value);
9 return $value;
12 public function getTermNgramsFromString($string) {
13 return $this->getNgramsFromString($string, true);
16 public function getSubstringNgramsFromString($string) {
17 return $this->getNgramsFromString($string, false);
20 private function getNgramsFromString($value, $as_term) {
21 $value = phutil_utf8_strtolower($value);
22 $tokens = $this->tokenizeNgramString($value);
24 // First, extract unique tokens from the string. This reduces the number
25 // of `phutil_utf8v()` calls we need to make if we are indexing a large
26 // corpus with redundant terms.
27 $unique_tokens = array();
28 foreach ($tokens as $token) {
29 if ($as_term) {
30 $token = ' '.$token.' ';
33 $unique_tokens[$token] = true;
36 $ngrams = array();
37 foreach ($unique_tokens as $token => $ignored) {
38 $token_v = phutil_utf8v($token);
39 $length = count($token_v);
41 // NOTE: We're being somewhat clever here to micro-optimize performance,
42 // especially for very long strings. See PHI87.
44 $token_l = array();
45 for ($ii = 0; $ii < $length; $ii++) {
46 $token_l[$ii] = strlen($token_v[$ii]);
49 $ngram_count = $length - 2;
50 $cursor = 0;
51 for ($ii = 0; $ii < $ngram_count; $ii++) {
52 $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
54 $ngram = substr($token, $cursor, $ngram_l);
55 $ngrams[$ngram] = $ngram;
57 $cursor += $token_l[$ii];
61 ksort($ngrams);
63 return array_keys($ngrams);