3 final class PhabricatorSearchNgramEngine
6 public function tokenizeNgramString($value) {
7 $value = trim($value, ' ');
8 $value = preg_split('/\s+/u', $value);
12 public function getTermNgramsFromString($string) {
13 return $this->getNgramsFromString($string, true);
16 public function getSubstringNgramsFromString($string) {
17 return $this->getNgramsFromString($string, false);
20 private function getNgramsFromString($value, $as_term) {
21 $value = phutil_utf8_strtolower($value);
22 $tokens = $this->tokenizeNgramString($value);
24 // First, extract unique tokens from the string. This reduces the number
25 // of `phutil_utf8v()` calls we need to make if we are indexing a large
26 // corpus with redundant terms.
27 $unique_tokens = array();
28 foreach ($tokens as $token) {
30 $token = ' '.$token.' ';
33 $unique_tokens[$token] = true;
37 foreach ($unique_tokens as $token => $ignored) {
38 $token_v = phutil_utf8v($token);
39 $length = count($token_v);
41 // NOTE: We're being somewhat clever here to micro-optimize performance,
42 // especially for very long strings. See PHI87.
45 for ($ii = 0; $ii < $length; $ii++
) {
46 $token_l[$ii] = strlen($token_v[$ii]);
49 $ngram_count = $length - 2;
51 for ($ii = 0; $ii < $ngram_count; $ii++
) {
52 $ngram_l = $token_l[$ii] +
$token_l[$ii +
1] +
$token_l[$ii +
2];
54 $ngram = substr($token, $cursor, $ngram_l);
55 $ngrams[$ngram] = $ngram;
57 $cursor +
= $token_l[$ii];
63 return array_keys($ngrams);