src/applications/search/engine/PhabricatorSearchNgramEngine.php

   1 <?php
   2
   3 final class PhabricatorSearchNgramEngine
   4   extends Phobject {
   5
   6   public function tokenizeNgramString($value) {
   7     $value = trim($value, ' ');
   8     $value = preg_split('/\s+/u', $value);
   9     return $value;
  10   }
  11
  12   public function getTermNgramsFromString($string) {
  13     return $this->getNgramsFromString($string, true);
  14   }
  15
  16   public function getSubstringNgramsFromString($string) {
  17     return $this->getNgramsFromString($string, false);
  18   }
  19
  20   private function getNgramsFromString($value, $as_term) {
  21     $value = phutil_utf8_strtolower($value);
  22     $tokens = $this->tokenizeNgramString($value);
  23
  24     // First, extract unique tokens from the string. This reduces the number
  25     // of `phutil_utf8v()` calls we need to make if we are indexing a large
  26     // corpus with redundant terms.
  27     $unique_tokens = array();
  28     foreach ($tokens as $token) {
  29       if ($as_term) {
  30         $token = ' '.$token.' ';
  31       }
  32
  33       $unique_tokens[$token] = true;
  34     }
  35
  36     $ngrams = array();
  37     foreach ($unique_tokens as $token => $ignored) {
  38       $token_v = phutil_utf8v($token);
  39       $length = count($token_v);
  40
  41       // NOTE: We're being somewhat clever here to micro-optimize performance,
  42       // especially for very long strings. See PHI87.
  43
  44       $token_l = array();
  45       for ($ii = 0; $ii < $length; $ii++) {
  46         $token_l[$ii] = strlen($token_v[$ii]);
  47       }
  48
  49       $ngram_count = $length - 2;
  50       $cursor = 0;
  51       for ($ii = 0; $ii < $ngram_count; $ii++) {
  52         $ngram_l = $token_l[$ii] + $token_l[$ii + 1] + $token_l[$ii + 2];
  53
  54         $ngram = substr($token, $cursor, $ngram_l);
  55         $ngrams[$ngram] = $ngram;
  56
  57         $cursor += $token_l[$ii];
  58       }
  59     }
  60
  61     ksort($ngrams);
  62
  63     return array_keys($ngrams);
  64   }
  65
  66 }