3 final class PhutilSearchStemmer
6 public function stemToken($token) {
7 $token = $this->normalizeToken($token);
8 return $this->applyStemmer($token);
11 public function stemCorpus($corpus) {
12 $corpus = $this->normalizeCorpus($corpus);
13 $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
16 foreach ($tokens as $key => $token) {
17 $token = trim($token, '._');
19 if (strlen($token) < 3) {
23 $words[$token] = $token;
27 foreach ($words as $word) {
28 $stems[] = $this->applyStemmer($word);
31 return implode(' ', $stems);
34 private function normalizeToken($token) {
35 return phutil_utf8_strtolower($token);
38 private function normalizeCorpus($corpus) {
39 return phutil_utf8_strtolower($corpus);
43 * @phutil-external-symbol class Porter
45 private function applyStemmer($normalized_token) {
46 // If the token has internal punctuation, handle it literally. This
47 // deals with things like domain names, Conduit API methods, and other
48 // sorts of informal tokens.
49 if (preg_match('/[._]/', $normalized_token)) {
50 return $normalized_token;
55 if ($loaded === null) {
56 $root = dirname(phutil_get_library_root('phabricator'));
57 require_once $root.'/externals/porter-stemmer/src/Porter.php';
62 $stem = Porter
::stem($normalized_token);
64 // If the stem is too short, it won't be a candidate for indexing. These
65 // tokens are also likely to be acronyms (like "DNS") rather than real
67 if (strlen($stem) < 3) {
68 return $normalized_token;