Correct a parameter order swap in "diffusion.historyquery" for Mercurial
[phabricator.git] / src / applications / search / compiler / PhutilSearchStemmer.php
blob3255336ff0b8c0c469871f34a61d65d7bf068c24
1 <?php
3 final class PhutilSearchStemmer
4 extends Phobject {
6 public function stemToken($token) {
7 $token = $this->normalizeToken($token);
8 return $this->applyStemmer($token);
11 public function stemCorpus($corpus) {
12 $corpus = $this->normalizeCorpus($corpus);
13 $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
15 $words = array();
16 foreach ($tokens as $key => $token) {
17 $token = trim($token, '._');
19 if (strlen($token) < 3) {
20 continue;
23 $words[$token] = $token;
26 $stems = array();
27 foreach ($words as $word) {
28 $stems[] = $this->applyStemmer($word);
31 return implode(' ', $stems);
34 private function normalizeToken($token) {
35 return phutil_utf8_strtolower($token);
38 private function normalizeCorpus($corpus) {
39 return phutil_utf8_strtolower($corpus);
42 /**
43 * @phutil-external-symbol class Porter
45 private function applyStemmer($normalized_token) {
46 // If the token has internal punctuation, handle it literally. This
47 // deals with things like domain names, Conduit API methods, and other
48 // sorts of informal tokens.
49 if (preg_match('/[._]/', $normalized_token)) {
50 return $normalized_token;
53 static $loaded;
55 if ($loaded === null) {
56 $root = dirname(phutil_get_library_root('phabricator'));
57 require_once $root.'/externals/porter-stemmer/src/Porter.php';
58 $loaded = true;
62 $stem = Porter::stem($normalized_token);
64 // If the stem is too short, it won't be a candidate for indexing. These
65 // tokens are also likely to be acronyms (like "DNS") rather than real
66 // English words.
67 if (strlen($stem) < 3) {
68 return $normalized_token;
71 return $stem;