Correct a parameter order swap in "diffusion.historyquery" for Mercurial
[phabricator.git] / src / applications / search / fulltextstorage / PhabricatorElasticFulltextStorageEngine.php
blobf6aead37598cbba5efc16e0e143a247db57a5a26
1 <?php
3 class PhabricatorElasticFulltextStorageEngine
4 extends PhabricatorFulltextStorageEngine {
6 private $index;
7 private $timeout;
8 private $version;
10 public function setService(PhabricatorSearchService $service) {
11 $this->service = $service;
12 $config = $service->getConfig();
13 $index = idx($config, 'path', '/phabricator');
14 $this->index = str_replace('/', '', $index);
15 $this->timeout = idx($config, 'timeout', 15);
16 $this->version = (int)idx($config, 'version', 5);
17 return $this;
20 public function getEngineIdentifier() {
21 return 'elasticsearch';
24 public function getTimestampField() {
25 return $this->version < 2 ?
26 '_timestamp' : 'lastModified';
29 public function getTextFieldType() {
30 return $this->version >= 5
31 ? 'text' : 'string';
34 public function getHostType() {
35 return new PhabricatorElasticsearchHost($this);
38 public function getHostForRead() {
39 return $this->getService()->getAnyHostForRole('read');
42 public function getHostForWrite() {
43 return $this->getService()->getAnyHostForRole('write');
46 public function setTimeout($timeout) {
47 $this->timeout = $timeout;
48 return $this;
51 public function getTimeout() {
52 return $this->timeout;
55 public function getTypeConstants($class) {
56 $relationship_class = new ReflectionClass($class);
57 $typeconstants = $relationship_class->getConstants();
58 return array_unique(array_values($typeconstants));
61 public function reindexAbstractDocument(
62 PhabricatorSearchAbstractDocument $doc) {
64 $host = $this->getHostForWrite();
66 $type = $doc->getDocumentType();
67 $phid = $doc->getPHID();
68 $handle = id(new PhabricatorHandleQuery())
69 ->setViewer(PhabricatorUser::getOmnipotentUser())
70 ->withPHIDs(array($phid))
71 ->executeOne();
73 $timestamp_key = $this->getTimestampField();
75 $spec = array(
76 'title' => $doc->getDocumentTitle(),
77 'dateCreated' => $doc->getDocumentCreated(),
78 $timestamp_key => $doc->getDocumentModified(),
81 foreach ($doc->getFieldData() as $field) {
82 list($field_name, $corpus, $aux) = $field;
83 if (!isset($spec[$field_name])) {
84 $spec[$field_name] = array($corpus);
85 } else {
86 $spec[$field_name][] = $corpus;
88 if ($aux != null) {
89 $spec[$field_name][] = $aux;
93 foreach ($doc->getRelationshipData() as $field) {
94 list($field_name, $related_phid, $rtype, $time) = $field;
95 if (!isset($spec[$field_name])) {
96 $spec[$field_name] = array($related_phid);
97 } else {
98 $spec[$field_name][] = $related_phid;
100 if ($time) {
101 $spec[$field_name.'_ts'] = $time;
105 $this->executeRequest($host, "/{$type}/{$phid}/", $spec, 'PUT');
108 private function buildSpec(PhabricatorSavedQuery $query) {
109 $q = new PhabricatorElasticsearchQueryBuilder('bool');
110 $query_string = $query->getParameter('query');
111 if (strlen($query_string)) {
112 $fields = $this->getTypeConstants('PhabricatorSearchDocumentFieldType');
114 // Build a simple_query_string query over all fields that must match all
115 // of the words in the search string.
116 $q->addMustClause(array(
117 'simple_query_string' => array(
118 'query' => $query_string,
119 'fields' => array(
120 PhabricatorSearchDocumentFieldType::FIELD_TITLE.'.*',
121 PhabricatorSearchDocumentFieldType::FIELD_BODY.'.*',
122 PhabricatorSearchDocumentFieldType::FIELD_COMMENT.'.*',
124 'default_operator' => 'AND',
128 // This second query clause is "SHOULD' so it only affects ranking of
129 // documents which already matched the Must clause. This amplifies the
130 // score of documents which have an exact match on title, body
131 // or comments.
132 $q->addShouldClause(array(
133 'simple_query_string' => array(
134 'query' => $query_string,
135 'fields' => array(
136 '*.raw',
137 PhabricatorSearchDocumentFieldType::FIELD_TITLE.'^4',
138 PhabricatorSearchDocumentFieldType::FIELD_BODY.'^3',
139 PhabricatorSearchDocumentFieldType::FIELD_COMMENT.'^1.2',
141 'analyzer' => 'english_exact',
142 'default_operator' => 'and',
148 $exclude = $query->getParameter('exclude');
149 if ($exclude) {
150 $q->addFilterClause(array(
151 'not' => array(
152 'ids' => array(
153 'values' => array($exclude),
159 $relationship_map = array(
160 PhabricatorSearchRelationship::RELATIONSHIP_AUTHOR =>
161 $query->getParameter('authorPHIDs', array()),
162 PhabricatorSearchRelationship::RELATIONSHIP_SUBSCRIBER =>
163 $query->getParameter('subscriberPHIDs', array()),
164 PhabricatorSearchRelationship::RELATIONSHIP_PROJECT =>
165 $query->getParameter('projectPHIDs', array()),
166 PhabricatorSearchRelationship::RELATIONSHIP_REPOSITORY =>
167 $query->getParameter('repositoryPHIDs', array()),
170 $statuses = $query->getParameter('statuses', array());
171 $statuses = array_fuse($statuses);
173 $rel_open = PhabricatorSearchRelationship::RELATIONSHIP_OPEN;
174 $rel_closed = PhabricatorSearchRelationship::RELATIONSHIP_CLOSED;
175 $rel_unowned = PhabricatorSearchRelationship::RELATIONSHIP_UNOWNED;
177 $include_open = !empty($statuses[$rel_open]);
178 $include_closed = !empty($statuses[$rel_closed]);
180 if ($include_open && !$include_closed) {
181 $q->addExistsClause($rel_open);
182 } else if (!$include_open && $include_closed) {
183 $q->addExistsClause($rel_closed);
186 if ($query->getParameter('withUnowned')) {
187 $q->addExistsClause($rel_unowned);
190 $rel_owner = PhabricatorSearchRelationship::RELATIONSHIP_OWNER;
191 if ($query->getParameter('withAnyOwner')) {
192 $q->addExistsClause($rel_owner);
193 } else {
194 $owner_phids = $query->getParameter('ownerPHIDs', array());
195 if (count($owner_phids)) {
196 $q->addTermsClause($rel_owner, $owner_phids);
200 foreach ($relationship_map as $field => $phids) {
201 if (is_array($phids) && !empty($phids)) {
202 $q->addTermsClause($field, $phids);
206 if (!$q->getClauseCount('must')) {
207 $q->addMustClause(array('match_all' => array('boost' => 1 )));
210 $spec = array(
211 '_source' => false,
212 'query' => array(
213 'bool' => $q->toArray(),
218 if (!$query->getParameter('query')) {
219 $spec['sort'] = array(
220 array('dateCreated' => 'desc'),
224 $offset = (int)$query->getParameter('offset', 0);
225 $limit = (int)$query->getParameter('limit', 101);
226 if ($offset + $limit > 10000) {
227 throw new Exception(pht(
228 'Query offset is too large. offset+limit=%s (max=%s)',
229 $offset + $limit,
230 10000));
232 $spec['from'] = $offset;
233 $spec['size'] = $limit;
235 return $spec;
238 public function executeSearch(PhabricatorSavedQuery $query) {
239 $types = $query->getParameter('types');
240 if (!$types) {
241 $types = array_keys(
242 PhabricatorSearchApplicationSearchEngine::getIndexableDocumentTypes());
245 // Don't use '/_search' for the case that there is something
246 // else in the index (for example if 'phabricator' is only an alias to
247 // some bigger index). Use '/$types/_search' instead.
248 $uri = '/'.implode(',', $types).'/_search';
250 $spec = $this->buildSpec($query);
251 $exceptions = array();
253 foreach ($this->service->getAllHostsForRole('read') as $host) {
254 try {
255 $response = $this->executeRequest($host, $uri, $spec);
256 $phids = ipull($response['hits']['hits'], '_id');
257 return $phids;
258 } catch (Exception $e) {
259 $exceptions[] = $e;
262 throw new PhutilAggregateException(pht('All Fulltext Search hosts failed:'),
263 $exceptions);
266 public function indexExists(PhabricatorElasticsearchHost $host = null) {
267 if (!$host) {
268 $host = $this->getHostForRead();
270 try {
271 if ($this->version >= 5) {
272 $uri = '/_stats/';
273 $res = $this->executeRequest($host, $uri, array());
274 return isset($res['indices']['phabricator']);
275 } else if ($this->version >= 2) {
276 $uri = '';
277 } else {
278 $uri = '/_status/';
280 return (bool)$this->executeRequest($host, $uri, array());
281 } catch (HTTPFutureHTTPResponseStatus $e) {
282 if ($e->getStatusCode() == 404) {
283 return false;
285 throw $e;
289 private function getIndexConfiguration() {
290 $data = array();
291 $data['settings'] = array(
292 'index' => array(
293 'auto_expand_replicas' => '0-2',
294 'analysis' => array(
295 'filter' => array(
296 'english_stop' => array(
297 'type' => 'stop',
298 'stopwords' => '_english_',
300 'english_stemmer' => array(
301 'type' => 'stemmer',
302 'language' => 'english',
304 'english_possessive_stemmer' => array(
305 'type' => 'stemmer',
306 'language' => 'possessive_english',
309 'analyzer' => array(
310 'english_exact' => array(
311 'tokenizer' => 'standard',
312 'filter' => array('lowercase'),
314 'letter_stop' => array(
315 'tokenizer' => 'letter',
316 'filter' => array('lowercase', 'english_stop'),
318 'english_stem' => array(
319 'tokenizer' => 'standard',
320 'filter' => array(
321 'english_possessive_stemmer',
322 'lowercase',
323 'english_stop',
324 'english_stemmer',
332 $fields = $this->getTypeConstants('PhabricatorSearchDocumentFieldType');
333 $relationships = $this->getTypeConstants('PhabricatorSearchRelationship');
335 $doc_types = array_keys(
336 PhabricatorSearchApplicationSearchEngine::getIndexableDocumentTypes());
338 $text_type = $this->getTextFieldType();
340 foreach ($doc_types as $type) {
341 $properties = array();
342 foreach ($fields as $field) {
343 // Use the custom analyzer for the corpus of text
344 $properties[$field] = array(
345 'type' => $text_type,
346 'fields' => array(
347 'raw' => array(
348 'type' => $text_type,
349 'analyzer' => 'english_exact',
350 'search_analyzer' => 'english',
351 'search_quote_analyzer' => 'english_exact',
353 'keywords' => array(
354 'type' => $text_type,
355 'analyzer' => 'letter_stop',
357 'stems' => array(
358 'type' => $text_type,
359 'analyzer' => 'english_stem',
365 if ($this->version < 5) {
366 foreach ($relationships as $rel) {
367 $properties[$rel] = array(
368 'type' => 'string',
369 'index' => 'not_analyzed',
370 'include_in_all' => false,
372 $properties[$rel.'_ts'] = array(
373 'type' => 'date',
374 'include_in_all' => false,
377 } else {
378 foreach ($relationships as $rel) {
379 $properties[$rel] = array(
380 'type' => 'keyword',
381 'include_in_all' => false,
382 'doc_values' => false,
384 $properties[$rel.'_ts'] = array(
385 'type' => 'date',
386 'include_in_all' => false,
391 // Ensure we have dateCreated since the default query requires it
392 $properties['dateCreated']['type'] = 'date';
393 $properties['lastModified']['type'] = 'date';
395 $data['mappings'][$type]['properties'] = $properties;
397 return $data;
400 public function indexIsSane(PhabricatorElasticsearchHost $host = null) {
401 if (!$host) {
402 $host = $this->getHostForRead();
404 if (!$this->indexExists($host)) {
405 return false;
407 $cur_mapping = $this->executeRequest($host, '/_mapping/', array());
408 $cur_settings = $this->executeRequest($host, '/_settings/', array());
409 $actual = array_merge($cur_settings[$this->index],
410 $cur_mapping[$this->index]);
412 $res = $this->check($actual, $this->getIndexConfiguration());
413 return $res;
417 * Recursively check if two Elasticsearch configuration arrays are equal
419 * @param $actual
420 * @param $required array
421 * @return bool
423 private function check($actual, $required, $path = '') {
424 foreach ($required as $key => $value) {
425 if (!array_key_exists($key, $actual)) {
426 if ($key === '_all') {
427 // The _all field never comes back so we just have to assume it
428 // is set correctly.
429 continue;
431 return false;
433 if (is_array($value)) {
434 if (!is_array($actual[$key])) {
435 return false;
437 if (!$this->check($actual[$key], $value, $path.'.'.$key)) {
438 return false;
440 continue;
443 $actual[$key] = self::normalizeConfigValue($actual[$key]);
444 $value = self::normalizeConfigValue($value);
445 if ($actual[$key] != $value) {
446 return false;
449 return true;
453 * Normalize a config value for comparison. Elasticsearch accepts all kinds
454 * of config values but it tends to throw back 'true' for true and 'false' for
455 * false so we normalize everything. Sometimes, oddly, it'll throw back false
456 * for false....
458 * @param mixed $value config value
459 * @return mixed value normalized
461 private static function normalizeConfigValue($value) {
462 if ($value === true) {
463 return 'true';
464 } else if ($value === false) {
465 return 'false';
467 return $value;
470 public function initIndex() {
471 $host = $this->getHostForWrite();
472 if ($this->indexExists()) {
473 $this->executeRequest($host, '/', array(), 'DELETE');
475 $data = $this->getIndexConfiguration();
476 $this->executeRequest($host, '/', $data, 'PUT');
479 public function getIndexStats(PhabricatorElasticsearchHost $host = null) {
480 if ($this->version < 2) {
481 return false;
483 if (!$host) {
484 $host = $this->getHostForRead();
486 $uri = '/_stats/';
488 $res = $this->executeRequest($host, $uri, array());
489 $stats = $res['indices'][$this->index];
490 return array(
491 pht('Queries') =>
492 idxv($stats, array('primaries', 'search', 'query_total')),
493 pht('Documents') =>
494 idxv($stats, array('total', 'docs', 'count')),
495 pht('Deleted') =>
496 idxv($stats, array('total', 'docs', 'deleted')),
497 pht('Storage Used') =>
498 phutil_format_bytes(idxv($stats,
499 array('total', 'store', 'size_in_bytes'))),
503 private function executeRequest(PhabricatorElasticsearchHost $host, $path,
504 array $data, $method = 'GET') {
506 $uri = $host->getURI($path);
507 $data = phutil_json_encode($data);
508 $future = new HTTPSFuture($uri, $data);
509 $future->addHeader('Content-Type', 'application/json');
511 if ($method != 'GET') {
512 $future->setMethod($method);
514 if ($this->getTimeout()) {
515 $future->setTimeout($this->getTimeout());
517 try {
518 list($body) = $future->resolvex();
519 } catch (HTTPFutureResponseStatus $ex) {
520 if ($ex->isTimeout() || (int)$ex->getStatusCode() > 499) {
521 $host->didHealthCheck(false);
523 throw $ex;
526 if ($method != 'GET') {
527 return null;
530 try {
531 $data = phutil_json_decode($body);
532 $host->didHealthCheck(true);
533 return $data;
534 } catch (PhutilJSONParserException $ex) {
535 $host->didHealthCheck(false);
536 throw new PhutilProxyException(
537 pht('Elasticsearch server returned invalid JSON!'),
538 $ex);