fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / search / indexer / PyLuceneIndexer.py
blob3ad0bce44447d65bad86e5c6e04ac04ce2335e86
1 # -*- coding: utf-8 -*-
3 # Copyright 2008 Zuza Software Foundation
4 #
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """
24 interface for the PyLucene (v2.x) indexing engine
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
27 """
29 __revision__ = "$Id$"
31 import CommonIndexer
32 # TODO: replace this dependency on the jToolkit
33 import jToolkit.glock
34 import tempfile
35 import re
36 import os
37 import time
39 # try to import the PyLucene package (with the two possible names)
40 # remember the type of the detected package (compiled with jcc (>=v2.3) or
41 # with gcj (<=v2.2)
42 try:
43 import PyLucene
44 _COMPILER = 'gcj'
45 except ImportError:
46 # if this fails, then there is no pylucene installed
47 import lucene
48 PyLucene = lucene
49 PyLucene.initVM(PyLucene.CLASSPATH)
50 _COMPILER = 'jcc'
53 UNNAMED_FIELD_NAME = "FieldWithoutAName"
54 MAX_FIELD_SIZE = 1048576
57 def is_available():
58 return _get_pylucene_version() == 2
61 class PyLuceneDatabase(CommonIndexer.CommonDatabase):
62 """manage and use a pylucene indexing database"""
64 QUERY_TYPE = PyLucene.Query
65 INDEX_DIRECTORY_NAME = "lucene"
67 def __init__(self, basedir, analyzer=None, create_allowed=True):
68 """initialize or open an indexing database
70 Any derived class must override __init__.
72 The following exceptions can be raised:
73 ValueError: the given location exists, but the database type
74 is incompatible (e.g. created by a different indexing engine)
75 OSError: the database failed to initialize
77 @param basedir: the parent directory of the database
78 @type basedir: str
79 @param analyzer: bitwise combination of possible analyzer flags
80 to be used as the default analyzer for this database. Leave it empty
81 to use the system default analyzer (self.ANALYZER_DEFAULT).
82 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
83 @type analyzer: int
84 @param create_allowed: create the database, if necessary; default: True
85 @type create_allowed: bool
86 @throws: OSError, ValueError
87 """
88 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
89 create_allowed=create_allowed)
90 self.pyl_analyzer = PyLucene.StandardAnalyzer()
91 self.writer = None
92 self.reader = None
93 self.index_version = None
94 try:
95 # try to open an existing database
96 tempreader = PyLucene.IndexReader.open(self.location)
97 tempreader.close()
98 except PyLucene.JavaError, err_msg:
99 # Write an error out, in case this is a real problem instead of an absence of an index
100 # TODO: turn the following two lines into debug output
101 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
102 #DEBUG_FOO("could not open index, so going to create: " + errorstr)
103 # Create the index, so we can open cached readers on it
104 if not create_allowed:
105 raise OSError("Indexer: skipping database creation")
106 try:
107 # create the parent directory if it does not exist
108 parent_path = os.path.dirname(self.location)
109 if not os.path.isdir(parent_path):
110 # recursively create all directories up to parent_path
111 os.makedirs(parent_path)
112 except IOError, err_msg:
113 raise OSError("Indexer: failed to create the parent " \
114 + "directory (%s) of the indexing database: %s" \
115 % (parent_path, err_msg))
116 try:
117 tempwriter = PyLucene.IndexWriter(self.location,
118 self.pyl_analyzer, True)
119 tempwriter.close()
120 except PyLucene.JavaError, err_msg:
121 raise OSError("Indexer: failed to open or create a Lucene" \
122 + " database (%s): %s" % (self.location, err_msg))
123 # the indexer is initialized - now we prepare the searcher
124 # create a lock for the database directory - to be used later
125 lockname = os.path.join(tempfile.gettempdir(),
126 re.sub("\W", "_", self.location))
127 self.dir_lock = jToolkit.glock.GlobalLock(lockname)
128 # windows file locking seems inconsistent, so we try 10 times
129 numtries = 0
130 self.dir_lock.acquire(blocking=True)
131 # read "self.reader", "self.indexVersion" and "self.searcher"
132 try:
133 while numtries < 10:
134 try:
135 self.reader = PyLucene.IndexReader.open(self.location)
136 self.indexVersion = self.reader.getCurrentVersion(
137 self.location)
138 self.searcher = PyLucene.IndexSearcher(self.reader)
139 break
140 except PyLucene.JavaError, e:
141 # store error message for possible later re-raise (below)
142 lock_error_msg = e
143 time.sleep(0.01)
144 numtries += 1
145 else:
146 # locking failed for 10 times
147 raise OSError("Indexer: failed to lock index database" \
148 + " (%s)" % lock_error_msg)
149 finally:
150 self.dir_lock.release()
151 # initialize the searcher and the reader
152 self._index_refresh()
154 def __del__(self):
155 """remove lock and close writer after loosing the last reference"""
156 self._writer_close()
158 def flush(self, optimize=False):
159 """flush the content of the database - to force changes to be written
160 to disk
162 some databases also support index optimization
164 @param optimize: should the index be optimized if possible?
165 @type optimize: bool
167 if self._writer_is_open():
168 try:
169 if optimize:
170 self.writer.optimize()
171 finally:
172 # close the database even if optimizing failed
173 self._writer_close()
174 # the reader/searcher needs an update, too
175 self._index_refresh()
177 def _create_query_for_query(self, query):
178 """generate a query based on an existing query object
180 basically this function should just create a copy of the original
182 @param query: the original query object
183 @type query: PyLucene.Query
184 @return: resulting query object
185 @rtype: PyLucene.Query
187 # TODO: a deep copy or a clone would be safer
188 # somehow not working (returns "null"): copy.deepcopy(query)
189 return query
191 def _create_query_for_string(self, text, require_all=True,
192 analyzer=None):
193 """generate a query for a plain term of a string query
195 basically this function parses the string and returns the resulting
196 query
198 @param text: the query string
199 @type text: str
200 @param require_all: boolean operator
201 (True -> AND (default) / False -> OR)
202 @type require_all: bool
203 @param analyzer: the analyzer to be used
204 possible analyzers are:
205 CommonDatabase.ANALYZER_TOKENIZE
206 the field value is splitted to be matched word-wise
207 CommonDatabase.ANALYZER_PARTIAL
208 the field value must start with the query string
209 CommonDatabase.ANALYZER_EXACT
210 keep special characters and the like
211 @type analyzer: bool
212 @return: resulting query object
213 @rtype: PyLucene.Query
215 if analyzer is None:
216 analyzer = self.analyzer
217 if analyzer == self.ANALYZER_EXACT:
218 analyzer_obj = PyLucene.KeywordAnalyzer()
219 else:
220 text = _escape_term_value(text)
221 analyzer_obj = PyLucene.StandardAnalyzer()
222 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
223 if (analyzer & self.ANALYZER_PARTIAL > 0):
224 # PyLucene uses explicit wildcards for partial matching
225 text += "*"
226 if require_all:
227 qp.setDefaultOperator(qp.Operator.AND)
228 else:
229 qp.setDefaultOperator(qp.Operator.OR)
230 return qp.parse(text)
232 def _create_query_for_field(self, field, value, analyzer=None):
233 """generate a field query
235 this functions creates a field->value query
237 @param field: the fieldname to be used
238 @type field: str
239 @param value: the wanted value of the field
240 @type value: str
241 @param analyzer: the analyzer to be used
242 possible analyzers are:
243 CommonDatabase.ANALYZER_TOKENIZE
244 the field value is splitted to be matched word-wise
245 CommonDatabase.ANALYZER_PARTIAL
246 the field value must start with the query string
247 CommonDatabase.ANALYZER_EXACT
248 keep special characters and the like
249 @type analyzer: bool
250 @return: resulting query object
251 @rtype: PyLucene.Query
253 if analyzer is None:
254 analyzer = self.analyzer
255 if analyzer == self.ANALYZER_EXACT:
256 analyzer_obj = PyLucene.KeywordAnalyzer()
257 else:
258 value = _escape_term_value(value)
259 analyzer_obj = PyLucene.StandardAnalyzer()
260 qp = PyLucene.QueryParser(field, analyzer_obj)
261 if (analyzer & self.ANALYZER_PARTIAL > 0):
262 # PyLucene uses explicit wildcards for partial matching
263 value += "*"
264 return qp.parse(value)
266 def _create_query_combined(self, queries, require_all=True):
267 """generate a combined query
269 @param queries: list of the original queries
270 @type queries: list of PyLucene.Query
271 @param require_all: boolean operator
272 (True -> AND (default) / False -> OR)
273 @type require_all: bool
274 @return: the resulting combined query object
275 @rtype: PyLucene.Query
277 combined_query = PyLucene.BooleanQuery()
278 for query in queries:
279 combined_query.add(
280 PyLucene.BooleanClause(query, _occur(require_all, False)))
281 return combined_query
283 def _create_empty_document(self):
284 """create an empty document to be filled and added to the index later
286 @return: the new document object
287 @rtype: PyLucene.Document
289 return PyLucene.Document()
291 def _add_plain_term(self, document, term, tokenize=True):
292 """add a term to a document
294 @param document: the document to be changed
295 @type document: PyLucene.Document
296 @param term: a single term to be added
297 @type term: str
298 @param tokenize: should the term be tokenized automatically
299 @type tokenize: bool
301 if tokenize:
302 token_flag = PyLucene.Field.Index.TOKENIZED
303 else:
304 token_flag = PyLucene.Field.Index.UN_TOKENIZED
305 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
306 PyLucene.Field.Store.YES, token_flag))
308 def _add_field_term(self, document, field, term, tokenize=True):
309 """add a field term to a document
311 @param document: the document to be changed
312 @type document: PyLucene.Document
313 @param field: name of the field
314 @type field: str
315 @param term: term to be associated to the field
316 @type term: str
317 @param tokenize: should the term be tokenized automatically
318 @type tokenize: bool
320 if tokenize:
321 token_flag = PyLucene.Field.Index.TOKENIZED
322 else:
323 token_flag = PyLucene.Field.Index.UN_TOKENIZED
324 document.add(PyLucene.Field(str(field), term,
325 PyLucene.Field.Store.YES, token_flag))
327 def _add_document_to_index(self, document):
328 """add a prepared document to the index database
330 @param document: the document to be added
331 @type document: PyLucene.Document
333 self._writer_open()
334 self.writer.addDocument(document)
336 def begin_transaction(self):
337 """PyLucene does not support transactions
339 Thus this function just opens the database for write access.
340 Call "cancel_transaction" or "commit_transaction" to close write
341 access in order to remove the exclusive lock from the database
342 directory.
344 self._writer_open()
346 def cancel_transaction(self):
347 """PyLucene does not support transactions
349 Thus this function just closes the database write access and removes
350 the exclusive lock.
352 See 'start_transaction' for details.
354 self._writer_close()
356 def commit_transaction(self):
357 """PyLucene does not support transactions
359 Thus this function just closes the database write access and removes
360 the exclusive lock.
362 See 'start_transaction' for details.
364 self._writer_close()
365 self._index_refresh()
367 def get_query_result(self, query):
368 """return an object containing the results of a query
370 @param query: a pre-compiled query
371 @type query: a query object of the real implementation
372 @return: an object that allows access to the results
373 @rtype: subclass of CommonEnquire
375 return PyLuceneHits(self.searcher.search(query))
377 def delete_document_by_id(self, docid):
378 """delete a specified document
380 @param docid: the document ID to be deleted
381 @type docid: int
383 self.reader.deleteDocument(docid)
384 # TODO: check the performance impact of calling "refresh" for each id
385 self._index_refresh()
387 def search(self, query, fieldnames):
388 """return a list of the contents of specified fields for all matches of
389 a query
391 @param query: the query to be issued
392 @type query: a query object of the real implementation
393 @param fieldnames: the name(s) of a field of the document content
394 @type fieldnames: string | list of strings
395 @return: a list of dicts containing the specified field(s)
396 @rtype: list of dicts
398 if isinstance(fieldnames, str):
399 fieldnames = [fieldnames]
400 hits = self.searcher.search(query)
401 if _COMPILER == 'jcc':
402 # add the ranking number and the retrieved document to the array
403 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
404 result = []
405 for hit, doc in hits:
406 fields = {}
407 for fieldname in fieldnames:
408 # take care for the special field "None"
409 if fieldname is None:
410 pyl_fieldname = UNNAMED_FIELD_NAME
411 else:
412 pyl_fieldname = fieldname
413 fields[fieldname] = doc.getValues(pyl_fieldname)
414 result.append(fields)
415 return result
417 def _writer_open(self):
418 """open write access for the indexing database and acquire an
419 exclusive lock
421 if not self._writer_is_open():
422 self.dir_lock.acquire()
423 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
424 False)
425 # "setMaxFieldLength" is available since PyLucene v2
426 # we must stay compatible to v1 for the derived class
427 # (PyLuceneIndexer1) - thus we make this step optional
428 if hasattr(self.writer, "setMaxFieldLength"):
429 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
430 # do nothing, if it is already open
432 def _writer_close(self):
433 """close indexing write access and remove the database lock"""
434 if self._writer_is_open():
435 self.writer.close()
436 self.writer = None
437 # make sure that the lock is removed
438 self.dir_lock.forcerelease()
440 def _writer_is_open(self):
441 """check if the indexing write access is currently open"""
442 return not self.writer is None
444 def _index_refresh(self):
445 """re-read the indexer database"""
446 try:
447 self.dir_lock.acquire(blocking=False)
448 except jToolkit.glock.GlobalLockError, e:
449 # if this fails the index is being rewritten, so we continue with
450 # our old version
451 return
452 try:
453 if self.reader is None or self.searcher is None:
454 self.reader = PyLucene.IndexReader.open(self.location)
455 self.searcher = PyLucene.IndexSearcher(self.reader)
456 elif self.index_version != self.reader.getCurrentVersion( \
457 self.location):
458 self.searcher.close()
459 self.reader.close()
460 self.reader = PyLucene.IndexReader.open(self.location)
461 self.searcher = PyLucene.IndexSearcher(self.reader)
462 self.index_version = self.reader.getCurrentVersion(self.location)
463 except PyLucene.JavaError,e:
464 # TODO: add some debugging output?
465 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e))
466 pass
467 self.dir_lock.release()
471 class PyLuceneHits(CommonIndexer.CommonEnquire):
472 """an enquire object contains the information about the result of a request
475 def get_matches(self, start, number):
476 """return a specified number of qualified matches of a previous query
478 @param start: index of the first match to return (starting from zero)
479 @type start: int
480 @param number: the number of matching entries to return
481 @type number: int
482 @return: a set of matching entries and some statistics
483 @rtype: tuple of (returned number, available number, matches)
484 "matches" is a dictionary of
485 ["rank", "percent", "document", "docid"]
487 # check if requested results do not exist
488 # stop is the lowest index number to be ommitted
489 stop = start + number
490 if stop > self.enquire.length():
491 stop = self.enquire.length()
492 # invalid request range
493 if stop <= start:
494 return (0, self.enquire.length(), [])
495 result = []
496 for index in range(start, stop):
497 item = {}
498 item["rank"] = index
499 item["docid"] = self.enquire.id(index)
500 item["percent"] = self.enquire.score(index)
501 item["document"] = self.enquire.doc(index)
502 result.append(item)
503 return (stop-start, self.enquire.length(), result)
505 def _occur(required, prohibited):
506 if required == True and prohibited == False:
507 return PyLucene.BooleanClause.Occur.MUST
508 elif required == False and prohibited == False:
509 return PyLucene.BooleanClause.Occur.SHOULD
510 elif required == False and prohibited == True:
511 return PyLucene.BooleanClause.Occur.MUST_NOT
512 else:
513 # It is an error to specify a clause as both required
514 # and prohibited
515 return None
517 def _get_pylucene_version():
518 """get the installed pylucene version
520 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
521 @rtype: int
523 version = PyLucene.VERSION
524 if version.startswith("1."):
525 return 1
526 elif version.startswith("2."):
527 return 2
528 else:
529 return 0
532 def _escape_term_value(text):
533 return re.sub("\*", "", text)