1 # -*- coding: utf-8 -*-
3 # Copyright 2008 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 interface for the PyLucene (v2.x) indexing engine
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
32 # TODO: replace this dependency on the jToolkit
39 # try to import the PyLucene package (with the two possible names)
40 # remember the type of the detected package (compiled with jcc (>=v2.3) or
46 # if this fails, then there is no pylucene installed
49 PyLucene
.initVM(PyLucene
.CLASSPATH
)
53 UNNAMED_FIELD_NAME
= "FieldWithoutAName"
54 MAX_FIELD_SIZE
= 1048576
58 return _get_pylucene_version() == 2
61 class PyLuceneDatabase(CommonIndexer
.CommonDatabase
):
62 """manage and use a pylucene indexing database"""
64 QUERY_TYPE
= PyLucene
.Query
65 INDEX_DIRECTORY_NAME
= "lucene"
67 def __init__(self
, basedir
, analyzer
=None, create_allowed
=True):
68 """initialize or open an indexing database
70 Any derived class must override __init__.
72 The following exceptions can be raised:
73 ValueError: the given location exists, but the database type
74 is incompatible (e.g. created by a different indexing engine)
75 OSError: the database failed to initialize
77 @param basedir: the parent directory of the database
79 @param analyzer: bitwise combination of possible analyzer flags
80 to be used as the default analyzer for this database. Leave it empty
81 to use the system default analyzer (self.ANALYZER_DEFAULT).
82 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
84 @param create_allowed: create the database, if necessary; default: True
85 @type create_allowed: bool
86 @throws: OSError, ValueError
88 super(PyLuceneDatabase
, self
).__init
__(basedir
, analyzer
=analyzer
,
89 create_allowed
=create_allowed
)
90 self
.pyl_analyzer
= PyLucene
.StandardAnalyzer()
93 self
.index_version
= None
95 # try to open an existing database
96 tempreader
= PyLucene
.IndexReader
.open(self
.location
)
98 except PyLucene
.JavaError
, err_msg
:
99 # Write an error out, in case this is a real problem instead of an absence of an index
100 # TODO: turn the following two lines into debug output
101 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
102 #DEBUG_FOO("could not open index, so going to create: " + errorstr)
103 # Create the index, so we can open cached readers on it
104 if not create_allowed
:
105 raise OSError("Indexer: skipping database creation")
107 # create the parent directory if it does not exist
108 parent_path
= os
.path
.dirname(self
.location
)
109 if not os
.path
.isdir(parent_path
):
110 # recursively create all directories up to parent_path
111 os
.makedirs(parent_path
)
112 except IOError, err_msg
:
113 raise OSError("Indexer: failed to create the parent " \
114 + "directory (%s) of the indexing database: %s" \
115 % (parent_path
, err_msg
))
117 tempwriter
= PyLucene
.IndexWriter(self
.location
,
118 self
.pyl_analyzer
, True)
120 except PyLucene
.JavaError
, err_msg
:
121 raise OSError("Indexer: failed to open or create a Lucene" \
122 + " database (%s): %s" % (self
.location
, err_msg
))
123 # the indexer is initialized - now we prepare the searcher
124 # create a lock for the database directory - to be used later
125 lockname
= os
.path
.join(tempfile
.gettempdir(),
126 re
.sub("\W", "_", self
.location
))
127 self
.dir_lock
= jToolkit
.glock
.GlobalLock(lockname
)
128 # windows file locking seems inconsistent, so we try 10 times
130 self
.dir_lock
.acquire(blocking
=True)
131 # read "self.reader", "self.indexVersion" and "self.searcher"
135 self
.reader
= PyLucene
.IndexReader
.open(self
.location
)
136 self
.indexVersion
= self
.reader
.getCurrentVersion(
138 self
.searcher
= PyLucene
.IndexSearcher(self
.reader
)
140 except PyLucene
.JavaError
, e
:
141 # store error message for possible later re-raise (below)
146 # locking failed for 10 times
147 raise OSError("Indexer: failed to lock index database" \
148 + " (%s)" % lock_error_msg
)
150 self
.dir_lock
.release()
151 # initialize the searcher and the reader
152 self
._index
_refresh
()
155 """remove lock and close writer after loosing the last reference"""
158 def flush(self
, optimize
=False):
159 """flush the content of the database - to force changes to be written
162 some databases also support index optimization
164 @param optimize: should the index be optimized if possible?
167 if self
._writer
_is
_open
():
170 self
.writer
.optimize()
172 # close the database even if optimizing failed
174 # the reader/searcher needs an update, too
175 self
._index
_refresh
()
177 def _create_query_for_query(self
, query
):
178 """generate a query based on an existing query object
180 basically this function should just create a copy of the original
182 @param query: the original query object
183 @type query: PyLucene.Query
184 @return: resulting query object
185 @rtype: PyLucene.Query
187 # TODO: a deep copy or a clone would be safer
188 # somehow not working (returns "null"): copy.deepcopy(query)
191 def _create_query_for_string(self
, text
, require_all
=True,
193 """generate a query for a plain term of a string query
195 basically this function parses the string and returns the resulting
198 @param text: the query string
200 @param require_all: boolean operator
201 (True -> AND (default) / False -> OR)
202 @type require_all: bool
203 @param analyzer: the analyzer to be used
204 possible analyzers are:
205 CommonDatabase.ANALYZER_TOKENIZE
206 the field value is splitted to be matched word-wise
207 CommonDatabase.ANALYZER_PARTIAL
208 the field value must start with the query string
209 CommonDatabase.ANALYZER_EXACT
210 keep special characters and the like
212 @return: resulting query object
213 @rtype: PyLucene.Query
216 analyzer
= self
.analyzer
217 if analyzer
== self
.ANALYZER_EXACT
:
218 analyzer_obj
= PyLucene
.KeywordAnalyzer()
220 text
= _escape_term_value(text
)
221 analyzer_obj
= PyLucene
.StandardAnalyzer()
222 qp
= PyLucene
.QueryParser(UNNAMED_FIELD_NAME
, analyzer_obj
)
223 if (analyzer
& self
.ANALYZER_PARTIAL
> 0):
224 # PyLucene uses explicit wildcards for partial matching
227 qp
.setDefaultOperator(qp
.Operator
.AND
)
229 qp
.setDefaultOperator(qp
.Operator
.OR
)
230 return qp
.parse(text
)
232 def _create_query_for_field(self
, field
, value
, analyzer
=None):
233 """generate a field query
235 this functions creates a field->value query
237 @param field: the fieldname to be used
239 @param value: the wanted value of the field
241 @param analyzer: the analyzer to be used
242 possible analyzers are:
243 CommonDatabase.ANALYZER_TOKENIZE
244 the field value is splitted to be matched word-wise
245 CommonDatabase.ANALYZER_PARTIAL
246 the field value must start with the query string
247 CommonDatabase.ANALYZER_EXACT
248 keep special characters and the like
250 @return: resulting query object
251 @rtype: PyLucene.Query
254 analyzer
= self
.analyzer
255 if analyzer
== self
.ANALYZER_EXACT
:
256 analyzer_obj
= PyLucene
.KeywordAnalyzer()
258 value
= _escape_term_value(value
)
259 analyzer_obj
= PyLucene
.StandardAnalyzer()
260 qp
= PyLucene
.QueryParser(field
, analyzer_obj
)
261 if (analyzer
& self
.ANALYZER_PARTIAL
> 0):
262 # PyLucene uses explicit wildcards for partial matching
264 return qp
.parse(value
)
266 def _create_query_combined(self
, queries
, require_all
=True):
267 """generate a combined query
269 @param queries: list of the original queries
270 @type queries: list of PyLucene.Query
271 @param require_all: boolean operator
272 (True -> AND (default) / False -> OR)
273 @type require_all: bool
274 @return: the resulting combined query object
275 @rtype: PyLucene.Query
277 combined_query
= PyLucene
.BooleanQuery()
278 for query
in queries
:
280 PyLucene
.BooleanClause(query
, _occur(require_all
, False)))
281 return combined_query
283 def _create_empty_document(self
):
284 """create an empty document to be filled and added to the index later
286 @return: the new document object
287 @rtype: PyLucene.Document
289 return PyLucene
.Document()
291 def _add_plain_term(self
, document
, term
, tokenize
=True):
292 """add a term to a document
294 @param document: the document to be changed
295 @type document: PyLucene.Document
296 @param term: a single term to be added
298 @param tokenize: should the term be tokenized automatically
302 token_flag
= PyLucene
.Field
.Index
.TOKENIZED
304 token_flag
= PyLucene
.Field
.Index
.UN_TOKENIZED
305 document
.add(PyLucene
.Field(str(UNNAMED_FIELD_NAME
), term
,
306 PyLucene
.Field
.Store
.YES
, token_flag
))
308 def _add_field_term(self
, document
, field
, term
, tokenize
=True):
309 """add a field term to a document
311 @param document: the document to be changed
312 @type document: PyLucene.Document
313 @param field: name of the field
315 @param term: term to be associated to the field
317 @param tokenize: should the term be tokenized automatically
321 token_flag
= PyLucene
.Field
.Index
.TOKENIZED
323 token_flag
= PyLucene
.Field
.Index
.UN_TOKENIZED
324 document
.add(PyLucene
.Field(str(field
), term
,
325 PyLucene
.Field
.Store
.YES
, token_flag
))
327 def _add_document_to_index(self
, document
):
328 """add a prepared document to the index database
330 @param document: the document to be added
331 @type document: PyLucene.Document
334 self
.writer
.addDocument(document
)
336 def begin_transaction(self
):
337 """PyLucene does not support transactions
339 Thus this function just opens the database for write access.
340 Call "cancel_transaction" or "commit_transaction" to close write
341 access in order to remove the exclusive lock from the database
346 def cancel_transaction(self
):
347 """PyLucene does not support transactions
349 Thus this function just closes the database write access and removes
352 See 'start_transaction' for details.
356 def commit_transaction(self
):
357 """PyLucene does not support transactions
359 Thus this function just closes the database write access and removes
362 See 'start_transaction' for details.
365 self
._index
_refresh
()
367 def get_query_result(self
, query
):
368 """return an object containing the results of a query
370 @param query: a pre-compiled query
371 @type query: a query object of the real implementation
372 @return: an object that allows access to the results
373 @rtype: subclass of CommonEnquire
375 return PyLuceneHits(self
.searcher
.search(query
))
377 def delete_document_by_id(self
, docid
):
378 """delete a specified document
380 @param docid: the document ID to be deleted
383 self
.reader
.deleteDocument(docid
)
384 # TODO: check the performance impact of calling "refresh" for each id
385 self
._index
_refresh
()
387 def search(self
, query
, fieldnames
):
388 """return a list of the contents of specified fields for all matches of
391 @param query: the query to be issued
392 @type query: a query object of the real implementation
393 @param fieldnames: the name(s) of a field of the document content
394 @type fieldnames: string | list of strings
395 @return: a list of dicts containing the specified field(s)
396 @rtype: list of dicts
398 if isinstance(fieldnames
, str):
399 fieldnames
= [fieldnames
]
400 hits
= self
.searcher
.search(query
)
401 if _COMPILER
== 'jcc':
402 # add the ranking number and the retrieved document to the array
403 hits
= [(hit
, hits
.doc(hit
)) for hit
in range(hits
.length())]
405 for hit
, doc
in hits
:
407 for fieldname
in fieldnames
:
408 # take care for the special field "None"
409 if fieldname
is None:
410 pyl_fieldname
= UNNAMED_FIELD_NAME
412 pyl_fieldname
= fieldname
413 fields
[fieldname
] = doc
.getValues(pyl_fieldname
)
414 result
.append(fields
)
417 def _writer_open(self
):
418 """open write access for the indexing database and acquire an
421 if not self
._writer
_is
_open
():
422 self
.dir_lock
.acquire()
423 self
.writer
= PyLucene
.IndexWriter(self
.location
, self
.pyl_analyzer
,
425 # "setMaxFieldLength" is available since PyLucene v2
426 # we must stay compatible to v1 for the derived class
427 # (PyLuceneIndexer1) - thus we make this step optional
428 if hasattr(self
.writer
, "setMaxFieldLength"):
429 self
.writer
.setMaxFieldLength(MAX_FIELD_SIZE
)
430 # do nothing, if it is already open
432 def _writer_close(self
):
433 """close indexing write access and remove the database lock"""
434 if self
._writer
_is
_open
():
437 # make sure that the lock is removed
438 self
.dir_lock
.forcerelease()
440 def _writer_is_open(self
):
441 """check if the indexing write access is currently open"""
442 return not self
.writer
is None
444 def _index_refresh(self
):
445 """re-read the indexer database"""
447 self
.dir_lock
.acquire(blocking
=False)
448 except jToolkit
.glock
.GlobalLockError
, e
:
449 # if this fails the index is being rewritten, so we continue with
453 if self
.reader
is None or self
.searcher
is None:
454 self
.reader
= PyLucene
.IndexReader
.open(self
.location
)
455 self
.searcher
= PyLucene
.IndexSearcher(self
.reader
)
456 elif self
.index_version
!= self
.reader
.getCurrentVersion( \
458 self
.searcher
.close()
460 self
.reader
= PyLucene
.IndexReader
.open(self
.location
)
461 self
.searcher
= PyLucene
.IndexSearcher(self
.reader
)
462 self
.index_version
= self
.reader
.getCurrentVersion(self
.location
)
463 except PyLucene
.JavaError
,e
:
464 # TODO: add some debugging output?
465 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e))
467 self
.dir_lock
.release()
471 class PyLuceneHits(CommonIndexer
.CommonEnquire
):
472 """an enquire object contains the information about the result of a request
475 def get_matches(self
, start
, number
):
476 """return a specified number of qualified matches of a previous query
478 @param start: index of the first match to return (starting from zero)
480 @param number: the number of matching entries to return
482 @return: a set of matching entries and some statistics
483 @rtype: tuple of (returned number, available number, matches)
484 "matches" is a dictionary of
485 ["rank", "percent", "document", "docid"]
487 # check if requested results do not exist
488 # stop is the lowest index number to be ommitted
489 stop
= start
+ number
490 if stop
> self
.enquire
.length():
491 stop
= self
.enquire
.length()
492 # invalid request range
494 return (0, self
.enquire
.length(), [])
496 for index
in range(start
, stop
):
499 item
["docid"] = self
.enquire
.id(index
)
500 item
["percent"] = self
.enquire
.score(index
)
501 item
["document"] = self
.enquire
.doc(index
)
503 return (stop
-start
, self
.enquire
.length(), result
)
505 def _occur(required
, prohibited
):
506 if required
== True and prohibited
== False:
507 return PyLucene
.BooleanClause
.Occur
.MUST
508 elif required
== False and prohibited
== False:
509 return PyLucene
.BooleanClause
.Occur
.SHOULD
510 elif required
== False and prohibited
== True:
511 return PyLucene
.BooleanClause
.Occur
.MUST_NOT
513 # It is an error to specify a clause as both required
517 def _get_pylucene_version():
518 """get the installed pylucene version
520 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
523 version
= PyLucene
.VERSION
524 if version
.startswith("1."):
526 elif version
.startswith("2."):
532 def _escape_term_value(text
):
533 return re
.sub("\*", "", text
)