search/indexer/PyLuceneIndexer.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2008 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 #
  21
  22
  23 """
  24 interface for the PyLucene (v2.x) indexing engine
  25
  26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
  27 """
  28
  29 __revision__ = "$Id$"
  30
  31 import CommonIndexer
  32 # TODO: replace this dependency on the jToolkit
  33 import jToolkit.glock
  34 import tempfile
  35 import re
  36 import os
  37 import time
  38
  39 # try to import the PyLucene package (with the two possible names)
  40 # remember the type of the detected package (compiled with jcc (>=v2.3) or
  41 # with gcj (<=v2.2)
  42 try:
  43         import PyLucene
  44         _COMPILER = 'gcj'
  45 except ImportError:
  46         # if this fails, then there is no pylucene installed
  47         import lucene
  48         PyLucene = lucene
  49         PyLucene.initVM(PyLucene.CLASSPATH)
  50         _COMPILER = 'jcc'
  51
  52
  53 UNNAMED_FIELD_NAME = "FieldWithoutAName"
  54 MAX_FIELD_SIZE = 1048576
  55
  56
  57 def is_available():
  58     return _get_pylucene_version() == 2
  59
  60
  61 class PyLuceneDatabase(CommonIndexer.CommonDatabase):
  62     """manage and use a pylucene indexing database"""
  63
  64     QUERY_TYPE = PyLucene.Query
  65     INDEX_DIRECTORY_NAME = "lucene"
  66
  67     def __init__(self, basedir, analyzer=None, create_allowed=True):
  68         """initialize or open an indexing database
  69
  70         Any derived class must override __init__.
  71
  72         The following exceptions can be raised:
  73             ValueError: the given location exists, but the database type
  74                 is incompatible (e.g. created by a different indexing engine)
  75             OSError: the database failed to initialize
  76
  77         @param basedir: the parent directory of the database
  78         @type basedir: str
  79         @param analyzer: bitwise combination of possible analyzer flags
  80             to be used as the default analyzer for this database. Leave it empty
  81             to use the system default analyzer (self.ANALYZER_DEFAULT).
  82             see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
  83         @type analyzer: int
  84         @param create_allowed: create the database, if necessary; default: True
  85         @type create_allowed: bool
  86         @throws: OSError, ValueError
  87         """
  88         super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
  89                 create_allowed=create_allowed)
  90         self.pyl_analyzer = PyLucene.StandardAnalyzer()
  91         self.writer = None
  92         self.reader = None
  93         self.index_version = None
  94         try:
  95             # try to open an existing database
  96             tempreader = PyLucene.IndexReader.open(self.location)
  97             tempreader.close()
  98         except PyLucene.JavaError, err_msg:
  99             # Write an error out, in case this is a real problem instead of an absence of an index
 100             # TODO: turn the following two lines into debug output
 101             #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str()
 102             #DEBUG_FOO("could not open index, so going to create: " + errorstr)
 103             # Create the index, so we can open cached readers on it
 104             if not create_allowed:
 105                 raise OSError("Indexer: skipping database creation")
 106             try:
 107                 # create the parent directory if it does not exist
 108                 parent_path = os.path.dirname(self.location)
 109                 if not os.path.isdir(parent_path):
 110                     # recursively create all directories up to parent_path
 111                     os.makedirs(parent_path)
 112             except IOError, err_msg:
 113                 raise OSError("Indexer: failed to create the parent " \
 114                         + "directory (%s) of the indexing database: %s" \
 115                         % (parent_path, err_msg))
 116             try:
 117                 tempwriter = PyLucene.IndexWriter(self.location,
 118                         self.pyl_analyzer, True)
 119                 tempwriter.close()
 120             except PyLucene.JavaError, err_msg:
 121                 raise OSError("Indexer: failed to open or create a Lucene" \
 122                         + " database (%s): %s" % (self.location, err_msg))
 123         # the indexer is initialized - now we prepare the searcher
 124         # create a lock for the database directory - to be used later
 125         lockname = os.path.join(tempfile.gettempdir(),
 126                 re.sub("\W", "_", self.location))
 127         self.dir_lock = jToolkit.glock.GlobalLock(lockname)
 128         # windows file locking seems inconsistent, so we try 10 times
 129         numtries = 0
 130         self.dir_lock.acquire(blocking=True)
 131         # read "self.reader", "self.indexVersion" and "self.searcher"
 132         try:
 133             while numtries < 10:
 134                 try:
 135                     self.reader = PyLucene.IndexReader.open(self.location)
 136                     self.indexVersion = self.reader.getCurrentVersion(
 137                             self.location)
 138                     self.searcher = PyLucene.IndexSearcher(self.reader)
 139                     break
 140                 except PyLucene.JavaError, e:
 141                     # store error message for possible later re-raise (below)
 142                     lock_error_msg = e
 143                     time.sleep(0.01)
 144                 numtries += 1
 145             else:
 146                 # locking failed for 10 times
 147                 raise OSError("Indexer: failed to lock index database" \
 148                         + " (%s)" % lock_error_msg)
 149         finally:
 150             self.dir_lock.release()
 151         # initialize the searcher and the reader
 152         self._index_refresh()
 153
 154     def __del__(self):
 155         """remove lock and close writer after loosing the last reference"""
 156         self._writer_close()
 157
 158     def flush(self, optimize=False):
 159         """flush the content of the database - to force changes to be written
 160         to disk
 161
 162         some databases also support index optimization
 163
 164         @param optimize: should the index be optimized if possible?
 165         @type optimize: bool
 166         """
 167         if self._writer_is_open():
 168             try:
 169                 if optimize:
 170                     self.writer.optimize()
 171             finally:
 172                 # close the database even if optimizing failed
 173                 self._writer_close()
 174         # the reader/searcher needs an update, too
 175         self._index_refresh()
 176
 177     def _create_query_for_query(self, query):
 178         """generate a query based on an existing query object
 179
 180         basically this function should just create a copy of the original
 181
 182         @param query: the original query object
 183         @type query: PyLucene.Query
 184         @return: resulting query object
 185         @rtype: PyLucene.Query
 186         """
 187         # TODO: a deep copy or a clone would be safer
 188         # somehow not working (returns "null"): copy.deepcopy(query)
 189         return query
 190
 191     def _create_query_for_string(self, text, require_all=True,
 192             analyzer=None):
 193         """generate a query for a plain term of a string query
 194
 195         basically this function parses the string and returns the resulting
 196         query
 197
 198         @param text: the query string
 199         @type text: str
 200         @param require_all: boolean operator
 201             (True -> AND (default) / False -> OR)
 202         @type require_all: bool
 203         @param analyzer: the analyzer to be used
 204             possible analyzers are:
 205                 CommonDatabase.ANALYZER_TOKENIZE
 206                     the field value is splitted to be matched word-wise
 207                 CommonDatabase.ANALYZER_PARTIAL
 208                     the field value must start with the query string
 209                 CommonDatabase.ANALYZER_EXACT
 210                     keep special characters and the like
 211         @type analyzer: bool
 212         @return: resulting query object
 213         @rtype: PyLucene.Query
 214         """
 215         if analyzer is None:
 216             analyzer = self.analyzer
 217         if analyzer == self.ANALYZER_EXACT:
 218             analyzer_obj = PyLucene.KeywordAnalyzer()
 219         else:
 220             text = _escape_term_value(text)
 221             analyzer_obj = PyLucene.StandardAnalyzer()
 222         qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
 223         if (analyzer & self.ANALYZER_PARTIAL > 0):
 224             # PyLucene uses explicit wildcards for partial matching
 225             text += "*"
 226         if require_all:
 227             qp.setDefaultOperator(qp.Operator.AND)
 228         else:
 229             qp.setDefaultOperator(qp.Operator.OR)
 230         return qp.parse(text)
 231
 232     def _create_query_for_field(self, field, value, analyzer=None):
 233         """generate a field query
 234
 235         this functions creates a field->value query
 236
 237         @param field: the fieldname to be used
 238         @type field: str
 239         @param value: the wanted value of the field
 240         @type value: str
 241         @param analyzer: the analyzer to be used
 242             possible analyzers are:
 243                 CommonDatabase.ANALYZER_TOKENIZE
 244                     the field value is splitted to be matched word-wise
 245                 CommonDatabase.ANALYZER_PARTIAL
 246                     the field value must start with the query string
 247                 CommonDatabase.ANALYZER_EXACT
 248                     keep special characters and the like
 249         @type analyzer: bool
 250         @return: resulting query object
 251         @rtype: PyLucene.Query
 252         """
 253         if analyzer is None:
 254             analyzer = self.analyzer
 255         if analyzer == self.ANALYZER_EXACT:
 256             analyzer_obj = PyLucene.KeywordAnalyzer()
 257         else:
 258             value = _escape_term_value(value)
 259             analyzer_obj = PyLucene.StandardAnalyzer()
 260         qp = PyLucene.QueryParser(field, analyzer_obj)
 261         if (analyzer & self.ANALYZER_PARTIAL > 0):
 262             # PyLucene uses explicit wildcards for partial matching
 263             value += "*"
 264         return qp.parse(value)
 265
 266     def _create_query_combined(self, queries, require_all=True):
 267         """generate a combined query
 268
 269         @param queries: list of the original queries
 270         @type queries: list of PyLucene.Query
 271         @param require_all: boolean operator
 272             (True -> AND (default) / False -> OR)
 273         @type require_all: bool
 274         @return: the resulting combined query object
 275         @rtype: PyLucene.Query
 276         """
 277         combined_query = PyLucene.BooleanQuery()
 278         for query in queries:
 279             combined_query.add(
 280                     PyLucene.BooleanClause(query, _occur(require_all, False)))
 281         return combined_query
 282
 283     def _create_empty_document(self):
 284         """create an empty document to be filled and added to the index later
 285
 286         @return: the new document object
 287         @rtype: PyLucene.Document
 288         """
 289         return PyLucene.Document()
 290
 291     def _add_plain_term(self, document, term, tokenize=True):
 292         """add a term to a document
 293
 294         @param document: the document to be changed
 295         @type document: PyLucene.Document
 296         @param term: a single term to be added
 297         @type term: str
 298         @param tokenize: should the term be tokenized automatically
 299         @type tokenize: bool
 300         """
 301         if tokenize:
 302             token_flag = PyLucene.Field.Index.TOKENIZED
 303         else:
 304             token_flag = PyLucene.Field.Index.UN_TOKENIZED
 305         document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
 306                 PyLucene.Field.Store.YES, token_flag))
 307
 308     def _add_field_term(self, document, field, term, tokenize=True):
 309         """add a field term to a document
 310
 311         @param document: the document to be changed
 312         @type document: PyLucene.Document
 313         @param field: name of the field
 314         @type field: str
 315         @param term: term to be associated to the field
 316         @type term: str
 317         @param tokenize: should the term be tokenized automatically
 318         @type tokenize: bool
 319         """
 320         if tokenize:
 321             token_flag = PyLucene.Field.Index.TOKENIZED
 322         else:
 323             token_flag = PyLucene.Field.Index.UN_TOKENIZED
 324         document.add(PyLucene.Field(str(field), term,
 325                 PyLucene.Field.Store.YES, token_flag))
 326
 327     def _add_document_to_index(self, document):
 328         """add a prepared document to the index database
 329
 330         @param document: the document to be added
 331         @type document: PyLucene.Document
 332         """
 333         self._writer_open()
 334         self.writer.addDocument(document)
 335
 336     def begin_transaction(self):
 337         """PyLucene does not support transactions
 338
 339         Thus this function just opens the database for write access.
 340         Call "cancel_transaction" or "commit_transaction" to close write
 341         access in order to remove the exclusive lock from the database
 342         directory.
 343         """
 344         self._writer_open()
 345
 346     def cancel_transaction(self):
 347         """PyLucene does not support transactions
 348
 349         Thus this function just closes the database write access and removes
 350         the exclusive lock.
 351
 352         See 'start_transaction' for details.
 353         """
 354         self._writer_close()
 355
 356     def commit_transaction(self):
 357         """PyLucene does not support transactions
 358
 359         Thus this function just closes the database write access and removes
 360         the exclusive lock.
 361
 362         See 'start_transaction' for details.
 363         """
 364         self._writer_close()
 365         self._index_refresh()
 366
 367     def get_query_result(self, query):
 368         """return an object containing the results of a query
 369
 370         @param query: a pre-compiled query
 371         @type query: a query object of the real implementation
 372         @return: an object that allows access to the results
 373         @rtype: subclass of CommonEnquire
 374         """
 375         return PyLuceneHits(self.searcher.search(query))
 376
 377     def delete_document_by_id(self, docid):
 378         """delete a specified document
 379
 380         @param docid: the document ID to be deleted
 381         @type docid: int
 382         """
 383         self.reader.deleteDocument(docid)
 384         # TODO: check the performance impact of calling "refresh" for each id
 385         self._index_refresh()
 386
 387     def search(self, query, fieldnames):
 388         """return a list of the contents of specified fields for all matches of
 389         a query
 390
 391         @param query: the query to be issued
 392         @type query: a query object of the real implementation
 393         @param fieldnames: the name(s) of a field of the document content
 394         @type fieldnames: string | list of strings
 395         @return: a list of dicts containing the specified field(s)
 396         @rtype: list of dicts
 397         """
 398         if isinstance(fieldnames, str):
 399             fieldnames = [fieldnames]
 400         hits = self.searcher.search(query)
 401         if _COMPILER == 'jcc':
 402             # add the ranking number and the retrieved document to the array
 403             hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
 404         result = []
 405         for hit, doc in hits:
 406             fields = {}
 407             for fieldname in fieldnames:
 408                 # take care for the special field "None"
 409                 if fieldname is None:
 410                     pyl_fieldname = UNNAMED_FIELD_NAME
 411                 else:
 412                     pyl_fieldname = fieldname
 413                 fields[fieldname] = doc.getValues(pyl_fieldname)
 414             result.append(fields)
 415         return result
 416
 417     def _writer_open(self):
 418         """open write access for the indexing database and acquire an
 419         exclusive lock
 420         """
 421         if not self._writer_is_open():
 422             self.dir_lock.acquire()
 423             self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
 424                     False)
 425             # "setMaxFieldLength" is available since PyLucene v2
 426             # we must stay compatible to v1 for the derived class
 427             # (PyLuceneIndexer1) - thus we make this step optional
 428             if hasattr(self.writer, "setMaxFieldLength"):
 429                 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
 430         # do nothing, if it is already open
 431
 432     def _writer_close(self):
 433         """close indexing write access and remove the database lock"""
 434         if self._writer_is_open():
 435             self.writer.close()
 436             self.writer = None
 437         # make sure that the lock is removed
 438         self.dir_lock.forcerelease()
 439
 440     def _writer_is_open(self):
 441         """check if the indexing write access is currently open"""
 442         return not self.writer is None
 443
 444     def _index_refresh(self):
 445         """re-read the indexer database"""
 446         try:
 447             self.dir_lock.acquire(blocking=False)
 448         except jToolkit.glock.GlobalLockError, e:
 449             # if this fails the index is being rewritten, so we continue with
 450             # our old version
 451             return
 452         try:
 453             if self.reader is None or self.searcher is None:
 454                 self.reader = PyLucene.IndexReader.open(self.location)
 455                 self.searcher = PyLucene.IndexSearcher(self.reader)
 456             elif self.index_version != self.reader.getCurrentVersion( \
 457                     self.location):
 458                 self.searcher.close()
 459                 self.reader.close()
 460                 self.reader = PyLucene.IndexReader.open(self.location)
 461                 self.searcher = PyLucene.IndexSearcher(self.reader)
 462                 self.index_version = self.reader.getCurrentVersion(self.location)
 463         except PyLucene.JavaError,e:
 464             # TODO: add some debugging output?
 465             #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e))
 466             pass
 467         self.dir_lock.release()
 468
 469
 470
 471 class PyLuceneHits(CommonIndexer.CommonEnquire):
 472     """an enquire object contains the information about the result of a request
 473     """
 474
 475     def get_matches(self, start, number):
 476         """return a specified number of qualified matches of a previous query
 477
 478         @param start: index of the first match to return (starting from zero)
 479         @type start: int
 480         @param number: the number of matching entries to return
 481         @type number: int
 482         @return: a set of matching entries and some statistics
 483         @rtype: tuple of (returned number, available number, matches)
 484                 "matches" is a dictionary of
 485                     ["rank", "percent", "document", "docid"]
 486         """
 487         # check if requested results do not exist
 488         # stop is the lowest index number to be ommitted
 489         stop = start + number
 490         if stop > self.enquire.length():
 491             stop = self.enquire.length()
 492         # invalid request range
 493         if stop <= start:
 494             return (0, self.enquire.length(), [])
 495         result = []
 496         for index in range(start, stop):
 497             item = {}
 498             item["rank"] = index
 499             item["docid"] = self.enquire.id(index)
 500             item["percent"] = self.enquire.score(index)
 501             item["document"] = self.enquire.doc(index)
 502             result.append(item)
 503         return (stop-start, self.enquire.length(), result)
 504
 505 def _occur(required, prohibited):
 506        if required == True and prohibited == False:
 507            return PyLucene.BooleanClause.Occur.MUST
 508        elif required == False and prohibited == False:
 509            return PyLucene.BooleanClause.Occur.SHOULD
 510        elif required == False and prohibited == True:
 511            return PyLucene.BooleanClause.Occur.MUST_NOT
 512        else:
 513            # It is an error to specify a clause as both required
 514            # and prohibited
 515            return None
 516
 517 def _get_pylucene_version():
 518     """get the installed pylucene version
 519
 520     @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
 521     @rtype: int
 522     """
 523     version = PyLucene.VERSION
 524     if version.startswith("1."):
 525         return 1
 526     elif version.startswith("2."):
 527         return 2
 528     else:
 529         return 0
 530
 531
 532 def _escape_term_value(text):
 533     return re.sub("\*", "", text)
 534