search/indexer/XapianIndexer.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2008 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21 #
  22
  23 """
  24 interface to the xapian indexing engine for the translate toolkit
  25
  26 Xapian v1.0 or higher is supported.
  27
  28 If you are interested in writing an interface for Xapian 0.x, then
  29 you should checkout the following:
  30     svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
  31 It is not completely working, but it should give you a good start.
  32 """
  33
  34 __revision__ = "$Id$"
  35
  36
  37 import CommonIndexer
  38 import xapian
  39 import os
  40 import re
  41
  42
  43 def is_available():
  44     return xapian.major_version() > 0
  45
  46
  47 # in xapian there is a length restriction for term strings
  48 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html
  49 # a maximum length of around 240 is described there - but we need less anyway
  50 _MAX_TERM_LENGTH = 128
  51
  52
  53 class XapianDatabase(CommonIndexer.CommonDatabase):
  54     """interface to the xapian (http://xapian.org) indexer
  55     """
  56
  57     QUERY_TYPE = xapian.Query
  58     INDEX_DIRECTORY_NAME = "xapian"
  59
  60     def __init__(self, basedir, analyzer=None, create_allowed=True):
  61         """initialize or open a xapian database
  62
  63         The following exceptions can be raised:
  64             ValueError: the given location exists, but the database type
  65                 is incompatible (e.g. created by a different indexing engine)
  66             OSError: the database failed to initialize
  67
  68         @param basedir: the parent directory of the database
  69         @type basedir: str
  70         @param analyzer: bitwise combination of possible analyzer flags
  71             to be used as the default analyzer for this database. Leave it empty
  72             to use the system default analyzer (self.ANALYZER_DEFAULT).
  73             see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
  74         @type analyzer: int
  75         @param create_allowed: create the database, if necessary; default: True
  76         @type create_allowed: bool
  77         @throws: OSError, ValueError
  78         """
  79         # call the __init__ function of our parent
  80         super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
  81                 create_allowed=create_allowed)
  82         if os.path.exists(self.location):
  83             # try to open an existing database
  84             try:
  85                 self.database = xapian.WritableDatabase(self.location,
  86                     xapian.DB_OPEN)
  87             except xapian.DatabaseOpeningError, err_msg:
  88                 raise ValueError("Indexer: failed to open xapian database " \
  89                         + "(%s) - maybe it is not a xapian database: %s" \
  90                         % (self.location, err_msg))
  91         else:
  92             # create a new database
  93             if not create_allowed:
  94                 raise OSError("Indexer: skipping database creation")
  95             try:
  96                 # create the parent directory if it does not exist
  97                 parent_path = os.path.dirname(self.location)
  98                 if not os.path.isdir(parent_path):
  99                     # recursively create all directories up to parent_path
 100                     os.makedirs(parent_path)
 101             except IOError, err_msg:
 102                 raise OSError("Indexer: failed to create the parent " \
 103                         + "directory (%s) of the indexing database: %s" \
 104                         % (parent_path, err_msg))
 105             try:
 106                 self.database = xapian.WritableDatabase(self.location,
 107                         xapian.DB_CREATE_OR_OPEN)
 108             except xapian.DatabaseOpeningError, err_msg:
 109                 raise OSError("Indexer: failed to open or create a xapian " \
 110                         + "database (%s): %s" % (self.location, err_msg))
 111
 112     def flush(self, optimize=False):
 113         """force to write the current changes to disk immediately
 114
 115         @param optimize: ignored for xapian
 116         @type optimize: bool
 117         """
 118         # write changes to disk (only if database is read-write)
 119         if (isinstance(self.database, xapian.WritableDatabase)):
 120             self.database.flush()
 121         # free the database to remove locks - this is a xapian-specific issue
 122         self.database = None
 123         # reopen it as read-only
 124         self._prepare_database()
 125
 126     def _create_query_for_query(self, query):
 127         """generate a query based on an existing query object
 128
 129         basically this function should just create a copy of the original
 130
 131         @param query: the original query object
 132         @type query: xapian.Query
 133         @return: the resulting query object
 134         @rtype: xapian.Query
 135         """
 136         # create a copy of the original query
 137         return xapian.Query(query)
 138
 139     def _create_query_for_string(self, text, require_all=True,
 140             analyzer=None):
 141         """generate a query for a plain term of a string query
 142
 143         basically this function parses the string and returns the resulting
 144         query
 145
 146         @param text: the query string
 147         @type text: str
 148         @param require_all: boolean operator
 149             (True -> AND (default) / False -> OR)
 150         @type require_all: bool
 151         @param analyzer: Define query options (partial matching, exact matching,
 152             tokenizing, ...) as bitwise combinations of
 153             CommonIndexer.ANALYZER_???.
 154             This can override previously defined field analyzer settings.
 155             If analyzer is None (default), then the configured analyzer for the
 156             field is used.
 157         @type analyzer: int
 158         @return: resulting query object
 159         @rtype: xapian.Query
 160         """
 161         qp = xapian.QueryParser()
 162         qp.set_database(self.database)
 163         if require_all:
 164             qp.set_default_op(xapian.Query.OP_AND)
 165         else:
 166             qp.set_default_op(xapian.Query.OP_OR)
 167         if analyzer is None:
 168             analyzer = self.analyzer
 169         if analyzer & self.ANALYZER_PARTIAL > 0:
 170             match_flags = xapian.QueryParser.FLAG_PARTIAL
 171             return qp.parse_query(text, match_flags)
 172         elif analyzer == self.ANALYZER_EXACT:
 173             # exact matching -
 174             return xapian.Query(text)
 175         else:
 176             # everything else (not partial and not exact)
 177             match_flags = 0
 178             return qp.parse_query(text, match_flags)
 179
 180     def _create_query_for_field(self, field, value, analyzer=None):
 181         """generate a field query
 182
 183         this functions creates a field->value query
 184
 185         @param field: the fieldname to be used
 186         @type field: str
 187         @param value: the wanted value of the field
 188         @type value: str
 189         @param analyzer: Define query options (partial matching, exact matching,
 190             tokenizing, ...) as bitwise combinations of
 191             CommonIndexer.ANALYZER_???.
 192             This can override previously defined field analyzer settings.
 193             If analyzer is None (default), then the configured analyzer for the
 194             field is used.
 195         @type analyzer: int
 196         @return: the resulting query object
 197         @rtype: xapian.Query
 198         """
 199         if analyzer is None:
 200             analyzer = self.analyzer
 201         if analyzer == self.ANALYZER_EXACT:
 202             # exact matching -> keep special characters
 203             return xapian.Query("%s%s" % (field.upper(), value))
 204         # other queries need a parser object
 205         qp = xapian.QueryParser()
 206         qp.set_database(self.database)
 207         if (analyzer & self.ANALYZER_PARTIAL > 0):
 208             # partial matching
 209             match_flags = xapian.QueryParser.FLAG_PARTIAL
 210             return qp.parse_query(value, match_flags, field.upper())
 211         else:
 212             # everything else (not partial and not exact)
 213             match_flags = 0
 214             return qp.parse_query(value, match_flags, field.upper())
 215
 216     def _create_query_combined(self, queries, require_all=True):
 217         """generate a combined query
 218
 219         @param queries: list of the original queries
 220         @type queries: list of xapian.Query
 221         @param require_all: boolean operator
 222             (True -> AND (default) / False -> OR)
 223         @type require_all: bool
 224         @return: the resulting combined query object
 225         @rtype: xapian.Query
 226         """
 227         if require_all:
 228             query_op = xapian.Query.OP_AND
 229         else:
 230             query_op = xapian.Query.OP_OR
 231         return xapian.Query(query_op, queries)
 232
 233     def _create_empty_document(self):
 234         """create an empty document to be filled and added to the index later
 235
 236         @return: the new document object
 237         @rtype: xapian.Document
 238         """
 239         return xapian.Document()
 240
 241     def _add_plain_term(self, document, term, tokenize=True):
 242         """add a term to a document
 243
 244         @param document: the document to be changed
 245         @type document: xapian.Document
 246         @param term: a single term to be added
 247         @type term: str
 248         @param tokenize: should the term be tokenized automatically
 249         @type tokenize: bool
 250         """
 251         if tokenize:
 252             term_gen = xapian.TermGenerator()
 253             term_gen.set_document(document)
 254             term_gen.index_text(term)
 255         else:
 256             document.add_term(_truncate_term_length(term))
 257
 258     def _add_field_term(self, document, field, term, tokenize=True):
 259         """add a field term to a document
 260
 261         @param document: the document to be changed
 262         @type document: xapian.Document
 263         @param field: name of the field
 264         @type field: str
 265         @param term: term to be associated to the field
 266         @type term: str
 267         @param tokenize: should the term be tokenized automatically
 268         @type tokenize: bool
 269         """
 270         if tokenize:
 271             term_gen = xapian.TermGenerator()
 272             term_gen.set_document(document)
 273             term_gen.index_text(term, 1, field.upper())
 274         else:
 275             document.add_term(_truncate_term_length("%s%s" % \
 276                         (field.upper(), term)))
 277
 278     def _add_document_to_index(self, document):
 279         """add a prepared document to the index database
 280
 281         @param document: the document to be added
 282         @type document: xapian.Document
 283         """
 284         # open the database for writing
 285         self._prepare_database(writable=True)
 286         self.database.add_document(document)
 287
 288     def begin_transaction(self):
 289         """begin a transaction
 290
 291         Xapian supports transactions to group multiple database modifications.
 292         This avoids intermediate flushing and therefore increases performance.
 293         """
 294         self._prepare_database(writable=True)
 295         self.database.begin_transaction()
 296
 297     def cancel_transaction(self):
 298         """cancel an ongoing transaction
 299
 300         no changes since the last execution of 'begin_transcation' are written
 301         """
 302         self._prepare_database(writable=True)
 303         self.database.cancel_transaction()
 304
 305     def commit_transaction(self):
 306         """submit the changes of an ongoing transaction
 307
 308         all changes since the last execution of 'begin_transaction' are written
 309         """
 310         self._prepare_database(writable=True)
 311         self.database.commit_transaction()
 312
 313     def get_query_result(self, query):
 314         """return an object containing the results of a query
 315
 316         @param query: a pre-compiled xapian query
 317         @type query: xapian.Query
 318         @return: an object that allows access to the results
 319         @rtype: XapianIndexer.CommonEnquire
 320         """
 321         enquire = xapian.Enquire(self.database)
 322         enquire.set_query(query)
 323         return XapianEnquire(enquire)
 324
 325     def delete_document_by_id(self, docid):
 326         """delete a specified document
 327
 328         @param docid: the document ID to be deleted
 329         @type docid: int
 330         """
 331         # open the database for writing
 332         self._prepare_database(writable=True)
 333         try:
 334             self.database.delete_document(docid)
 335             return True
 336         except xapian.DocNotFoundError:
 337             return False
 338
 339     def search(self, query, fieldnames):
 340         """return a list of the contents of specified fields for all matches of
 341         a query
 342
 343         @param query: the query to be issued
 344         @type query: xapian.Query
 345         @param fieldnames: the name(s) of a field of the document content
 346         @type fieldnames: string | list of strings
 347         @return: a list of dicts containing the specified field(s)
 348         @rtype: list of dicts
 349         """
 350         result = []
 351         if isinstance(fieldnames, str):
 352             fieldnames = [fieldnames]
 353         self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
 354         return result
 355
 356     def _prepare_database(self, writable=False):
 357         """reopen the database as read-only or as writable if necessary
 358
 359         this fixes a xapian specific issue regarding open locks for
 360         writable databases
 361
 362         @param writable: True for opening a writable database
 363         @type writable: bool
 364         """
 365         if writable and (not isinstance(self.database,
 366                 xapian.WritableDatabase)):
 367             self.database = xapian.WritableDatabase(self.location,
 368                     xapian.DB_OPEN)
 369         elif not writable and (not isinstance(self.database, xapian.Database)):
 370             self.database = xapian.Database(self.location)
 371
 372
 373 class XapianEnquire(CommonIndexer.CommonEnquire):
 374     """interface to the xapian object for storing sets of matches
 375     """
 376
 377     def get_matches(self, start, number):
 378         """return a specified number of qualified matches of a previous query
 379
 380         @param start: index of the first match to return (starting from zero)
 381         @type start: int
 382         @param number: the number of matching entries to return
 383         @type number: int
 384         @return: a set of matching entries and some statistics
 385         @rtype: tuple of (returned number, available number, matches)
 386                 "matches" is a dictionary of
 387                     ["rank", "percent", "document", "docid"]
 388         """
 389         matches = self.enquire.get_mset(start, number)
 390         result = []
 391         for match in matches:
 392             elem = {}
 393             elem["rank"] = match[xapian.MSET_RANK]
 394             elem["docid"] = match[xapian.MSET_DID]
 395             elem["percent"] = match[xapian.MSET_PERCENT]
 396             elem["document"] = match[xapian.MSET_DOCUMENT]
 397             result.append(elem)
 398         return (matches.size(), matches.get_matches_estimated(), result)
 399
 400
 401 def _truncate_term_length(term, taken=0):
 402     """truncate the length of a term string length to the maximum allowed
 403     for xapian terms
 404
 405     @param term: the value of the term, that should be truncated
 406     @type term: str
 407     @param taken: since a term consists of the name of the term and its
 408         actual value, this additional parameter can be used to reduce the
 409         maximum count of possible characters
 410     @type taken: int
 411     @return: the truncated string
 412     @rtype: str
 413     """
 414     if len(term) > _MAX_TERM_LENGTH - taken:
 415         return term[0:_MAX_TERM_LENGTH - taken - 1]
 416     else:
 417         return term
 418
 419 def _extract_fieldvalues(match, (result, fieldnames)):
 420     """add a dict of field values to a list
 421
 422     usually this function should be used together with '_walk_matches'
 423     for traversing a list of matches
 424     @param match: a single match object
 425     @type match: xapian.MSet
 426     @param result: the resulting dict will be added to this list
 427     @type result: list of dict
 428     @param fieldnames: the names of the fields to be added to the dict
 429     @type result: list of str
 430     """
 431     # prepare empty dict
 432     item_fields = {}
 433     # fill the dict
 434     for term in match["document"].termlist():
 435         for fname in fieldnames:
 436             if ((fname is None) and re.match("[^A-Z]", term.term)):
 437                 value = term.term
 438             elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
 439                 value = term.term[len(fname):]
 440             else:
 441                 continue
 442             # we found a matching field/term
 443             if item_fields.has_key(fname):
 444                 item_fields[fname].append(value)
 445             else:
 446                 item_fields[fname] = [value]
 447     result.append(item_fields)
 448