search/indexer/CommonIndexer.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2008 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 #
  21
  22
  23 """
  24 base class for interfaces to indexing engines for pootle
  25 """
  26
  27 import translate.lang.data
  28 import os
  29
  30 __revision__ = "$Id$"
  31
  32
  33 def is_available():
  34     """check if this indexing engine interface is usable
  35
  36     this function must exist in every module that contains indexing engine
  37     interfaces
  38
  39     @return: is this interface usable?
  40     @rtype: bool
  41     """
  42     return False
  43
  44
  45 class CommonDatabase(object):
  46     """base class for indexing support
  47
  48     any real implementation must override most methods of this class
  49     """
  50
  51     field_analyzers = {}
  52     """mapping of field names and analyzers - see 'set_field_analyzers'"""
  53
  54     ANALYZER_EXACT = 0
  55     """exact matching: the query string must equal the whole term string"""
  56
  57     ANALYZER_PARTIAL = 1<<1
  58     """partial matching: a document matches, even if the query string only
  59     matches the beginning of the term value."""
  60
  61     ANALYZER_TOKENIZE = 1<<2
  62     """tokenize terms and queries automatically"""
  63
  64     ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL
  65     """the default analyzer to be used if nothing is configured"""
  66
  67     QUERY_TYPE = None
  68     """override this with the query class of the implementation"""
  69
  70     INDEX_DIRECTORY_NAME = None
  71     """override this with a string to be used as the name of the indexing
  72     directory/file in the filesystem
  73     """
  74
  75     def __init__(self, basedir, analyzer=None, create_allowed=True):
  76         """initialize or open an indexing database
  77
  78         Any derived class must override __init__.
  79
  80         The following exceptions can be raised:
  81             ValueError: the given location exists, but the database type
  82                 is incompatible (e.g. created by a different indexing engine)
  83             OSError: the database failed to initialize
  84
  85         Any implementation can rely on the "self.location" attribute to be set
  86         by the __init__ function of the super class.
  87
  88         @param basedir: the parent directory of the database
  89         @type basedir: str
  90         @param analyzer: bitwise combination of possible analyzer flags
  91             to be used as the default analyzer for this database. Leave it empty
  92             to use the system default analyzer (self.ANALYZER_DEFAULT).
  93             see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
  94         @type analyzer: int
  95         @param create_allowed: create the database, if necessary; default: True
  96         @type create_allowed: bool
  97         @throws: OSError, ValueError
  98         """
  99         # just do some checks
 100         if self.QUERY_TYPE is None:
 101             raise NotImplementedError("Incomplete indexer implementation: " \
 102                     + "'QUERY_TYPE' is undefined")
 103         if self.INDEX_DIRECTORY_NAME is None:
 104             raise NotImplementedError("Incomplete indexer implementation: " \
 105                     + "'INDEX_DIRECTORY_NAME' is undefined")
 106         self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME)
 107         if (not create_allowed) and (not os.path.exists(self.location)):
 108             raise OSError("Indexer: the database does not exist - and I am" \
 109                     + " not configured to create it.")
 110         if analyzer is None:
 111             self.analyzer = self.ANALYZER_DEFAULT
 112         else:
 113             self.analyzer = analyzer
 114         self.field_analyzers = {}
 115
 116     def flush(self, optimize=False):
 117         """flush the content of the database - to force changes to be written
 118         to disk
 119
 120         some databases also support index optimization
 121
 122         @param optimize: should the index be optimized if possible?
 123         @type optimize: bool
 124         """
 125         raise NotImplementedError("Incomplete indexer implementation: " \
 126                 + "'flush' is missing")
 127
 128     def make_query(self, args, require_all=True, analyzer=None):
 129         """create simple queries (strings or field searches) or
 130         combine multiple queries (AND/OR)
 131
 132         To specifiy rules for field searches, you may want to take a look at
 133         'set_field_analyzers'. The parameter 'match_text_partial' can override
 134         the previously defined default setting.
 135
 136         @param args: queries or search string or description of field query
 137             examples:
 138                 [xapian.Query("foo"), xapian.Query("bar")]
 139                 xapian.Query("foo")
 140                 "bar"
 141                 {"foo": "bar", "foobar": "foo"}
 142         @type args: list of queries | single query | str | dict
 143         @param require_all: boolean operator
 144             (True -> AND (default) / False -> OR)
 145         @type require_all: boolean
 146         @param analyzer: (only applicable for 'dict' or 'str')
 147             Define query options (partial matching, exact matching, tokenizing,
 148             ...) as bitwise combinations of CommonIndexer.ANALYZER_???.
 149             This can override previously defined field analyzer settings.
 150             If analyzer is None (default), then the configured analyzer for the
 151             field is used.
 152         @type analyzer: int
 153         @return: the combined query
 154         @rtype: query type of the specific implemention
 155         """
 156         # turn a dict into a list if necessary
 157         if isinstance(args, dict):
 158             args = args.items()
 159         # turn 'args' into a list if necessary
 160         if not isinstance(args, list):
 161             args = [args]
 162         # combine all given queries
 163         result = []
 164         for query in args:
 165             # just add precompiled queries
 166             if isinstance(query, self.QUERY_TYPE):
 167                 result.append(self._create_query_for_query(query))
 168             # create field/value queries out of a tuple
 169             elif isinstance(query, tuple):
 170                 field, value = query
 171                 # perform unicode normalization
 172                 field = translate.lang.data.normalize(unicode(field))
 173                 value = translate.lang.data.normalize(unicode(value))
 174                 # check for the choosen match type
 175                 if analyzer is None:
 176                     analyzer = self.get_field_analyzers(field)
 177                 result.append(self._create_query_for_field(field, value,
 178                         analyzer=analyzer))
 179             # parse plaintext queries
 180             elif isinstance(query, str):
 181                 if analyzer is None:
 182                     analyzer = self.analyzer
 183                 # perform unicode normalization
 184                 query = translate.lang.data.normalize(unicode(query))
 185                 result.append(self._create_query_for_string(query,
 186                         require_all=require_all, analyzer=analyzer))
 187             else:
 188                 # other types of queries are not supported
 189                 raise ValueError("Unable to handle query type: %s" \
 190                         % str(type(query)))
 191         # return the combined query
 192         return self._create_query_combined(result, require_all)
 193
 194     def _create_query_for_query(self, query):
 195         """generate a query based on an existing query object
 196
 197         basically this function should just create a copy of the original
 198
 199         @param query: the original query object
 200         @type query: xapian.Query
 201         @return: the resulting query object
 202         @rtype: xapian.Query | PyLucene.Query
 203         """
 204         raise NotImplementedError("Incomplete indexer implementation: " \
 205                 + "'_create_query_for_query' is missing")
 206
 207     def _create_query_for_string(self, text, require_all=True,
 208             analyzer=None):
 209         """generate a query for a plain term of a string query
 210
 211         basically this function parses the string and returns the resulting
 212         query
 213
 214         @param text: the query string
 215         @type text: str
 216         @param require_all: boolean operator
 217             (True -> AND (default) / False -> OR)
 218         @type require_all: bool
 219         @param analyzer: Define query options (partial matching, exact matching,
 220             tokenizing, ...) as bitwise combinations of
 221             CommonIndexer.ANALYZER_???.
 222             This can override previously defined field analyzer settings.
 223             If analyzer is None (default), then the configured analyzer for the
 224             field is used.
 225         @type analyzer: int
 226         @return: resulting query object
 227         @rtype: xapian.Query | PyLucene.Query
 228         """
 229         raise NotImplementedError("Incomplete indexer implementation: " \
 230                 + "'_create_query_for_string' is missing")
 231
 232     def _create_query_for_field(self, field, value, analyzer=None):
 233         """generate a field query
 234
 235         this functions creates a field->value query
 236
 237         @param field: the fieldname to be used
 238         @type field: str
 239         @param value: the wanted value of the field
 240         @type value: str
 241         @param analyzer: Define query options (partial matching, exact matching,
 242             tokenizing, ...) as bitwise combinations of
 243             CommonIndexer.ANALYZER_???.
 244             This can override previously defined field analyzer settings.
 245             If analyzer is None (default), then the configured analyzer for the
 246             field is used.
 247         @type analyzer: int
 248         @return: resulting query object
 249         @rtype: xapian.Query | PyLucene.Query
 250         """
 251         raise NotImplementedError("Incomplete indexer implementation: " \
 252                 + "'_create_query_for_field' is missing")
 253
 254     def _create_query_combined(self, queries, require_all=True):
 255         """generate a combined query
 256
 257         @param queries: list of the original queries
 258         @type queries: list of xapian.Query
 259         @param require_all: boolean operator
 260             (True -> AND (default) / False -> OR)
 261         @type require_all: bool
 262         @return: the resulting combined query object
 263         @rtype: xapian.Query | PyLucene.Query
 264         """
 265         raise NotImplementedError("Incomplete indexer implementation: " \
 266                 + "'_create_query_combined' is missing")
 267
 268     def index_document(self, data):
 269         """add the given data to the database
 270
 271         @param data: the data to be indexed.
 272             A dictionary will be treated as fieldname:value combinations.
 273             If the fieldname is None then the value will be interpreted as a
 274             plain term or as a list of plain terms.
 275             Lists of strings are treated as plain terms.
 276         @type data: dict | list of str
 277         """
 278         doc = self._create_empty_document()
 279         if isinstance(data, dict):
 280             data = data.items()
 281         # add all data
 282         for dataset in data:
 283             if isinstance(dataset, tuple):
 284                 # the dataset tuple consists of '(key, value)'
 285                 key, value = dataset
 286                 if key is None:
 287                     if isinstance(value, list):
 288                         terms = value[:]
 289                     elif isinstance(value, str):
 290                         terms = [value]
 291                     else:
 292                         raise ValueError("Invalid data type to be indexed: %s" \
 293                                 % str(type(data)))
 294                     for one_term in terms:
 295                         self._add_plain_term(doc, self._decode(one_term),
 296                                 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
 297                 else:
 298                     analyze_settings = self.get_field_analyzers(key)
 299                     self._add_field_term(doc, key, self._decode(value),
 300                             (analyze_settings & self.ANALYZER_TOKENIZE > 0))
 301             elif isinstance(dataset, str):
 302                 self._add_plain_term(doc, self._decode(dataset),
 303                         (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
 304             else:
 305                 raise ValueError("Invalid data type to be indexed: %s" \
 306                         % str(type(data)))
 307         self._add_document_to_index(doc)
 308
 309     def _create_empty_document(self):
 310         """create an empty document to be filled and added to the index later
 311
 312         @return: the new document object
 313         @rtype: xapian.Document | PyLucene.Document
 314         """
 315         raise NotImplementedError("Incomplete indexer implementation: " \
 316                 + "'_create_empty_document' is missing")
 317
 318     def _add_plain_term(self, document, term, tokenize=True):
 319         """add a term to a document
 320
 321         @param document: the document to be changed
 322         @type document: xapian.Document | PyLucene.Document
 323         @param term: a single term to be added
 324         @type term: str
 325         @param tokenize: should the term be tokenized automatically
 326         @type tokenize: bool
 327         """
 328         raise NotImplementedError("Incomplete indexer implementation: " \
 329                 + "'_add_plain_term' is missing")
 330
 331     def _add_field_term(self, document, field, term, tokenize=True):
 332         """add a field term to a document
 333
 334         @param document: the document to be changed
 335         @type document: xapian.Document | PyLucene.Document
 336         @param field: name of the field
 337         @type field: str
 338         @param term: term to be associated to the field
 339         @type term: str
 340         @param tokenize: should the term be tokenized automatically
 341         @type tokenize: bool
 342         """
 343         raise NotImplementedError("Incomplete indexer implementation: " \
 344                 + "'_add_field_term' is missing")
 345
 346     def _add_document_to_index(self, document):
 347         """add a prepared document to the index database
 348
 349         @param document: the document to be added
 350         @type document: xapian.Document | PyLucene.Document
 351         """
 352         raise NotImplementedError("Incomplete indexer implementation: " \
 353                 + "'_add_document_to_index' is missing")
 354
 355     def begin_transaction(self):
 356         """begin a transaction
 357
 358         You can group multiple modifications of a database as a transaction.
 359         This prevents time-consuming database flushing and helps, if you want
 360         that a changeset is committed either completely or not at all.
 361         No changes will be written to disk until 'commit_transaction'.
 362         'cancel_transaction' can be used to revert an ongoing transaction.
 363
 364         Database types that do not support transactions may silently ignore it.
 365         """
 366         raise NotImplementedError("Incomplete indexer implementation: " \
 367                 + "'begin_transaction' is missing")
 368
 369     def cancel_transaction(self):
 370         """cancel an ongoing transaction
 371
 372         See 'start_transaction' for details.
 373         """
 374         raise NotImplementedError("Incomplete indexer implementation: " \
 375                 + "'cancel_transaction' is missing")
 376
 377     def commit_transaction(self):
 378         """submit the currently ongoing transaction and write changes to disk
 379
 380         See 'start_transaction' for details.
 381         """
 382         raise NotImplementedError("Incomplete indexer implementation: " \
 383                 + "'commit_transaction' is missing")
 384
 385     def get_query_result(self, query):
 386         """return an object containing the results of a query
 387
 388         @param query: a pre-compiled query
 389         @type query: a query object of the real implementation
 390         @return: an object that allows access to the results
 391         @rtype: subclass of CommonEnquire
 392         """
 393         raise NotImplementedError("Incomplete indexer implementation: " \
 394                 + "'get_query_result' is missing")
 395
 396     def delete_document_by_id(self, docid):
 397         """delete a specified document
 398
 399         @param docid: the document ID to be deleted
 400         @type docid: int
 401         """
 402         raise NotImplementedError("Incomplete indexer implementation: " \
 403                 + "'delete_document_by_id' is missing")
 404
 405     def search(self, query, fieldnames):
 406         """return a list of the contents of specified fields for all matches of
 407         a query
 408
 409         @param query: the query to be issued
 410         @type query: a query object of the real implementation
 411         @param fieldnames: the name(s) of a field of the document content
 412         @type fieldnames: string | list of strings
 413         @return: a list of dicts containing the specified field(s)
 414         @rtype: list of dicts
 415         """
 416         raise NotImplementedError("Incomplete indexer implementation: " \
 417                 + "'search' is missing")
 418
 419     def delete_doc(self, ident):
 420         """delete the documents returned by a query
 421
 422         @param ident: [list of] document IDs | dict describing a query | query
 423         @type ident: int | list of tuples | dict | list of dicts |
 424             query (e.g. xapian.Query) | list of queries
 425         """
 426         # turn a doc-ID into a list of doc-IDs
 427         if isinstance(ident, list):
 428             # it is already a list
 429             ident_list = ident
 430         else:
 431             ident_list = [ident]
 432         if len(ident_list) == 0:
 433             # no matching items
 434             return 0
 435         if isinstance(ident_list[0], int):
 436             # create a list of IDs of all successfully removed documents
 437             success_delete = [match for match in ident_list
 438                     if self.delete_document_by_id(match)]
 439             return len(success_delete)
 440         if isinstance(ident_list[0], dict):
 441             # something like: { "msgid": "foobar" }
 442             # assemble all queries
 443             query = self.make_query([self.make_query(query_dict,
 444                     require_all=True) for query_dict in ident_list],
 445                     require_all=True)
 446         elif isinstance(ident_list[0], object):
 447             # assume a query object (with 'AND')
 448             query = self.make_query(ident_list, require_all=True)
 449         else:
 450             # invalid element type in list (not necessarily caught in the
 451             # lines above)
 452             raise TypeError("description of documents to-be-deleted is not " \
 453                     + "supported: list of %s" % type(ident_list[0]))
 454         # we successfully created a query - now iterate through the result
 455         # no documents deleted so far ...
 456         remove_list = []
 457         # delete all resulting documents step by step
 458         def add_docid_to_list(match):
 459             """collect every document ID"""
 460             remove_list.append(match["docid"])
 461         self._walk_matches(query, add_docid_to_list)
 462         return self.delete_doc(remove_list)
 463
 464     def _walk_matches(self, query, function, arg_for_function=None):
 465         """use this function if you want to do something with every single match
 466         of a query
 467
 468         example: self._walk_matches(query, function_for_match, arg_for_func)
 469             'function_for_match' expects only one argument: the matched object
 470         @param query: a query object of the real implementation
 471         @type query: xapian.Query | PyLucene.Query
 472         @param function: the function to execute with every match
 473         @type function: function
 474         @param arg_for_function: an optional argument for the function
 475         @type arg_for_function: anything
 476         """
 477         # execute the query
 478         enquire = self.get_query_result(query)
 479         # start with the first element
 480         start = 0
 481         # do the loop at least once
 482         size, avail = (0, 1)
 483         # how many results per 'get_matches'?
 484         steps = 2
 485         while start < avail:
 486             (size, avail, matches) = enquire.get_matches(start, steps)
 487             for match in matches:
 488                 if arg_for_function is None:
 489                     function(match)
 490                 else:
 491                     function(match, arg_for_function)
 492             start += size
 493
 494     def set_field_analyzers(self, field_analyzers):
 495         """set the analyzers for different fields of the database documents
 496
 497         All bitwise combinations of CommonIndexer.ANALYZER_??? are possible.
 498
 499         @param field_analyzers: mapping of field names and analyzers
 500         @type field_analyzers: dict containing field names and analyzers
 501         @throws: TypeError for invalid values in 'field_analyzers'
 502         """
 503         for field, analyzer in field_analyzers.items():
 504             # check for invald input types
 505             if not isinstance(field, (str, unicode)):
 506                 raise TypeError("field name must be a string")
 507             if not isinstance(analyzer, int):
 508                 raise TypeError("the analyzer must be a whole number (int)")
 509             # map the analyzer to the field name
 510             self.field_analyzers[field] = analyzer
 511
 512     def get_field_analyzers(self, fieldnames=None):
 513         """return the analyzer that was mapped to a specific field
 514
 515         see 'set_field_analyzers' for details
 516
 517         @param fieldnames: the analyzer of this field (or all/multiple fields)
 518             is requested; leave empty (or "None") to request all fields
 519         @type fieldnames: str | list of str | None
 520         @return: the analyzer setting of the field - see
 521             CommonDatabase.ANALYZER_??? or a dict of field names and analyzers
 522         @rtype: int | dict
 523         """
 524         # all field analyzers are requested
 525         if fieldnames is None:
 526             # return a copy
 527             return dict(self.field_analyzers)
 528         # one field is requested
 529         if isinstance(fieldnames, (str, unicode)):
 530             if self.field_analyzers.has_key(fieldnames):
 531                 return self.field_analyzers[fieldnames]
 532             else:
 533                 return self.analyzer
 534         # a list of fields is requested
 535         if isinstance(fieldnames, list):
 536             result = {}
 537             for field in fieldnames:
 538                 result[field] = self.get_field_analyzers(field)
 539             return result
 540         return self.analyzer
 541
 542     def _decode(self, text):
 543         """decode the string from utf-8 or charmap
 544         perform unicde normalization
 545         """
 546         if isinstance(text, str):
 547             try:
 548                 result = unicode(text.decode("UTF-8"))
 549             except UnicodeEncodeError, e:
 550                 result = unicode(text.decode("charmap"))
 551         elif not isinstance(text, unicode):
 552             result = unicode(text)
 553         else:
 554             result = text
 555         # perform unicode normalization
 556         return translate.lang.data.normalize(result)
 557
 558
 559 class CommonEnquire(object):
 560     """an enquire object contains the information about the result of a request
 561     """
 562
 563     def __init__(self, enquire):
 564         """intialization of a wrapper around enquires of different backends
 565
 566         @param enquire: a previous enquire
 567         @type enquire: xapian.Enquire | pylucene-enquire
 568         """
 569         self.enquire = enquire
 570
 571     def get_matches(self, start, number):
 572         """return a specified number of qualified matches of a previous query
 573
 574         @param start: index of the first match to return (starting from zero)
 575         @type start: int
 576         @param number: the number of matching entries to return
 577         @type number: int
 578         @return: a set of matching entries and some statistics
 579         @rtype: tuple of (returned number, available number, matches)
 580                 "matches" is a dictionary of
 581                     ["rank", "percent", "document", "docid"]
 582         """
 583         raise NotImplementedError("Incomplete indexing implementation: " \
 584                 + "'get_matches' for the 'Enquire' class is missing")
 585
 586     def get_matches_count(self):
 587         """return the estimated number of matches
 588
 589         use "CommonIndexer.search" to retrieve the exact number of matches
 590         @return: the estimaed number of matches
 591         @rtype: int
 592         """
 593         (returned, estimate_count, matches) = self.get_matches(0, 1)
 594         return estimate_count
 595