search/indexer/PyLuceneIndexer1.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # Copyright 2008 Zuza Software Foundation
   4 #
   5 # This file is part of translate.
   6 #
   7 # translate is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # translate is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with translate; if not, write to the Free Software
  19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 #
  21
  22
  23 """
  24 interface for the pylucene (v1.x) indexing engine
  25
  26 take a look at PyLuceneIndexer.py for PyLucene v2.x support
  27 """
  28
  29 __revision__ = "$Id$"
  30
  31 # this module is based on PyLuceneIndexer (for PyLucene v2.x)
  32 import PyLuceneIndexer
  33 import PyLucene
  34
  35
  36 def is_available():
  37     return PyLuceneIndexer._get_pylucene_version() == 1
  38
  39
  40 class PyLuceneDatabase(PyLuceneIndexer.PyLuceneDatabase):
  41     """manage and use a pylucene indexing database"""
  42
  43     def _create_query_for_string(self, text, require_all=True,
  44                 analyzer=None):
  45         """generate a query for a plain term of a string query
  46
  47         basically this function parses the string and returns the resulting
  48         query
  49
  50         @param text: the query string
  51         @type text: str
  52         @param require_all: boolean operator
  53             (True -> AND (default) / False -> OR)
  54         @type require_all: bool
  55         @param analyzer: the analyzer to be used
  56             possible analyzers are:
  57                 CommonDatabase.ANALYZER_TOKENIZE
  58                     the field value is splitted to be matched word-wise
  59                 CommonDatabase.ANALYZER_PARTIAL
  60                     the field value must start with the query string
  61                 CommonDatabase.ANALYZER_EXACT
  62                     keep special characters and the like
  63         @type analyzer: bool
  64         @return: resulting query object
  65         @rtype: PyLucene.Query
  66         """
  67         if analyzer is None:
  68             analyzer = self.analyzer
  69         if analyzer == self.ANALYZER_EXACT:
  70             # exact matching - no substitution ...
  71             # for PyLucene: nothing special is necessary
  72             pass
  73         # don't care about special characters ...
  74         if analyzer == self.ANALYZER_EXACT:
  75             analyzer_obj = self.ExactAnalyzer()
  76         else:
  77             text = _escape_term_value(text)
  78             analyzer_obj = PyLucene.StandardAnalyzer()
  79         qp = PyLucene.QueryParser(analyzer=analyzer_obj)
  80         if require_all:
  81             qp.setDefaultOperator(qp.Operator.AND)
  82         else:
  83             qp.setDefaultOperator(qp.Operator.OR)
  84         if (analyzer & self.ANALYZER_PARTIAL) > 0:
  85             # PyLucene uses explicit wildcards for partial matching
  86             text += "*"
  87         return qp.parse(text)
  88
  89     def _create_query_for_field(self, field, value, analyzer=None):
  90         """generate a field query
  91
  92         this functions creates a field->value query
  93
  94         @param field: the fieldname to be used
  95         @type field: str
  96         @param value: the wanted value of the field
  97         @type value: str
  98         @param analyzer: the analyzer to be used
  99             possible analyzers are:
 100                 CommonDatabase.ANALYZER_TOKENIZE
 101                     the field value is splitted to be matched word-wise
 102                 CommonDatabase.ANALYZER_PARTIAL
 103                     the field value must start with the query string
 104                 CommonDatabase.ANALYZER_EXACT
 105                     keep special characters and the like
 106         @type analyzer: bool
 107         @return: resulting query object
 108         @rtype: PyLucene.Query
 109         """
 110         if analyzer is None:
 111             analyzer = self.analyzer
 112         if analyzer == self.ANALYZER_EXACT:
 113             analyzer_obj = self.ExactAnalyzer()
 114         else:
 115             value = _escape_term_value(value)
 116             analyzer_obj = PyLucene.StandardAnalyzer()
 117         if (analyzer & self.ANALYZER_PARTIAL) > 0:
 118             # PyLucene uses explicit wildcards for partial matching
 119             value += "*"
 120         return PyLucene.QueryParser.parse(value, field, analyzer_obj)
 121
 122     def _create_query_combined(self, queries, require_all=True):
 123         """generate a combined query
 124
 125         @param queries: list of the original queries
 126         @type queries: list of xapian.Query
 127         @param require_all: boolean operator
 128             (True -> AND (default) / False -> OR)
 129         @type require_all: bool
 130         @return: the resulting combined query object
 131         @rtype: PyLucene.Query
 132         """
 133         combined_query = PyLucene.BooleanQuery()
 134         for query in queries:
 135             combined_query.add(
 136                     PyLucene.BooleanClause(query, require_all, False))
 137         return combined_query
 138
 139     def _add_plain_term(self, document, term, tokenize=True):
 140         """add a term to a document
 141
 142         @param document: the document to be changed
 143         @type document: xapian.Document | PyLucene.Document
 144         @param term: a single term to be added
 145         @type term: str
 146         @param tokenize: should the term be tokenized automatically
 147         @type tokenize: bool
 148         """
 149         # Field parameters: name, string, store, index, token
 150         document.add(PyLucene.Field(str(PyLuceneIndex.UNNAMED_FIELD_NAME), term,
 151                 True, True, tokenize))
 152
 153     def _add_field_term(self, document, field, term, tokenize=True):
 154         """add a field term to a document
 155
 156         @param document: the document to be changed
 157         @type document: xapian.Document | PyLucene.Document
 158         @param field: name of the field
 159         @type field: str
 160         @param term: term to be associated to the field
 161         @type term: str
 162         @param tokenize: should the term be tokenized automatically
 163         @type tokenize: bool
 164         """
 165         # TODO: decoding (utf-8) is missing
 166         # Field parameters: name, string, store, index, token
 167         document.add(PyLucene.Field(str(field), term,
 168                 True, True, tokenize))
 169
 170     def get_query_result(self, query):
 171         """return an object containing the results of a query
 172
 173         @param query: a pre-compiled query
 174         @type query: a query object of the real implementation
 175         @return: an object that allows access to the results
 176         @rtype: subclass of CommonEnquire
 177         """
 178         return PyLucene.indexSearcher.search(query)
 179
 180     def search(self, query, fieldnames):
 181         """return a list of the contents of specified fields for all matches of
 182         a query
 183
 184         @param query: the query to be issued
 185         @type query: a query object of the real implementation
 186         @param fieldnames: the name(s) of a field of the document content
 187         @type fieldnames: string | list of strings
 188         @return: a list of dicts containing the specified field(s)
 189         @rtype: list of dicts
 190         """
 191         if isinstance(fieldnames, str):
 192             fieldnames = [fieldnames]
 193         hits = PyLucene.indexSearcher.search(query)
 194         result = []
 195         for hit, doc in hits:
 196             fields = {}
 197             for fieldname in fieldnames:
 198                 content = doc.get(fieldname)
 199                 if not content is None:
 200                     fields[fieldname] = content
 201             result.append(fields)
 202         return result
 203
 204     def _writer_open(self):
 205         """open write access for the indexing database and acquire an
 206         exclusive lock
 207         """
 208         super(PyLuceneIndexer1, self)._writer_open_()
 209         self.writer.maxFieldLength = PyLuceneIndexer.MAX_FIELD_SIZE
 210