for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / search / indexer / PyLuceneIndexer1.py
bloba250b4c9744ebea411c945d9f5bbd0795f42ccc4
1 # -*- coding: utf-8 -*-
3 # Copyright 2008 Zuza Software Foundation
4 #
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """
24 interface for the pylucene (v1.x) indexing engine
26 take a look at PyLuceneIndexer.py for PyLucene v2.x support
27 """
29 __revision__ = "$Id$"
31 # this module is based on PyLuceneIndexer (for PyLucene v2.x)
32 import PyLuceneIndexer
33 import PyLucene
36 def is_available():
37 return PyLuceneIndexer._get_pylucene_version() == 1
40 class PyLuceneDatabase(PyLuceneIndexer.PyLuceneDatabase):
41 """manage and use a pylucene indexing database"""
43 def _create_query_for_string(self, text, require_all=True,
44 analyzer=None):
45 """generate a query for a plain term of a string query
47 basically this function parses the string and returns the resulting
48 query
50 @param text: the query string
51 @type text: str
52 @param require_all: boolean operator
53 (True -> AND (default) / False -> OR)
54 @type require_all: bool
55 @param analyzer: the analyzer to be used
56 possible analyzers are:
57 CommonDatabase.ANALYZER_TOKENIZE
58 the field value is splitted to be matched word-wise
59 CommonDatabase.ANALYZER_PARTIAL
60 the field value must start with the query string
61 CommonDatabase.ANALYZER_EXACT
62 keep special characters and the like
63 @type analyzer: bool
64 @return: resulting query object
65 @rtype: PyLucene.Query
66 """
67 if analyzer is None:
68 analyzer = self.analyzer
69 if analyzer == self.ANALYZER_EXACT:
70 # exact matching - no substitution ...
71 # for PyLucene: nothing special is necessary
72 pass
73 # don't care about special characters ...
74 if analyzer == self.ANALYZER_EXACT:
75 analyzer_obj = self.ExactAnalyzer()
76 else:
77 text = _escape_term_value(text)
78 analyzer_obj = PyLucene.StandardAnalyzer()
79 qp = PyLucene.QueryParser(analyzer=analyzer_obj)
80 if require_all:
81 qp.setDefaultOperator(qp.Operator.AND)
82 else:
83 qp.setDefaultOperator(qp.Operator.OR)
84 if (analyzer & self.ANALYZER_PARTIAL) > 0:
85 # PyLucene uses explicit wildcards for partial matching
86 text += "*"
87 return qp.parse(text)
89 def _create_query_for_field(self, field, value, analyzer=None):
90 """generate a field query
92 this functions creates a field->value query
94 @param field: the fieldname to be used
95 @type field: str
96 @param value: the wanted value of the field
97 @type value: str
98 @param analyzer: the analyzer to be used
99 possible analyzers are:
100 CommonDatabase.ANALYZER_TOKENIZE
101 the field value is splitted to be matched word-wise
102 CommonDatabase.ANALYZER_PARTIAL
103 the field value must start with the query string
104 CommonDatabase.ANALYZER_EXACT
105 keep special characters and the like
106 @type analyzer: bool
107 @return: resulting query object
108 @rtype: PyLucene.Query
110 if analyzer is None:
111 analyzer = self.analyzer
112 if analyzer == self.ANALYZER_EXACT:
113 analyzer_obj = self.ExactAnalyzer()
114 else:
115 value = _escape_term_value(value)
116 analyzer_obj = PyLucene.StandardAnalyzer()
117 if (analyzer & self.ANALYZER_PARTIAL) > 0:
118 # PyLucene uses explicit wildcards for partial matching
119 value += "*"
120 return PyLucene.QueryParser.parse(value, field, analyzer_obj)
122 def _create_query_combined(self, queries, require_all=True):
123 """generate a combined query
125 @param queries: list of the original queries
126 @type queries: list of xapian.Query
127 @param require_all: boolean operator
128 (True -> AND (default) / False -> OR)
129 @type require_all: bool
130 @return: the resulting combined query object
131 @rtype: PyLucene.Query
133 combined_query = PyLucene.BooleanQuery()
134 for query in queries:
135 combined_query.add(
136 PyLucene.BooleanClause(query, require_all, False))
137 return combined_query
139 def _add_plain_term(self, document, term, tokenize=True):
140 """add a term to a document
142 @param document: the document to be changed
143 @type document: xapian.Document | PyLucene.Document
144 @param term: a single term to be added
145 @type term: str
146 @param tokenize: should the term be tokenized automatically
147 @type tokenize: bool
149 # Field parameters: name, string, store, index, token
150 document.add(PyLucene.Field(str(PyLuceneIndex.UNNAMED_FIELD_NAME), term,
151 True, True, tokenize))
153 def _add_field_term(self, document, field, term, tokenize=True):
154 """add a field term to a document
156 @param document: the document to be changed
157 @type document: xapian.Document | PyLucene.Document
158 @param field: name of the field
159 @type field: str
160 @param term: term to be associated to the field
161 @type term: str
162 @param tokenize: should the term be tokenized automatically
163 @type tokenize: bool
165 # TODO: decoding (utf-8) is missing
166 # Field parameters: name, string, store, index, token
167 document.add(PyLucene.Field(str(field), term,
168 True, True, tokenize))
170 def get_query_result(self, query):
171 """return an object containing the results of a query
173 @param query: a pre-compiled query
174 @type query: a query object of the real implementation
175 @return: an object that allows access to the results
176 @rtype: subclass of CommonEnquire
178 return PyLucene.indexSearcher.search(query)
180 def search(self, query, fieldnames):
181 """return a list of the contents of specified fields for all matches of
182 a query
184 @param query: the query to be issued
185 @type query: a query object of the real implementation
186 @param fieldnames: the name(s) of a field of the document content
187 @type fieldnames: string | list of strings
188 @return: a list of dicts containing the specified field(s)
189 @rtype: list of dicts
191 if isinstance(fieldnames, str):
192 fieldnames = [fieldnames]
193 hits = PyLucene.indexSearcher.search(query)
194 result = []
195 for hit, doc in hits:
196 fields = {}
197 for fieldname in fieldnames:
198 content = doc.get(fieldname)
199 if not content is None:
200 fields[fieldname] = content
201 result.append(fields)
202 return result
204 def _writer_open(self):
205 """open write access for the indexing database and acquire an
206 exclusive lock
208 super(PyLuceneIndexer1, self)._writer_open_()
209 self.writer.maxFieldLength = PyLuceneIndexer.MAX_FIELD_SIZE