for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / search / indexer / XapianIndexer.py
blobf197a1e0b500b096f671ca430ef9de7ab31f636c
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2008 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """
24 interface to the xapian indexing engine for the translate toolkit
26 Xapian v1.0 or higher is supported.
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following:
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
34 __revision__ = "$Id$"
37 import CommonIndexer
38 import xapian
39 import os
40 import re
43 def is_available():
44 return xapian.major_version() > 0
47 # in xapian there is a length restriction for term strings
48 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html
49 # a maximum length of around 240 is described there - but we need less anyway
50 _MAX_TERM_LENGTH = 128
53 class XapianDatabase(CommonIndexer.CommonDatabase):
54 """interface to the xapian (http://xapian.org) indexer
55 """
57 QUERY_TYPE = xapian.Query
58 INDEX_DIRECTORY_NAME = "xapian"
60 def __init__(self, basedir, analyzer=None, create_allowed=True):
61 """initialize or open a xapian database
63 The following exceptions can be raised:
64 ValueError: the given location exists, but the database type
65 is incompatible (e.g. created by a different indexing engine)
66 OSError: the database failed to initialize
68 @param basedir: the parent directory of the database
69 @type basedir: str
70 @param analyzer: bitwise combination of possible analyzer flags
71 to be used as the default analyzer for this database. Leave it empty
72 to use the system default analyzer (self.ANALYZER_DEFAULT).
73 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
74 @type analyzer: int
75 @param create_allowed: create the database, if necessary; default: True
76 @type create_allowed: bool
77 @throws: OSError, ValueError
78 """
79 # call the __init__ function of our parent
80 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
81 create_allowed=create_allowed)
82 if os.path.exists(self.location):
83 # try to open an existing database
84 try:
85 self.database = xapian.WritableDatabase(self.location,
86 xapian.DB_OPEN)
87 except xapian.DatabaseOpeningError, err_msg:
88 raise ValueError("Indexer: failed to open xapian database " \
89 + "(%s) - maybe it is not a xapian database: %s" \
90 % (self.location, err_msg))
91 else:
92 # create a new database
93 if not create_allowed:
94 raise OSError("Indexer: skipping database creation")
95 try:
96 # create the parent directory if it does not exist
97 parent_path = os.path.dirname(self.location)
98 if not os.path.isdir(parent_path):
99 # recursively create all directories up to parent_path
100 os.makedirs(parent_path)
101 except IOError, err_msg:
102 raise OSError("Indexer: failed to create the parent " \
103 + "directory (%s) of the indexing database: %s" \
104 % (parent_path, err_msg))
105 try:
106 self.database = xapian.WritableDatabase(self.location,
107 xapian.DB_CREATE_OR_OPEN)
108 except xapian.DatabaseOpeningError, err_msg:
109 raise OSError("Indexer: failed to open or create a xapian " \
110 + "database (%s): %s" % (self.location, err_msg))
112 def flush(self, optimize=False):
113 """force to write the current changes to disk immediately
115 @param optimize: ignored for xapian
116 @type optimize: bool
118 # write changes to disk (only if database is read-write)
119 if (isinstance(self.database, xapian.WritableDatabase)):
120 self.database.flush()
121 # free the database to remove locks - this is a xapian-specific issue
122 self.database = None
123 # reopen it as read-only
124 self._prepare_database()
126 def _create_query_for_query(self, query):
127 """generate a query based on an existing query object
129 basically this function should just create a copy of the original
131 @param query: the original query object
132 @type query: xapian.Query
133 @return: the resulting query object
134 @rtype: xapian.Query
136 # create a copy of the original query
137 return xapian.Query(query)
139 def _create_query_for_string(self, text, require_all=True,
140 analyzer=None):
141 """generate a query for a plain term of a string query
143 basically this function parses the string and returns the resulting
144 query
146 @param text: the query string
147 @type text: str
148 @param require_all: boolean operator
149 (True -> AND (default) / False -> OR)
150 @type require_all: bool
151 @param analyzer: Define query options (partial matching, exact matching,
152 tokenizing, ...) as bitwise combinations of
153 CommonIndexer.ANALYZER_???.
154 This can override previously defined field analyzer settings.
155 If analyzer is None (default), then the configured analyzer for the
156 field is used.
157 @type analyzer: int
158 @return: resulting query object
159 @rtype: xapian.Query
161 qp = xapian.QueryParser()
162 qp.set_database(self.database)
163 if require_all:
164 qp.set_default_op(xapian.Query.OP_AND)
165 else:
166 qp.set_default_op(xapian.Query.OP_OR)
167 if analyzer is None:
168 analyzer = self.analyzer
169 if analyzer & self.ANALYZER_PARTIAL > 0:
170 match_flags = xapian.QueryParser.FLAG_PARTIAL
171 return qp.parse_query(text, match_flags)
172 elif analyzer == self.ANALYZER_EXACT:
173 # exact matching -
174 return xapian.Query(text)
175 else:
176 # everything else (not partial and not exact)
177 match_flags = 0
178 return qp.parse_query(text, match_flags)
180 def _create_query_for_field(self, field, value, analyzer=None):
181 """generate a field query
183 this functions creates a field->value query
185 @param field: the fieldname to be used
186 @type field: str
187 @param value: the wanted value of the field
188 @type value: str
189 @param analyzer: Define query options (partial matching, exact matching,
190 tokenizing, ...) as bitwise combinations of
191 CommonIndexer.ANALYZER_???.
192 This can override previously defined field analyzer settings.
193 If analyzer is None (default), then the configured analyzer for the
194 field is used.
195 @type analyzer: int
196 @return: the resulting query object
197 @rtype: xapian.Query
199 if analyzer is None:
200 analyzer = self.analyzer
201 if analyzer == self.ANALYZER_EXACT:
202 # exact matching -> keep special characters
203 return xapian.Query("%s%s" % (field.upper(), value))
204 # other queries need a parser object
205 qp = xapian.QueryParser()
206 qp.set_database(self.database)
207 if (analyzer & self.ANALYZER_PARTIAL > 0):
208 # partial matching
209 match_flags = xapian.QueryParser.FLAG_PARTIAL
210 return qp.parse_query(value, match_flags, field.upper())
211 else:
212 # everything else (not partial and not exact)
213 match_flags = 0
214 return qp.parse_query(value, match_flags, field.upper())
216 def _create_query_combined(self, queries, require_all=True):
217 """generate a combined query
219 @param queries: list of the original queries
220 @type queries: list of xapian.Query
221 @param require_all: boolean operator
222 (True -> AND (default) / False -> OR)
223 @type require_all: bool
224 @return: the resulting combined query object
225 @rtype: xapian.Query
227 if require_all:
228 query_op = xapian.Query.OP_AND
229 else:
230 query_op = xapian.Query.OP_OR
231 return xapian.Query(query_op, queries)
233 def _create_empty_document(self):
234 """create an empty document to be filled and added to the index later
236 @return: the new document object
237 @rtype: xapian.Document
239 return xapian.Document()
241 def _add_plain_term(self, document, term, tokenize=True):
242 """add a term to a document
244 @param document: the document to be changed
245 @type document: xapian.Document
246 @param term: a single term to be added
247 @type term: str
248 @param tokenize: should the term be tokenized automatically
249 @type tokenize: bool
251 if tokenize:
252 term_gen = xapian.TermGenerator()
253 term_gen.set_document(document)
254 term_gen.index_text(term)
255 else:
256 document.add_term(_truncate_term_length(term))
258 def _add_field_term(self, document, field, term, tokenize=True):
259 """add a field term to a document
261 @param document: the document to be changed
262 @type document: xapian.Document
263 @param field: name of the field
264 @type field: str
265 @param term: term to be associated to the field
266 @type term: str
267 @param tokenize: should the term be tokenized automatically
268 @type tokenize: bool
270 if tokenize:
271 term_gen = xapian.TermGenerator()
272 term_gen.set_document(document)
273 term_gen.index_text(term, 1, field.upper())
274 else:
275 document.add_term(_truncate_term_length("%s%s" % \
276 (field.upper(), term)))
278 def _add_document_to_index(self, document):
279 """add a prepared document to the index database
281 @param document: the document to be added
282 @type document: xapian.Document
284 # open the database for writing
285 self._prepare_database(writable=True)
286 self.database.add_document(document)
288 def begin_transaction(self):
289 """begin a transaction
291 Xapian supports transactions to group multiple database modifications.
292 This avoids intermediate flushing and therefore increases performance.
294 self._prepare_database(writable=True)
295 self.database.begin_transaction()
297 def cancel_transaction(self):
298 """cancel an ongoing transaction
300 no changes since the last execution of 'begin_transcation' are written
302 self._prepare_database(writable=True)
303 self.database.cancel_transaction()
305 def commit_transaction(self):
306 """submit the changes of an ongoing transaction
308 all changes since the last execution of 'begin_transaction' are written
310 self._prepare_database(writable=True)
311 self.database.commit_transaction()
313 def get_query_result(self, query):
314 """return an object containing the results of a query
316 @param query: a pre-compiled xapian query
317 @type query: xapian.Query
318 @return: an object that allows access to the results
319 @rtype: XapianIndexer.CommonEnquire
321 enquire = xapian.Enquire(self.database)
322 enquire.set_query(query)
323 return XapianEnquire(enquire)
325 def delete_document_by_id(self, docid):
326 """delete a specified document
328 @param docid: the document ID to be deleted
329 @type docid: int
331 # open the database for writing
332 self._prepare_database(writable=True)
333 try:
334 self.database.delete_document(docid)
335 return True
336 except xapian.DocNotFoundError:
337 return False
339 def search(self, query, fieldnames):
340 """return a list of the contents of specified fields for all matches of
341 a query
343 @param query: the query to be issued
344 @type query: xapian.Query
345 @param fieldnames: the name(s) of a field of the document content
346 @type fieldnames: string | list of strings
347 @return: a list of dicts containing the specified field(s)
348 @rtype: list of dicts
350 result = []
351 if isinstance(fieldnames, str):
352 fieldnames = [fieldnames]
353 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
354 return result
356 def _prepare_database(self, writable=False):
357 """reopen the database as read-only or as writable if necessary
359 this fixes a xapian specific issue regarding open locks for
360 writable databases
362 @param writable: True for opening a writable database
363 @type writable: bool
365 if writable and (not isinstance(self.database,
366 xapian.WritableDatabase)):
367 self.database = xapian.WritableDatabase(self.location,
368 xapian.DB_OPEN)
369 elif not writable and (not isinstance(self.database, xapian.Database)):
370 self.database = xapian.Database(self.location)
373 class XapianEnquire(CommonIndexer.CommonEnquire):
374 """interface to the xapian object for storing sets of matches
377 def get_matches(self, start, number):
378 """return a specified number of qualified matches of a previous query
380 @param start: index of the first match to return (starting from zero)
381 @type start: int
382 @param number: the number of matching entries to return
383 @type number: int
384 @return: a set of matching entries and some statistics
385 @rtype: tuple of (returned number, available number, matches)
386 "matches" is a dictionary of
387 ["rank", "percent", "document", "docid"]
389 matches = self.enquire.get_mset(start, number)
390 result = []
391 for match in matches:
392 elem = {}
393 elem["rank"] = match[xapian.MSET_RANK]
394 elem["docid"] = match[xapian.MSET_DID]
395 elem["percent"] = match[xapian.MSET_PERCENT]
396 elem["document"] = match[xapian.MSET_DOCUMENT]
397 result.append(elem)
398 return (matches.size(), matches.get_matches_estimated(), result)
401 def _truncate_term_length(term, taken=0):
402 """truncate the length of a term string length to the maximum allowed
403 for xapian terms
405 @param term: the value of the term, that should be truncated
406 @type term: str
407 @param taken: since a term consists of the name of the term and its
408 actual value, this additional parameter can be used to reduce the
409 maximum count of possible characters
410 @type taken: int
411 @return: the truncated string
412 @rtype: str
414 if len(term) > _MAX_TERM_LENGTH - taken:
415 return term[0:_MAX_TERM_LENGTH - taken - 1]
416 else:
417 return term
419 def _extract_fieldvalues(match, (result, fieldnames)):
420 """add a dict of field values to a list
422 usually this function should be used together with '_walk_matches'
423 for traversing a list of matches
424 @param match: a single match object
425 @type match: xapian.MSet
426 @param result: the resulting dict will be added to this list
427 @type result: list of dict
428 @param fieldnames: the names of the fields to be added to the dict
429 @type result: list of str
431 # prepare empty dict
432 item_fields = {}
433 # fill the dict
434 for term in match["document"].termlist():
435 for fname in fieldnames:
436 if ((fname is None) and re.match("[^A-Z]", term.term)):
437 value = term.term
438 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
439 value = term.term[len(fname):]
440 else:
441 continue
442 # we found a matching field/term
443 if item_fields.has_key(fname):
444 item_fields[fname].append(value)
445 else:
446 item_fields[fname] = [value]
447 result.append(item_fields)