2 # -*- coding: utf-8 -*-
4 # Copyright 2008 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 interface to the xapian indexing engine for the translate toolkit
26 Xapian v1.0 or higher is supported.
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following:
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
44 return xapian
.major_version() > 0
47 # in xapian there is a length restriction for term strings
48 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html
49 # a maximum length of around 240 is described there - but we need less anyway
50 _MAX_TERM_LENGTH
= 128
53 class XapianDatabase(CommonIndexer
.CommonDatabase
):
54 """interface to the xapian (http://xapian.org) indexer
57 QUERY_TYPE
= xapian
.Query
58 INDEX_DIRECTORY_NAME
= "xapian"
60 def __init__(self
, basedir
, analyzer
=None, create_allowed
=True):
61 """initialize or open a xapian database
63 The following exceptions can be raised:
64 ValueError: the given location exists, but the database type
65 is incompatible (e.g. created by a different indexing engine)
66 OSError: the database failed to initialize
68 @param basedir: the parent directory of the database
70 @param analyzer: bitwise combination of possible analyzer flags
71 to be used as the default analyzer for this database. Leave it empty
72 to use the system default analyzer (self.ANALYZER_DEFAULT).
73 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
75 @param create_allowed: create the database, if necessary; default: True
76 @type create_allowed: bool
77 @throws: OSError, ValueError
79 # call the __init__ function of our parent
80 super(XapianDatabase
, self
).__init
__(basedir
, analyzer
=analyzer
,
81 create_allowed
=create_allowed
)
82 if os
.path
.exists(self
.location
):
83 # try to open an existing database
85 self
.database
= xapian
.WritableDatabase(self
.location
,
87 except xapian
.DatabaseOpeningError
, err_msg
:
88 raise ValueError("Indexer: failed to open xapian database " \
89 + "(%s) - maybe it is not a xapian database: %s" \
90 % (self
.location
, err_msg
))
92 # create a new database
93 if not create_allowed
:
94 raise OSError("Indexer: skipping database creation")
96 # create the parent directory if it does not exist
97 parent_path
= os
.path
.dirname(self
.location
)
98 if not os
.path
.isdir(parent_path
):
99 # recursively create all directories up to parent_path
100 os
.makedirs(parent_path
)
101 except IOError, err_msg
:
102 raise OSError("Indexer: failed to create the parent " \
103 + "directory (%s) of the indexing database: %s" \
104 % (parent_path
, err_msg
))
106 self
.database
= xapian
.WritableDatabase(self
.location
,
107 xapian
.DB_CREATE_OR_OPEN
)
108 except xapian
.DatabaseOpeningError
, err_msg
:
109 raise OSError("Indexer: failed to open or create a xapian " \
110 + "database (%s): %s" % (self
.location
, err_msg
))
112 def flush(self
, optimize
=False):
113 """force to write the current changes to disk immediately
115 @param optimize: ignored for xapian
118 # write changes to disk (only if database is read-write)
119 if (isinstance(self
.database
, xapian
.WritableDatabase
)):
120 self
.database
.flush()
121 # free the database to remove locks - this is a xapian-specific issue
123 # reopen it as read-only
124 self
._prepare
_database
()
126 def _create_query_for_query(self
, query
):
127 """generate a query based on an existing query object
129 basically this function should just create a copy of the original
131 @param query: the original query object
132 @type query: xapian.Query
133 @return: the resulting query object
136 # create a copy of the original query
137 return xapian
.Query(query
)
139 def _create_query_for_string(self
, text
, require_all
=True,
141 """generate a query for a plain term of a string query
143 basically this function parses the string and returns the resulting
146 @param text: the query string
148 @param require_all: boolean operator
149 (True -> AND (default) / False -> OR)
150 @type require_all: bool
151 @param analyzer: Define query options (partial matching, exact matching,
152 tokenizing, ...) as bitwise combinations of
153 CommonIndexer.ANALYZER_???.
154 This can override previously defined field analyzer settings.
155 If analyzer is None (default), then the configured analyzer for the
158 @return: resulting query object
161 qp
= xapian
.QueryParser()
162 qp
.set_database(self
.database
)
164 qp
.set_default_op(xapian
.Query
.OP_AND
)
166 qp
.set_default_op(xapian
.Query
.OP_OR
)
168 analyzer
= self
.analyzer
169 if analyzer
& self
.ANALYZER_PARTIAL
> 0:
170 match_flags
= xapian
.QueryParser
.FLAG_PARTIAL
171 return qp
.parse_query(text
, match_flags
)
172 elif analyzer
== self
.ANALYZER_EXACT
:
174 return xapian
.Query(text
)
176 # everything else (not partial and not exact)
178 return qp
.parse_query(text
, match_flags
)
180 def _create_query_for_field(self
, field
, value
, analyzer
=None):
181 """generate a field query
183 this functions creates a field->value query
185 @param field: the fieldname to be used
187 @param value: the wanted value of the field
189 @param analyzer: Define query options (partial matching, exact matching,
190 tokenizing, ...) as bitwise combinations of
191 CommonIndexer.ANALYZER_???.
192 This can override previously defined field analyzer settings.
193 If analyzer is None (default), then the configured analyzer for the
196 @return: the resulting query object
200 analyzer
= self
.analyzer
201 if analyzer
== self
.ANALYZER_EXACT
:
202 # exact matching -> keep special characters
203 return xapian
.Query("%s%s" % (field
.upper(), value
))
204 # other queries need a parser object
205 qp
= xapian
.QueryParser()
206 qp
.set_database(self
.database
)
207 if (analyzer
& self
.ANALYZER_PARTIAL
> 0):
209 match_flags
= xapian
.QueryParser
.FLAG_PARTIAL
210 return qp
.parse_query(value
, match_flags
, field
.upper())
212 # everything else (not partial and not exact)
214 return qp
.parse_query(value
, match_flags
, field
.upper())
216 def _create_query_combined(self
, queries
, require_all
=True):
217 """generate a combined query
219 @param queries: list of the original queries
220 @type queries: list of xapian.Query
221 @param require_all: boolean operator
222 (True -> AND (default) / False -> OR)
223 @type require_all: bool
224 @return: the resulting combined query object
228 query_op
= xapian
.Query
.OP_AND
230 query_op
= xapian
.Query
.OP_OR
231 return xapian
.Query(query_op
, queries
)
233 def _create_empty_document(self
):
234 """create an empty document to be filled and added to the index later
236 @return: the new document object
237 @rtype: xapian.Document
239 return xapian
.Document()
241 def _add_plain_term(self
, document
, term
, tokenize
=True):
242 """add a term to a document
244 @param document: the document to be changed
245 @type document: xapian.Document
246 @param term: a single term to be added
248 @param tokenize: should the term be tokenized automatically
252 term_gen
= xapian
.TermGenerator()
253 term_gen
.set_document(document
)
254 term_gen
.index_text(term
)
256 document
.add_term(_truncate_term_length(term
))
258 def _add_field_term(self
, document
, field
, term
, tokenize
=True):
259 """add a field term to a document
261 @param document: the document to be changed
262 @type document: xapian.Document
263 @param field: name of the field
265 @param term: term to be associated to the field
267 @param tokenize: should the term be tokenized automatically
271 term_gen
= xapian
.TermGenerator()
272 term_gen
.set_document(document
)
273 term_gen
.index_text(term
, 1, field
.upper())
275 document
.add_term(_truncate_term_length("%s%s" % \
276 (field
.upper(), term
)))
278 def _add_document_to_index(self
, document
):
279 """add a prepared document to the index database
281 @param document: the document to be added
282 @type document: xapian.Document
284 # open the database for writing
285 self
._prepare
_database
(writable
=True)
286 self
.database
.add_document(document
)
288 def begin_transaction(self
):
289 """begin a transaction
291 Xapian supports transactions to group multiple database modifications.
292 This avoids intermediate flushing and therefore increases performance.
294 self
._prepare
_database
(writable
=True)
295 self
.database
.begin_transaction()
297 def cancel_transaction(self
):
298 """cancel an ongoing transaction
300 no changes since the last execution of 'begin_transcation' are written
302 self
._prepare
_database
(writable
=True)
303 self
.database
.cancel_transaction()
305 def commit_transaction(self
):
306 """submit the changes of an ongoing transaction
308 all changes since the last execution of 'begin_transaction' are written
310 self
._prepare
_database
(writable
=True)
311 self
.database
.commit_transaction()
313 def get_query_result(self
, query
):
314 """return an object containing the results of a query
316 @param query: a pre-compiled xapian query
317 @type query: xapian.Query
318 @return: an object that allows access to the results
319 @rtype: XapianIndexer.CommonEnquire
321 enquire
= xapian
.Enquire(self
.database
)
322 enquire
.set_query(query
)
323 return XapianEnquire(enquire
)
325 def delete_document_by_id(self
, docid
):
326 """delete a specified document
328 @param docid: the document ID to be deleted
331 # open the database for writing
332 self
._prepare
_database
(writable
=True)
334 self
.database
.delete_document(docid
)
336 except xapian
.DocNotFoundError
:
339 def search(self
, query
, fieldnames
):
340 """return a list of the contents of specified fields for all matches of
343 @param query: the query to be issued
344 @type query: xapian.Query
345 @param fieldnames: the name(s) of a field of the document content
346 @type fieldnames: string | list of strings
347 @return: a list of dicts containing the specified field(s)
348 @rtype: list of dicts
351 if isinstance(fieldnames
, str):
352 fieldnames
= [fieldnames
]
353 self
._walk
_matches
(query
, _extract_fieldvalues
, (result
, fieldnames
))
356 def _prepare_database(self
, writable
=False):
357 """reopen the database as read-only or as writable if necessary
359 this fixes a xapian specific issue regarding open locks for
362 @param writable: True for opening a writable database
365 if writable
and (not isinstance(self
.database
,
366 xapian
.WritableDatabase
)):
367 self
.database
= xapian
.WritableDatabase(self
.location
,
369 elif not writable
and (not isinstance(self
.database
, xapian
.Database
)):
370 self
.database
= xapian
.Database(self
.location
)
373 class XapianEnquire(CommonIndexer
.CommonEnquire
):
374 """interface to the xapian object for storing sets of matches
377 def get_matches(self
, start
, number
):
378 """return a specified number of qualified matches of a previous query
380 @param start: index of the first match to return (starting from zero)
382 @param number: the number of matching entries to return
384 @return: a set of matching entries and some statistics
385 @rtype: tuple of (returned number, available number, matches)
386 "matches" is a dictionary of
387 ["rank", "percent", "document", "docid"]
389 matches
= self
.enquire
.get_mset(start
, number
)
391 for match
in matches
:
393 elem
["rank"] = match
[xapian
.MSET_RANK
]
394 elem
["docid"] = match
[xapian
.MSET_DID
]
395 elem
["percent"] = match
[xapian
.MSET_PERCENT
]
396 elem
["document"] = match
[xapian
.MSET_DOCUMENT
]
398 return (matches
.size(), matches
.get_matches_estimated(), result
)
401 def _truncate_term_length(term
, taken
=0):
402 """truncate the length of a term string length to the maximum allowed
405 @param term: the value of the term, that should be truncated
407 @param taken: since a term consists of the name of the term and its
408 actual value, this additional parameter can be used to reduce the
409 maximum count of possible characters
411 @return: the truncated string
414 if len(term
) > _MAX_TERM_LENGTH
- taken
:
415 return term
[0:_MAX_TERM_LENGTH
- taken
- 1]
419 def _extract_fieldvalues(match
, (result
, fieldnames
)):
420 """add a dict of field values to a list
422 usually this function should be used together with '_walk_matches'
423 for traversing a list of matches
424 @param match: a single match object
425 @type match: xapian.MSet
426 @param result: the resulting dict will be added to this list
427 @type result: list of dict
428 @param fieldnames: the names of the fields to be added to the dict
429 @type result: list of str
434 for term
in match
["document"].termlist():
435 for fname
in fieldnames
:
436 if ((fname
is None) and re
.match("[^A-Z]", term
.term
)):
438 elif re
.match("%s[^A-Z]" % str(fname
).upper(), term
.term
):
439 value
= term
.term
[len(fname
):]
442 # we found a matching field/term
443 if item_fields
.has_key(fname
):
444 item_fields
[fname
].append(value
)
446 item_fields
[fname
] = [value
]
447 result
.append(item_fields
)