fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / search / indexer / CommonIndexer.py
blob68eb9f04b86d50c0ce671fe7415b10f1a1bfe6cf
1 # -*- coding: utf-8 -*-
3 # Copyright 2008 Zuza Software Foundation
4 #
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 """
24 base class for interfaces to indexing engines for pootle
25 """
27 import translate.lang.data
28 import os
30 __revision__ = "$Id$"
33 def is_available():
34 """check if this indexing engine interface is usable
36 this function must exist in every module that contains indexing engine
37 interfaces
39 @return: is this interface usable?
40 @rtype: bool
41 """
42 return False
45 class CommonDatabase(object):
46 """base class for indexing support
48 any real implementation must override most methods of this class
49 """
51 field_analyzers = {}
52 """mapping of field names and analyzers - see 'set_field_analyzers'"""
54 ANALYZER_EXACT = 0
55 """exact matching: the query string must equal the whole term string"""
57 ANALYZER_PARTIAL = 1<<1
58 """partial matching: a document matches, even if the query string only
59 matches the beginning of the term value."""
61 ANALYZER_TOKENIZE = 1<<2
62 """tokenize terms and queries automatically"""
64 ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL
65 """the default analyzer to be used if nothing is configured"""
67 QUERY_TYPE = None
68 """override this with the query class of the implementation"""
70 INDEX_DIRECTORY_NAME = None
71 """override this with a string to be used as the name of the indexing
72 directory/file in the filesystem
73 """
75 def __init__(self, basedir, analyzer=None, create_allowed=True):
76 """initialize or open an indexing database
78 Any derived class must override __init__.
80 The following exceptions can be raised:
81 ValueError: the given location exists, but the database type
82 is incompatible (e.g. created by a different indexing engine)
83 OSError: the database failed to initialize
85 Any implementation can rely on the "self.location" attribute to be set
86 by the __init__ function of the super class.
88 @param basedir: the parent directory of the database
89 @type basedir: str
90 @param analyzer: bitwise combination of possible analyzer flags
91 to be used as the default analyzer for this database. Leave it empty
92 to use the system default analyzer (self.ANALYZER_DEFAULT).
93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
94 @type analyzer: int
95 @param create_allowed: create the database, if necessary; default: True
96 @type create_allowed: bool
97 @throws: OSError, ValueError
98 """
99 # just do some checks
100 if self.QUERY_TYPE is None:
101 raise NotImplementedError("Incomplete indexer implementation: " \
102 + "'QUERY_TYPE' is undefined")
103 if self.INDEX_DIRECTORY_NAME is None:
104 raise NotImplementedError("Incomplete indexer implementation: " \
105 + "'INDEX_DIRECTORY_NAME' is undefined")
106 self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME)
107 if (not create_allowed) and (not os.path.exists(self.location)):
108 raise OSError("Indexer: the database does not exist - and I am" \
109 + " not configured to create it.")
110 if analyzer is None:
111 self.analyzer = self.ANALYZER_DEFAULT
112 else:
113 self.analyzer = analyzer
114 self.field_analyzers = {}
116 def flush(self, optimize=False):
117 """flush the content of the database - to force changes to be written
118 to disk
120 some databases also support index optimization
122 @param optimize: should the index be optimized if possible?
123 @type optimize: bool
125 raise NotImplementedError("Incomplete indexer implementation: " \
126 + "'flush' is missing")
128 def make_query(self, args, require_all=True, analyzer=None):
129 """create simple queries (strings or field searches) or
130 combine multiple queries (AND/OR)
132 To specifiy rules for field searches, you may want to take a look at
133 'set_field_analyzers'. The parameter 'match_text_partial' can override
134 the previously defined default setting.
136 @param args: queries or search string or description of field query
137 examples:
138 [xapian.Query("foo"), xapian.Query("bar")]
139 xapian.Query("foo")
140 "bar"
141 {"foo": "bar", "foobar": "foo"}
142 @type args: list of queries | single query | str | dict
143 @param require_all: boolean operator
144 (True -> AND (default) / False -> OR)
145 @type require_all: boolean
146 @param analyzer: (only applicable for 'dict' or 'str')
147 Define query options (partial matching, exact matching, tokenizing,
148 ...) as bitwise combinations of CommonIndexer.ANALYZER_???.
149 This can override previously defined field analyzer settings.
150 If analyzer is None (default), then the configured analyzer for the
151 field is used.
152 @type analyzer: int
153 @return: the combined query
154 @rtype: query type of the specific implemention
156 # turn a dict into a list if necessary
157 if isinstance(args, dict):
158 args = args.items()
159 # turn 'args' into a list if necessary
160 if not isinstance(args, list):
161 args = [args]
162 # combine all given queries
163 result = []
164 for query in args:
165 # just add precompiled queries
166 if isinstance(query, self.QUERY_TYPE):
167 result.append(self._create_query_for_query(query))
168 # create field/value queries out of a tuple
169 elif isinstance(query, tuple):
170 field, value = query
171 # perform unicode normalization
172 field = translate.lang.data.normalize(unicode(field))
173 value = translate.lang.data.normalize(unicode(value))
174 # check for the choosen match type
175 if analyzer is None:
176 analyzer = self.get_field_analyzers(field)
177 result.append(self._create_query_for_field(field, value,
178 analyzer=analyzer))
179 # parse plaintext queries
180 elif isinstance(query, str):
181 if analyzer is None:
182 analyzer = self.analyzer
183 # perform unicode normalization
184 query = translate.lang.data.normalize(unicode(query))
185 result.append(self._create_query_for_string(query,
186 require_all=require_all, analyzer=analyzer))
187 else:
188 # other types of queries are not supported
189 raise ValueError("Unable to handle query type: %s" \
190 % str(type(query)))
191 # return the combined query
192 return self._create_query_combined(result, require_all)
194 def _create_query_for_query(self, query):
195 """generate a query based on an existing query object
197 basically this function should just create a copy of the original
199 @param query: the original query object
200 @type query: xapian.Query
201 @return: the resulting query object
202 @rtype: xapian.Query | PyLucene.Query
204 raise NotImplementedError("Incomplete indexer implementation: " \
205 + "'_create_query_for_query' is missing")
207 def _create_query_for_string(self, text, require_all=True,
208 analyzer=None):
209 """generate a query for a plain term of a string query
211 basically this function parses the string and returns the resulting
212 query
214 @param text: the query string
215 @type text: str
216 @param require_all: boolean operator
217 (True -> AND (default) / False -> OR)
218 @type require_all: bool
219 @param analyzer: Define query options (partial matching, exact matching,
220 tokenizing, ...) as bitwise combinations of
221 CommonIndexer.ANALYZER_???.
222 This can override previously defined field analyzer settings.
223 If analyzer is None (default), then the configured analyzer for the
224 field is used.
225 @type analyzer: int
226 @return: resulting query object
227 @rtype: xapian.Query | PyLucene.Query
229 raise NotImplementedError("Incomplete indexer implementation: " \
230 + "'_create_query_for_string' is missing")
232 def _create_query_for_field(self, field, value, analyzer=None):
233 """generate a field query
235 this functions creates a field->value query
237 @param field: the fieldname to be used
238 @type field: str
239 @param value: the wanted value of the field
240 @type value: str
241 @param analyzer: Define query options (partial matching, exact matching,
242 tokenizing, ...) as bitwise combinations of
243 CommonIndexer.ANALYZER_???.
244 This can override previously defined field analyzer settings.
245 If analyzer is None (default), then the configured analyzer for the
246 field is used.
247 @type analyzer: int
248 @return: resulting query object
249 @rtype: xapian.Query | PyLucene.Query
251 raise NotImplementedError("Incomplete indexer implementation: " \
252 + "'_create_query_for_field' is missing")
254 def _create_query_combined(self, queries, require_all=True):
255 """generate a combined query
257 @param queries: list of the original queries
258 @type queries: list of xapian.Query
259 @param require_all: boolean operator
260 (True -> AND (default) / False -> OR)
261 @type require_all: bool
262 @return: the resulting combined query object
263 @rtype: xapian.Query | PyLucene.Query
265 raise NotImplementedError("Incomplete indexer implementation: " \
266 + "'_create_query_combined' is missing")
268 def index_document(self, data):
269 """add the given data to the database
271 @param data: the data to be indexed.
272 A dictionary will be treated as fieldname:value combinations.
273 If the fieldname is None then the value will be interpreted as a
274 plain term or as a list of plain terms.
275 Lists of strings are treated as plain terms.
276 @type data: dict | list of str
278 doc = self._create_empty_document()
279 if isinstance(data, dict):
280 data = data.items()
281 # add all data
282 for dataset in data:
283 if isinstance(dataset, tuple):
284 # the dataset tuple consists of '(key, value)'
285 key, value = dataset
286 if key is None:
287 if isinstance(value, list):
288 terms = value[:]
289 elif isinstance(value, str):
290 terms = [value]
291 else:
292 raise ValueError("Invalid data type to be indexed: %s" \
293 % str(type(data)))
294 for one_term in terms:
295 self._add_plain_term(doc, self._decode(one_term),
296 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
297 else:
298 analyze_settings = self.get_field_analyzers(key)
299 self._add_field_term(doc, key, self._decode(value),
300 (analyze_settings & self.ANALYZER_TOKENIZE > 0))
301 elif isinstance(dataset, str):
302 self._add_plain_term(doc, self._decode(dataset),
303 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
304 else:
305 raise ValueError("Invalid data type to be indexed: %s" \
306 % str(type(data)))
307 self._add_document_to_index(doc)
309 def _create_empty_document(self):
310 """create an empty document to be filled and added to the index later
312 @return: the new document object
313 @rtype: xapian.Document | PyLucene.Document
315 raise NotImplementedError("Incomplete indexer implementation: " \
316 + "'_create_empty_document' is missing")
318 def _add_plain_term(self, document, term, tokenize=True):
319 """add a term to a document
321 @param document: the document to be changed
322 @type document: xapian.Document | PyLucene.Document
323 @param term: a single term to be added
324 @type term: str
325 @param tokenize: should the term be tokenized automatically
326 @type tokenize: bool
328 raise NotImplementedError("Incomplete indexer implementation: " \
329 + "'_add_plain_term' is missing")
331 def _add_field_term(self, document, field, term, tokenize=True):
332 """add a field term to a document
334 @param document: the document to be changed
335 @type document: xapian.Document | PyLucene.Document
336 @param field: name of the field
337 @type field: str
338 @param term: term to be associated to the field
339 @type term: str
340 @param tokenize: should the term be tokenized automatically
341 @type tokenize: bool
343 raise NotImplementedError("Incomplete indexer implementation: " \
344 + "'_add_field_term' is missing")
346 def _add_document_to_index(self, document):
347 """add a prepared document to the index database
349 @param document: the document to be added
350 @type document: xapian.Document | PyLucene.Document
352 raise NotImplementedError("Incomplete indexer implementation: " \
353 + "'_add_document_to_index' is missing")
355 def begin_transaction(self):
356 """begin a transaction
358 You can group multiple modifications of a database as a transaction.
359 This prevents time-consuming database flushing and helps, if you want
360 that a changeset is committed either completely or not at all.
361 No changes will be written to disk until 'commit_transaction'.
362 'cancel_transaction' can be used to revert an ongoing transaction.
364 Database types that do not support transactions may silently ignore it.
366 raise NotImplementedError("Incomplete indexer implementation: " \
367 + "'begin_transaction' is missing")
369 def cancel_transaction(self):
370 """cancel an ongoing transaction
372 See 'start_transaction' for details.
374 raise NotImplementedError("Incomplete indexer implementation: " \
375 + "'cancel_transaction' is missing")
377 def commit_transaction(self):
378 """submit the currently ongoing transaction and write changes to disk
380 See 'start_transaction' for details.
382 raise NotImplementedError("Incomplete indexer implementation: " \
383 + "'commit_transaction' is missing")
385 def get_query_result(self, query):
386 """return an object containing the results of a query
388 @param query: a pre-compiled query
389 @type query: a query object of the real implementation
390 @return: an object that allows access to the results
391 @rtype: subclass of CommonEnquire
393 raise NotImplementedError("Incomplete indexer implementation: " \
394 + "'get_query_result' is missing")
396 def delete_document_by_id(self, docid):
397 """delete a specified document
399 @param docid: the document ID to be deleted
400 @type docid: int
402 raise NotImplementedError("Incomplete indexer implementation: " \
403 + "'delete_document_by_id' is missing")
405 def search(self, query, fieldnames):
406 """return a list of the contents of specified fields for all matches of
407 a query
409 @param query: the query to be issued
410 @type query: a query object of the real implementation
411 @param fieldnames: the name(s) of a field of the document content
412 @type fieldnames: string | list of strings
413 @return: a list of dicts containing the specified field(s)
414 @rtype: list of dicts
416 raise NotImplementedError("Incomplete indexer implementation: " \
417 + "'search' is missing")
419 def delete_doc(self, ident):
420 """delete the documents returned by a query
422 @param ident: [list of] document IDs | dict describing a query | query
423 @type ident: int | list of tuples | dict | list of dicts |
424 query (e.g. xapian.Query) | list of queries
426 # turn a doc-ID into a list of doc-IDs
427 if isinstance(ident, list):
428 # it is already a list
429 ident_list = ident
430 else:
431 ident_list = [ident]
432 if len(ident_list) == 0:
433 # no matching items
434 return 0
435 if isinstance(ident_list[0], int):
436 # create a list of IDs of all successfully removed documents
437 success_delete = [match for match in ident_list
438 if self.delete_document_by_id(match)]
439 return len(success_delete)
440 if isinstance(ident_list[0], dict):
441 # something like: { "msgid": "foobar" }
442 # assemble all queries
443 query = self.make_query([self.make_query(query_dict,
444 require_all=True) for query_dict in ident_list],
445 require_all=True)
446 elif isinstance(ident_list[0], object):
447 # assume a query object (with 'AND')
448 query = self.make_query(ident_list, require_all=True)
449 else:
450 # invalid element type in list (not necessarily caught in the
451 # lines above)
452 raise TypeError("description of documents to-be-deleted is not " \
453 + "supported: list of %s" % type(ident_list[0]))
454 # we successfully created a query - now iterate through the result
455 # no documents deleted so far ...
456 remove_list = []
457 # delete all resulting documents step by step
458 def add_docid_to_list(match):
459 """collect every document ID"""
460 remove_list.append(match["docid"])
461 self._walk_matches(query, add_docid_to_list)
462 return self.delete_doc(remove_list)
464 def _walk_matches(self, query, function, arg_for_function=None):
465 """use this function if you want to do something with every single match
466 of a query
468 example: self._walk_matches(query, function_for_match, arg_for_func)
469 'function_for_match' expects only one argument: the matched object
470 @param query: a query object of the real implementation
471 @type query: xapian.Query | PyLucene.Query
472 @param function: the function to execute with every match
473 @type function: function
474 @param arg_for_function: an optional argument for the function
475 @type arg_for_function: anything
477 # execute the query
478 enquire = self.get_query_result(query)
479 # start with the first element
480 start = 0
481 # do the loop at least once
482 size, avail = (0, 1)
483 # how many results per 'get_matches'?
484 steps = 2
485 while start < avail:
486 (size, avail, matches) = enquire.get_matches(start, steps)
487 for match in matches:
488 if arg_for_function is None:
489 function(match)
490 else:
491 function(match, arg_for_function)
492 start += size
494 def set_field_analyzers(self, field_analyzers):
495 """set the analyzers for different fields of the database documents
497 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible.
499 @param field_analyzers: mapping of field names and analyzers
500 @type field_analyzers: dict containing field names and analyzers
501 @throws: TypeError for invalid values in 'field_analyzers'
503 for field, analyzer in field_analyzers.items():
504 # check for invald input types
505 if not isinstance(field, (str, unicode)):
506 raise TypeError("field name must be a string")
507 if not isinstance(analyzer, int):
508 raise TypeError("the analyzer must be a whole number (int)")
509 # map the analyzer to the field name
510 self.field_analyzers[field] = analyzer
512 def get_field_analyzers(self, fieldnames=None):
513 """return the analyzer that was mapped to a specific field
515 see 'set_field_analyzers' for details
517 @param fieldnames: the analyzer of this field (or all/multiple fields)
518 is requested; leave empty (or "None") to request all fields
519 @type fieldnames: str | list of str | None
520 @return: the analyzer setting of the field - see
521 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers
522 @rtype: int | dict
524 # all field analyzers are requested
525 if fieldnames is None:
526 # return a copy
527 return dict(self.field_analyzers)
528 # one field is requested
529 if isinstance(fieldnames, (str, unicode)):
530 if self.field_analyzers.has_key(fieldnames):
531 return self.field_analyzers[fieldnames]
532 else:
533 return self.analyzer
534 # a list of fields is requested
535 if isinstance(fieldnames, list):
536 result = {}
537 for field in fieldnames:
538 result[field] = self.get_field_analyzers(field)
539 return result
540 return self.analyzer
542 def _decode(self, text):
543 """decode the string from utf-8 or charmap
544 perform unicde normalization
546 if isinstance(text, str):
547 try:
548 result = unicode(text.decode("UTF-8"))
549 except UnicodeEncodeError, e:
550 result = unicode(text.decode("charmap"))
551 elif not isinstance(text, unicode):
552 result = unicode(text)
553 else:
554 result = text
555 # perform unicode normalization
556 return translate.lang.data.normalize(result)
559 class CommonEnquire(object):
560 """an enquire object contains the information about the result of a request
563 def __init__(self, enquire):
564 """intialization of a wrapper around enquires of different backends
566 @param enquire: a previous enquire
567 @type enquire: xapian.Enquire | pylucene-enquire
569 self.enquire = enquire
571 def get_matches(self, start, number):
572 """return a specified number of qualified matches of a previous query
574 @param start: index of the first match to return (starting from zero)
575 @type start: int
576 @param number: the number of matching entries to return
577 @type number: int
578 @return: a set of matching entries and some statistics
579 @rtype: tuple of (returned number, available number, matches)
580 "matches" is a dictionary of
581 ["rank", "percent", "document", "docid"]
583 raise NotImplementedError("Incomplete indexing implementation: " \
584 + "'get_matches' for the 'Enquire' class is missing")
586 def get_matches_count(self):
587 """return the estimated number of matches
589 use "CommonIndexer.search" to retrieve the exact number of matches
590 @return: the estimaed number of matches
591 @rtype: int
593 (returned, estimate_count, matches) = self.get_matches(0, 1)
594 return estimate_count