1 # -*- coding: utf-8 -*-
3 # Copyright 2008 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 base class for interfaces to indexing engines for pootle
27 import translate
.lang
.data
34 """check if this indexing engine interface is usable
36 this function must exist in every module that contains indexing engine
39 @return: is this interface usable?
45 class CommonDatabase(object):
46 """base class for indexing support
48 any real implementation must override most methods of this class
52 """mapping of field names and analyzers - see 'set_field_analyzers'"""
55 """exact matching: the query string must equal the whole term string"""
57 ANALYZER_PARTIAL
= 1<<1
58 """partial matching: a document matches, even if the query string only
59 matches the beginning of the term value."""
61 ANALYZER_TOKENIZE
= 1<<2
62 """tokenize terms and queries automatically"""
64 ANALYZER_DEFAULT
= ANALYZER_TOKENIZE | ANALYZER_PARTIAL
65 """the default analyzer to be used if nothing is configured"""
68 """override this with the query class of the implementation"""
70 INDEX_DIRECTORY_NAME
= None
71 """override this with a string to be used as the name of the indexing
72 directory/file in the filesystem
75 def __init__(self
, basedir
, analyzer
=None, create_allowed
=True):
76 """initialize or open an indexing database
78 Any derived class must override __init__.
80 The following exceptions can be raised:
81 ValueError: the given location exists, but the database type
82 is incompatible (e.g. created by a different indexing engine)
83 OSError: the database failed to initialize
85 Any implementation can rely on the "self.location" attribute to be set
86 by the __init__ function of the super class.
88 @param basedir: the parent directory of the database
90 @param analyzer: bitwise combination of possible analyzer flags
91 to be used as the default analyzer for this database. Leave it empty
92 to use the system default analyzer (self.ANALYZER_DEFAULT).
93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
95 @param create_allowed: create the database, if necessary; default: True
96 @type create_allowed: bool
97 @throws: OSError, ValueError
100 if self
.QUERY_TYPE
is None:
101 raise NotImplementedError("Incomplete indexer implementation: " \
102 + "'QUERY_TYPE' is undefined")
103 if self
.INDEX_DIRECTORY_NAME
is None:
104 raise NotImplementedError("Incomplete indexer implementation: " \
105 + "'INDEX_DIRECTORY_NAME' is undefined")
106 self
.location
= os
.path
.join(basedir
, self
.INDEX_DIRECTORY_NAME
)
107 if (not create_allowed
) and (not os
.path
.exists(self
.location
)):
108 raise OSError("Indexer: the database does not exist - and I am" \
109 + " not configured to create it.")
111 self
.analyzer
= self
.ANALYZER_DEFAULT
113 self
.analyzer
= analyzer
114 self
.field_analyzers
= {}
116 def flush(self
, optimize
=False):
117 """flush the content of the database - to force changes to be written
120 some databases also support index optimization
122 @param optimize: should the index be optimized if possible?
125 raise NotImplementedError("Incomplete indexer implementation: " \
126 + "'flush' is missing")
128 def make_query(self
, args
, require_all
=True, analyzer
=None):
129 """create simple queries (strings or field searches) or
130 combine multiple queries (AND/OR)
132 To specifiy rules for field searches, you may want to take a look at
133 'set_field_analyzers'. The parameter 'match_text_partial' can override
134 the previously defined default setting.
136 @param args: queries or search string or description of field query
138 [xapian.Query("foo"), xapian.Query("bar")]
141 {"foo": "bar", "foobar": "foo"}
142 @type args: list of queries | single query | str | dict
143 @param require_all: boolean operator
144 (True -> AND (default) / False -> OR)
145 @type require_all: boolean
146 @param analyzer: (only applicable for 'dict' or 'str')
147 Define query options (partial matching, exact matching, tokenizing,
148 ...) as bitwise combinations of CommonIndexer.ANALYZER_???.
149 This can override previously defined field analyzer settings.
150 If analyzer is None (default), then the configured analyzer for the
153 @return: the combined query
154 @rtype: query type of the specific implemention
156 # turn a dict into a list if necessary
157 if isinstance(args
, dict):
159 # turn 'args' into a list if necessary
160 if not isinstance(args
, list):
162 # combine all given queries
165 # just add precompiled queries
166 if isinstance(query
, self
.QUERY_TYPE
):
167 result
.append(self
._create
_query
_for
_query
(query
))
168 # create field/value queries out of a tuple
169 elif isinstance(query
, tuple):
171 # perform unicode normalization
172 field
= translate
.lang
.data
.normalize(unicode(field
))
173 value
= translate
.lang
.data
.normalize(unicode(value
))
174 # check for the choosen match type
176 analyzer
= self
.get_field_analyzers(field
)
177 result
.append(self
._create
_query
_for
_field
(field
, value
,
179 # parse plaintext queries
180 elif isinstance(query
, str):
182 analyzer
= self
.analyzer
183 # perform unicode normalization
184 query
= translate
.lang
.data
.normalize(unicode(query
))
185 result
.append(self
._create
_query
_for
_string
(query
,
186 require_all
=require_all
, analyzer
=analyzer
))
188 # other types of queries are not supported
189 raise ValueError("Unable to handle query type: %s" \
191 # return the combined query
192 return self
._create
_query
_combined
(result
, require_all
)
194 def _create_query_for_query(self
, query
):
195 """generate a query based on an existing query object
197 basically this function should just create a copy of the original
199 @param query: the original query object
200 @type query: xapian.Query
201 @return: the resulting query object
202 @rtype: xapian.Query | PyLucene.Query
204 raise NotImplementedError("Incomplete indexer implementation: " \
205 + "'_create_query_for_query' is missing")
207 def _create_query_for_string(self
, text
, require_all
=True,
209 """generate a query for a plain term of a string query
211 basically this function parses the string and returns the resulting
214 @param text: the query string
216 @param require_all: boolean operator
217 (True -> AND (default) / False -> OR)
218 @type require_all: bool
219 @param analyzer: Define query options (partial matching, exact matching,
220 tokenizing, ...) as bitwise combinations of
221 CommonIndexer.ANALYZER_???.
222 This can override previously defined field analyzer settings.
223 If analyzer is None (default), then the configured analyzer for the
226 @return: resulting query object
227 @rtype: xapian.Query | PyLucene.Query
229 raise NotImplementedError("Incomplete indexer implementation: " \
230 + "'_create_query_for_string' is missing")
232 def _create_query_for_field(self
, field
, value
, analyzer
=None):
233 """generate a field query
235 this functions creates a field->value query
237 @param field: the fieldname to be used
239 @param value: the wanted value of the field
241 @param analyzer: Define query options (partial matching, exact matching,
242 tokenizing, ...) as bitwise combinations of
243 CommonIndexer.ANALYZER_???.
244 This can override previously defined field analyzer settings.
245 If analyzer is None (default), then the configured analyzer for the
248 @return: resulting query object
249 @rtype: xapian.Query | PyLucene.Query
251 raise NotImplementedError("Incomplete indexer implementation: " \
252 + "'_create_query_for_field' is missing")
254 def _create_query_combined(self
, queries
, require_all
=True):
255 """generate a combined query
257 @param queries: list of the original queries
258 @type queries: list of xapian.Query
259 @param require_all: boolean operator
260 (True -> AND (default) / False -> OR)
261 @type require_all: bool
262 @return: the resulting combined query object
263 @rtype: xapian.Query | PyLucene.Query
265 raise NotImplementedError("Incomplete indexer implementation: " \
266 + "'_create_query_combined' is missing")
268 def index_document(self
, data
):
269 """add the given data to the database
271 @param data: the data to be indexed.
272 A dictionary will be treated as fieldname:value combinations.
273 If the fieldname is None then the value will be interpreted as a
274 plain term or as a list of plain terms.
275 Lists of strings are treated as plain terms.
276 @type data: dict | list of str
278 doc
= self
._create
_empty
_document
()
279 if isinstance(data
, dict):
283 if isinstance(dataset
, tuple):
284 # the dataset tuple consists of '(key, value)'
287 if isinstance(value
, list):
289 elif isinstance(value
, str):
292 raise ValueError("Invalid data type to be indexed: %s" \
294 for one_term
in terms
:
295 self
._add
_plain
_term
(doc
, self
._decode
(one_term
),
296 (self
.ANALYZER_DEFAULT
& self
.ANALYZER_TOKENIZE
> 0))
298 analyze_settings
= self
.get_field_analyzers(key
)
299 self
._add
_field
_term
(doc
, key
, self
._decode
(value
),
300 (analyze_settings
& self
.ANALYZER_TOKENIZE
> 0))
301 elif isinstance(dataset
, str):
302 self
._add
_plain
_term
(doc
, self
._decode
(dataset
),
303 (self
.ANALYZER_DEFAULT
& self
.ANALYZER_TOKENIZE
> 0))
305 raise ValueError("Invalid data type to be indexed: %s" \
307 self
._add
_document
_to
_index
(doc
)
309 def _create_empty_document(self
):
310 """create an empty document to be filled and added to the index later
312 @return: the new document object
313 @rtype: xapian.Document | PyLucene.Document
315 raise NotImplementedError("Incomplete indexer implementation: " \
316 + "'_create_empty_document' is missing")
318 def _add_plain_term(self
, document
, term
, tokenize
=True):
319 """add a term to a document
321 @param document: the document to be changed
322 @type document: xapian.Document | PyLucene.Document
323 @param term: a single term to be added
325 @param tokenize: should the term be tokenized automatically
328 raise NotImplementedError("Incomplete indexer implementation: " \
329 + "'_add_plain_term' is missing")
331 def _add_field_term(self
, document
, field
, term
, tokenize
=True):
332 """add a field term to a document
334 @param document: the document to be changed
335 @type document: xapian.Document | PyLucene.Document
336 @param field: name of the field
338 @param term: term to be associated to the field
340 @param tokenize: should the term be tokenized automatically
343 raise NotImplementedError("Incomplete indexer implementation: " \
344 + "'_add_field_term' is missing")
346 def _add_document_to_index(self
, document
):
347 """add a prepared document to the index database
349 @param document: the document to be added
350 @type document: xapian.Document | PyLucene.Document
352 raise NotImplementedError("Incomplete indexer implementation: " \
353 + "'_add_document_to_index' is missing")
355 def begin_transaction(self
):
356 """begin a transaction
358 You can group multiple modifications of a database as a transaction.
359 This prevents time-consuming database flushing and helps, if you want
360 that a changeset is committed either completely or not at all.
361 No changes will be written to disk until 'commit_transaction'.
362 'cancel_transaction' can be used to revert an ongoing transaction.
364 Database types that do not support transactions may silently ignore it.
366 raise NotImplementedError("Incomplete indexer implementation: " \
367 + "'begin_transaction' is missing")
369 def cancel_transaction(self
):
370 """cancel an ongoing transaction
372 See 'start_transaction' for details.
374 raise NotImplementedError("Incomplete indexer implementation: " \
375 + "'cancel_transaction' is missing")
377 def commit_transaction(self
):
378 """submit the currently ongoing transaction and write changes to disk
380 See 'start_transaction' for details.
382 raise NotImplementedError("Incomplete indexer implementation: " \
383 + "'commit_transaction' is missing")
385 def get_query_result(self
, query
):
386 """return an object containing the results of a query
388 @param query: a pre-compiled query
389 @type query: a query object of the real implementation
390 @return: an object that allows access to the results
391 @rtype: subclass of CommonEnquire
393 raise NotImplementedError("Incomplete indexer implementation: " \
394 + "'get_query_result' is missing")
396 def delete_document_by_id(self
, docid
):
397 """delete a specified document
399 @param docid: the document ID to be deleted
402 raise NotImplementedError("Incomplete indexer implementation: " \
403 + "'delete_document_by_id' is missing")
405 def search(self
, query
, fieldnames
):
406 """return a list of the contents of specified fields for all matches of
409 @param query: the query to be issued
410 @type query: a query object of the real implementation
411 @param fieldnames: the name(s) of a field of the document content
412 @type fieldnames: string | list of strings
413 @return: a list of dicts containing the specified field(s)
414 @rtype: list of dicts
416 raise NotImplementedError("Incomplete indexer implementation: " \
417 + "'search' is missing")
419 def delete_doc(self
, ident
):
420 """delete the documents returned by a query
422 @param ident: [list of] document IDs | dict describing a query | query
423 @type ident: int | list of tuples | dict | list of dicts |
424 query (e.g. xapian.Query) | list of queries
426 # turn a doc-ID into a list of doc-IDs
427 if isinstance(ident
, list):
428 # it is already a list
432 if len(ident_list
) == 0:
435 if isinstance(ident_list
[0], int):
436 # create a list of IDs of all successfully removed documents
437 success_delete
= [match
for match
in ident_list
438 if self
.delete_document_by_id(match
)]
439 return len(success_delete
)
440 if isinstance(ident_list
[0], dict):
441 # something like: { "msgid": "foobar" }
442 # assemble all queries
443 query
= self
.make_query([self
.make_query(query_dict
,
444 require_all
=True) for query_dict
in ident_list
],
446 elif isinstance(ident_list
[0], object):
447 # assume a query object (with 'AND')
448 query
= self
.make_query(ident_list
, require_all
=True)
450 # invalid element type in list (not necessarily caught in the
452 raise TypeError("description of documents to-be-deleted is not " \
453 + "supported: list of %s" % type(ident_list
[0]))
454 # we successfully created a query - now iterate through the result
455 # no documents deleted so far ...
457 # delete all resulting documents step by step
458 def add_docid_to_list(match
):
459 """collect every document ID"""
460 remove_list
.append(match
["docid"])
461 self
._walk
_matches
(query
, add_docid_to_list
)
462 return self
.delete_doc(remove_list
)
464 def _walk_matches(self
, query
, function
, arg_for_function
=None):
465 """use this function if you want to do something with every single match
468 example: self._walk_matches(query, function_for_match, arg_for_func)
469 'function_for_match' expects only one argument: the matched object
470 @param query: a query object of the real implementation
471 @type query: xapian.Query | PyLucene.Query
472 @param function: the function to execute with every match
473 @type function: function
474 @param arg_for_function: an optional argument for the function
475 @type arg_for_function: anything
478 enquire
= self
.get_query_result(query
)
479 # start with the first element
481 # do the loop at least once
483 # how many results per 'get_matches'?
486 (size
, avail
, matches
) = enquire
.get_matches(start
, steps
)
487 for match
in matches
:
488 if arg_for_function
is None:
491 function(match
, arg_for_function
)
494 def set_field_analyzers(self
, field_analyzers
):
495 """set the analyzers for different fields of the database documents
497 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible.
499 @param field_analyzers: mapping of field names and analyzers
500 @type field_analyzers: dict containing field names and analyzers
501 @throws: TypeError for invalid values in 'field_analyzers'
503 for field
, analyzer
in field_analyzers
.items():
504 # check for invald input types
505 if not isinstance(field
, (str, unicode)):
506 raise TypeError("field name must be a string")
507 if not isinstance(analyzer
, int):
508 raise TypeError("the analyzer must be a whole number (int)")
509 # map the analyzer to the field name
510 self
.field_analyzers
[field
] = analyzer
512 def get_field_analyzers(self
, fieldnames
=None):
513 """return the analyzer that was mapped to a specific field
515 see 'set_field_analyzers' for details
517 @param fieldnames: the analyzer of this field (or all/multiple fields)
518 is requested; leave empty (or "None") to request all fields
519 @type fieldnames: str | list of str | None
520 @return: the analyzer setting of the field - see
521 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers
524 # all field analyzers are requested
525 if fieldnames
is None:
527 return dict(self
.field_analyzers
)
528 # one field is requested
529 if isinstance(fieldnames
, (str, unicode)):
530 if self
.field_analyzers
.has_key(fieldnames
):
531 return self
.field_analyzers
[fieldnames
]
534 # a list of fields is requested
535 if isinstance(fieldnames
, list):
537 for field
in fieldnames
:
538 result
[field
] = self
.get_field_analyzers(field
)
542 def _decode(self
, text
):
543 """decode the string from utf-8 or charmap
544 perform unicde normalization
546 if isinstance(text
, str):
548 result
= unicode(text
.decode("UTF-8"))
549 except UnicodeEncodeError, e
:
550 result
= unicode(text
.decode("charmap"))
551 elif not isinstance(text
, unicode):
552 result
= unicode(text
)
555 # perform unicode normalization
556 return translate
.lang
.data
.normalize(result
)
559 class CommonEnquire(object):
560 """an enquire object contains the information about the result of a request
563 def __init__(self
, enquire
):
564 """intialization of a wrapper around enquires of different backends
566 @param enquire: a previous enquire
567 @type enquire: xapian.Enquire | pylucene-enquire
569 self
.enquire
= enquire
571 def get_matches(self
, start
, number
):
572 """return a specified number of qualified matches of a previous query
574 @param start: index of the first match to return (starting from zero)
576 @param number: the number of matching entries to return
578 @return: a set of matching entries and some statistics
579 @rtype: tuple of (returned number, available number, matches)
580 "matches" is a dictionary of
581 ["rank", "percent", "document", "docid"]
583 raise NotImplementedError("Incomplete indexing implementation: " \
584 + "'get_matches' for the 'Enquire' class is missing")
586 def get_matches_count(self
):
587 """return the estimated number of matches
589 use "CommonIndexer.search" to retrieve the exact number of matches
590 @return: the estimaed number of matches
593 (returned
, estimate_count
, matches
) = self
.get_matches(0, 1)
594 return estimate_count