for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / filters / checks.py
blob8abca7d3d478afdd711d4b96baa3eab41da51792
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2004-2008 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """This is a set of validation checks that can be performed on translation
23 units.
25 Derivatives of UnitChecker (like StandardUnitChecker) check translation units,
26 and derivatives of TranslationChecker (like StandardChecker) check
27 (source, target) translation pairs.
29 When adding a new test here, please document and explain the behaviour on the
30 U{wiki <http://translate.sourceforge.net/wiki/toolkit/pofilter_tests>}.
31 """
33 from translate.filters import helpers
34 from translate.filters import decoration
35 from translate.filters import prefilters
36 from translate.filters import spelling
37 from translate.lang import factory
38 from translate.lang import data
39 # The import of xliff could fail if the user doesn't have lxml installed. For
40 # now we try to continue gracefully to help users who aren't interested in
41 # support for XLIFF or other XML formats.
42 try:
43 from translate.storage import xliff
44 except ImportError, e:
45 xliff = None
46 import re
48 # These are some regular expressions that are compiled for use in some tests
50 # printf syntax based on http://en.wikipedia.org/wiki/Printf which doens't cover everything we leave \w instead of specifying the exact letters as
51 # this should capture printf types defined in other platforms.
52 printf_pat = re.compile('%((?:(?P<ord>\d+)\$)*(?P<fullvar>[+#-]*(?:\d+)*(?:\.\d+)*(hh\|h\|l\|ll)*(?P<type>[\w%])))')
54 # The name of the XML tag
55 tagname_re = re.compile("<[\s]*([\w\/]*)")
57 # We allow escaped quotes, probably for old escaping style of OOo helpcontent
58 #TODO: remove escaped strings once usage is audited
59 property_re = re.compile(" (\w*)=((\\\\?\".*?\\\\?\")|(\\\\?'.*?\\\\?'))")
61 # The whole tag
62 tag_re = re.compile("<[^>]+>")
64 def tagname(string):
65 """Returns the name of the XML/HTML tag in string"""
66 return tagname_re.match(string).groups(1)[0]
68 def intuplelist(pair, list):
69 """Tests to see if pair == (a,b,c) is in list, but handles None entries in
70 list as wildcards (only allowed in positions "a" and "c"). We take a shortcut
71 by only considering "c" if "b" has already matched."""
72 a, b, c = pair
73 if (b, c) == (None, None):
74 #This is a tagname
75 return pair
76 for pattern in list:
77 x, y, z = pattern
78 if (x, y) in [(a, b), (None, b)]:
79 if z in [None, c]:
80 return pattern
81 return pair
83 def tagproperties(strings, ignore):
84 """Returns all the properties in the XML/HTML tag string as
85 (tagname, propertyname, propertyvalue), but ignore those combinations
86 specified in ignore."""
87 properties = []
88 for string in strings:
89 tag = tagname(string)
90 properties += [(tag, None, None)]
91 #Now we isolate the attribute pairs.
92 pairs = property_re.findall(string)
93 for property, value, a, b in pairs:
94 #Strip the quotes:
95 value = value[1:-1]
97 canignore = False
98 if (tag, property, value) in ignore or \
99 intuplelist((tag,property,value), ignore) != (tag,property,value):
100 canignore = True
101 break
102 if not canignore:
103 properties += [(tag, property, value)]
104 return properties
107 class FilterFailure(Exception):
108 """This exception signals that a Filter didn't pass, and gives an explanation
109 or a comment"""
110 def __init__(self, messages):
111 if not isinstance(messages, list):
112 messages = [messages]
113 assert isinstance(messages[0], unicode) # Assumption: all of same type
114 Exception.__init__(self, u", ".join(messages))
116 class SeriousFilterFailure(FilterFailure):
117 """This exception signals that a Filter didn't pass, and the bad translation
118 might break an application (so the string will be marked fuzzy)"""
119 pass
121 #(tag, attribute, value) specifies a certain attribute which can be changed/
122 #ignored if it exists inside tag. In the case where there is a third element
123 #in the tuple, it indicates a property value that can be ignored if present
124 #(like defaults, for example)
125 #If a certain item is None, it indicates that it is relevant for all values of
126 #the property/tag that is specified as None. A non-None value of "value"
127 #indicates that the value of the attribute must be taken into account.
128 common_ignoretags = [(None, "xml-lang", None)]
129 common_canchangetags = [("img", "alt", None)]
131 class CheckerConfig(object):
132 """object representing the configuration of a checker"""
133 def __init__(self, targetlanguage=None, accelmarkers=None, varmatches=None,
134 notranslatewords=None, musttranslatewords=None, validchars=None,
135 punctuation=None, endpunctuation=None, ignoretags=None,
136 canchangetags=None, criticaltests=None, credit_sources=None):
137 # Init lists
138 self.accelmarkers = self._init_list(accelmarkers)
139 self.varmatches = self._init_list(varmatches)
140 self.criticaltests = self._init_list(criticaltests)
141 self.credit_sources = self._init_list(credit_sources)
142 # Lang data
143 self.targetlanguage = targetlanguage
144 self.updatetargetlanguage(targetlanguage)
145 self.sourcelang = factory.getlanguage('en')
146 # Inits with default values
147 self.punctuation = self._init_default(data.forceunicode(punctuation), self.lang.punctuation)
148 self.endpunctuation = self._init_default(data.forceunicode(endpunctuation), self.lang.sentenceend)
149 self.ignoretags = self._init_default(ignoretags, common_ignoretags)
150 self.canchangetags = self._init_default(canchangetags, common_canchangetags)
151 # Other data
152 # TODO: allow user configuration of untranslatable words
153 self.notranslatewords = dict.fromkeys([data.forceunicode(key) for key in self._init_list(notranslatewords)])
154 self.musttranslatewords = dict.fromkeys([data.forceunicode(key) for key in self._init_list(musttranslatewords)])
155 validchars = data.forceunicode(validchars)
156 self.validcharsmap = {}
157 self.updatevalidchars(validchars)
159 def _init_list(self, list):
160 """initialise configuration paramaters that are lists
162 @type list: List
163 @param list: None (we'll initialise a blank list) or a list paramater
164 @rtype: List
166 if list is None:
167 list = []
168 return list
170 def _init_default(self, param, default):
171 """initialise parameters that can have default options
173 @param param: the user supplied paramater value
174 @param default: default values when param is not specified
175 @return: the paramater as specified by the user of the default settings
177 if param is None:
178 return default
179 return param
181 def update(self, otherconfig):
182 """combines the info in otherconfig into this config object"""
183 self.targetlanguage = otherconfig.targetlanguage or self.targetlanguage
184 self.updatetargetlanguage(self.targetlanguage)
185 self.accelmarkers.extend([c for c in otherconfig.accelmarkers if not c in self.accelmarkers])
186 self.varmatches.extend(otherconfig.varmatches)
187 self.notranslatewords.update(otherconfig.notranslatewords)
188 self.musttranslatewords.update(otherconfig.musttranslatewords)
189 self.validcharsmap.update(otherconfig.validcharsmap)
190 self.punctuation += otherconfig.punctuation
191 self.endpunctuation += otherconfig.endpunctuation
192 #TODO: consider also updating in the following cases:
193 self.ignoretags = otherconfig.ignoretags
194 self.canchangetags = otherconfig.canchangetags
195 self.criticaltests.extend(otherconfig.criticaltests)
196 self.credit_sources = otherconfig.credit_sources
198 def updatevalidchars(self, validchars):
199 """updates the map that eliminates valid characters"""
200 if validchars is None:
201 return True
202 validcharsmap = dict([(ord(validchar), None) for validchar in data.forceunicode(validchars)])
203 self.validcharsmap.update(validcharsmap)
205 def updatetargetlanguage(self, langcode):
206 """Updates the target language in the config to the given target language"""
207 self.lang = factory.getlanguage(langcode)
209 def cache_results(f):
210 def cached_f(self, param1):
211 key = (f.__name__, param1)
212 res_cache = self.results_cache
213 if key in res_cache:
214 return res_cache[key]
215 else:
216 value = f(self, param1)
217 res_cache[key] = value
218 return value
219 return cached_f
221 class UnitChecker(object):
222 """Parent Checker class which does the checking based on functions available
223 in derived classes."""
224 preconditions = {}
226 def __init__(self, checkerconfig=None, excludefilters=None, limitfilters=None, errorhandler=None):
227 self.errorhandler = errorhandler
228 if checkerconfig is None:
229 self.setconfig(CheckerConfig())
230 else:
231 self.setconfig(checkerconfig)
232 # exclude functions defined in UnitChecker from being treated as tests...
233 self.helperfunctions = {}
234 for functionname in dir(UnitChecker):
235 function = getattr(self, functionname)
236 if callable(function):
237 self.helperfunctions[functionname] = function
238 self.defaultfilters = self.getfilters(excludefilters, limitfilters)
240 self.results_cache = {}
242 def getfilters(self, excludefilters=None, limitfilters=None):
243 """returns dictionary of available filters, including/excluding those in
244 the given lists"""
245 filters = {}
246 if limitfilters is None:
247 # use everything available unless instructed
248 limitfilters = dir(self)
249 if excludefilters is None:
250 excludefilters = {}
251 for functionname in limitfilters:
252 if functionname in excludefilters: continue
253 if functionname in self.helperfunctions: continue
254 if functionname == "errorhandler": continue
255 filterfunction = getattr(self, functionname, None)
256 if not callable(filterfunction): continue
257 filters[functionname] = filterfunction
258 return filters
260 def setconfig(self, config):
261 """sets the accelerator list"""
262 self.config = config
263 self.accfilters = [prefilters.filteraccelerators(accelmarker) for accelmarker in self.config.accelmarkers]
264 self.varfilters = [prefilters.filtervariables(startmatch, endmatch, prefilters.varname)
265 for startmatch, endmatch in self.config.varmatches]
266 self.removevarfilter = [prefilters.filtervariables(startmatch, endmatch, prefilters.varnone)
267 for startmatch, endmatch in self.config.varmatches]
269 def setsuggestionstore(self, store):
270 """Sets the filename that a checker should use for evaluating suggestions."""
271 self.suggestion_store = store
273 def filtervariables(self, str1):
274 """filter out variables from str1"""
275 return helpers.multifilter(str1, self.varfilters)
276 filtervariables = cache_results(filtervariables)
278 def removevariables(self, str1):
279 """remove variables from str1"""
280 return helpers.multifilter(str1, self.removevarfilter)
281 removevariables = cache_results(removevariables)
283 def filteraccelerators(self, str1):
284 """filter out accelerators from str1"""
285 return helpers.multifilter(str1, self.accfilters, None)
286 filteraccelerators = cache_results(filteraccelerators)
288 def filteraccelerators_by_list(self, str1, acceptlist=None):
289 """filter out accelerators from str1"""
290 return helpers.multifilter(str1, self.accfilters, acceptlist)
292 def filterwordswithpunctuation(self, str1):
293 """replaces words with punctuation with their unpunctuated equivalents"""
294 return prefilters.filterwordswithpunctuation(str1)
295 filterwordswithpunctuation = cache_results(filterwordswithpunctuation)
297 def filterxml(self, str1):
298 """filter out XML from the string so only text remains"""
299 return tag_re.sub("", str1)
300 filterxml = cache_results(filterxml)
302 def run_test(self, test, unit):
303 """Runs the given test on the given unit.
305 Note that this can raise a FilterFailure as part of normal operation"""
306 return test(unit)
308 def run_filters(self, unit):
309 """run all the tests in this suite, return failures as testname, message_or_exception"""
310 self.results_cache = {}
311 failures = {}
312 ignores = self.config.lang.ignoretests[:]
313 functionnames = self.defaultfilters.keys()
314 priorityfunctionnames = self.preconditions.keys()
315 otherfunctionnames = filter(lambda functionname: functionname not in self.preconditions, functionnames)
316 for functionname in priorityfunctionnames + otherfunctionnames:
317 if functionname in ignores:
318 continue
319 filterfunction = getattr(self, functionname, None)
320 # this filterfunction may only be defined on another checker if using TeeChecker
321 if filterfunction is None:
322 continue
323 filtermessage = filterfunction.__doc__
324 try:
325 filterresult = self.run_test(filterfunction, unit)
326 except FilterFailure, e:
327 filterresult = False
328 filtermessage = e.args[0]
329 except Exception, e:
330 if self.errorhandler is None:
331 raise ValueError("error in filter %s: %r, %r, %s" % \
332 (functionname, unit.source, unit.target, e))
333 else:
334 filterresult = self.errorhandler(functionname, unit.source, unit.target, e)
335 if not filterresult:
336 # we test some preconditions that aren't actually a cause for failure
337 if functionname in self.defaultfilters:
338 failures[functionname] = filtermessage
339 if functionname in self.preconditions:
340 for ignoredfunctionname in self.preconditions[functionname]:
341 ignores.append(ignoredfunctionname)
342 self.results_cache = {}
343 return failures
345 class TranslationChecker(UnitChecker):
346 """A checker that passes source and target strings to the checks, not the
347 whole unit.
349 This provides some speedup and simplifies testing."""
350 def __init__(self, checkerconfig=None, excludefilters=None, limitfilters=None, errorhandler=None):
351 super(TranslationChecker, self).__init__(checkerconfig, excludefilters, limitfilters, errorhandler)
353 def run_test(self, test, unit):
354 """Runs the given test on the given unit.
356 Note that this can raise a FilterFailure as part of normal operation."""
357 if self.hasplural:
358 filtermessages = []
359 filterresult = True
360 for pluralform in unit.target.strings:
361 try:
362 if not test(self.str1, pluralform):
363 filterresult = False
364 except FilterFailure, e:
365 filterresult = False
366 filtermessages.append( str(e).decode('utf-8') )
367 if not filterresult and filtermessages:
368 raise FilterFailure(filtermessages)
369 else:
370 return True
371 else:
372 return test(self.str1, self.str2)
374 def run_filters(self, unit):
375 """Do some optimisation by caching some data of the unit for the benefit
376 of run_test()."""
377 self.str1 = data.forceunicode(unit.source)
378 self.str2 = data.forceunicode(unit.target)
379 self.hasplural = unit.hasplural()
380 return super(TranslationChecker, self).run_filters(unit)
382 class TeeChecker:
383 """A Checker that controls multiple checkers."""
384 def __init__(self, checkerconfig=None, excludefilters=None, limitfilters=None,
385 checkerclasses=None, errorhandler=None, languagecode=None):
386 """construct a TeeChecker from the given checkers"""
387 self.limitfilters = limitfilters
388 if checkerclasses is None:
389 checkerclasses = [StandardChecker]
390 self.checkers = [checkerclass(checkerconfig=checkerconfig, excludefilters=excludefilters, limitfilters=limitfilters, errorhandler=errorhandler) for checkerclass in checkerclasses]
391 if languagecode:
392 for checker in self.checkers:
393 checker.config.updatetargetlanguage(languagecode)
394 # Let's hook up the language specific checker
395 lang_checker = self.checkers[0].config.lang.checker
396 if lang_checker:
397 self.checkers.append(lang_checker)
399 self.combinedfilters = self.getfilters(excludefilters, limitfilters)
400 self.config = checkerconfig or self.checkers[0].config
402 def getfilters(self, excludefilters=None, limitfilters=None):
403 """returns dictionary of available filters, including/excluding those in
404 the given lists"""
405 if excludefilters is None:
406 excludefilters = {}
407 filterslist = [checker.getfilters(excludefilters, limitfilters) for checker in self.checkers]
408 self.combinedfilters = {}
409 for filters in filterslist:
410 self.combinedfilters.update(filters)
411 # TODO: move this somewhere more sensible (a checkfilters method?)
412 if limitfilters is not None:
413 for filtername in limitfilters:
414 if not filtername in self.combinedfilters:
415 import sys
416 print >> sys.stderr, "warning: could not find filter %s" % filtername
417 return self.combinedfilters
419 def run_filters(self, unit):
420 """run all the tests in the checker's suites"""
421 failures = {}
422 for checker in self.checkers:
423 failures.update(checker.run_filters(unit))
424 return failures
426 def setsuggestionstore(self, store):
427 """Sets the filename that a checker should use for evaluating suggestions."""
428 for checker in self.checkers:
429 checker.setsuggestionstore(store)
432 class StandardChecker(TranslationChecker):
433 """The basic test suite for source -> target translations."""
434 def untranslated(self, str1, str2):
435 """checks whether a string has been translated at all"""
436 str2 = prefilters.removekdecomments(str2)
437 return not (len(str1.strip()) > 0 and len(str2) == 0)
439 def unchanged(self, str1, str2):
440 """checks whether a translation is basically identical to the original string"""
441 str1 = self.filteraccelerators(str1)
442 str2 = self.filteraccelerators(str2)
443 if len(str1.strip()) == 0:
444 return True
445 if str1.isupper() and str1 == str2:
446 return True
447 if self.config.notranslatewords:
448 words1 = str1.split()
449 if len(words1) == 1 and [word for word in words1 if word in self.config.notranslatewords]:
450 return True
451 str1 = self.removevariables(str1)
452 str2 = self.removevariables(str2)
453 if not (str1.strip().isdigit() or len(str1) < 2 or decoration.ispurepunctuation(str1.strip())) and (str1.strip().lower() == str2.strip().lower()):
454 raise FilterFailure(u"please translate")
455 return True
457 def blank(self, str1, str2):
458 """checks whether a translation only contains spaces"""
459 len1 = len(str1.strip())
460 len2 = len(str2.strip())
461 return not (len1 > 0 and len(str2) != 0 and len2 == 0)
463 def short(self, str1, str2):
464 """checks whether a translation is much shorter than the original string"""
465 len1 = len(str1.strip())
466 len2 = len(str2.strip())
467 return not ((len1 > 0) and (0 < len2 < (len1 * 0.1)) or ((len1 > 1) and (len2 == 1)))
469 def long(self, str1, str2):
470 """checks whether a translation is much longer than the original string"""
471 len1 = len(str1.strip())
472 len2 = len(str2.strip())
473 return not ((len1 > 0) and (0 < len1 < (len2 * 0.1)) or ((len1 == 1) and (len2 > 1)))
475 def escapes(self, str1, str2):
476 """checks whether escaping is consistent between the two strings"""
477 if not helpers.countsmatch(str1, str2, ("\\", "\\\\")):
478 escapes1 = u", ".join([u"'%s'" % word for word in str1.split() if "\\" in word])
479 escapes2 = u", ".join([u"'%s'" % word for word in str2.split() if "\\" in word])
480 raise SeriousFilterFailure(u"escapes in original (%s) don't match escapes in translation (%s)" % (escapes1, escapes2))
481 else:
482 return True
484 def newlines(self, str1, str2):
485 """checks whether newlines are consistent between the two strings"""
486 if not helpers.countsmatch(str1, str2, ("\n", "\r")):
487 raise FilterFailure(u"line endings in original don't match line endings in translation")
488 else:
489 return True
491 def tabs(self, str1, str2):
492 """checks whether tabs are consistent between the two strings"""
493 if not helpers.countmatch(str1, str2, "\t"):
494 raise SeriousFilterFailure(u"tabs in original don't match tabs in translation")
495 else:
496 return True
499 def singlequoting(self, str1, str2):
500 """checks whether singlequoting is consistent between the two strings"""
501 str1 = self.filterwordswithpunctuation(self.filteraccelerators(self.filtervariables(str1)))
502 str2 = self.filterwordswithpunctuation(self.filteraccelerators(self.filtervariables(str2)))
503 return helpers.countsmatch(str1, str2, ("'", "''", "\\'"))
505 def doublequoting(self, str1, str2):
506 """checks whether doublequoting is consistent between the two strings"""
507 str1 = self.filteraccelerators(self.filtervariables(str1))
508 str1 = self.filterxml(str1)
509 str1 = self.config.lang.punctranslate(str1)
510 str2 = self.filteraccelerators(self.filtervariables(str2))
511 str2 = self.filterxml(str2)
512 return helpers.countsmatch(str1, str2, ('"', '""', '\\"', u"«", u"»"))
514 def doublespacing(self, str1, str2):
515 """checks for bad double-spaces by comparing to original"""
516 str1 = self.filteraccelerators(str1)
517 str2 = self.filteraccelerators(str2)
518 return helpers.countmatch(str1, str2, " ")
520 def puncspacing(self, str1, str2):
521 """checks for bad spacing after punctuation"""
522 if str1.find(u" ") == -1:
523 return True
524 str1 = self.filteraccelerators(self.filtervariables(str1))
525 str1 = self.config.lang.punctranslate(str1)
526 str2 = self.filteraccelerators(self.filtervariables(str2))
527 for puncchar in self.config.punctuation:
528 plaincount1 = str1.count(puncchar)
529 plaincount2 = str2.count(puncchar)
530 if not plaincount1 or plaincount1 != plaincount2:
531 continue
532 spacecount1 = str1.count(puncchar+" ")
533 spacecount2 = str2.count(puncchar+" ")
534 if spacecount1 != spacecount2:
535 # handle extra spaces that are because of transposed punctuation
536 if str1.endswith(puncchar) != str2.endswith(puncchar) and abs(spacecount1-spacecount2) == 1:
537 continue
538 return False
539 return True
541 def printf(self, str1, str2):
542 """checks whether printf format strings match"""
543 count1 = count2 = None
544 for var_num2, match2 in enumerate(printf_pat.finditer(str2)):
545 count2 = var_num2 + 1
546 if match2.group('ord'):
547 for var_num1, match1 in enumerate(printf_pat.finditer(str1)):
548 count1 = var_num1 + 1
549 if int(match2.group('ord')) == var_num1 + 1:
550 if match2.group('fullvar') != match1.group('fullvar'):
551 return 0
552 else:
553 for var_num1, match1 in enumerate(printf_pat.finditer(str1)):
554 count1 = var_num1 + 1
555 if (var_num1 == var_num2) and (match1.group('fullvar') != match2.group('fullvar')):
556 return 0
558 if count2 is None:
559 if list(printf_pat.finditer(str1)):
560 return 0
562 if (count1 or count2) and (count1 != count2):
563 return 0
564 return 1
566 def accelerators(self, str1, str2):
567 """checks whether accelerators are consistent between the two strings"""
568 str1 = self.filtervariables(str1)
569 str2 = self.filtervariables(str2)
570 messages = []
571 for accelmarker in self.config.accelmarkers:
572 counter1 = decoration.countaccelerators(accelmarker, self.config.sourcelang.validaccel)
573 counter2 = decoration.countaccelerators(accelmarker, self.config.lang.validaccel)
574 count1, countbad1 = counter1(str1)
575 count2, countbad2 = counter2(str2)
576 getaccel = decoration.getaccelerators(accelmarker, self.config.lang.validaccel)
577 accel2, bad2 = getaccel(str2)
578 if count1 == count2:
579 continue
580 if count1 == 1 and count2 == 0:
581 if countbad2 == 1:
582 messages.append("accelerator %s appears before an invalid accelerator character '%s' (eg. space)" % (accelmarker, bad2[0]))
583 else:
584 messages.append(u"accelerator %s is missing from translation" % accelmarker)
585 elif count1 == 0:
586 messages.append(u"accelerator %s does not occur in original and should not be in translation" % accelmarker)
587 elif count1 == 1 and count2 > count1:
588 messages.append("accelerator %s is repeated in translation" % accelmarker)
589 else:
590 messages.append("accelerator %s occurs %d time(s) in original and %d time(s) in translation" % (accelmarker, count1, count2))
591 if messages:
592 if "accelerators" in self.config.criticaltests:
593 raise SeriousFilterFailure(messages)
594 else:
595 raise FilterFailure(messages)
596 return True
598 # def acceleratedvariables(self, str1, str2):
599 # """checks that no variables are accelerated"""
600 # messages = []
601 # for accelerator in self.config.accelmarkers:
602 # for variablestart, variableend in self.config.varmatches:
603 # error = accelerator + variablestart
604 # if str1.find(error) >= 0:
605 # messages.append("original has an accelerated variable")
606 # if str2.find(error) >= 0:
607 # messages.append("translation has an accelerated variable")
608 # if messages:
609 # raise FilterFailure(messages)
610 # return True
612 def variables(self, str1, str2):
613 """checks whether variables of various forms are consistent between the two strings"""
614 messages = []
615 mismatch1, mismatch2 = [], []
616 varnames1, varnames2 = [], []
617 for startmarker, endmarker in self.config.varmatches:
618 varchecker = decoration.getvariables(startmarker, endmarker)
619 if startmarker and endmarker:
620 if isinstance(endmarker, int):
621 redecorate = lambda var: startmarker + var
622 else:
623 redecorate = lambda var: startmarker + var + endmarker
624 elif startmarker:
625 redecorate = lambda var: startmarker + var
626 else:
627 redecorate = lambda var: var
628 vars1 = varchecker(str1)
629 vars2 = varchecker(str2)
630 if vars1 != vars2:
631 # we use counts to compare so we can handle multiple variables
632 vars1, vars2 = [var for var in vars1 if vars1.count(var) > vars2.count(var)], [var for var in vars2 if vars1.count(var) < vars2.count(var)]
633 # filter variable names we've already seen, so they aren't matched by more than one filter...
634 vars1, vars2 = [var for var in vars1 if var not in varnames1], [var for var in vars2 if var not in varnames2]
635 varnames1.extend(vars1)
636 varnames2.extend(vars2)
637 vars1 = map(redecorate, vars1)
638 vars2 = map(redecorate, vars2)
639 mismatch1.extend(vars1)
640 mismatch2.extend(vars2)
641 if mismatch1:
642 messages.append("do not translate: %s" % ", ".join(mismatch1))
643 elif mismatch2:
644 messages.append("translation contains variables not in original: %s" % ", ".join(mismatch2))
645 if messages and mismatch1:
646 raise SeriousFilterFailure(messages)
647 elif messages:
648 raise FilterFailure(messages)
649 return True
651 def functions(self, str1, str2):
652 """checks that function names are not translated"""
653 return helpers.funcmatch(str1, str2, decoration.getfunctions, self.config.punctuation)
655 def emails(self, str1, str2):
656 """checks that emails are not translated"""
657 return helpers.funcmatch(str1, str2, decoration.getemails)
659 def urls(self, str1, str2):
660 """checks that URLs are not translated"""
661 return helpers.funcmatch(str1, str2, decoration.geturls)
663 def numbers(self, str1, str2):
664 """checks whether numbers of various forms are consistent between the two strings"""
665 return helpers.countsmatch(str1, str2, decoration.getnumbers(str1))
667 def startwhitespace(self, str1, str2):
668 """checks whether whitespace at the beginning of the strings matches"""
669 str1 = self.filteraccelerators(self.filtervariables(str1))
670 str2 = self.filteraccelerators(self.filtervariables(str2))
671 return helpers.funcmatch(str1, str2, decoration.spacestart)
673 def endwhitespace(self, str1, str2):
674 """checks whether whitespace at the end of the strings matches"""
675 str1 = self.filteraccelerators(self.filtervariables(str1))
676 str2 = self.filteraccelerators(self.filtervariables(str2))
677 return helpers.funcmatch(str1, str2, decoration.spaceend)
679 def startpunc(self, str1, str2):
680 """checks whether punctuation at the beginning of the strings match"""
681 str1 = self.filteraccelerators(self.filtervariables(self.filterwordswithpunctuation(str1)))
682 str1 = self.config.lang.punctranslate(str1)
683 str2 = self.filteraccelerators(self.filtervariables(self.filterwordswithpunctuation(str2)))
684 return helpers.funcmatch(str1, str2, decoration.puncstart, self.config.punctuation)
686 def endpunc(self, str1, str2):
687 """checks whether punctuation at the end of the strings match"""
688 str1 = self.filteraccelerators(self.filtervariables(self.filterwordswithpunctuation(str1)))
689 str1 = self.config.lang.punctranslate(str1)
690 str2 = self.filteraccelerators(self.filtervariables(self.filterwordswithpunctuation(str2)))
691 return helpers.funcmatch(str1, str2, decoration.puncend, self.config.endpunctuation)
693 def purepunc(self, str1, str2):
694 """checks that strings that are purely punctuation are not changed"""
695 # this test is a subset of startandend
696 if (decoration.ispurepunctuation(str1)):
697 return str1 == str2
698 else:
699 return not decoration.ispurepunctuation(str2)
701 def brackets(self, str1, str2):
702 """checks that the number of brackets in both strings match"""
703 str1 = self.filtervariables(str1)
704 str2 = self.filtervariables(str2)
705 messages = []
706 missing = []
707 extra = []
708 for bracket in ("[", "]", "{", "}", "(", ")"):
709 count1 = str1.count(bracket)
710 count2 = str2.count(bracket)
711 if count2 < count1:
712 missing.append("'%s'" % bracket)
713 elif count2 > count1:
714 extra.append("'%s'" % bracket)
715 if missing:
716 messages.append(u"translation is missing %s" % ", ".join(missing))
717 if extra:
718 messages.append(u"translation has extra %s" % ", ".join(extra))
719 if messages:
720 raise FilterFailure(messages)
721 return True
723 def sentencecount(self, str1, str2):
724 """checks that the number of sentences in both strings match"""
725 sentences1 = len(self.config.sourcelang.sentences(str1))
726 sentences2 = len(self.config.lang.sentences(str2))
727 if not sentences1 == sentences2:
728 raise FilterFailure(u"The number of sentences differ: %d versus %d" % (sentences1, sentences2))
729 return True
731 def options(self, str1, str2):
732 """checks that options are not translated"""
733 str1 = self.filtervariables(str1)
734 for word1 in str1.split():
735 if word1 != "--" and word1.startswith("--") and word1[-1].isalnum():
736 parts = word1.split("=")
737 if not parts[0] in str2:
738 raise FilterFailure("The option %s does not occur or is translated in the translation." % parts[0])
739 if len(parts) > 1 and parts[1] in str2:
740 raise FilterFailure("The parameter %(param)s in option %(option)s is not translated." % {"param": parts[0], "option": parts[1]})
741 return True
743 def startcaps(self, str1, str2):
744 """checks that the message starts with the correct capitalisation"""
745 str1 = self.filteraccelerators(str1)
746 str2 = self.filteraccelerators(str2)
747 if len(str1) > 1 and len(str2) > 1:
748 return self.config.sourcelang.capsstart(str1) == self.config.lang.capsstart(str2)
749 if len(str1) == 0 and len(str2) == 0:
750 return True
751 if len(str1) == 0 or len(str2) == 0:
752 return False
753 return True
755 def simplecaps(self, str1, str2):
756 """checks the capitalisation of two strings isn't wildly different"""
757 str1 = self.removevariables(str1)
758 str2 = self.removevariables(str2)
759 # TODO: review this. The 'I' is specific to English, so it probably serves
760 # no purpose to get sourcelang.sentenceend
761 str1 = re.sub(u"[^%s]( I )" % self.config.sourcelang.sentenceend, " i ", str1)
762 capitals1 = helpers.filtercount(str1, type(str1).isupper)
763 capitals2 = helpers.filtercount(str2, type(str2).isupper)
764 alpha1 = helpers.filtercount(str1, type(str1).isalpha)
765 alpha2 = helpers.filtercount(str2, type(str2).isalpha)
766 # Capture the all caps case
767 if capitals1 == alpha1:
768 return capitals2 == alpha2
769 # some heuristic tests to try and see that the style of capitals is vaguely the same
770 if capitals1 == 0 or capitals1 == 1:
771 return capitals2 == capitals1
772 elif capitals1 < len(str1) / 10:
773 return capitals2 < len(str2) / 8
774 elif len(str1) < 10:
775 return abs(capitals1 - capitals2) < 3
776 elif capitals1 > len(str1) * 6 / 10:
777 return capitals2 > len(str2) * 6 / 10
778 else:
779 return abs(capitals1 - capitals2) < (len(str1) + len(str2)) / 6
781 def acronyms(self, str1, str2):
782 """checks that acronyms that appear are unchanged"""
783 acronyms = []
784 allowed = []
785 for startmatch, endmatch in self.config.varmatches:
786 allowed += decoration.getvariables(startmatch, endmatch)(str1)
787 allowed += self.config.musttranslatewords.keys()
788 str1 = self.filteraccelerators(self.filtervariables(str1))
789 iter = self.config.lang.word_iter(str1)
790 str2 = self.filteraccelerators(self.filtervariables(str2))
791 #TODO: strip XML? - should provide better error messsages
792 # see mail/chrome/messanger/smime.properties.po
793 #TODO: consider limiting the word length for recognising acronyms to
794 #something like 5/6 characters
795 for word in iter:
796 if word.isupper() and len(word) > 1 and word not in allowed:
797 if str2.find(word) == -1:
798 acronyms.append(word)
799 if acronyms:
800 raise FilterFailure("acronyms should not be translated: " + ", ".join(acronyms))
801 return True
803 def doublewords(self, str1, str2):
804 """checks for repeated words in the translation"""
805 lastword = ""
806 without_newlines = "\n".join(str2.split("\n"))
807 words = self.filteraccelerators(self.removevariables(without_newlines)).replace(".", "").lower().split()
808 for word in words:
809 if word == lastword:
810 raise FilterFailure("The word '%s' is repeated" % word)
811 lastword = word
812 return True
814 def notranslatewords(self, str1, str2):
815 """checks that words configured as untranslatable appear in the translation too"""
816 if not self.config.notranslatewords:
817 return True
818 str1 = self.filtervariables(str1)
819 str2 = self.filtervariables(str2)
820 #The above is full of strange quotes and things in utf-8 encoding.
821 #single apostrophe perhaps problematic in words like "doesn't"
822 for seperator in self.config.punctuation:
823 str1 = str1.replace(seperator, u" ")
824 str2 = str2.replace(seperator, u" ")
825 words1 = self.filteraccelerators(str1).split()
826 words2 = self.filteraccelerators(str2).split()
827 stopwords = [word for word in words1 if word in self.config.notranslatewords and word not in words2]
828 if stopwords:
829 raise FilterFailure("do not translate: %s" % (", ".join(stopwords)))
830 return True
832 def musttranslatewords(self, str1, str2):
833 """checks that words configured as definitely translatable don't appear in
834 the translation"""
835 if not self.config.musttranslatewords:
836 return True
837 str1 = self.removevariables(str1)
838 str2 = self.removevariables(str2)
839 #The above is full of strange quotes and things in utf-8 encoding.
840 #single apostrophe perhaps problematic in words like "doesn't"
841 for seperator in self.config.punctuation:
842 str1 = str1.replace(seperator, " ")
843 str2 = str2.replace(seperator, " ")
844 words1 = self.filteraccelerators(str1).split()
845 words2 = self.filteraccelerators(str2).split()
846 stopwords = [word for word in words1 if word in self.config.musttranslatewords and word in words2]
847 if stopwords:
848 raise FilterFailure("please translate: %s" % (", ".join(stopwords)))
849 return True
851 def validchars(self, str1, str2):
852 """checks that only characters specified as valid appear in the translation"""
853 if not self.config.validcharsmap:
854 return True
855 invalid1 = str1.translate(self.config.validcharsmap)
856 invalid2 = str2.translate(self.config.validcharsmap)
857 invalidchars = [u"'%s' (\\u%04x)" % (invalidchar, ord(invalidchar)) for invalidchar in invalid2 if invalidchar not in invalid1]
858 if invalidchars:
859 raise FilterFailure(u"invalid chars: %s" % (u", ".join(invalidchars)))
860 return True
862 def filepaths(self, str1, str2):
863 """checks that file paths have not been translated"""
864 for word1 in self.filteraccelerators(str1).split():
865 if word1.startswith("/"):
866 if not helpers.countsmatch(str1, str2, (word1,)):
867 return False
868 return True
870 def xmltags(self, str1, str2):
871 """checks that XML/HTML tags have not been translated"""
872 tags1 = tag_re.findall(str1)
873 if len(tags1) > 0:
874 if (len(tags1[0]) == len(str1)) and not "=" in tags1[0]:
875 return True
876 tags2 = tag_re.findall(str2)
877 properties1 = tagproperties(tags1, self.config.ignoretags)
878 properties2 = tagproperties(tags2, self.config.ignoretags)
879 filtered1 = []
880 filtered2 = []
881 for property1 in properties1:
882 filtered1 += [intuplelist(property1, self.config.canchangetags)]
883 for property2 in properties2:
884 filtered2 += [intuplelist(property2, self.config.canchangetags)]
886 #TODO: consider the consequences of different ordering of attributes/tags
887 if filtered1 != filtered2:
888 return False
889 else:
890 # No tags in str1, let's just check that none were added in str2. This
891 # might be useful for fuzzy strings wrongly unfuzzied, for example.
892 tags2 = tag_re.findall(str2)
893 if len(tags2) > 0:
894 return False
895 return True
897 def kdecomments(self, str1, str2):
898 """checks to ensure that no KDE style comments appear in the translation"""
899 return str2.find("\n_:") == -1 and not str2.startswith("_:")
901 def compendiumconflicts(self, str1, str2):
902 """checks for Gettext compendium conflicts (#-#-#-#-#)"""
903 return str2.find("#-#-#-#-#") == -1
905 def simpleplurals(self, str1, str2):
906 """checks for English style plural(s) for you to review"""
907 def numberofpatterns(string, patterns):
908 number = 0
909 for pattern in patterns:
910 number += len(re.findall(pattern, string))
911 return number
913 sourcepatterns = ["\(s\)"]
914 targetpatterns = ["\(s\)"]
915 sourcecount = numberofpatterns(str1, sourcepatterns)
916 targetcount = numberofpatterns(str2, targetpatterns)
917 if self.config.lang.nplurals == 1:
918 return not targetcount
919 return sourcecount == targetcount
921 def spellcheck(self, str1, str2):
922 """checks words that don't pass a spell check"""
923 if not self.config.targetlanguage:
924 return True
925 if not spelling.available:
926 return True
927 str1 = self.filteraccelerators_by_list(self.filtervariables(str1), self.config.sourcelang.validaccel)
928 str2 = self.filteraccelerators_by_list(self.filtervariables(str2), self.config.lang.validaccel)
929 ignore1 = []
930 messages = []
931 for word, index, suggestions in spelling.check(str1, lang="en"):
932 ignore1.append(word)
933 for word, index, suggestions in spelling.check(str2, lang=self.config.targetlanguage):
934 if word in self.config.notranslatewords:
935 continue
936 if word in ignore1:
937 continue
938 # hack to ignore hyphenisation rules
939 if word in suggestions:
940 continue
941 messages.append(u"check spelling of %s (could be %s)" % (word, u" / ".join(suggestions)))
942 if messages:
943 raise FilterFailure(messages)
944 return True
946 def credits(self, str1, str2):
947 """checks for messages containing translation credits instead of normal translations."""
948 return not str1 in self.config.credit_sources
950 # If the precondition filter is run and fails then the other tests listed are ignored
951 preconditions = {"untranslated": ("simplecaps", "variables", "startcaps",
952 "accelerators", "brackets", "endpunc",
953 "acronyms", "xmltags", "startpunc",
954 "endwhitespace", "startwhitespace",
955 "escapes", "doublequoting", "singlequoting",
956 "filepaths", "purepunc", "doublespacing",
957 "sentencecount", "numbers", "isfuzzy",
958 "isreview", "notranslatewords", "musttranslatewords",
959 "emails", "simpleplurals", "urls", "printf",
960 "tabs", "newlines", "functions", "options",
961 "blank", "nplurals"),
962 "blank": ("simplecaps", "variables", "startcaps",
963 "accelerators", "brackets", "endpunc",
964 "acronyms", "xmltags", "startpunc",
965 "endwhitespace", "startwhitespace",
966 "escapes", "doublequoting", "singlequoting",
967 "filepaths", "purepunc", "doublespacing",
968 "sentencecount", "numbers", "isfuzzy",
969 "isreview", "notranslatewords", "musttranslatewords",
970 "emails", "simpleplurals", "urls", "printf",
971 "tabs", "newlines", "functions", "options"),
972 "credits": ("simplecaps", "variables", "startcaps",
973 "accelerators", "brackets", "endpunc",
974 "acronyms", "xmltags", "startpunc",
975 "escapes", "doublequoting", "singlequoting",
976 "filepaths", "doublespacing",
977 "sentencecount", "numbers",
978 "emails", "simpleplurals", "urls", "printf",
979 "tabs", "newlines", "functions", "options"),
980 "purepunc": ("startcaps", "options"),
981 "startcaps": ("simplecaps",),
982 "endwhitespace": ("endpunc",),
983 "startwhitespace":("startpunc",),
984 "unchanged": ("doublewords",),
985 "compendiumconflicts": ("accelerators", "brackets", "escapes",
986 "numbers", "startpunc", "long", "variables",
987 "startcaps", "sentencecount", "simplecaps",
988 "doublespacing", "endpunc", "xmltags",
989 "startwhitespace", "endwhitespace",
990 "singlequoting", "doublequoting",
991 "filepaths", "purepunc", "doublewords", "printf") }
993 # code to actually run the tests (use unittest?)
995 openofficeconfig = CheckerConfig(
996 accelmarkers = ["~"],
997 varmatches = [("&", ";"), ("%", "%"), ("%", None), ("%", 0), ("$(", ")"), ("$", "$"), ("${", "}"), ("#", "#"), ("#", 1), ("#", 0), ("($", ")"), ("$[", "]"), ("[", "]"), ("$", None)],
998 ignoretags = [("alt", "xml-lang", None), ("ahelp", "visibility", "visible"), ("img", "width", None), ("img", "height", None)],
999 canchangetags = [("link", "name", None)]
1002 class OpenOfficeChecker(StandardChecker):
1003 def __init__(self, **kwargs):
1004 checkerconfig = kwargs.get("checkerconfig", None)
1005 if checkerconfig is None:
1006 checkerconfig = CheckerConfig()
1007 kwargs["checkerconfig"] = checkerconfig
1008 checkerconfig.update(openofficeconfig)
1009 StandardChecker.__init__(self, **kwargs)
1011 mozillaconfig = CheckerConfig(
1012 accelmarkers = ["&"],
1013 varmatches = [("&", ";"), ("%", "%"), ("%", 1), ("$", "$"), ("$", None), ("#", 1), ("${", "}"), ("$(^", ")")],
1014 criticaltests = ["accelerators"]
1017 class MozillaChecker(StandardChecker):
1018 def __init__(self, **kwargs):
1019 checkerconfig = kwargs.get("checkerconfig", None)
1020 if checkerconfig is None:
1021 checkerconfig = CheckerConfig()
1022 kwargs["checkerconfig"] = checkerconfig
1023 checkerconfig.update(mozillaconfig)
1024 StandardChecker.__init__(self, **kwargs)
1026 gnomeconfig = CheckerConfig(
1027 accelmarkers = ["_"],
1028 varmatches = [("%", 1), ("$(", ")")],
1029 credit_sources = [u"translator-credits"]
1032 class GnomeChecker(StandardChecker):
1033 def __init__(self, **kwargs):
1034 checkerconfig = kwargs.get("checkerconfig", None)
1035 if checkerconfig is None:
1036 checkerconfig = CheckerConfig()
1037 kwargs["checkerconfig"] = checkerconfig
1038 checkerconfig.update(gnomeconfig)
1039 StandardChecker.__init__(self, **kwargs)
1041 kdeconfig = CheckerConfig(
1042 accelmarkers = ["&"],
1043 varmatches = [("%", 1)],
1044 credit_sources = [u"Your names", u"Your emails", u"ROLES_OF_TRANSLATORS"]
1047 class KdeChecker(StandardChecker):
1048 def __init__(self, **kwargs):
1049 # TODO allow setup of KDE plural and translator comments so that they do
1050 # not create false postives
1051 checkerconfig = kwargs.get("checkerconfig", None)
1052 if checkerconfig is None:
1053 checkerconfig = CheckerConfig()
1054 kwargs["checkerconfig"] = checkerconfig
1055 checkerconfig.update(kdeconfig)
1056 StandardChecker.__init__(self, **kwargs)
1058 cclicenseconfig = CheckerConfig(varmatches = [("@", "@")])
1059 class CCLicenseChecker(StandardChecker):
1060 def __init__(self, **kwargs):
1061 checkerconfig = kwargs.get("checkerconfig", None)
1062 if checkerconfig is None:
1063 checkerconfig = CheckerConfig()
1064 kwargs["checkerconfig"] = checkerconfig
1065 checkerconfig.update(cclicenseconfig)
1066 StandardChecker.__init__(self, **kwargs)
1068 projectcheckers = {
1069 "openoffice": OpenOfficeChecker,
1070 "mozilla": MozillaChecker,
1071 "kde": KdeChecker,
1072 "wx": KdeChecker,
1073 "gnome": GnomeChecker,
1074 "creativecommons": CCLicenseChecker
1078 class StandardUnitChecker(UnitChecker):
1079 """The standard checks for common checks on translation units."""
1080 def isfuzzy(self, unit):
1081 """Check if the unit has been marked fuzzy."""
1082 return not unit.isfuzzy()
1084 def isreview(self, unit):
1085 """Check if the unit has been marked review."""
1086 return not unit.isreview()
1088 def nplurals(self, unit):
1089 """Checks for the correct number of noun forms for plural translations."""
1090 if unit.hasplural():
1091 # if we don't have a valid nplurals value, don't run the test
1092 nplurals = self.config.lang.nplurals
1093 if nplurals > 0:
1094 return len(unit.target.strings) == nplurals
1095 return True
1097 def hassuggestion(self, unit):
1098 """Checks if there is at least one suggested translation for this unit."""
1099 self.suggestion_store = getattr(self, 'suggestion_store', None)
1100 suggestions = []
1101 if self.suggestion_store:
1102 source = unit.source
1103 suggestions = [unit for unit in self.suggestion_store.units if unit.source == source]
1104 elif xliff and isinstance(unit, xliff.xliffunit):
1105 # TODO: we probably want to filter them somehow
1106 suggestions = unit.getalttrans()
1107 return not bool(suggestions)
1110 def runtests(str1, str2, ignorelist=()):
1111 """verifies that the tests pass for a pair of strings"""
1112 from translate.storage import base
1113 str1 = data.forceunicode(str1)
1114 str2 = data.forceunicode(str2)
1115 unit = base.TranslationUnit(str1)
1116 unit.target = str2
1117 checker = StandardChecker(excludefilters=ignorelist)
1118 failures = checker.run_filters(unit)
1119 for testname, message in failures:
1120 print "failure: %s: %s\n %r\n %r" % (testname, message, str1, str2)
1121 return failures
1123 def batchruntests(pairs):
1124 """runs test on a batch of string pairs"""
1125 passed, numpairs = 0, len(pairs)
1126 for str1, str2 in pairs:
1127 if runtests(str1, str2):
1128 passed += 1
1129 print
1130 print "total: %d/%d pairs passed" % (passed, numpairs)
1132 if __name__ == '__main__':
1133 testset = [(r"simple", r"somple"),
1134 (r"\this equals \that", r"does \this equal \that?"),
1135 (r"this \'equals\' that", r"this 'equals' that"),
1136 (r" start and end! they must match.", r"start and end! they must match."),
1137 (r"check for matching %variables marked like %this", r"%this %variable is marked"),
1138 (r"check for mismatching %variables marked like %this", r"%that %variable is marked"),
1139 (r"check for mismatching %variables% too", r"how many %variable% are marked"),
1140 (r"%% %%", r"%%"),
1141 (r"Row: %1, Column: %2", r"Mothalo: %1, Kholomo: %2"),
1142 (r"simple lowercase", r"it is all lowercase"),
1143 (r"simple lowercase", r"It Is All Lowercase"),
1144 (r"Simple First Letter Capitals", r"First Letters"),
1145 (r"SIMPLE CAPITALS", r"First Letters"),
1146 (r"SIMPLE CAPITALS", r"ALL CAPITALS"),
1147 (r"forgot to translate", r" ")
1149 batchruntests(testset)