2 # -*- coding: utf-8 -*-
4 # Copyright 2004-2008 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """This is a set of validation checks that can be performed on translation
25 Derivatives of UnitChecker (like StandardUnitChecker) check translation units,
26 and derivatives of TranslationChecker (like StandardChecker) check
27 (source, target) translation pairs.
29 When adding a new test here, please document and explain the behaviour on the
30 U{wiki <http://translate.sourceforge.net/wiki/toolkit/pofilter_tests>}.
33 from translate
.filters
import helpers
34 from translate
.filters
import decoration
35 from translate
.filters
import prefilters
36 from translate
.filters
import spelling
37 from translate
.lang
import factory
38 from translate
.lang
import data
39 # The import of xliff could fail if the user doesn't have lxml installed. For
40 # now we try to continue gracefully to help users who aren't interested in
41 # support for XLIFF or other XML formats.
43 from translate
.storage
import xliff
44 except ImportError, e
:
48 # These are some regular expressions that are compiled for use in some tests
50 # printf syntax based on http://en.wikipedia.org/wiki/Printf which doens't cover everything we leave \w instead of specifying the exact letters as
51 # this should capture printf types defined in other platforms.
52 printf_pat
= re
.compile('%((?:(?P<ord>\d+)\$)*(?P<fullvar>[+#-]*(?:\d+)*(?:\.\d+)*(hh\|h\|l\|ll)*(?P<type>[\w%])))')
54 # The name of the XML tag
55 tagname_re
= re
.compile("<[\s]*([\w\/]*)")
57 # We allow escaped quotes, probably for old escaping style of OOo helpcontent
58 #TODO: remove escaped strings once usage is audited
59 property_re
= re
.compile(" (\w*)=((\\\\?\".*?\\\\?\")|(\\\\?'.*?\\\\?'))")
62 tag_re
= re
.compile("<[^>]+>")
65 """Returns the name of the XML/HTML tag in string"""
66 return tagname_re
.match(string
).groups(1)[0]
68 def intuplelist(pair
, list):
69 """Tests to see if pair == (a,b,c) is in list, but handles None entries in
70 list as wildcards (only allowed in positions "a" and "c"). We take a shortcut
71 by only considering "c" if "b" has already matched."""
73 if (b
, c
) == (None, None):
78 if (x
, y
) in [(a
, b
), (None, b
)]:
83 def tagproperties(strings
, ignore
):
84 """Returns all the properties in the XML/HTML tag string as
85 (tagname, propertyname, propertyvalue), but ignore those combinations
86 specified in ignore."""
88 for string
in strings
:
90 properties
+= [(tag
, None, None)]
91 #Now we isolate the attribute pairs.
92 pairs
= property_re
.findall(string
)
93 for property, value
, a
, b
in pairs
:
98 if (tag
, property, value
) in ignore
or \
99 intuplelist((tag
,property,value
), ignore
) != (tag
,property,value
):
103 properties
+= [(tag
, property, value
)]
107 class FilterFailure(Exception):
108 """This exception signals that a Filter didn't pass, and gives an explanation
110 def __init__(self
, messages
):
111 if not isinstance(messages
, list):
112 messages
= [messages
]
113 assert isinstance(messages
[0], unicode) # Assumption: all of same type
114 Exception.__init
__(self
, u
", ".join(messages
))
116 class SeriousFilterFailure(FilterFailure
):
117 """This exception signals that a Filter didn't pass, and the bad translation
118 might break an application (so the string will be marked fuzzy)"""
121 #(tag, attribute, value) specifies a certain attribute which can be changed/
122 #ignored if it exists inside tag. In the case where there is a third element
123 #in the tuple, it indicates a property value that can be ignored if present
124 #(like defaults, for example)
125 #If a certain item is None, it indicates that it is relevant for all values of
126 #the property/tag that is specified as None. A non-None value of "value"
127 #indicates that the value of the attribute must be taken into account.
128 common_ignoretags
= [(None, "xml-lang", None)]
129 common_canchangetags
= [("img", "alt", None)]
131 class CheckerConfig(object):
132 """object representing the configuration of a checker"""
133 def __init__(self
, targetlanguage
=None, accelmarkers
=None, varmatches
=None,
134 notranslatewords
=None, musttranslatewords
=None, validchars
=None,
135 punctuation
=None, endpunctuation
=None, ignoretags
=None,
136 canchangetags
=None, criticaltests
=None, credit_sources
=None):
138 self
.accelmarkers
= self
._init
_list
(accelmarkers
)
139 self
.varmatches
= self
._init
_list
(varmatches
)
140 self
.criticaltests
= self
._init
_list
(criticaltests
)
141 self
.credit_sources
= self
._init
_list
(credit_sources
)
143 self
.targetlanguage
= targetlanguage
144 self
.updatetargetlanguage(targetlanguage
)
145 self
.sourcelang
= factory
.getlanguage('en')
146 # Inits with default values
147 self
.punctuation
= self
._init
_default
(data
.forceunicode(punctuation
), self
.lang
.punctuation
)
148 self
.endpunctuation
= self
._init
_default
(data
.forceunicode(endpunctuation
), self
.lang
.sentenceend
)
149 self
.ignoretags
= self
._init
_default
(ignoretags
, common_ignoretags
)
150 self
.canchangetags
= self
._init
_default
(canchangetags
, common_canchangetags
)
152 # TODO: allow user configuration of untranslatable words
153 self
.notranslatewords
= dict.fromkeys([data
.forceunicode(key
) for key
in self
._init
_list
(notranslatewords
)])
154 self
.musttranslatewords
= dict.fromkeys([data
.forceunicode(key
) for key
in self
._init
_list
(musttranslatewords
)])
155 validchars
= data
.forceunicode(validchars
)
156 self
.validcharsmap
= {}
157 self
.updatevalidchars(validchars
)
159 def _init_list(self
, list):
160 """initialise configuration paramaters that are lists
163 @param list: None (we'll initialise a blank list) or a list paramater
170 def _init_default(self
, param
, default
):
171 """initialise parameters that can have default options
173 @param param: the user supplied paramater value
174 @param default: default values when param is not specified
175 @return: the paramater as specified by the user of the default settings
181 def update(self
, otherconfig
):
182 """combines the info in otherconfig into this config object"""
183 self
.targetlanguage
= otherconfig
.targetlanguage
or self
.targetlanguage
184 self
.updatetargetlanguage(self
.targetlanguage
)
185 self
.accelmarkers
.extend([c
for c
in otherconfig
.accelmarkers
if not c
in self
.accelmarkers
])
186 self
.varmatches
.extend(otherconfig
.varmatches
)
187 self
.notranslatewords
.update(otherconfig
.notranslatewords
)
188 self
.musttranslatewords
.update(otherconfig
.musttranslatewords
)
189 self
.validcharsmap
.update(otherconfig
.validcharsmap
)
190 self
.punctuation
+= otherconfig
.punctuation
191 self
.endpunctuation
+= otherconfig
.endpunctuation
192 #TODO: consider also updating in the following cases:
193 self
.ignoretags
= otherconfig
.ignoretags
194 self
.canchangetags
= otherconfig
.canchangetags
195 self
.criticaltests
.extend(otherconfig
.criticaltests
)
196 self
.credit_sources
= otherconfig
.credit_sources
198 def updatevalidchars(self
, validchars
):
199 """updates the map that eliminates valid characters"""
200 if validchars
is None:
202 validcharsmap
= dict([(ord(validchar
), None) for validchar
in data
.forceunicode(validchars
)])
203 self
.validcharsmap
.update(validcharsmap
)
205 def updatetargetlanguage(self
, langcode
):
206 """Updates the target language in the config to the given target language"""
207 self
.lang
= factory
.getlanguage(langcode
)
209 def cache_results(f
):
210 def cached_f(self
, param1
):
211 key
= (f
.__name
__, param1
)
212 res_cache
= self
.results_cache
214 return res_cache
[key
]
216 value
= f(self
, param1
)
217 res_cache
[key
] = value
221 class UnitChecker(object):
222 """Parent Checker class which does the checking based on functions available
223 in derived classes."""
226 def __init__(self
, checkerconfig
=None, excludefilters
=None, limitfilters
=None, errorhandler
=None):
227 self
.errorhandler
= errorhandler
228 if checkerconfig
is None:
229 self
.setconfig(CheckerConfig())
231 self
.setconfig(checkerconfig
)
232 # exclude functions defined in UnitChecker from being treated as tests...
233 self
.helperfunctions
= {}
234 for functionname
in dir(UnitChecker
):
235 function
= getattr(self
, functionname
)
236 if callable(function
):
237 self
.helperfunctions
[functionname
] = function
238 self
.defaultfilters
= self
.getfilters(excludefilters
, limitfilters
)
240 self
.results_cache
= {}
242 def getfilters(self
, excludefilters
=None, limitfilters
=None):
243 """returns dictionary of available filters, including/excluding those in
246 if limitfilters
is None:
247 # use everything available unless instructed
248 limitfilters
= dir(self
)
249 if excludefilters
is None:
251 for functionname
in limitfilters
:
252 if functionname
in excludefilters
: continue
253 if functionname
in self
.helperfunctions
: continue
254 if functionname
== "errorhandler": continue
255 filterfunction
= getattr(self
, functionname
, None)
256 if not callable(filterfunction
): continue
257 filters
[functionname
] = filterfunction
260 def setconfig(self
, config
):
261 """sets the accelerator list"""
263 self
.accfilters
= [prefilters
.filteraccelerators(accelmarker
) for accelmarker
in self
.config
.accelmarkers
]
264 self
.varfilters
= [prefilters
.filtervariables(startmatch
, endmatch
, prefilters
.varname
)
265 for startmatch
, endmatch
in self
.config
.varmatches
]
266 self
.removevarfilter
= [prefilters
.filtervariables(startmatch
, endmatch
, prefilters
.varnone
)
267 for startmatch
, endmatch
in self
.config
.varmatches
]
269 def setsuggestionstore(self
, store
):
270 """Sets the filename that a checker should use for evaluating suggestions."""
271 self
.suggestion_store
= store
273 def filtervariables(self
, str1
):
274 """filter out variables from str1"""
275 return helpers
.multifilter(str1
, self
.varfilters
)
276 filtervariables
= cache_results(filtervariables
)
278 def removevariables(self
, str1
):
279 """remove variables from str1"""
280 return helpers
.multifilter(str1
, self
.removevarfilter
)
281 removevariables
= cache_results(removevariables
)
283 def filteraccelerators(self
, str1
):
284 """filter out accelerators from str1"""
285 return helpers
.multifilter(str1
, self
.accfilters
, None)
286 filteraccelerators
= cache_results(filteraccelerators
)
288 def filteraccelerators_by_list(self
, str1
, acceptlist
=None):
289 """filter out accelerators from str1"""
290 return helpers
.multifilter(str1
, self
.accfilters
, acceptlist
)
292 def filterwordswithpunctuation(self
, str1
):
293 """replaces words with punctuation with their unpunctuated equivalents"""
294 return prefilters
.filterwordswithpunctuation(str1
)
295 filterwordswithpunctuation
= cache_results(filterwordswithpunctuation
)
297 def filterxml(self
, str1
):
298 """filter out XML from the string so only text remains"""
299 return tag_re
.sub("", str1
)
300 filterxml
= cache_results(filterxml
)
302 def run_test(self
, test
, unit
):
303 """Runs the given test on the given unit.
305 Note that this can raise a FilterFailure as part of normal operation"""
308 def run_filters(self
, unit
):
309 """run all the tests in this suite, return failures as testname, message_or_exception"""
310 self
.results_cache
= {}
312 ignores
= self
.config
.lang
.ignoretests
[:]
313 functionnames
= self
.defaultfilters
.keys()
314 priorityfunctionnames
= self
.preconditions
.keys()
315 otherfunctionnames
= filter(lambda functionname
: functionname
not in self
.preconditions
, functionnames
)
316 for functionname
in priorityfunctionnames
+ otherfunctionnames
:
317 if functionname
in ignores
:
319 filterfunction
= getattr(self
, functionname
, None)
320 # this filterfunction may only be defined on another checker if using TeeChecker
321 if filterfunction
is None:
323 filtermessage
= filterfunction
.__doc
__
325 filterresult
= self
.run_test(filterfunction
, unit
)
326 except FilterFailure
, e
:
328 filtermessage
= e
.args
[0]
330 if self
.errorhandler
is None:
331 raise ValueError("error in filter %s: %r, %r, %s" % \
332 (functionname
, unit
.source
, unit
.target
, e
))
334 filterresult
= self
.errorhandler(functionname
, unit
.source
, unit
.target
, e
)
336 # we test some preconditions that aren't actually a cause for failure
337 if functionname
in self
.defaultfilters
:
338 failures
[functionname
] = filtermessage
339 if functionname
in self
.preconditions
:
340 for ignoredfunctionname
in self
.preconditions
[functionname
]:
341 ignores
.append(ignoredfunctionname
)
342 self
.results_cache
= {}
345 class TranslationChecker(UnitChecker
):
346 """A checker that passes source and target strings to the checks, not the
349 This provides some speedup and simplifies testing."""
350 def __init__(self
, checkerconfig
=None, excludefilters
=None, limitfilters
=None, errorhandler
=None):
351 super(TranslationChecker
, self
).__init
__(checkerconfig
, excludefilters
, limitfilters
, errorhandler
)
353 def run_test(self
, test
, unit
):
354 """Runs the given test on the given unit.
356 Note that this can raise a FilterFailure as part of normal operation."""
360 for pluralform
in unit
.target
.strings
:
362 if not test(self
.str1
, pluralform
):
364 except FilterFailure
, e
:
366 filtermessages
.append( str(e
).decode('utf-8') )
367 if not filterresult
and filtermessages
:
368 raise FilterFailure(filtermessages
)
372 return test(self
.str1
, self
.str2
)
374 def run_filters(self
, unit
):
375 """Do some optimisation by caching some data of the unit for the benefit
377 self
.str1
= data
.forceunicode(unit
.source
)
378 self
.str2
= data
.forceunicode(unit
.target
)
379 self
.hasplural
= unit
.hasplural()
380 return super(TranslationChecker
, self
).run_filters(unit
)
383 """A Checker that controls multiple checkers."""
384 def __init__(self
, checkerconfig
=None, excludefilters
=None, limitfilters
=None,
385 checkerclasses
=None, errorhandler
=None, languagecode
=None):
386 """construct a TeeChecker from the given checkers"""
387 self
.limitfilters
= limitfilters
388 if checkerclasses
is None:
389 checkerclasses
= [StandardChecker
]
390 self
.checkers
= [checkerclass(checkerconfig
=checkerconfig
, excludefilters
=excludefilters
, limitfilters
=limitfilters
, errorhandler
=errorhandler
) for checkerclass
in checkerclasses
]
392 for checker
in self
.checkers
:
393 checker
.config
.updatetargetlanguage(languagecode
)
394 # Let's hook up the language specific checker
395 lang_checker
= self
.checkers
[0].config
.lang
.checker
397 self
.checkers
.append(lang_checker
)
399 self
.combinedfilters
= self
.getfilters(excludefilters
, limitfilters
)
400 self
.config
= checkerconfig
or self
.checkers
[0].config
402 def getfilters(self
, excludefilters
=None, limitfilters
=None):
403 """returns dictionary of available filters, including/excluding those in
405 if excludefilters
is None:
407 filterslist
= [checker
.getfilters(excludefilters
, limitfilters
) for checker
in self
.checkers
]
408 self
.combinedfilters
= {}
409 for filters
in filterslist
:
410 self
.combinedfilters
.update(filters
)
411 # TODO: move this somewhere more sensible (a checkfilters method?)
412 if limitfilters
is not None:
413 for filtername
in limitfilters
:
414 if not filtername
in self
.combinedfilters
:
416 print >> sys
.stderr
, "warning: could not find filter %s" % filtername
417 return self
.combinedfilters
419 def run_filters(self
, unit
):
420 """run all the tests in the checker's suites"""
422 for checker
in self
.checkers
:
423 failures
.update(checker
.run_filters(unit
))
426 def setsuggestionstore(self
, store
):
427 """Sets the filename that a checker should use for evaluating suggestions."""
428 for checker
in self
.checkers
:
429 checker
.setsuggestionstore(store
)
432 class StandardChecker(TranslationChecker
):
433 """The basic test suite for source -> target translations."""
434 def untranslated(self
, str1
, str2
):
435 """checks whether a string has been translated at all"""
436 str2
= prefilters
.removekdecomments(str2
)
437 return not (len(str1
.strip()) > 0 and len(str2
) == 0)
439 def unchanged(self
, str1
, str2
):
440 """checks whether a translation is basically identical to the original string"""
441 str1
= self
.filteraccelerators(str1
)
442 str2
= self
.filteraccelerators(str2
)
443 if len(str1
.strip()) == 0:
445 if str1
.isupper() and str1
== str2
:
447 if self
.config
.notranslatewords
:
448 words1
= str1
.split()
449 if len(words1
) == 1 and [word
for word
in words1
if word
in self
.config
.notranslatewords
]:
451 str1
= self
.removevariables(str1
)
452 str2
= self
.removevariables(str2
)
453 if not (str1
.strip().isdigit() or len(str1
) < 2 or decoration
.ispurepunctuation(str1
.strip())) and (str1
.strip().lower() == str2
.strip().lower()):
454 raise FilterFailure(u
"please translate")
457 def blank(self
, str1
, str2
):
458 """checks whether a translation only contains spaces"""
459 len1
= len(str1
.strip())
460 len2
= len(str2
.strip())
461 return not (len1
> 0 and len(str2
) != 0 and len2
== 0)
463 def short(self
, str1
, str2
):
464 """checks whether a translation is much shorter than the original string"""
465 len1
= len(str1
.strip())
466 len2
= len(str2
.strip())
467 return not ((len1
> 0) and (0 < len2
< (len1
* 0.1)) or ((len1
> 1) and (len2
== 1)))
469 def long(self
, str1
, str2
):
470 """checks whether a translation is much longer than the original string"""
471 len1
= len(str1
.strip())
472 len2
= len(str2
.strip())
473 return not ((len1
> 0) and (0 < len1
< (len2
* 0.1)) or ((len1
== 1) and (len2
> 1)))
475 def escapes(self
, str1
, str2
):
476 """checks whether escaping is consistent between the two strings"""
477 if not helpers
.countsmatch(str1
, str2
, ("\\", "\\\\")):
478 escapes1
= u
", ".join([u
"'%s'" % word
for word
in str1
.split() if "\\" in word
])
479 escapes2
= u
", ".join([u
"'%s'" % word
for word
in str2
.split() if "\\" in word
])
480 raise SeriousFilterFailure(u
"escapes in original (%s) don't match escapes in translation (%s)" % (escapes1
, escapes2
))
484 def newlines(self
, str1
, str2
):
485 """checks whether newlines are consistent between the two strings"""
486 if not helpers
.countsmatch(str1
, str2
, ("\n", "\r")):
487 raise FilterFailure(u
"line endings in original don't match line endings in translation")
491 def tabs(self
, str1
, str2
):
492 """checks whether tabs are consistent between the two strings"""
493 if not helpers
.countmatch(str1
, str2
, "\t"):
494 raise SeriousFilterFailure(u
"tabs in original don't match tabs in translation")
499 def singlequoting(self
, str1
, str2
):
500 """checks whether singlequoting is consistent between the two strings"""
501 str1
= self
.filterwordswithpunctuation(self
.filteraccelerators(self
.filtervariables(str1
)))
502 str2
= self
.filterwordswithpunctuation(self
.filteraccelerators(self
.filtervariables(str2
)))
503 return helpers
.countsmatch(str1
, str2
, ("'", "''", "\\'"))
505 def doublequoting(self
, str1
, str2
):
506 """checks whether doublequoting is consistent between the two strings"""
507 str1
= self
.filteraccelerators(self
.filtervariables(str1
))
508 str1
= self
.filterxml(str1
)
509 str1
= self
.config
.lang
.punctranslate(str1
)
510 str2
= self
.filteraccelerators(self
.filtervariables(str2
))
511 str2
= self
.filterxml(str2
)
512 return helpers
.countsmatch(str1
, str2
, ('"', '""', '\\"', u
"«", u
"»"))
514 def doublespacing(self
, str1
, str2
):
515 """checks for bad double-spaces by comparing to original"""
516 str1
= self
.filteraccelerators(str1
)
517 str2
= self
.filteraccelerators(str2
)
518 return helpers
.countmatch(str1
, str2
, " ")
520 def puncspacing(self
, str1
, str2
):
521 """checks for bad spacing after punctuation"""
522 if str1
.find(u
" ") == -1:
524 str1
= self
.filteraccelerators(self
.filtervariables(str1
))
525 str1
= self
.config
.lang
.punctranslate(str1
)
526 str2
= self
.filteraccelerators(self
.filtervariables(str2
))
527 for puncchar
in self
.config
.punctuation
:
528 plaincount1
= str1
.count(puncchar
)
529 plaincount2
= str2
.count(puncchar
)
530 if not plaincount1
or plaincount1
!= plaincount2
:
532 spacecount1
= str1
.count(puncchar
+" ")
533 spacecount2
= str2
.count(puncchar
+" ")
534 if spacecount1
!= spacecount2
:
535 # handle extra spaces that are because of transposed punctuation
536 if str1
.endswith(puncchar
) != str2
.endswith(puncchar
) and abs(spacecount1
-spacecount2
) == 1:
541 def printf(self
, str1
, str2
):
542 """checks whether printf format strings match"""
543 count1
= count2
= None
544 for var_num2
, match2
in enumerate(printf_pat
.finditer(str2
)):
545 count2
= var_num2
+ 1
546 if match2
.group('ord'):
547 for var_num1
, match1
in enumerate(printf_pat
.finditer(str1
)):
548 count1
= var_num1
+ 1
549 if int(match2
.group('ord')) == var_num1
+ 1:
550 if match2
.group('fullvar') != match1
.group('fullvar'):
553 for var_num1
, match1
in enumerate(printf_pat
.finditer(str1
)):
554 count1
= var_num1
+ 1
555 if (var_num1
== var_num2
) and (match1
.group('fullvar') != match2
.group('fullvar')):
559 if list(printf_pat
.finditer(str1
)):
562 if (count1
or count2
) and (count1
!= count2
):
566 def accelerators(self
, str1
, str2
):
567 """checks whether accelerators are consistent between the two strings"""
568 str1
= self
.filtervariables(str1
)
569 str2
= self
.filtervariables(str2
)
571 for accelmarker
in self
.config
.accelmarkers
:
572 counter1
= decoration
.countaccelerators(accelmarker
, self
.config
.sourcelang
.validaccel
)
573 counter2
= decoration
.countaccelerators(accelmarker
, self
.config
.lang
.validaccel
)
574 count1
, countbad1
= counter1(str1
)
575 count2
, countbad2
= counter2(str2
)
576 getaccel
= decoration
.getaccelerators(accelmarker
, self
.config
.lang
.validaccel
)
577 accel2
, bad2
= getaccel(str2
)
580 if count1
== 1 and count2
== 0:
582 messages
.append("accelerator %s appears before an invalid accelerator character '%s' (eg. space)" % (accelmarker
, bad2
[0]))
584 messages
.append(u
"accelerator %s is missing from translation" % accelmarker
)
586 messages
.append(u
"accelerator %s does not occur in original and should not be in translation" % accelmarker
)
587 elif count1
== 1 and count2
> count1
:
588 messages
.append("accelerator %s is repeated in translation" % accelmarker
)
590 messages
.append("accelerator %s occurs %d time(s) in original and %d time(s) in translation" % (accelmarker
, count1
, count2
))
592 if "accelerators" in self
.config
.criticaltests
:
593 raise SeriousFilterFailure(messages
)
595 raise FilterFailure(messages
)
598 # def acceleratedvariables(self, str1, str2):
599 # """checks that no variables are accelerated"""
601 # for accelerator in self.config.accelmarkers:
602 # for variablestart, variableend in self.config.varmatches:
603 # error = accelerator + variablestart
604 # if str1.find(error) >= 0:
605 # messages.append("original has an accelerated variable")
606 # if str2.find(error) >= 0:
607 # messages.append("translation has an accelerated variable")
609 # raise FilterFailure(messages)
612 def variables(self
, str1
, str2
):
613 """checks whether variables of various forms are consistent between the two strings"""
615 mismatch1
, mismatch2
= [], []
616 varnames1
, varnames2
= [], []
617 for startmarker
, endmarker
in self
.config
.varmatches
:
618 varchecker
= decoration
.getvariables(startmarker
, endmarker
)
619 if startmarker
and endmarker
:
620 if isinstance(endmarker
, int):
621 redecorate
= lambda var
: startmarker
+ var
623 redecorate
= lambda var
: startmarker
+ var
+ endmarker
625 redecorate
= lambda var
: startmarker
+ var
627 redecorate
= lambda var
: var
628 vars1
= varchecker(str1
)
629 vars2
= varchecker(str2
)
631 # we use counts to compare so we can handle multiple variables
632 vars1
, vars2
= [var
for var
in vars1
if vars1
.count(var
) > vars2
.count(var
)], [var
for var
in vars2
if vars1
.count(var
) < vars2
.count(var
)]
633 # filter variable names we've already seen, so they aren't matched by more than one filter...
634 vars1
, vars2
= [var
for var
in vars1
if var
not in varnames1
], [var
for var
in vars2
if var
not in varnames2
]
635 varnames1
.extend(vars1
)
636 varnames2
.extend(vars2
)
637 vars1
= map(redecorate
, vars1
)
638 vars2
= map(redecorate
, vars2
)
639 mismatch1
.extend(vars1
)
640 mismatch2
.extend(vars2
)
642 messages
.append("do not translate: %s" % ", ".join(mismatch1
))
644 messages
.append("translation contains variables not in original: %s" % ", ".join(mismatch2
))
645 if messages
and mismatch1
:
646 raise SeriousFilterFailure(messages
)
648 raise FilterFailure(messages
)
651 def functions(self
, str1
, str2
):
652 """checks that function names are not translated"""
653 return helpers
.funcmatch(str1
, str2
, decoration
.getfunctions
, self
.config
.punctuation
)
655 def emails(self
, str1
, str2
):
656 """checks that emails are not translated"""
657 return helpers
.funcmatch(str1
, str2
, decoration
.getemails
)
659 def urls(self
, str1
, str2
):
660 """checks that URLs are not translated"""
661 return helpers
.funcmatch(str1
, str2
, decoration
.geturls
)
663 def numbers(self
, str1
, str2
):
664 """checks whether numbers of various forms are consistent between the two strings"""
665 return helpers
.countsmatch(str1
, str2
, decoration
.getnumbers(str1
))
667 def startwhitespace(self
, str1
, str2
):
668 """checks whether whitespace at the beginning of the strings matches"""
669 str1
= self
.filteraccelerators(self
.filtervariables(str1
))
670 str2
= self
.filteraccelerators(self
.filtervariables(str2
))
671 return helpers
.funcmatch(str1
, str2
, decoration
.spacestart
)
673 def endwhitespace(self
, str1
, str2
):
674 """checks whether whitespace at the end of the strings matches"""
675 str1
= self
.filteraccelerators(self
.filtervariables(str1
))
676 str2
= self
.filteraccelerators(self
.filtervariables(str2
))
677 return helpers
.funcmatch(str1
, str2
, decoration
.spaceend
)
679 def startpunc(self
, str1
, str2
):
680 """checks whether punctuation at the beginning of the strings match"""
681 str1
= self
.filteraccelerators(self
.filtervariables(self
.filterwordswithpunctuation(str1
)))
682 str1
= self
.config
.lang
.punctranslate(str1
)
683 str2
= self
.filteraccelerators(self
.filtervariables(self
.filterwordswithpunctuation(str2
)))
684 return helpers
.funcmatch(str1
, str2
, decoration
.puncstart
, self
.config
.punctuation
)
686 def endpunc(self
, str1
, str2
):
687 """checks whether punctuation at the end of the strings match"""
688 str1
= self
.filteraccelerators(self
.filtervariables(self
.filterwordswithpunctuation(str1
)))
689 str1
= self
.config
.lang
.punctranslate(str1
)
690 str2
= self
.filteraccelerators(self
.filtervariables(self
.filterwordswithpunctuation(str2
)))
691 return helpers
.funcmatch(str1
, str2
, decoration
.puncend
, self
.config
.endpunctuation
)
693 def purepunc(self
, str1
, str2
):
694 """checks that strings that are purely punctuation are not changed"""
695 # this test is a subset of startandend
696 if (decoration
.ispurepunctuation(str1
)):
699 return not decoration
.ispurepunctuation(str2
)
701 def brackets(self
, str1
, str2
):
702 """checks that the number of brackets in both strings match"""
703 str1
= self
.filtervariables(str1
)
704 str2
= self
.filtervariables(str2
)
708 for bracket
in ("[", "]", "{", "}", "(", ")"):
709 count1
= str1
.count(bracket
)
710 count2
= str2
.count(bracket
)
712 missing
.append("'%s'" % bracket
)
713 elif count2
> count1
:
714 extra
.append("'%s'" % bracket
)
716 messages
.append(u
"translation is missing %s" % ", ".join(missing
))
718 messages
.append(u
"translation has extra %s" % ", ".join(extra
))
720 raise FilterFailure(messages
)
723 def sentencecount(self
, str1
, str2
):
724 """checks that the number of sentences in both strings match"""
725 sentences1
= len(self
.config
.sourcelang
.sentences(str1
))
726 sentences2
= len(self
.config
.lang
.sentences(str2
))
727 if not sentences1
== sentences2
:
728 raise FilterFailure(u
"The number of sentences differ: %d versus %d" % (sentences1
, sentences2
))
731 def options(self
, str1
, str2
):
732 """checks that options are not translated"""
733 str1
= self
.filtervariables(str1
)
734 for word1
in str1
.split():
735 if word1
!= "--" and word1
.startswith("--") and word1
[-1].isalnum():
736 parts
= word1
.split("=")
737 if not parts
[0] in str2
:
738 raise FilterFailure("The option %s does not occur or is translated in the translation." % parts
[0])
739 if len(parts
) > 1 and parts
[1] in str2
:
740 raise FilterFailure("The parameter %(param)s in option %(option)s is not translated." % {"param": parts
[0], "option": parts
[1]})
743 def startcaps(self
, str1
, str2
):
744 """checks that the message starts with the correct capitalisation"""
745 str1
= self
.filteraccelerators(str1
)
746 str2
= self
.filteraccelerators(str2
)
747 if len(str1
) > 1 and len(str2
) > 1:
748 return self
.config
.sourcelang
.capsstart(str1
) == self
.config
.lang
.capsstart(str2
)
749 if len(str1
) == 0 and len(str2
) == 0:
751 if len(str1
) == 0 or len(str2
) == 0:
755 def simplecaps(self
, str1
, str2
):
756 """checks the capitalisation of two strings isn't wildly different"""
757 str1
= self
.removevariables(str1
)
758 str2
= self
.removevariables(str2
)
759 # TODO: review this. The 'I' is specific to English, so it probably serves
760 # no purpose to get sourcelang.sentenceend
761 str1
= re
.sub(u
"[^%s]( I )" % self
.config
.sourcelang
.sentenceend
, " i ", str1
)
762 capitals1
= helpers
.filtercount(str1
, type(str1
).isupper
)
763 capitals2
= helpers
.filtercount(str2
, type(str2
).isupper
)
764 alpha1
= helpers
.filtercount(str1
, type(str1
).isalpha
)
765 alpha2
= helpers
.filtercount(str2
, type(str2
).isalpha
)
766 # Capture the all caps case
767 if capitals1
== alpha1
:
768 return capitals2
== alpha2
769 # some heuristic tests to try and see that the style of capitals is vaguely the same
770 if capitals1
== 0 or capitals1
== 1:
771 return capitals2
== capitals1
772 elif capitals1
< len(str1
) / 10:
773 return capitals2
< len(str2
) / 8
775 return abs(capitals1
- capitals2
) < 3
776 elif capitals1
> len(str1
) * 6 / 10:
777 return capitals2
> len(str2
) * 6 / 10
779 return abs(capitals1
- capitals2
) < (len(str1
) + len(str2
)) / 6
781 def acronyms(self
, str1
, str2
):
782 """checks that acronyms that appear are unchanged"""
785 for startmatch
, endmatch
in self
.config
.varmatches
:
786 allowed
+= decoration
.getvariables(startmatch
, endmatch
)(str1
)
787 allowed
+= self
.config
.musttranslatewords
.keys()
788 str1
= self
.filteraccelerators(self
.filtervariables(str1
))
789 iter = self
.config
.lang
.word_iter(str1
)
790 str2
= self
.filteraccelerators(self
.filtervariables(str2
))
791 #TODO: strip XML? - should provide better error messsages
792 # see mail/chrome/messanger/smime.properties.po
793 #TODO: consider limiting the word length for recognising acronyms to
794 #something like 5/6 characters
796 if word
.isupper() and len(word
) > 1 and word
not in allowed
:
797 if str2
.find(word
) == -1:
798 acronyms
.append(word
)
800 raise FilterFailure("acronyms should not be translated: " + ", ".join(acronyms
))
803 def doublewords(self
, str1
, str2
):
804 """checks for repeated words in the translation"""
806 without_newlines
= "\n".join(str2
.split("\n"))
807 words
= self
.filteraccelerators(self
.removevariables(without_newlines
)).replace(".", "").lower().split()
810 raise FilterFailure("The word '%s' is repeated" % word
)
814 def notranslatewords(self
, str1
, str2
):
815 """checks that words configured as untranslatable appear in the translation too"""
816 if not self
.config
.notranslatewords
:
818 str1
= self
.filtervariables(str1
)
819 str2
= self
.filtervariables(str2
)
820 #The above is full of strange quotes and things in utf-8 encoding.
821 #single apostrophe perhaps problematic in words like "doesn't"
822 for seperator
in self
.config
.punctuation
:
823 str1
= str1
.replace(seperator
, u
" ")
824 str2
= str2
.replace(seperator
, u
" ")
825 words1
= self
.filteraccelerators(str1
).split()
826 words2
= self
.filteraccelerators(str2
).split()
827 stopwords
= [word
for word
in words1
if word
in self
.config
.notranslatewords
and word
not in words2
]
829 raise FilterFailure("do not translate: %s" % (", ".join(stopwords
)))
832 def musttranslatewords(self
, str1
, str2
):
833 """checks that words configured as definitely translatable don't appear in
835 if not self
.config
.musttranslatewords
:
837 str1
= self
.removevariables(str1
)
838 str2
= self
.removevariables(str2
)
839 #The above is full of strange quotes and things in utf-8 encoding.
840 #single apostrophe perhaps problematic in words like "doesn't"
841 for seperator
in self
.config
.punctuation
:
842 str1
= str1
.replace(seperator
, " ")
843 str2
= str2
.replace(seperator
, " ")
844 words1
= self
.filteraccelerators(str1
).split()
845 words2
= self
.filteraccelerators(str2
).split()
846 stopwords
= [word
for word
in words1
if word
in self
.config
.musttranslatewords
and word
in words2
]
848 raise FilterFailure("please translate: %s" % (", ".join(stopwords
)))
851 def validchars(self
, str1
, str2
):
852 """checks that only characters specified as valid appear in the translation"""
853 if not self
.config
.validcharsmap
:
855 invalid1
= str1
.translate(self
.config
.validcharsmap
)
856 invalid2
= str2
.translate(self
.config
.validcharsmap
)
857 invalidchars
= [u
"'%s' (\\u%04x)" % (invalidchar
, ord(invalidchar
)) for invalidchar
in invalid2
if invalidchar
not in invalid1
]
859 raise FilterFailure(u
"invalid chars: %s" % (u
", ".join(invalidchars
)))
862 def filepaths(self
, str1
, str2
):
863 """checks that file paths have not been translated"""
864 for word1
in self
.filteraccelerators(str1
).split():
865 if word1
.startswith("/"):
866 if not helpers
.countsmatch(str1
, str2
, (word1
,)):
870 def xmltags(self
, str1
, str2
):
871 """checks that XML/HTML tags have not been translated"""
872 tags1
= tag_re
.findall(str1
)
874 if (len(tags1
[0]) == len(str1
)) and not "=" in tags1
[0]:
876 tags2
= tag_re
.findall(str2
)
877 properties1
= tagproperties(tags1
, self
.config
.ignoretags
)
878 properties2
= tagproperties(tags2
, self
.config
.ignoretags
)
881 for property1
in properties1
:
882 filtered1
+= [intuplelist(property1
, self
.config
.canchangetags
)]
883 for property2
in properties2
:
884 filtered2
+= [intuplelist(property2
, self
.config
.canchangetags
)]
886 #TODO: consider the consequences of different ordering of attributes/tags
887 if filtered1
!= filtered2
:
890 # No tags in str1, let's just check that none were added in str2. This
891 # might be useful for fuzzy strings wrongly unfuzzied, for example.
892 tags2
= tag_re
.findall(str2
)
897 def kdecomments(self
, str1
, str2
):
898 """checks to ensure that no KDE style comments appear in the translation"""
899 return str2
.find("\n_:") == -1 and not str2
.startswith("_:")
901 def compendiumconflicts(self
, str1
, str2
):
902 """checks for Gettext compendium conflicts (#-#-#-#-#)"""
903 return str2
.find("#-#-#-#-#") == -1
905 def simpleplurals(self
, str1
, str2
):
906 """checks for English style plural(s) for you to review"""
907 def numberofpatterns(string
, patterns
):
909 for pattern
in patterns
:
910 number
+= len(re
.findall(pattern
, string
))
913 sourcepatterns
= ["\(s\)"]
914 targetpatterns
= ["\(s\)"]
915 sourcecount
= numberofpatterns(str1
, sourcepatterns
)
916 targetcount
= numberofpatterns(str2
, targetpatterns
)
917 if self
.config
.lang
.nplurals
== 1:
918 return not targetcount
919 return sourcecount
== targetcount
921 def spellcheck(self
, str1
, str2
):
922 """checks words that don't pass a spell check"""
923 if not self
.config
.targetlanguage
:
925 if not spelling
.available
:
927 str1
= self
.filteraccelerators_by_list(self
.filtervariables(str1
), self
.config
.sourcelang
.validaccel
)
928 str2
= self
.filteraccelerators_by_list(self
.filtervariables(str2
), self
.config
.lang
.validaccel
)
931 for word
, index
, suggestions
in spelling
.check(str1
, lang
="en"):
933 for word
, index
, suggestions
in spelling
.check(str2
, lang
=self
.config
.targetlanguage
):
934 if word
in self
.config
.notranslatewords
:
938 # hack to ignore hyphenisation rules
939 if word
in suggestions
:
941 messages
.append(u
"check spelling of %s (could be %s)" % (word
, u
" / ".join(suggestions
)))
943 raise FilterFailure(messages
)
946 def credits(self
, str1
, str2
):
947 """checks for messages containing translation credits instead of normal translations."""
948 return not str1
in self
.config
.credit_sources
950 # If the precondition filter is run and fails then the other tests listed are ignored
951 preconditions
= {"untranslated": ("simplecaps", "variables", "startcaps",
952 "accelerators", "brackets", "endpunc",
953 "acronyms", "xmltags", "startpunc",
954 "endwhitespace", "startwhitespace",
955 "escapes", "doublequoting", "singlequoting",
956 "filepaths", "purepunc", "doublespacing",
957 "sentencecount", "numbers", "isfuzzy",
958 "isreview", "notranslatewords", "musttranslatewords",
959 "emails", "simpleplurals", "urls", "printf",
960 "tabs", "newlines", "functions", "options",
961 "blank", "nplurals"),
962 "blank": ("simplecaps", "variables", "startcaps",
963 "accelerators", "brackets", "endpunc",
964 "acronyms", "xmltags", "startpunc",
965 "endwhitespace", "startwhitespace",
966 "escapes", "doublequoting", "singlequoting",
967 "filepaths", "purepunc", "doublespacing",
968 "sentencecount", "numbers", "isfuzzy",
969 "isreview", "notranslatewords", "musttranslatewords",
970 "emails", "simpleplurals", "urls", "printf",
971 "tabs", "newlines", "functions", "options"),
972 "credits": ("simplecaps", "variables", "startcaps",
973 "accelerators", "brackets", "endpunc",
974 "acronyms", "xmltags", "startpunc",
975 "escapes", "doublequoting", "singlequoting",
976 "filepaths", "doublespacing",
977 "sentencecount", "numbers",
978 "emails", "simpleplurals", "urls", "printf",
979 "tabs", "newlines", "functions", "options"),
980 "purepunc": ("startcaps", "options"),
981 "startcaps": ("simplecaps",),
982 "endwhitespace": ("endpunc",),
983 "startwhitespace":("startpunc",),
984 "unchanged": ("doublewords",),
985 "compendiumconflicts": ("accelerators", "brackets", "escapes",
986 "numbers", "startpunc", "long", "variables",
987 "startcaps", "sentencecount", "simplecaps",
988 "doublespacing", "endpunc", "xmltags",
989 "startwhitespace", "endwhitespace",
990 "singlequoting", "doublequoting",
991 "filepaths", "purepunc", "doublewords", "printf") }
993 # code to actually run the tests (use unittest?)
995 openofficeconfig
= CheckerConfig(
996 accelmarkers
= ["~"],
997 varmatches
= [("&", ";"), ("%", "%"), ("%", None), ("%", 0), ("$(", ")"), ("$", "$"), ("${", "}"), ("#", "#"), ("#", 1), ("#", 0), ("($", ")"), ("$[", "]"), ("[", "]"), ("$", None)],
998 ignoretags
= [("alt", "xml-lang", None), ("ahelp", "visibility", "visible"), ("img", "width", None), ("img", "height", None)],
999 canchangetags
= [("link", "name", None)]
1002 class OpenOfficeChecker(StandardChecker
):
1003 def __init__(self
, **kwargs
):
1004 checkerconfig
= kwargs
.get("checkerconfig", None)
1005 if checkerconfig
is None:
1006 checkerconfig
= CheckerConfig()
1007 kwargs
["checkerconfig"] = checkerconfig
1008 checkerconfig
.update(openofficeconfig
)
1009 StandardChecker
.__init
__(self
, **kwargs
)
1011 mozillaconfig
= CheckerConfig(
1012 accelmarkers
= ["&"],
1013 varmatches
= [("&", ";"), ("%", "%"), ("%", 1), ("$", "$"), ("$", None), ("#", 1), ("${", "}"), ("$(^", ")")],
1014 criticaltests
= ["accelerators"]
1017 class MozillaChecker(StandardChecker
):
1018 def __init__(self
, **kwargs
):
1019 checkerconfig
= kwargs
.get("checkerconfig", None)
1020 if checkerconfig
is None:
1021 checkerconfig
= CheckerConfig()
1022 kwargs
["checkerconfig"] = checkerconfig
1023 checkerconfig
.update(mozillaconfig
)
1024 StandardChecker
.__init
__(self
, **kwargs
)
1026 gnomeconfig
= CheckerConfig(
1027 accelmarkers
= ["_"],
1028 varmatches
= [("%", 1), ("$(", ")")],
1029 credit_sources
= [u
"translator-credits"]
1032 class GnomeChecker(StandardChecker
):
1033 def __init__(self
, **kwargs
):
1034 checkerconfig
= kwargs
.get("checkerconfig", None)
1035 if checkerconfig
is None:
1036 checkerconfig
= CheckerConfig()
1037 kwargs
["checkerconfig"] = checkerconfig
1038 checkerconfig
.update(gnomeconfig
)
1039 StandardChecker
.__init
__(self
, **kwargs
)
1041 kdeconfig
= CheckerConfig(
1042 accelmarkers
= ["&"],
1043 varmatches
= [("%", 1)],
1044 credit_sources
= [u
"Your names", u
"Your emails", u
"ROLES_OF_TRANSLATORS"]
1047 class KdeChecker(StandardChecker
):
1048 def __init__(self
, **kwargs
):
1049 # TODO allow setup of KDE plural and translator comments so that they do
1050 # not create false postives
1051 checkerconfig
= kwargs
.get("checkerconfig", None)
1052 if checkerconfig
is None:
1053 checkerconfig
= CheckerConfig()
1054 kwargs
["checkerconfig"] = checkerconfig
1055 checkerconfig
.update(kdeconfig
)
1056 StandardChecker
.__init
__(self
, **kwargs
)
1058 cclicenseconfig
= CheckerConfig(varmatches
= [("@", "@")])
1059 class CCLicenseChecker(StandardChecker
):
1060 def __init__(self
, **kwargs
):
1061 checkerconfig
= kwargs
.get("checkerconfig", None)
1062 if checkerconfig
is None:
1063 checkerconfig
= CheckerConfig()
1064 kwargs
["checkerconfig"] = checkerconfig
1065 checkerconfig
.update(cclicenseconfig
)
1066 StandardChecker
.__init
__(self
, **kwargs
)
1069 "openoffice": OpenOfficeChecker
,
1070 "mozilla": MozillaChecker
,
1073 "gnome": GnomeChecker
,
1074 "creativecommons": CCLicenseChecker
1078 class StandardUnitChecker(UnitChecker
):
1079 """The standard checks for common checks on translation units."""
1080 def isfuzzy(self
, unit
):
1081 """Check if the unit has been marked fuzzy."""
1082 return not unit
.isfuzzy()
1084 def isreview(self
, unit
):
1085 """Check if the unit has been marked review."""
1086 return not unit
.isreview()
1088 def nplurals(self
, unit
):
1089 """Checks for the correct number of noun forms for plural translations."""
1090 if unit
.hasplural():
1091 # if we don't have a valid nplurals value, don't run the test
1092 nplurals
= self
.config
.lang
.nplurals
1094 return len(unit
.target
.strings
) == nplurals
1097 def hassuggestion(self
, unit
):
1098 """Checks if there is at least one suggested translation for this unit."""
1099 self
.suggestion_store
= getattr(self
, 'suggestion_store', None)
1101 if self
.suggestion_store
:
1102 source
= unit
.source
1103 suggestions
= [unit
for unit
in self
.suggestion_store
.units
if unit
.source
== source
]
1104 elif xliff
and isinstance(unit
, xliff
.xliffunit
):
1105 # TODO: we probably want to filter them somehow
1106 suggestions
= unit
.getalttrans()
1107 return not bool(suggestions
)
1110 def runtests(str1
, str2
, ignorelist
=()):
1111 """verifies that the tests pass for a pair of strings"""
1112 from translate
.storage
import base
1113 str1
= data
.forceunicode(str1
)
1114 str2
= data
.forceunicode(str2
)
1115 unit
= base
.TranslationUnit(str1
)
1117 checker
= StandardChecker(excludefilters
=ignorelist
)
1118 failures
= checker
.run_filters(unit
)
1119 for testname
, message
in failures
:
1120 print "failure: %s: %s\n %r\n %r" % (testname
, message
, str1
, str2
)
1123 def batchruntests(pairs
):
1124 """runs test on a batch of string pairs"""
1125 passed
, numpairs
= 0, len(pairs
)
1126 for str1
, str2
in pairs
:
1127 if runtests(str1
, str2
):
1130 print "total: %d/%d pairs passed" % (passed
, numpairs
)
1132 if __name__
== '__main__':
1133 testset
= [(r
"simple", r
"somple"),
1134 (r
"\this equals \that", r
"does \this equal \that?"),
1135 (r
"this \'equals\' that", r
"this 'equals' that"),
1136 (r
" start and end! they must match.", r
"start and end! they must match."),
1137 (r
"check for matching %variables marked like %this", r
"%this %variable is marked"),
1138 (r
"check for mismatching %variables marked like %this", r
"%that %variable is marked"),
1139 (r
"check for mismatching %variables% too", r
"how many %variable% are marked"),
1141 (r
"Row: %1, Column: %2", r
"Mothalo: %1, Kholomo: %2"),
1142 (r
"simple lowercase", r
"it is all lowercase"),
1143 (r
"simple lowercase", r
"It Is All Lowercase"),
1144 (r
"Simple First Letter Capitals", r
"First Letters"),
1145 (r
"SIMPLE CAPITALS", r
"First Letters"),
1146 (r
"SIMPLE CAPITALS", r
"ALL CAPITALS"),
1147 (r
"forgot to translate", r
" ")
1149 batchruntests(testset
)