2 # Copyright (c) 2014 Wladimir J. van der Laan
3 # Distributed under the MIT software license, see the accompanying
4 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
6 Run this script from the root of the repository to update all translations from
8 It will do the following automatically:
10 - fetch all translations using the tx tool
11 - post-process them into valid and committable format
12 - remove invalid control characters
13 - remove location tags (makes diffs less noisy)
16 - auto-add new translations to the build system according to the translation process
18 from __future__
import division
, print_function
24 import xml
.etree
.ElementTree
as ET
26 # Name of transifex tool
28 # Name of source language file
29 SOURCE_LANG
= 'bitcoin_en.ts'
30 # Directory with locale files
31 LOCALE_DIR
= 'src/qt/locale'
32 # Minimum number of messages for translation to be considered at all
35 def check_at_repository_root():
36 if not os
.path
.exists('.git'):
37 print('No .git directory found')
38 print('Execute this script at the root of the repository', file=sys
.stderr
)
41 def fetch_all_translations():
42 if subprocess
.call([TX
, 'pull', '-f', '-a']):
43 print('Error while fetching translations', file=sys
.stderr
)
46 def find_format_specifiers(s
):
47 '''Find all format specifiers in a string.'''
51 percent
= s
.find('%', pos
)
54 specifiers
.append(s
[percent
+1])
58 def split_format_specifiers(specifiers
):
59 '''Split format specifiers between numeric (Qt) and others (strprintf)'''
63 if s
in {'1','2','3','4','5','6','7','8','9'}:
68 # If both numeric format specifiers and "others" are used, assume we're dealing
69 # with a Qt-formatted message. In the case of Qt formatting (see https://doc.qt.io/qt-5/qstring.html#arg)
70 # only numeric formats are replaced at all. This means "(percentage: %1%)" is valid, without needing
71 # any kind of escaping that would be necessary for strprintf. Without this, this function
72 # would wrongly detect '%)' as a printf format specifier.
76 # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
77 return set(numeric
),other
79 def sanitize_string(s
):
80 '''Sanitize string for printing'''
81 return s
.replace('\n',' ')
83 def check_format_specifiers(source
, translation
, errors
, numerus
):
84 source_f
= split_format_specifiers(find_format_specifiers(source
))
85 # assert that no source messages contain both Qt and strprintf format specifiers
86 # if this fails, go change the source as this is hacky and confusing!
87 assert(not(source_f
[0] and source_f
[1]))
89 translation_f
= split_format_specifiers(find_format_specifiers(translation
))
91 errors
.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source
), sanitize_string(translation
)))
94 if source_f
!= translation_f
:
95 if numerus
and source_f
== (set(), ['n']) and translation_f
== (set(), []) and translation
.find('%') == -1:
96 # Allow numerus translations to omit %n specifier (usually when it only has one possible value)
98 errors
.append("Mismatch between '%s' and '%s'" % (sanitize_string(source
), sanitize_string(translation
)))
102 def all_ts_files(suffix
=''):
103 for filename
in os
.listdir(LOCALE_DIR
):
104 # process only language files, and do not process source language
105 if not filename
.endswith('.ts'+suffix
) or filename
== SOURCE_LANG
+suffix
:
107 if suffix
: # remove provided suffix
108 filename
= filename
[0:-len(suffix
)]
109 filepath
= os
.path
.join(LOCALE_DIR
, filename
)
110 yield(filename
, filepath
)
112 FIX_RE
= re
.compile(b
'[\x00-\x09\x0b\x0c\x0e-\x1f]')
113 def remove_invalid_characters(s
):
114 '''Remove invalid characters from translation string'''
115 return FIX_RE
.sub(b
'', s
)
117 # Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
118 # comparison, disable by default)
119 _orig_escape_cdata
= None
120 def escape_cdata(text
):
121 text
= _orig_escape_cdata(text
)
122 text
= text
.replace("'", ''')
123 text
= text
.replace('"', '"')
126 def postprocess_translations(reduce_diff_hacks
=False):
127 print('Checking and postprocessing...')
129 if reduce_diff_hacks
:
130 global _orig_escape_cdata
131 _orig_escape_cdata
= ET
._escape
_cdata
132 ET
._escape
_cdata
= escape_cdata
134 for (filename
,filepath
) in all_ts_files():
135 os
.rename(filepath
, filepath
+'.orig')
138 for (filename
,filepath
) in all_ts_files('.orig'):
139 # pre-fixups to cope with transifex output
140 parser
= ET
.XMLParser(encoding
='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
141 with
open(filepath
+ '.orig', 'rb') as f
:
143 # remove control characters; this must be done over the entire file otherwise the XML parser will fail
144 data
= remove_invalid_characters(data
)
145 tree
= ET
.parse(io
.BytesIO(data
), parser
=parser
)
147 # iterate over all messages in file
148 root
= tree
.getroot()
149 for context
in root
.findall('context'):
150 for message
in context
.findall('message'):
151 numerus
= message
.get('numerus') == 'yes'
152 source
= message
.find('source').text
153 translation_node
= message
.find('translation')
154 # pick all numerusforms
156 translations
= [i
.text
for i
in translation_node
.findall('numerusform')]
158 translations
= [translation_node
.text
]
160 for translation
in translations
:
161 if translation
is None:
164 valid
= check_format_specifiers(source
, translation
, errors
, numerus
)
167 print('%s: %s' % (filename
, error
))
169 if not valid
: # set type to unfinished and clear string if invalid
170 translation_node
.clear()
171 translation_node
.set('type', 'unfinished')
174 # Remove location tags
175 for location
in message
.findall('location'):
176 message
.remove(location
)
178 # Remove entire message if it is an unfinished translation
179 if translation_node
.get('type') == 'unfinished':
180 context
.remove(message
)
182 # check if document is (virtually) empty, and remove it if so
184 for context
in root
.findall('context'):
185 for message
in context
.findall('message'):
187 if num_messages
< MIN_NUM_MESSAGES
:
188 print('Removing %s, as it contains only %i messages' % (filepath
, num_messages
))
191 # write fixed-up tree
192 # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
193 if reduce_diff_hacks
:
195 tree
.write(out
, encoding
='utf-8')
197 out
= out
.replace(b
' />', b
'/>')
198 with
open(filepath
, 'wb') as f
:
201 tree
.write(filepath
, encoding
='utf-8')
204 if __name__
== '__main__':
205 check_at_repository_root()
206 fetch_all_translations()
207 postprocess_translations()