Added 'list_only' option (and modified 'run()' to respect it).
[python/dscho.git] / Tools / i18n / pygettext.py
blob3542f3f23ec2cd0f8f861935bb8a589c02146448
1 #! /usr/bin/env python
3 """pygettext -- Python equivalent of xgettext(1)
5 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
6 internationalization of C programs. Most of these tools are independent of
7 the programming language and can be used from within Python programs. Martin
8 von Loewis' work[1] helps considerably in this regard.
10 There's one hole though; xgettext is the program that scans source code
11 looking for message strings, but it groks only C (or C++). Python introduces
12 a few wrinkles, such as dual quoting characters, triple quoted strings, and
13 raw strings. xgettext understands none of this.
15 Enter pygettext, which uses Python's standard tokenize module to scan Python
16 source code, generating .pot files identical to what GNU xgettext[2] generates
17 for C and C++ code. From there, the standard GNU tools can be used.
19 A word about marking Python strings as candidates for translation. GNU
20 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
21 gettext_noop. But those can be a lot of text to include all over your code.
22 C and C++ have a trick: they use the C preprocessor. Most internationalized C
23 source includes a #define for gettext() to _() so that what has to be written
24 in the source is much less. Thus these are both translatable strings:
26 gettext("Translatable String")
27 _("Translatable String")
29 Python of course has no preprocessor so this doesn't work so well. Thus,
30 pygettext searches only for _() by default, but see the -k/--keyword flag
31 below for how to augment this.
33 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
34 [2] http://www.gnu.org/software/gettext/gettext.html
37 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
38 where ever possible.
40 Usage: pygettext [options] filename ...
42 Options:
45 --extract-all
46 Extract all strings
48 -d default-domain
49 --default-domain=default-domain
50 Rename the default output file from messages.pot to default-domain.pot
52 -k [word]
53 --keyword[=word]
54 Additional keywords to look for. Without `word' means not to use the
55 default keywords. The default keywords, which are always looked for
56 if not explicitly disabled: _
58 The default keyword list is different than GNU xgettext. You can have
59 multiple -k flags on the command line.
61 --no-location
62 Do not write filename/lineno location comments
64 -n [style]
65 --add-location[=style]
66 Write filename/lineno location comments indicating where each
67 extracted string is found in the source. These lines appear before
68 each msgid. Two styles are supported:
70 Solaris # File: filename, line: line-number
71 Gnu #: filename:line
73 If style is omitted, Gnu is used. The style name is case
74 insensitive. By default, locations are included.
76 --help
78 print this help message and exit
80 """
82 import os
83 import sys
84 import string
85 import time
86 import getopt
87 import tokenize
89 __version__ = '0.1'
93 def usage(code, msg=''):
94 print __doc__ % globals()
95 if msg:
96 print msg
97 sys.exit(code)
101 def normalize(s):
102 # This converts the various Python string types into a format that is
103 # appropriate for .po files, namely much closer to C style.
105 # unwrap quotes, safely
106 s = eval(s, {'__builtins__':{}}, {})
107 # now escape any embedded double quotes
108 parts = []
109 last = 0
110 i = string.find(s, '"')
111 while i >= 0:
112 # find the number of preceding backslashes
113 j = i
114 n = 0
115 while j >= 0 and s[i] == '\\':
116 j = j - 1
117 n = n + 1
118 if (n % 2) == 0:
119 parts.append(s[last:j])
120 parts.append('\\')
121 parts.append(s[j:i])
122 else:
123 parts.append(s[last:i])
124 last = i
125 i = string.find(s, '"', i+1)
126 else:
127 parts.append(s[last:])
128 if parts:
129 return '"' + string.join(parts, '') + '"'
130 else:
131 return '"' + s + '"'
135 class TokenEater:
136 def __init__(self, options):
137 self.__options = options
138 self.__messages = {}
139 self.__state = self.__waiting
140 self.__data = []
141 self.__lineno = -1
143 def __call__(self, ttype, tstring, stup, etup, line):
144 # dispatch
145 self.__state(ttype, tstring, stup[0])
147 def __waiting(self, ttype, tstring, lineno):
148 if ttype == tokenize.NAME and tstring in self.__options.keywords:
149 self.__state = self.__keywordseen
151 def __keywordseen(self, ttype, tstring, lineno):
152 if ttype == tokenize.OP and tstring == '(':
153 self.__data = []
154 self.__lineno = lineno
155 self.__state = self.__openseen
156 else:
157 self.__state = self.__waiting
159 def __openseen(self, ttype, tstring, lineno):
160 if ttype == tokenize.OP and tstring == ')':
161 # We've seen the last of the translatable strings. Record the
162 # line number of the first line of the strings and update the list
163 # of messages seen. Reset state for the next batch. If there
164 # were no strings inside _(), then just ignore this entry.
165 if self.__data:
166 msg = string.join(self.__data, '')
167 entry = (self.__curfile, self.__lineno)
168 linenos = self.__messages.get(msg)
169 if linenos is None:
170 self.__messages[msg] = [entry]
171 else:
172 linenos.append(entry)
173 self.__state = self.__waiting
174 elif ttype == tokenize.STRING:
175 self.__data.append(normalize(tstring))
176 # TBD: should we warn if we seen anything else?
178 def set_filename(self, filename):
179 self.__curfile = filename
181 def write(self, fp):
182 options = self.__options
183 timestamp = time.ctime(time.time())
184 # common header
185 try:
186 sys.stdout = fp
187 print '# POT file generated by pygettext.py', __version__
188 print '#', timestamp
189 print '#'
190 for k, v in self.__messages.items():
191 for filename, lineno in v:
192 # location comments are different b/w Solaris and GNU
193 if options.location == options.SOLARIS:
194 print '# File: %s,' % filename, 'line: %d' % lineno
195 elif options.location == options.GNU:
196 print '#: %s:%d' % (filename, lineno)
197 # TBD: sorting, normalizing
198 print 'msgid', k
199 print 'msgstr '
200 print
201 finally:
202 sys.stdout = sys.__stdout__
205 def main():
206 default_keywords = ['_']
207 try:
208 opts, args = getopt.getopt(
209 sys.argv[1:],
210 'k:d:n:h',
211 ['keyword', 'default-domain', 'help',
212 'add-location=', 'no-location'])
213 except getopt.error, msg:
214 usage(1, msg)
216 # for holding option values
217 class Options:
218 # constants
219 GNU = 1
220 SOLARIS = 2
221 # defaults
222 keywords = []
223 outfile = 'messages.pot'
224 location = GNU
226 options = Options()
227 locations = {'gnu' : options.GNU,
228 'solaris' : options.SOLARIS,
231 # parse options
232 for opt, arg in opts:
233 if opt in ('-h', '--help'):
234 usage(0)
235 elif opt in ('-k', '--keyword'):
236 if arg is None:
237 default_keywords = []
238 options.keywords.append(arg)
239 elif opt in ('-d', '--default-domain'):
240 options.outfile = arg + '.pot'
241 elif opt in ('-n', '--add-location'):
242 if arg is None:
243 arg = 'gnu'
244 try:
245 options.location = locations[string.lower(arg)]
246 except KeyError:
247 usage(1, 'Invalid value for --add-location: ' + arg)
248 elif opt in ('--no-location',):
249 options.location = 0
251 # calculate all keywords
252 options.keywords.extend(default_keywords)
254 # slurp through all the files
255 eater = TokenEater(options)
256 for filename in args:
257 fp = open(filename)
258 eater.set_filename(filename)
259 tokenize.tokenize(fp.readline, eater)
260 fp.close()
262 fp = open(options.outfile, 'w')
263 eater.write(fp)
264 fp.close()
268 if __name__ == '__main__':
269 main()