Updated for 2.1a3
[python/dscho.git] / Tools / i18n / pygettext.py
bloba839799065034c6ebe040bdb673d5849978b1447
1 #! /usr/bin/env python
2 # Originally written by Barry Warsaw <bwarsaw@python.org>
4 # minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
7 """pygettext -- Python equivalent of xgettext(1)
9 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10 internationalization of C programs. Most of these tools are independent of
11 the programming language and can be used from within Python programs. Martin
12 von Loewis' work[1] helps considerably in this regard.
14 There's one problem though; xgettext is the program that scans source code
15 looking for message strings, but it groks only C (or C++). Python introduces
16 a few wrinkles, such as dual quoting characters, triple quoted strings, and
17 raw strings. xgettext understands none of this.
19 Enter pygettext, which uses Python's standard tokenize module to scan Python
20 source code, generating .pot files identical to what GNU xgettext[2] generates
21 for C and C++ code. From there, the standard GNU tools can be used.
23 A word about marking Python strings as candidates for translation. GNU
24 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25 gettext_noop. But those can be a lot of text to include all over your code.
26 C and C++ have a trick: they use the C preprocessor. Most internationalized C
27 source includes a #define for gettext() to _() so that what has to be written
28 in the source is much less. Thus these are both translatable strings:
30 gettext("Translatable String")
31 _("Translatable String")
33 Python of course has no preprocessor so this doesn't work so well. Thus,
34 pygettext searches only for _() by default, but see the -k/--keyword flag
35 below for how to augment this.
37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38 [2] http://www.gnu.org/software/gettext/gettext.html
40 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
41 where ever possible. However some options are still missing or are not fully
42 implemented. Also, xgettext's use of command line switches with option
43 arguments is broken, and in these cases, pygettext just defines additional
44 switches.
46 Usage: pygettext [options] inputfile ...
48 Options:
51 --extract-all
52 Extract all strings
54 -d name
55 --default-domain=name
56 Rename the default output file from messages.pot to name.pot
59 --escape
60 Replace non-ASCII characters with octal escape sequences.
63 --docstrings
64 Extract module, class, method, and function docstrings. These do not
65 need to be wrapped in _() markers, and in fact cannot be for Python to
66 consider them docstrings.
69 --help
70 print this help message and exit
72 -k word
73 --keyword=word
74 Keywords to look for in addition to the default set, which are:
75 %(DEFAULTKEYWORDS)s
77 You can have multiple -k flags on the command line.
80 --no-default-keywords
81 Disable the default set of keywords (see above). Any keywords
82 explicitly added with the -k/--keyword option are still recognized.
84 --no-location
85 Do not write filename/lineno location comments.
88 --add-location
89 Write filename/lineno location comments indicating where each
90 extracted string is found in the source. These lines appear before
91 each msgid. The style of comments is controlled by the -S/--style
92 option. This is the default.
94 -o filename
95 --output=filename
96 Rename the default output file from messages.pot to filename. If
97 filename is `-' then the output is sent to standard out.
99 -p dir
100 --output-dir=dir
101 Output files will be placed in directory dir.
103 -S stylename
104 --style stylename
105 Specify which style to use for location comments. Two styles are
106 supported:
108 Solaris # File: filename, line: line-number
109 GNU #: filename:line
111 The style name is case insensitive. GNU style is the default.
114 --verbose
115 Print the names of the files being processed.
118 --version
119 Print the version of pygettext and exit.
121 -w columns
122 --width=columns
123 Set width of output to columns.
125 -x filename
126 --exclude-file=filename
127 Specify a file that contains a list of strings that are not be
128 extracted from the input files. Each string to be excluded must
129 appear on a line by itself in the file.
131 If `inputfile' is -, standard input is read.
135 import os
136 import sys
137 import time
138 import getopt
139 import tokenize
141 # for selftesting
142 try:
143 import fintl
144 _ = fintl.gettext
145 except ImportError:
146 def _(s): return s
148 __version__ = '1.2'
150 default_keywords = ['_']
151 DEFAULTKEYWORDS = ', '.join(default_keywords)
153 EMPTYSTRING = ''
157 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
158 # it's there.
159 pot_header = _('''\
160 # SOME DESCRIPTIVE TITLE.
161 # Copyright (C) YEAR ORGANIZATION
162 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
164 msgid ""
165 msgstr ""
166 "Project-Id-Version: PACKAGE VERSION\\n"
167 "PO-Revision-Date: %(time)s\\n"
168 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
169 "Language-Team: LANGUAGE <LL@li.org>\\n"
170 "MIME-Version: 1.0\\n"
171 "Content-Type: text/plain; charset=CHARSET\\n"
172 "Content-Transfer-Encoding: ENCODING\\n"
173 "Generated-By: pygettext.py %(version)s\\n"
175 ''')
178 def usage(code, msg=''):
179 print >> sys.stderr, _(__doc__) % globals()
180 if msg:
181 print >> sys.stderr, msg
182 sys.exit(code)
186 escapes = []
188 def make_escapes(pass_iso8859):
189 global escapes
190 if pass_iso8859:
191 # Allow iso-8859 characters to pass through so that e.g. 'msgid
192 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
193 # escape any character outside the 32..126 range.
194 mod = 128
195 else:
196 mod = 256
197 for i in range(256):
198 if 32 <= (i % mod) <= 126:
199 escapes.append(chr(i))
200 else:
201 escapes.append("\\%03o" % i)
202 escapes[ord('\\')] = '\\\\'
203 escapes[ord('\t')] = '\\t'
204 escapes[ord('\r')] = '\\r'
205 escapes[ord('\n')] = '\\n'
206 escapes[ord('\"')] = '\\"'
209 def escape(s):
210 global escapes
211 s = list(s)
212 for i in range(len(s)):
213 s[i] = escapes[ord(s[i])]
214 return EMPTYSTRING.join(s)
217 def safe_eval(s):
218 # unwrap quotes, safely
219 return eval(s, {'__builtins__':{}}, {})
222 def normalize(s):
223 # This converts the various Python string types into a format that is
224 # appropriate for .po files, namely much closer to C style.
225 lines = s.split('\n')
226 if len(lines) == 1:
227 s = '"' + escape(s) + '"'
228 else:
229 if not lines[-1]:
230 del lines[-1]
231 lines[-1] = lines[-1] + '\n'
232 for i in range(len(lines)):
233 lines[i] = escape(lines[i])
234 lineterm = '\\n"\n"'
235 s = '""\n"' + lineterm.join(lines) + '"'
236 return s
240 class TokenEater:
241 def __init__(self, options):
242 self.__options = options
243 self.__messages = {}
244 self.__state = self.__waiting
245 self.__data = []
246 self.__lineno = -1
247 self.__freshmodule = 1
249 def __call__(self, ttype, tstring, stup, etup, line):
250 # dispatch
251 ## import token
252 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
253 ## 'tstring:', tstring
254 self.__state(ttype, tstring, stup[0])
256 def __waiting(self, ttype, tstring, lineno):
257 # Do docstring extractions, if enabled
258 if self.__options.docstrings:
259 # module docstring?
260 if self.__freshmodule:
261 if ttype == tokenize.STRING:
262 self.__addentry(safe_eval(tstring), lineno)
263 self.__freshmodule = 0
264 elif ttype not in (tokenize.COMMENT, tokenize.NL):
265 self.__freshmodule = 0
266 return
267 # class docstring?
268 if ttype == tokenize.NAME and tstring in ('class', 'def'):
269 self.__state = self.__suiteseen
270 return
271 if ttype == tokenize.NAME and tstring in self.__options.keywords:
272 self.__state = self.__keywordseen
274 def __suiteseen(self, ttype, tstring, lineno):
275 # ignore anything until we see the colon
276 if ttype == tokenize.OP and tstring == ':':
277 self.__state = self.__suitedocstring
279 def __suitedocstring(self, ttype, tstring, lineno):
280 # ignore any intervening noise
281 if ttype == tokenize.STRING:
282 self.__addentry(safe_eval(tstring), lineno)
283 self.__state = self.__waiting
284 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
285 tokenize.COMMENT):
286 # there was no class docstring
287 self.__state = self.__waiting
289 def __keywordseen(self, ttype, tstring, lineno):
290 if ttype == tokenize.OP and tstring == '(':
291 self.__data = []
292 self.__lineno = lineno
293 self.__state = self.__openseen
294 else:
295 self.__state = self.__waiting
297 def __openseen(self, ttype, tstring, lineno):
298 if ttype == tokenize.OP and tstring == ')':
299 # We've seen the last of the translatable strings. Record the
300 # line number of the first line of the strings and update the list
301 # of messages seen. Reset state for the next batch. If there
302 # were no strings inside _(), then just ignore this entry.
303 if self.__data:
304 self.__addentry(EMPTYSTRING.join(self.__data))
305 self.__state = self.__waiting
306 elif ttype == tokenize.STRING:
307 self.__data.append(safe_eval(tstring))
308 # TBD: should we warn if we seen anything else?
310 def __addentry(self, msg, lineno=None):
311 if lineno is None:
312 lineno = self.__lineno
313 if not msg in self.__options.toexclude:
314 entry = (self.__curfile, lineno)
315 self.__messages.setdefault(msg, []).append(entry)
317 def set_filename(self, filename):
318 self.__curfile = filename
320 def write(self, fp):
321 options = self.__options
322 timestamp = time.ctime(time.time())
323 # The time stamp in the header doesn't have the same format as that
324 # generated by xgettext...
325 print >> fp, pot_header % {'time': timestamp, 'version': __version__}
326 for k, v in self.__messages.items():
327 if not options.writelocations:
328 pass
329 # location comments are different b/w Solaris and GNU:
330 elif options.locationstyle == options.SOLARIS:
331 for filename, lineno in v:
332 d = {'filename': filename, 'lineno': lineno}
333 print >>fp, _('# File: %(filename)s, line: %(lineno)d') % d
334 elif options.locationstyle == options.GNU:
335 # fit as many locations on one line, as long as the
336 # resulting line length doesn't exceeds 'options.width'
337 locline = '#:'
338 for filename, lineno in v:
339 d = {'filename': filename, 'lineno': lineno}
340 s = _(' %(filename)s:%(lineno)d') % d
341 if len(locline) + len(s) <= options.width:
342 locline = locline + s
343 else:
344 print >> fp, locline
345 locline = "#:" + s
346 if len(locline) > 2:
347 print >> fp, locline
348 # TBD: sorting, normalizing
349 print >> fp, 'msgid', normalize(k)
350 print >> fp, 'msgstr ""\n'
354 def main():
355 global default_keywords
356 try:
357 opts, args = getopt.getopt(
358 sys.argv[1:],
359 'ad:DEhk:Kno:p:S:Vvw:x:',
360 ['extract-all', 'default-domain', 'escape', 'help',
361 'keyword=', 'no-default-keywords',
362 'add-location', 'no-location', 'output=', 'output-dir=',
363 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
364 'docstrings',
366 except getopt.error, msg:
367 usage(1, msg)
369 # for holding option values
370 class Options:
371 # constants
372 GNU = 1
373 SOLARIS = 2
374 # defaults
375 extractall = 0 # FIXME: currently this option has no effect at all.
376 escape = 0
377 keywords = []
378 outpath = ''
379 outfile = 'messages.pot'
380 writelocations = 1
381 locationstyle = GNU
382 verbose = 0
383 width = 78
384 excludefilename = ''
385 docstrings = 0
387 options = Options()
388 locations = {'gnu' : options.GNU,
389 'solaris' : options.SOLARIS,
392 # parse options
393 for opt, arg in opts:
394 if opt in ('-h', '--help'):
395 usage(0)
396 elif opt in ('-a', '--extract-all'):
397 options.extractall = 1
398 elif opt in ('-d', '--default-domain'):
399 options.outfile = arg + '.pot'
400 elif opt in ('-E', '--escape'):
401 options.escape = 1
402 elif opt in ('-D', '--docstrings'):
403 options.docstrings = 1
404 elif opt in ('-k', '--keyword'):
405 options.keywords.append(arg)
406 elif opt in ('-K', '--no-default-keywords'):
407 default_keywords = []
408 elif opt in ('-n', '--add-location'):
409 options.writelocations = 1
410 elif opt in ('--no-location',):
411 options.writelocations = 0
412 elif opt in ('-S', '--style'):
413 options.locationstyle = locations.get(arg.lower())
414 if options.locationstyle is None:
415 usage(1, _('Invalid value for --style: %s') % arg)
416 elif opt in ('-o', '--output'):
417 options.outfile = arg
418 elif opt in ('-p', '--output-dir'):
419 options.outpath = arg
420 elif opt in ('-v', '--verbose'):
421 options.verbose = 1
422 elif opt in ('-V', '--version'):
423 print _('pygettext.py (xgettext for Python) %s') % __version__
424 sys.exit(0)
425 elif opt in ('-w', '--width'):
426 try:
427 options.width = int(arg)
428 except ValueError:
429 usage(1, _('--width argument must be an integer: %s') % arg)
430 elif opt in ('-x', '--exclude-file'):
431 options.excludefilename = arg
433 # calculate escapes
434 make_escapes(options.escape)
436 # calculate all keywords
437 options.keywords.extend(default_keywords)
439 # initialize list of strings to exclude
440 if options.excludefilename:
441 try:
442 fp = open(options.excludefilename)
443 options.toexclude = fp.readlines()
444 fp.close()
445 except IOError:
446 sys.stderr.write(_("Can't read --exclude-file: %s") %
447 options.excludefilename)
448 sys.exit(1)
449 else:
450 options.toexclude = []
452 # slurp through all the files
453 eater = TokenEater(options)
454 for filename in args:
455 if filename == '-':
456 if options.verbose:
457 print _('Reading standard input')
458 fp = sys.stdin
459 closep = 0
460 else:
461 if options.verbose:
462 print _('Working on %s') % filename
463 fp = open(filename)
464 closep = 1
465 try:
466 eater.set_filename(filename)
467 try:
468 tokenize.tokenize(fp.readline, eater)
469 except tokenize.TokenError, e:
470 sys.stderr.write('%s: %s, line %d, column %d\n' %
471 (e[0], filename, e[1][0], e[1][1]))
472 finally:
473 if closep:
474 fp.close()
476 # write the output
477 if options.outfile == '-':
478 fp = sys.stdout
479 closep = 0
480 else:
481 if options.outpath:
482 options.outfile = os.path.join(options.outpath, options.outfile)
483 fp = open(options.outfile, 'w')
484 closep = 1
485 try:
486 eater.write(fp)
487 finally:
488 if closep:
489 fp.close()
492 if __name__ == '__main__':
493 main()
494 # some more test strings
495 _(u'a unicode string')