2 # Originally written by Barry Warsaw <bwarsaw@python.org>
4 # minimally patched to make it even more xgettext compatible
5 # by Peter Funk <pf@artcom-gmbh.de>
7 """pygettext -- Python equivalent of xgettext(1)
9 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10 internationalization of C programs. Most of these tools are independent of
11 the programming language and can be used from within Python programs. Martin
12 von Loewis' work[1] helps considerably in this regard.
14 There's one problem though; xgettext is the program that scans source code
15 looking for message strings, but it groks only C (or C++). Python introduces
16 a few wrinkles, such as dual quoting characters, triple quoted strings, and
17 raw strings. xgettext understands none of this.
19 Enter pygettext, which uses Python's standard tokenize module to scan Python
20 source code, generating .pot files identical to what GNU xgettext[2] generates
21 for C and C++ code. From there, the standard GNU tools can be used.
23 A word about marking Python strings as candidates for translation. GNU
24 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25 gettext_noop. But those can be a lot of text to include all over your code.
26 C and C++ have a trick: they use the C preprocessor. Most internationalized C
27 source includes a #define for gettext() to _() so that what has to be written
28 in the source is much less. Thus these are both translatable strings:
30 gettext("Translatable String")
31 _("Translatable String")
33 Python of course has no preprocessor so this doesn't work so well. Thus,
34 pygettext searches only for _() by default, but see the -k/--keyword flag
35 below for how to augment this.
37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38 [2] http://www.gnu.org/software/gettext/gettext.html
40 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
41 where ever possible. However some options are still missing or are not fully
42 implemented. Also, xgettext's use of command line switches with option
43 arguments is broken, and in these cases, pygettext just defines additional
46 Usage: pygettext [options] inputfile ...
56 Rename the default output file from messages.pot to name.pot
60 Replace non-ASCII characters with octal escape sequences.
64 Extract module, class, method, and function docstrings. These do not
65 need to be wrapped in _() markers, and in fact cannot be for Python to
66 consider them docstrings.
70 print this help message and exit
74 Keywords to look for in addition to the default set, which are:
77 You can have multiple -k flags on the command line.
81 Disable the default set of keywords (see above). Any keywords
82 explicitly added with the -k/--keyword option are still recognized.
85 Do not write filename/lineno location comments.
89 Write filename/lineno location comments indicating where each
90 extracted string is found in the source. These lines appear before
91 each msgid. The style of comments is controlled by the -S/--style
92 option. This is the default.
96 Rename the default output file from messages.pot to filename. If
97 filename is `-' then the output is sent to standard out.
101 Output files will be placed in directory dir.
105 Specify which style to use for location comments. Two styles are
108 Solaris # File: filename, line: line-number
111 The style name is case insensitive. GNU style is the default.
115 Print the names of the files being processed.
119 Print the version of pygettext and exit.
123 Set width of output to columns.
126 --exclude-file=filename
127 Specify a file that contains a list of strings that are not be
128 extracted from the input files. Each string to be excluded must
129 appear on a line by itself in the file.
131 If `inputfile' is -, standard input is read.
150 default_keywords
= ['_']
151 DEFAULTKEYWORDS
= ', '.join(default_keywords
)
157 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
160 # SOME DESCRIPTIVE TITLE.
161 # Copyright (C) YEAR ORGANIZATION
162 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
166 "Project-Id-Version: PACKAGE VERSION\\n"
167 "PO-Revision-Date: %(time)s\\n"
168 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
169 "Language-Team: LANGUAGE <LL@li.org>\\n"
170 "MIME-Version: 1.0\\n"
171 "Content-Type: text/plain; charset=CHARSET\\n"
172 "Content-Transfer-Encoding: ENCODING\\n"
173 "Generated-By: pygettext.py %(version)s\\n"
178 def usage(code
, msg
=''):
179 print >> sys
.stderr
, _(__doc__
) % globals()
181 print >> sys
.stderr
, msg
188 def make_escapes(pass_iso8859
):
191 # Allow iso-8859 characters to pass through so that e.g. 'msgid
192 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
193 # escape any character outside the 32..126 range.
198 if 32 <= (i
% mod
) <= 126:
199 escapes
.append(chr(i
))
201 escapes
.append("\\%03o" % i
)
202 escapes
[ord('\\')] = '\\\\'
203 escapes
[ord('\t')] = '\\t'
204 escapes
[ord('\r')] = '\\r'
205 escapes
[ord('\n')] = '\\n'
206 escapes
[ord('\"')] = '\\"'
212 for i
in range(len(s
)):
213 s
[i
] = escapes
[ord(s
[i
])]
214 return EMPTYSTRING
.join(s
)
218 # unwrap quotes, safely
219 return eval(s
, {'__builtins__':{}}, {})
223 # This converts the various Python string types into a format that is
224 # appropriate for .po files, namely much closer to C style.
225 lines
= s
.split('\n')
227 s
= '"' + escape(s
) + '"'
231 lines
[-1] = lines
[-1] + '\n'
232 for i
in range(len(lines
)):
233 lines
[i
] = escape(lines
[i
])
235 s
= '""\n"' + lineterm
.join(lines
) + '"'
241 def __init__(self
, options
):
242 self
.__options
= options
244 self
.__state
= self
.__waiting
247 self
.__freshmodule
= 1
249 def __call__(self
, ttype
, tstring
, stup
, etup
, line
):
252 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
253 ## 'tstring:', tstring
254 self
.__state
(ttype
, tstring
, stup
[0])
256 def __waiting(self
, ttype
, tstring
, lineno
):
257 # Do docstring extractions, if enabled
258 if self
.__options
.docstrings
:
260 if self
.__freshmodule
:
261 if ttype
== tokenize
.STRING
:
262 self
.__addentry
(safe_eval(tstring
), lineno
)
263 self
.__freshmodule
= 0
264 elif ttype
not in (tokenize
.COMMENT
, tokenize
.NL
):
265 self
.__freshmodule
= 0
268 if ttype
== tokenize
.NAME
and tstring
in ('class', 'def'):
269 self
.__state
= self
.__suiteseen
271 if ttype
== tokenize
.NAME
and tstring
in self
.__options
.keywords
:
272 self
.__state
= self
.__keywordseen
274 def __suiteseen(self
, ttype
, tstring
, lineno
):
275 # ignore anything until we see the colon
276 if ttype
== tokenize
.OP
and tstring
== ':':
277 self
.__state
= self
.__suitedocstring
279 def __suitedocstring(self
, ttype
, tstring
, lineno
):
280 # ignore any intervening noise
281 if ttype
== tokenize
.STRING
:
282 self
.__addentry
(safe_eval(tstring
), lineno
)
283 self
.__state
= self
.__waiting
284 elif ttype
not in (tokenize
.NEWLINE
, tokenize
.INDENT
,
286 # there was no class docstring
287 self
.__state
= self
.__waiting
289 def __keywordseen(self
, ttype
, tstring
, lineno
):
290 if ttype
== tokenize
.OP
and tstring
== '(':
292 self
.__lineno
= lineno
293 self
.__state
= self
.__openseen
295 self
.__state
= self
.__waiting
297 def __openseen(self
, ttype
, tstring
, lineno
):
298 if ttype
== tokenize
.OP
and tstring
== ')':
299 # We've seen the last of the translatable strings. Record the
300 # line number of the first line of the strings and update the list
301 # of messages seen. Reset state for the next batch. If there
302 # were no strings inside _(), then just ignore this entry.
304 self
.__addentry
(EMPTYSTRING
.join(self
.__data
))
305 self
.__state
= self
.__waiting
306 elif ttype
== tokenize
.STRING
:
307 self
.__data
.append(safe_eval(tstring
))
308 # TBD: should we warn if we seen anything else?
310 def __addentry(self
, msg
, lineno
=None):
312 lineno
= self
.__lineno
313 if not msg
in self
.__options
.toexclude
:
314 entry
= (self
.__curfile
, lineno
)
315 self
.__messages
.setdefault(msg
, []).append(entry
)
317 def set_filename(self
, filename
):
318 self
.__curfile
= filename
321 options
= self
.__options
322 timestamp
= time
.ctime(time
.time())
323 # The time stamp in the header doesn't have the same format as that
324 # generated by xgettext...
325 print >> fp
, pot_header
% {'time': timestamp
, 'version': __version__
}
326 for k
, v
in self
.__messages
.items():
327 if not options
.writelocations
:
329 # location comments are different b/w Solaris and GNU:
330 elif options
.locationstyle
== options
.SOLARIS
:
331 for filename
, lineno
in v
:
332 d
= {'filename': filename
, 'lineno': lineno
}
333 print >>fp
, _('# File: %(filename)s, line: %(lineno)d') % d
334 elif options
.locationstyle
== options
.GNU
:
335 # fit as many locations on one line, as long as the
336 # resulting line length doesn't exceeds 'options.width'
338 for filename
, lineno
in v
:
339 d
= {'filename': filename
, 'lineno': lineno
}
340 s
= _(' %(filename)s:%(lineno)d') % d
341 if len(locline
) + len(s
) <= options
.width
:
342 locline
= locline
+ s
348 # TBD: sorting, normalizing
349 print >> fp
, 'msgid', normalize(k
)
350 print >> fp
, 'msgstr ""\n'
355 global default_keywords
357 opts
, args
= getopt
.getopt(
359 'ad:DEhk:Kno:p:S:Vvw:x:',
360 ['extract-all', 'default-domain', 'escape', 'help',
361 'keyword=', 'no-default-keywords',
362 'add-location', 'no-location', 'output=', 'output-dir=',
363 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
366 except getopt
.error
, msg
:
369 # for holding option values
375 extractall
= 0 # FIXME: currently this option has no effect at all.
379 outfile
= 'messages.pot'
388 locations
= {'gnu' : options
.GNU
,
389 'solaris' : options
.SOLARIS
,
393 for opt
, arg
in opts
:
394 if opt
in ('-h', '--help'):
396 elif opt
in ('-a', '--extract-all'):
397 options
.extractall
= 1
398 elif opt
in ('-d', '--default-domain'):
399 options
.outfile
= arg
+ '.pot'
400 elif opt
in ('-E', '--escape'):
402 elif opt
in ('-D', '--docstrings'):
403 options
.docstrings
= 1
404 elif opt
in ('-k', '--keyword'):
405 options
.keywords
.append(arg
)
406 elif opt
in ('-K', '--no-default-keywords'):
407 default_keywords
= []
408 elif opt
in ('-n', '--add-location'):
409 options
.writelocations
= 1
410 elif opt
in ('--no-location',):
411 options
.writelocations
= 0
412 elif opt
in ('-S', '--style'):
413 options
.locationstyle
= locations
.get(arg
.lower())
414 if options
.locationstyle
is None:
415 usage(1, _('Invalid value for --style: %s') % arg
)
416 elif opt
in ('-o', '--output'):
417 options
.outfile
= arg
418 elif opt
in ('-p', '--output-dir'):
419 options
.outpath
= arg
420 elif opt
in ('-v', '--verbose'):
422 elif opt
in ('-V', '--version'):
423 print _('pygettext.py (xgettext for Python) %s') % __version__
425 elif opt
in ('-w', '--width'):
427 options
.width
= int(arg
)
429 usage(1, _('--width argument must be an integer: %s') % arg
)
430 elif opt
in ('-x', '--exclude-file'):
431 options
.excludefilename
= arg
434 make_escapes(options
.escape
)
436 # calculate all keywords
437 options
.keywords
.extend(default_keywords
)
439 # initialize list of strings to exclude
440 if options
.excludefilename
:
442 fp
= open(options
.excludefilename
)
443 options
.toexclude
= fp
.readlines()
446 sys
.stderr
.write(_("Can't read --exclude-file: %s") %
447 options
.excludefilename
)
450 options
.toexclude
= []
452 # slurp through all the files
453 eater
= TokenEater(options
)
454 for filename
in args
:
457 print _('Reading standard input')
462 print _('Working on %s') % filename
466 eater
.set_filename(filename
)
468 tokenize
.tokenize(fp
.readline
, eater
)
469 except tokenize
.TokenError
, e
:
470 sys
.stderr
.write('%s: %s, line %d, column %d\n' %
471 (e
[0], filename
, e
[1][0], e
[1][1]))
477 if options
.outfile
== '-':
482 options
.outfile
= os
.path
.join(options
.outpath
, options
.outfile
)
483 fp
= open(options
.outfile
, 'w')
492 if __name__
== '__main__':
494 # some more test strings
495 _(u
'a unicode string')