2 # -*- coding: iso-8859-1 -*-
3 # Originally written by Barry Warsaw <barry@zope.com>
5 # Minimally patched to make it even more xgettext compatible
6 # by Peter Funk <pf@artcom-gmbh.de>
8 # 2002-11-22 Jürgen Hermann <jh@web.de>
9 # Added checks that _() only contains string literals, and
10 # command line args are resolved to module lists, i.e. you
11 # can now pass a filename, a module or package name, or a
12 # directory (including globbing chars, important for Win32).
13 # Made docstring fit in 80 chars wide displays using pydoc.
23 __doc__
= _("""pygettext -- Python equivalent of xgettext(1)
25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26 internationalization of C programs. Most of these tools are independent of
27 the programming language and can be used from within Python programs.
28 Martin von Loewis' work[1] helps considerably in this regard.
30 There's one problem though; xgettext is the program that scans source code
31 looking for message strings, but it groks only C (or C++). Python
32 introduces a few wrinkles, such as dual quoting characters, triple quoted
33 strings, and raw strings. xgettext understands none of this.
35 Enter pygettext, which uses Python's standard tokenize module to scan
36 Python source code, generating .pot files identical to what GNU xgettext[2]
37 generates for C and C++ code. From there, the standard GNU tools can be
40 A word about marking Python strings as candidates for translation. GNU
41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42 and gettext_noop. But those can be a lot of text to include all over your
43 code. C and C++ have a trick: they use the C preprocessor. Most
44 internationalized C source includes a #define for gettext() to _() so that
45 what has to be written in the source is much less. Thus these are both
48 gettext("Translatable String")
49 _("Translatable String")
51 Python of course has no preprocessor so this doesn't work so well. Thus,
52 pygettext searches only for _() by default, but see the -k/--keyword flag
53 below for how to augment this.
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
58 NOTE: pygettext attempts to be option and feature compatible with GNU
59 xgettext where ever possible. However some options are still missing or are
60 not fully implemented. Also, xgettext's use of command line switches with
61 option arguments is broken, and in these cases, pygettext just defines
64 Usage: pygettext [options] inputfile ...
74 Rename the default output file from messages.pot to name.pot.
78 Replace non-ASCII characters with octal escape sequences.
82 Extract module, class, method, and function docstrings. These do
83 not need to be wrapped in _() markers, and in fact cannot be for
84 Python to consider them docstrings. (See also the -X option).
88 Print this help message and exit.
92 Keywords to look for in addition to the default set, which are:
95 You can have multiple -k flags on the command line.
99 Disable the default set of keywords (see above). Any keywords
100 explicitly added with the -k/--keyword option are still recognized.
103 Do not write filename/lineno location comments.
107 Write filename/lineno location comments indicating where each
108 extracted string is found in the source. These lines appear before
109 each msgid. The style of comments is controlled by the -S/--style
110 option. This is the default.
114 Rename the default output file from messages.pot to filename. If
115 filename is `-' then the output is sent to standard out.
119 Output files will be placed in directory dir.
123 Specify which style to use for location comments. Two styles are
126 Solaris # File: filename, line: line-number
129 The style name is case insensitive. GNU style is the default.
133 Print the names of the files being processed.
137 Print the version of pygettext and exit.
141 Set width of output to columns.
144 --exclude-file=filename
145 Specify a file that contains a list of strings that are not be
146 extracted from the input files. Each string to be excluded must
147 appear on a line by itself in the file.
150 --no-docstrings=filename
151 Specify a file that contains a list of files (one per line) that
152 should not have their docstrings extracted. This is only useful in
153 conjunction with the -D option above.
155 If `inputfile' is -, standard input is read.
168 default_keywords
= ['_']
169 DEFAULTKEYWORDS
= ', '.join(default_keywords
)
175 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178 # SOME DESCRIPTIVE TITLE.
179 # Copyright (C) YEAR ORGANIZATION
180 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
184 "Project-Id-Version: PACKAGE VERSION\\n"
185 "POT-Creation-Date: %(time)s\\n"
186 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
187 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
188 "Language-Team: LANGUAGE <LL@li.org>\\n"
189 "MIME-Version: 1.0\\n"
190 "Content-Type: text/plain; charset=CHARSET\\n"
191 "Content-Transfer-Encoding: ENCODING\\n"
192 "Generated-By: pygettext.py %(version)s\\n"
197 def usage(code
, msg
=''):
198 print >> sys
.stderr
, __doc__
% globals()
200 print >> sys
.stderr
, msg
207 def make_escapes(pass_iso8859
):
210 # Allow iso-8859 characters to pass through so that e.g. 'msgid
211 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
212 # escape any character outside the 32..126 range.
217 if 32 <= (i
% mod
) <= 126:
218 escapes
.append(chr(i
))
220 escapes
.append("\\%03o" % i
)
221 escapes
[ord('\\')] = '\\\\'
222 escapes
[ord('\t')] = '\\t'
223 escapes
[ord('\r')] = '\\r'
224 escapes
[ord('\n')] = '\\n'
225 escapes
[ord('\"')] = '\\"'
231 for i
in range(len(s
)):
232 s
[i
] = escapes
[ord(s
[i
])]
233 return EMPTYSTRING
.join(s
)
237 # unwrap quotes, safely
238 return eval(s
, {'__builtins__':{}}, {})
242 # This converts the various Python string types into a format that is
243 # appropriate for .po files, namely much closer to C style.
244 lines
= s
.split('\n')
246 s
= '"' + escape(s
) + '"'
250 lines
[-1] = lines
[-1] + '\n'
251 for i
in range(len(lines
)):
252 lines
[i
] = escape(lines
[i
])
254 s
= '""\n"' + lineterm
.join(lines
) + '"'
258 def containsAny(str, set):
259 """ Check whether 'str' contains ANY of the chars in 'set'
261 return 1 in [c
in str for c
in set]
264 def _visit_pyfiles(list, dirname
, names
):
265 """ Helper for getFilesForName().
267 # get extension for python source files
268 if not globals().has_key('_py_ext'):
271 _py_ext
= [triple
[0] for triple
in imp
.get_suffixes() if triple
[2] == imp
.PY_SOURCE
][0]
273 # don't recurse into CVS directories
277 # add all *.py files to list
279 [os
.path
.join(dirname
, file)
281 if os
.path
.splitext(file)[1] == _py_ext
])
284 def _get_modpkg_path(dotted_name
, pathlist
=None):
285 """ Get the filesystem path for a module or a package.
287 Return the file system path to a file for a module,
288 and to a directory for a package. Return None if
289 the name is not found, or is a builtin or extension module.
293 # split off top-most name
294 parts
= dotted_name
.split('.', 1)
297 # we have a dotted path, import top-level package
299 file, pathname
, description
= imp
.find_module(parts
[0], pathlist
)
300 if file: file.close()
304 # check if it's indeed a package
305 if description
[2] == imp
.PKG_DIRECTORY
:
306 # recursively handle the remaining name parts
307 pathname
= _get_modpkg_path(parts
[1], [pathname
])
313 file, pathname
, description
= imp
.find_module(dotted_name
, pathlist
)
314 if file: file.close()
315 if description
[2] not in [imp
.PY_SOURCE
, imp
.PKG_DIRECTORY
]:
323 def getFilesForName(name
):
324 """ Get a list of module files for a filename, a module or package name,
329 if not os
.path
.exists(name
):
330 # check for glob chars
331 if containsAny(name
, "*?[]"):
333 files
= glob
.glob(name
)
336 list.extend(getFilesForName(file))
339 # try to find module or package
340 name
= _get_modpkg_path(name
)
344 if os
.path
.isdir(name
):
345 # find all python files in directory
347 os
.path
.walk(name
, _visit_pyfiles
, list)
349 elif os
.path
.exists(name
):
357 def __init__(self
, options
):
358 self
.__options
= options
360 self
.__state
= self
.__waiting
363 self
.__freshmodule
= 1
364 self
.__curfile
= None
366 def __call__(self
, ttype
, tstring
, stup
, etup
, line
):
369 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
370 ## 'tstring:', tstring
371 self
.__state
(ttype
, tstring
, stup
[0])
373 def __waiting(self
, ttype
, tstring
, lineno
):
374 opts
= self
.__options
375 # Do docstring extractions, if enabled
376 if opts
.docstrings
and not opts
.nodocstrings
.get(self
.__curfile
):
378 if self
.__freshmodule
:
379 if ttype
== tokenize
.STRING
:
380 self
.__addentry
(safe_eval(tstring
), lineno
, isdocstring
=1)
381 self
.__freshmodule
= 0
382 elif ttype
not in (tokenize
.COMMENT
, tokenize
.NL
):
383 self
.__freshmodule
= 0
386 if ttype
== tokenize
.NAME
and tstring
in ('class', 'def'):
387 self
.__state
= self
.__suiteseen
389 if ttype
== tokenize
.NAME
and tstring
in opts
.keywords
:
390 self
.__state
= self
.__keywordseen
392 def __suiteseen(self
, ttype
, tstring
, lineno
):
393 # ignore anything until we see the colon
394 if ttype
== tokenize
.OP
and tstring
== ':':
395 self
.__state
= self
.__suitedocstring
397 def __suitedocstring(self
, ttype
, tstring
, lineno
):
398 # ignore any intervening noise
399 if ttype
== tokenize
.STRING
:
400 self
.__addentry
(safe_eval(tstring
), lineno
, isdocstring
=1)
401 self
.__state
= self
.__waiting
402 elif ttype
not in (tokenize
.NEWLINE
, tokenize
.INDENT
,
404 # there was no class docstring
405 self
.__state
= self
.__waiting
407 def __keywordseen(self
, ttype
, tstring
, lineno
):
408 if ttype
== tokenize
.OP
and tstring
== '(':
410 self
.__lineno
= lineno
411 self
.__state
= self
.__openseen
413 self
.__state
= self
.__waiting
415 def __openseen(self
, ttype
, tstring
, lineno
):
416 if ttype
== tokenize
.OP
and tstring
== ')':
417 # We've seen the last of the translatable strings. Record the
418 # line number of the first line of the strings and update the list
419 # of messages seen. Reset state for the next batch. If there
420 # were no strings inside _(), then just ignore this entry.
422 self
.__addentry
(EMPTYSTRING
.join(self
.__data
))
423 self
.__state
= self
.__waiting
424 elif ttype
== tokenize
.STRING
:
425 self
.__data
.append(safe_eval(tstring
))
426 elif ttype
not in [tokenize
.COMMENT
, token
.INDENT
, token
.DEDENT
,
427 token
.NEWLINE
, tokenize
.NL
]:
428 # warn if we see anything else than STRING or whitespace
429 print >>sys
.stderr
, _('*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"') % {
430 'token': tstring
, 'file': self
.__curfile
, 'lineno': self
.__lineno
}
431 self
.__state
= self
.__waiting
433 def __addentry(self
, msg
, lineno
=None, isdocstring
=0):
435 lineno
= self
.__lineno
436 if not msg
in self
.__options
.toexclude
:
437 entry
= (self
.__curfile
, lineno
)
438 self
.__messages
.setdefault(msg
, {})[entry
] = isdocstring
440 def set_filename(self
, filename
):
441 self
.__curfile
= filename
442 self
.__freshmodule
= 1
445 options
= self
.__options
446 timestamp
= time
.ctime(time
.time())
447 # The time stamp in the header doesn't have the same format as that
448 # generated by xgettext...
449 print >> fp
, pot_header
% {'time': timestamp
, 'version': __version__
}
450 # Sort the entries. First sort each particular entry's keys, then
451 # sort all the entries by their first item.
453 for k
, v
in self
.__messages
.items():
456 reverse
.setdefault(tuple(keys
), []).append((k
, v
))
457 rkeys
= reverse
.keys()
460 rentries
= reverse
[rkey
]
462 for k
, v
in rentries
:
464 # If the entry was gleaned out of a docstring, then add a
465 # comment stating so. This is to aid translators who may wish
466 # to skip translating some unimportant docstrings.
467 if reduce(operator
.__add
__, v
.values()):
469 # k is the message string, v is a dictionary-set of (filename,
470 # lineno) tuples. We want to sort the entries in v first by
471 # file name and then by line number.
474 if not options
.writelocations
:
476 # location comments are different b/w Solaris and GNU:
477 elif options
.locationstyle
== options
.SOLARIS
:
478 for filename
, lineno
in v
:
479 d
= {'filename': filename
, 'lineno': lineno
}
481 '# File: %(filename)s, line: %(lineno)d') % d
482 elif options
.locationstyle
== options
.GNU
:
483 # fit as many locations on one line, as long as the
484 # resulting line length doesn't exceeds 'options.width'
486 for filename
, lineno
in v
:
487 d
= {'filename': filename
, 'lineno': lineno
}
488 s
= _(' %(filename)s:%(lineno)d') % d
489 if len(locline
) + len(s
) <= options
.width
:
490 locline
= locline
+ s
497 print >> fp
, '#, docstring'
498 print >> fp
, 'msgid', normalize(k
)
499 print >> fp
, 'msgstr ""\n'
504 global default_keywords
506 opts
, args
= getopt
.getopt(
508 'ad:DEhk:Kno:p:S:Vvw:x:X:',
509 ['extract-all', 'default-domain=', 'escape', 'help',
510 'keyword=', 'no-default-keywords',
511 'add-location', 'no-location', 'output=', 'output-dir=',
512 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
513 'docstrings', 'no-docstrings',
515 except getopt
.error
, msg
:
518 # for holding option values
524 extractall
= 0 # FIXME: currently this option has no effect at all.
528 outfile
= 'messages.pot'
538 locations
= {'gnu' : options
.GNU
,
539 'solaris' : options
.SOLARIS
,
543 for opt
, arg
in opts
:
544 if opt
in ('-h', '--help'):
546 elif opt
in ('-a', '--extract-all'):
547 options
.extractall
= 1
548 elif opt
in ('-d', '--default-domain'):
549 options
.outfile
= arg
+ '.pot'
550 elif opt
in ('-E', '--escape'):
552 elif opt
in ('-D', '--docstrings'):
553 options
.docstrings
= 1
554 elif opt
in ('-k', '--keyword'):
555 options
.keywords
.append(arg
)
556 elif opt
in ('-K', '--no-default-keywords'):
557 default_keywords
= []
558 elif opt
in ('-n', '--add-location'):
559 options
.writelocations
= 1
560 elif opt
in ('--no-location',):
561 options
.writelocations
= 0
562 elif opt
in ('-S', '--style'):
563 options
.locationstyle
= locations
.get(arg
.lower())
564 if options
.locationstyle
is None:
565 usage(1, _('Invalid value for --style: %s') % arg
)
566 elif opt
in ('-o', '--output'):
567 options
.outfile
= arg
568 elif opt
in ('-p', '--output-dir'):
569 options
.outpath
= arg
570 elif opt
in ('-v', '--verbose'):
572 elif opt
in ('-V', '--version'):
573 print _('pygettext.py (xgettext for Python) %s') % __version__
575 elif opt
in ('-w', '--width'):
577 options
.width
= int(arg
)
579 usage(1, _('--width argument must be an integer: %s') % arg
)
580 elif opt
in ('-x', '--exclude-file'):
581 options
.excludefilename
= arg
582 elif opt
in ('-X', '--no-docstrings'):
589 options
.nodocstrings
[line
[:-1]] = 1
594 make_escapes(options
.escape
)
596 # calculate all keywords
597 options
.keywords
.extend(default_keywords
)
599 # initialize list of strings to exclude
600 if options
.excludefilename
:
602 fp
= open(options
.excludefilename
)
603 options
.toexclude
= fp
.readlines()
606 print >> sys
.stderr
, _(
607 "Can't read --exclude-file: %s") % options
.excludefilename
610 options
.toexclude
= []
612 # resolve args to module lists
618 expanded
.extend(getFilesForName(arg
))
621 # slurp through all the files
622 eater
= TokenEater(options
)
623 for filename
in args
:
626 print _('Reading standard input')
631 print _('Working on %s') % filename
635 eater
.set_filename(filename
)
637 tokenize
.tokenize(fp
.readline
, eater
)
638 except tokenize
.TokenError
, e
:
639 print >> sys
.stderr
, '%s: %s, line %d, column %d' % (
640 e
[0], filename
, e
[1][0], e
[1][1])
646 if options
.outfile
== '-':
651 options
.outfile
= os
.path
.join(options
.outpath
, options
.outfile
)
652 fp
= open(options
.outfile
, 'w')
661 if __name__
== '__main__':
663 # some more test strings
664 _(u
'a unicode string')
665 _('*** Seen unexpected token "%(token)s"' % {'token': 'test'}) # this one creates a warning
666 _('more' 'than' 'one' 'string')