Tools/i18n/pygettext.py

   1 #! /usr/bin/env python
   2 # Originally written by Barry Warsaw <bwarsaw@python.org>
   3 #
   4 # minimally patched to make it even more xgettext compatible
   5 # by Peter Funk <pf@artcom-gmbh.de>
   6
   7 """pygettext -- Python equivalent of xgettext(1)
   8
   9 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
  10 internationalization of C programs.  Most of these tools are independent of
  11 the programming language and can be used from within Python programs.  Martin
  12 von Loewis' work[1] helps considerably in this regard.
  13
  14 There's one problem though; xgettext is the program that scans source code
  15 looking for message strings, but it groks only C (or C++).  Python introduces
  16 a few wrinkles, such as dual quoting characters, triple quoted strings, and
  17 raw strings.  xgettext understands none of this.
  18
  19 Enter pygettext, which uses Python's standard tokenize module to scan Python
  20 source code, generating .pot files identical to what GNU xgettext[2] generates
  21 for C and C++ code.  From there, the standard GNU tools can be used.
  22
  23 A word about marking Python strings as candidates for translation.  GNU
  24 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
  25 gettext_noop.  But those can be a lot of text to include all over your code.
  26 C and C++ have a trick: they use the C preprocessor.  Most internationalized C
  27 source includes a #define for gettext() to _() so that what has to be written
  28 in the source is much less.  Thus these are both translatable strings:
  29
  30     gettext("Translatable String")
  31     _("Translatable String")
  32
  33 Python of course has no preprocessor so this doesn't work so well.  Thus,
  34 pygettext searches only for _() by default, but see the -k/--keyword flag
  35 below for how to augment this.
  36
  37  [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
  38  [2] http://www.gnu.org/software/gettext/gettext.html
  39
  40 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
  41 where ever possible.  However some options are still missing or are not fully
  42 implemented.  Also, xgettext's use of command line switches with option
  43 arguments is broken, and in these cases, pygettext just defines additional
  44 switches.
  45
  46 Usage: pygettext [options] inputfile ...
  47
  48 Options:
  49
  50     -a
  51     --extract-all
  52         Extract all strings
  53
  54     -d name
  55     --default-domain=name
  56         Rename the default output file from messages.pot to name.pot
  57
  58     -E
  59     --escape
  60         Replace non-ASCII characters with octal escape sequences.
  61
  62     -D
  63     --docstrings
  64         Extract module, class, method, and function docstrings.  These do not
  65         need to be wrapped in _() markers, and in fact cannot be for Python to
  66         consider them docstrings.
  67
  68     -h
  69     --help
  70         print this help message and exit
  71
  72     -k word
  73     --keyword=word
  74         Keywords to look for in addition to the default set, which are:
  75         %(DEFAULTKEYWORDS)s
  76
  77         You can have multiple -k flags on the command line.
  78
  79     -K
  80     --no-default-keywords
  81         Disable the default set of keywords (see above).  Any keywords
  82         explicitly added with the -k/--keyword option are still recognized.
  83
  84     --no-location
  85         Do not write filename/lineno location comments.
  86
  87     -n
  88     --add-location
  89         Write filename/lineno location comments indicating where each
  90         extracted string is found in the source.  These lines appear before
  91         each msgid.  The style of comments is controlled by the -S/--style
  92         option.  This is the default.
  93
  94     -o filename
  95     --output=filename
  96         Rename the default output file from messages.pot to filename.  If
  97         filename is `-' then the output is sent to standard out.
  98
  99     -p dir
 100     --output-dir=dir
 101         Output files will be placed in directory dir.
 102
 103     -S stylename
 104     --style stylename
 105         Specify which style to use for location comments.  Two styles are
 106         supported:
 107
 108         Solaris  # File: filename, line: line-number
 109         GNU      #: filename:line
 110
 111         The style name is case insensitive.  GNU style is the default.
 112
 113     -v
 114     --verbose
 115         Print the names of the files being processed.
 116
 117     -V
 118     --version
 119         Print the version of pygettext and exit.
 120
 121     -w columns
 122     --width=columns
 123         Set width of output to columns.
 124
 125     -x filename
 126     --exclude-file=filename
 127         Specify a file that contains a list of strings that are not be
 128         extracted from the input files.  Each string to be excluded must
 129         appear on a line by itself in the file.
 130
 131 If `inputfile' is -, standard input is read.
 132
 133 """
 134
 135 import os
 136 import sys
 137 import time
 138 import getopt
 139 import tokenize
 140
 141 # for selftesting
 142 try:
 143     import fintl
 144     _ = fintl.gettext
 145 except ImportError:
 146     def _(s): return s
 147
 148 __version__ = '1.2'
 149
 150 default_keywords = ['_']
 151 DEFAULTKEYWORDS = ', '.join(default_keywords)
 152
 153 EMPTYSTRING = ''
 154
 155
 156 \f
 157 # The normal pot-file header. msgmerge and EMACS' po-mode work better if
 158 # it's there.
 159 pot_header = _('''\
 160 # SOME DESCRIPTIVE TITLE.
 161 # Copyright (C) YEAR ORGANIZATION
 162 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
 163 #
 164 msgid ""
 165 msgstr ""
 166 "Project-Id-Version: PACKAGE VERSION\\n"
 167 "PO-Revision-Date: %(time)s\\n"
 168 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 169 "Language-Team: LANGUAGE <LL@li.org>\\n"
 170 "MIME-Version: 1.0\\n"
 171 "Content-Type: text/plain; charset=CHARSET\\n"
 172 "Content-Transfer-Encoding: ENCODING\\n"
 173 "Generated-By: pygettext.py %(version)s\\n"
 174
 175 ''')
 176
 177 \f
 178 def usage(code, msg=''):
 179     print >> sys.stderr, _(__doc__) % globals()
 180     if msg:
 181         print >> sys.stderr, msg
 182     sys.exit(code)
 183
 184
 185 \f
 186 escapes = []
 187
 188 def make_escapes(pass_iso8859):
 189     global escapes
 190     if pass_iso8859:
 191         # Allow iso-8859 characters to pass through so that e.g. 'msgid
 192         # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
 193         # escape any character outside the 32..126 range.
 194         mod = 128
 195     else:
 196         mod = 256
 197     for i in range(256):
 198         if 32 <= (i % mod) <= 126:
 199             escapes.append(chr(i))
 200         else:
 201             escapes.append("\\%03o" % i)
 202     escapes[ord('\\')] = '\\\\'
 203     escapes[ord('\t')] = '\\t'
 204     escapes[ord('\r')] = '\\r'
 205     escapes[ord('\n')] = '\\n'
 206     escapes[ord('\"')] = '\\"'
 207
 208
 209 def escape(s):
 210     global escapes
 211     s = list(s)
 212     for i in range(len(s)):
 213         s[i] = escapes[ord(s[i])]
 214     return EMPTYSTRING.join(s)
 215
 216
 217 def safe_eval(s):
 218     # unwrap quotes, safely
 219     return eval(s, {'__builtins__':{}}, {})
 220
 221
 222 def normalize(s):
 223     # This converts the various Python string types into a format that is
 224     # appropriate for .po files, namely much closer to C style.
 225     lines = s.split('\n')
 226     if len(lines) == 1:
 227         s = '"' + escape(s) + '"'
 228     else:
 229         if not lines[-1]:
 230             del lines[-1]
 231             lines[-1] = lines[-1] + '\n'
 232         for i in range(len(lines)):
 233             lines[i] = escape(lines[i])
 234         lineterm = '\\n"\n"'
 235         s = '""\n"' + lineterm.join(lines) + '"'
 236     return s
 237
 238
 239 \f
 240 class TokenEater:
 241     def __init__(self, options):
 242         self.__options = options
 243         self.__messages = {}
 244         self.__state = self.__waiting
 245         self.__data = []
 246         self.__lineno = -1
 247         self.__freshmodule = 1
 248
 249     def __call__(self, ttype, tstring, stup, etup, line):
 250         # dispatch
 251 ##        import token
 252 ##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
 253 ##              'tstring:', tstring
 254         self.__state(ttype, tstring, stup[0])
 255
 256     def __waiting(self, ttype, tstring, lineno):
 257         # Do docstring extractions, if enabled
 258         if self.__options.docstrings:
 259             # module docstring?
 260             if self.__freshmodule:
 261                 if ttype == tokenize.STRING:
 262                     self.__addentry(safe_eval(tstring), lineno)
 263                     self.__freshmodule = 0
 264                 elif ttype not in (tokenize.COMMENT, tokenize.NL):
 265                     self.__freshmodule = 0
 266                 return
 267             # class docstring?
 268             if ttype == tokenize.NAME and tstring in ('class', 'def'):
 269                 self.__state = self.__suiteseen
 270                 return
 271         if ttype == tokenize.NAME and tstring in self.__options.keywords:
 272             self.__state = self.__keywordseen
 273
 274     def __suiteseen(self, ttype, tstring, lineno):
 275         # ignore anything until we see the colon
 276         if ttype == tokenize.OP and tstring == ':':
 277             self.__state = self.__suitedocstring
 278
 279     def __suitedocstring(self, ttype, tstring, lineno):
 280         # ignore any intervening noise
 281         if ttype == tokenize.STRING:
 282             self.__addentry(safe_eval(tstring), lineno)
 283             self.__state = self.__waiting
 284         elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
 285                            tokenize.COMMENT):
 286             # there was no class docstring
 287             self.__state = self.__waiting
 288
 289     def __keywordseen(self, ttype, tstring, lineno):
 290         if ttype == tokenize.OP and tstring == '(':
 291             self.__data = []
 292             self.__lineno = lineno
 293             self.__state = self.__openseen
 294         else:
 295             self.__state = self.__waiting
 296
 297     def __openseen(self, ttype, tstring, lineno):
 298         if ttype == tokenize.OP and tstring == ')':
 299             # We've seen the last of the translatable strings.  Record the
 300             # line number of the first line of the strings and update the list
 301             # of messages seen.  Reset state for the next batch.  If there
 302             # were no strings inside _(), then just ignore this entry.
 303             if self.__data:
 304                 self.__addentry(EMPTYSTRING.join(self.__data))
 305             self.__state = self.__waiting
 306         elif ttype == tokenize.STRING:
 307             self.__data.append(safe_eval(tstring))
 308         # TBD: should we warn if we seen anything else?
 309
 310     def __addentry(self, msg, lineno=None):
 311         if lineno is None:
 312             lineno = self.__lineno
 313         if not msg in self.__options.toexclude:
 314             entry = (self.__curfile, lineno)
 315             self.__messages.setdefault(msg, []).append(entry)
 316
 317     def set_filename(self, filename):
 318         self.__curfile = filename
 319
 320     def write(self, fp):
 321         options = self.__options
 322         timestamp = time.ctime(time.time())
 323         # The time stamp in the header doesn't have the same format as that
 324         # generated by xgettext...
 325         print >> fp, pot_header % {'time': timestamp, 'version': __version__}
 326         for k, v in self.__messages.items():
 327             if not options.writelocations:
 328                 pass
 329             # location comments are different b/w Solaris and GNU:
 330             elif options.locationstyle == options.SOLARIS:
 331                 for filename, lineno in v:
 332                     d = {'filename': filename, 'lineno': lineno}
 333                     print >>fp, _('# File: %(filename)s, line: %(lineno)d') % d
 334             elif options.locationstyle == options.GNU:
 335                 # fit as many locations on one line, as long as the
 336                 # resulting line length doesn't exceeds 'options.width'
 337                 locline = '#:'
 338                 for filename, lineno in v:
 339                     d = {'filename': filename, 'lineno': lineno}
 340                     s = _(' %(filename)s:%(lineno)d') % d
 341                     if len(locline) + len(s) <= options.width:
 342                         locline = locline + s
 343                     else:
 344                         print >> fp, locline
 345                         locline = "#:" + s
 346                 if len(locline) > 2:
 347                     print >> fp, locline
 348             # TBD: sorting, normalizing
 349             print >> fp, 'msgid', normalize(k)
 350             print >> fp, 'msgstr ""\n'
 351
 352
 353 \f
 354 def main():
 355     global default_keywords
 356     try:
 357         opts, args = getopt.getopt(
 358             sys.argv[1:],
 359             'ad:DEhk:Kno:p:S:Vvw:x:',
 360             ['extract-all', 'default-domain', 'escape', 'help',
 361              'keyword=', 'no-default-keywords',
 362              'add-location', 'no-location', 'output=', 'output-dir=',
 363              'style=', 'verbose', 'version', 'width=', 'exclude-file=',
 364              'docstrings',
 365              ])
 366     except getopt.error, msg:
 367         usage(1, msg)
 368
 369     # for holding option values
 370     class Options:
 371         # constants
 372         GNU = 1
 373         SOLARIS = 2
 374         # defaults
 375         extractall = 0 # FIXME: currently this option has no effect at all.
 376         escape = 0
 377         keywords = []
 378         outpath = ''
 379         outfile = 'messages.pot'
 380         writelocations = 1
 381         locationstyle = GNU
 382         verbose = 0
 383         width = 78
 384         excludefilename = ''
 385         docstrings = 0
 386
 387     options = Options()
 388     locations = {'gnu' : options.GNU,
 389                  'solaris' : options.SOLARIS,
 390                  }
 391
 392     # parse options
 393     for opt, arg in opts:
 394         if opt in ('-h', '--help'):
 395             usage(0)
 396         elif opt in ('-a', '--extract-all'):
 397             options.extractall = 1
 398         elif opt in ('-d', '--default-domain'):
 399             options.outfile = arg + '.pot'
 400         elif opt in ('-E', '--escape'):
 401             options.escape = 1
 402         elif opt in ('-D', '--docstrings'):
 403             options.docstrings = 1
 404         elif opt in ('-k', '--keyword'):
 405             options.keywords.append(arg)
 406         elif opt in ('-K', '--no-default-keywords'):
 407             default_keywords = []
 408         elif opt in ('-n', '--add-location'):
 409             options.writelocations = 1
 410         elif opt in ('--no-location',):
 411             options.writelocations = 0
 412         elif opt in ('-S', '--style'):
 413             options.locationstyle = locations.get(arg.lower())
 414             if options.locationstyle is None:
 415                 usage(1, _('Invalid value for --style: %s') % arg)
 416         elif opt in ('-o', '--output'):
 417             options.outfile = arg
 418         elif opt in ('-p', '--output-dir'):
 419             options.outpath = arg
 420         elif opt in ('-v', '--verbose'):
 421             options.verbose = 1
 422         elif opt in ('-V', '--version'):
 423             print _('pygettext.py (xgettext for Python) %s') % __version__
 424             sys.exit(0)
 425         elif opt in ('-w', '--width'):
 426             try:
 427                 options.width = int(arg)
 428             except ValueError:
 429                 usage(1, _('--width argument must be an integer: %s') % arg)
 430         elif opt in ('-x', '--exclude-file'):
 431             options.excludefilename = arg
 432
 433     # calculate escapes
 434     make_escapes(options.escape)
 435
 436     # calculate all keywords
 437     options.keywords.extend(default_keywords)
 438
 439     # initialize list of strings to exclude
 440     if options.excludefilename:
 441         try:
 442             fp = open(options.excludefilename)
 443             options.toexclude = fp.readlines()
 444             fp.close()
 445         except IOError:
 446             sys.stderr.write(_("Can't read --exclude-file: %s") %
 447                              options.excludefilename)
 448             sys.exit(1)
 449     else:
 450         options.toexclude = []
 451
 452     # slurp through all the files
 453     eater = TokenEater(options)
 454     for filename in args:
 455         if filename == '-':
 456             if options.verbose:
 457                 print _('Reading standard input')
 458             fp = sys.stdin
 459             closep = 0
 460         else:
 461             if options.verbose:
 462                 print _('Working on %s') % filename
 463             fp = open(filename)
 464             closep = 1
 465         try:
 466             eater.set_filename(filename)
 467             try:
 468                 tokenize.tokenize(fp.readline, eater)
 469             except tokenize.TokenError, e:
 470                 sys.stderr.write('%s: %s, line %d, column %d\n' %
 471                                  (e[0], filename, e[1][0], e[1][1]))
 472         finally:
 473             if closep:
 474                 fp.close()
 475
 476     # write the output
 477     if options.outfile == '-':
 478         fp = sys.stdout
 479         closep = 0
 480     else:
 481         if options.outpath:
 482             options.outfile = os.path.join(options.outpath, options.outfile)
 483         fp = open(options.outfile, 'w')
 484         closep = 1
 485     try:
 486         eater.write(fp)
 487     finally:
 488         if closep:
 489             fp.close()
 490
 491 \f
 492 if __name__ == '__main__':
 493     main()
 494     # some more test strings
 495     _(u'a unicode string')