Tools/i18n/pygettext.py

   1 #! /usr/bin/env python
   2
   3 """pygettext -- Python equivalent of xgettext(1)
   4
   5 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
   6 internationalization of C programs.  Most of these tools are independent of
   7 the programming language and can be used from within Python programs.  Martin
   8 von Loewis' work[1] helps considerably in this regard.
   9
  10 There's one hole though; xgettext is the program that scans source code
  11 looking for message strings, but it groks only C (or C++).  Python introduces
  12 a few wrinkles, such as dual quoting characters, triple quoted strings, and
  13 raw strings.  xgettext understands none of this.
  14
  15 Enter pygettext, which uses Python's standard tokenize module to scan Python
  16 source code, generating .pot files identical to what GNU xgettext[2] generates
  17 for C and C++ code.  From there, the standard GNU tools can be used.
  18
  19 A word about marking Python strings as candidates for translation.  GNU
  20 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
  21 gettext_noop.  But those can be a lot of text to include all over your code.
  22 C and C++ have a trick: they use the C preprocessor.  Most internationalized C
  23 source includes a #define for gettext() to _() so that what has to be written
  24 in the source is much less.  Thus these are both translatable strings:
  25
  26     gettext("Translatable String")
  27     _("Translatable String")
  28
  29 Python of course has no preprocessor so this doesn't work so well.  Thus,
  30 pygettext searches only for _() by default, but see the -k/--keyword flag
  31 below for how to augment this.
  32
  33  [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
  34  [2] http://www.gnu.org/software/gettext/gettext.html
  35
  36
  37 NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
  38 where ever possible.
  39
  40 Usage: pygettext [options] filename ...
  41
  42 Options:
  43
  44     -a
  45     --extract-all
  46         Extract all strings
  47
  48     -d default-domain
  49     --default-domain=default-domain
  50         Rename the default output file from messages.pot to default-domain.pot
  51
  52     -k [word]
  53     --keyword[=word]
  54         Additional keywords to look for.  Without `word' means not to use the
  55         default keywords.  The default keywords, which are always looked for
  56         if not explicitly disabled: _
  57
  58         The default keyword list is different than GNU xgettext. You can have
  59         multiple -k flags on the command line.
  60
  61     --no-location
  62         Do not write filename/lineno location comments
  63
  64     -n [style]
  65     --add-location[=style]
  66         Write filename/lineno location comments indicating where each
  67         extracted string is found in the source.  These lines appear before
  68         each msgid.  Two styles are supported:
  69
  70         Solaris  # File: filename, line: line-number
  71         Gnu      #: filename:line
  72
  73         If style is omitted, Gnu is used.  The style name is case
  74         insensitive.  By default, locations are included.
  75
  76     --help
  77     -h
  78         print this help message and exit
  79
  80 """
  81
  82 import os
  83 import sys
  84 import string
  85 import time
  86 import getopt
  87 import tokenize
  88
  89 __version__ = '0.1'
  90
  91
  92 \f
  93 def usage(code, msg=''):
  94     print __doc__ % globals()
  95     if msg:
  96         print msg
  97     sys.exit(code)
  98
  99
 100 \f
 101 def normalize(s):
 102     # This converts the various Python string types into a format that is
 103     # appropriate for .po files, namely much closer to C style.
 104     #
 105     # unwrap quotes, safely
 106     s = eval(s, {'__builtins__':{}}, {})
 107     # now escape any embedded double quotes
 108     parts = []
 109     last = 0
 110     i = string.find(s, '"')
 111     while i >= 0:
 112         # find the number of preceding backslashes
 113         j = i
 114         n = 0
 115         while j >= 0 and s[i] == '\\':
 116             j = j - 1
 117             n = n + 1
 118         if (n % 2) == 0:
 119             parts.append(s[last:j])
 120             parts.append('\\')
 121             parts.append(s[j:i])
 122         else:
 123             parts.append(s[last:i])
 124         last = i
 125         i = string.find(s, '"', i+1)
 126     else:
 127         parts.append(s[last:])
 128     if parts:
 129         return '"' + string.join(parts, '') + '"'
 130     else:
 131         return '"' + s + '"'
 132
 133
 134 \f
 135 class TokenEater:
 136     def __init__(self, options):
 137         self.__options = options
 138         self.__messages = {}
 139         self.__state = self.__waiting
 140         self.__data = []
 141         self.__lineno = -1
 142
 143     def __call__(self, ttype, tstring, stup, etup, line):
 144         # dispatch
 145         self.__state(ttype, tstring, stup[0])
 146
 147     def __waiting(self, ttype, tstring, lineno):
 148         if ttype == tokenize.NAME and tstring in self.__options.keywords:
 149             self.__state = self.__keywordseen
 150
 151     def __keywordseen(self, ttype, tstring, lineno):
 152         if ttype == tokenize.OP and tstring == '(':
 153             self.__data = []
 154             self.__lineno = lineno
 155             self.__state = self.__openseen
 156         else:
 157             self.__state = self.__waiting
 158
 159     def __openseen(self, ttype, tstring, lineno):
 160         if ttype == tokenize.OP and tstring == ')':
 161             # We've seen the last of the translatable strings.  Record the
 162             # line number of the first line of the strings and update the list
 163             # of messages seen.  Reset state for the next batch.  If there
 164             # were no strings inside _(), then just ignore this entry.
 165             if self.__data:
 166                 msg = string.join(self.__data, '')
 167                 entry = (self.__curfile, self.__lineno)
 168                 linenos = self.__messages.get(msg)
 169                 if linenos is None:
 170                     self.__messages[msg] = [entry]
 171                 else:
 172                     linenos.append(entry)
 173             self.__state = self.__waiting
 174         elif ttype == tokenize.STRING:
 175             self.__data.append(normalize(tstring))
 176         # TBD: should we warn if we seen anything else?
 177
 178     def set_filename(self, filename):
 179         self.__curfile = filename
 180
 181     def write(self, fp):
 182         options = self.__options
 183         timestamp = time.ctime(time.time())
 184         # common header
 185         try:
 186             sys.stdout = fp
 187             print '# POT file generated by pygettext.py', __version__
 188             print '#', timestamp
 189             print '#'
 190             for k, v in self.__messages.items():
 191                 for filename, lineno in v:
 192                     # location comments are different b/w Solaris and GNU
 193                     if options.location == options.SOLARIS:
 194                         print '# File: %s,' % filename, 'line: %d' % lineno
 195                     elif options.location == options.GNU:
 196                         print '#: %s:%d' % (filename, lineno)
 197                 # TBD: sorting, normalizing
 198                 print 'msgid', k
 199                 print 'msgstr '
 200                 print
 201         finally:
 202             sys.stdout = sys.__stdout__
 203
 204 \f
 205 def main():
 206     default_keywords = ['_']
 207     try:
 208         opts, args = getopt.getopt(
 209             sys.argv[1:],
 210             'k:d:n:h',
 211             ['keyword', 'default-domain', 'help',
 212              'add-location=', 'no-location'])
 213     except getopt.error, msg:
 214         usage(1, msg)
 215
 216     # for holding option values
 217     class Options:
 218         # constants
 219         GNU = 1
 220         SOLARIS = 2
 221         # defaults
 222         keywords = []
 223         outfile = 'messages.pot'
 224         location = GNU
 225
 226     options = Options()
 227     locations = {'gnu' : options.GNU,
 228                  'solaris' : options.SOLARIS,
 229                  }
 230
 231     # parse options
 232     for opt, arg in opts:
 233         if opt in ('-h', '--help'):
 234             usage(0)
 235         elif opt in ('-k', '--keyword'):
 236             if arg is None:
 237                 default_keywords = []
 238             options.keywords.append(arg)
 239         elif opt in ('-d', '--default-domain'):
 240             options.outfile = arg + '.pot'
 241         elif opt in ('-n', '--add-location'):
 242             if arg is None:
 243                 arg = 'gnu'
 244             try:
 245                 options.location = locations[string.lower(arg)]
 246             except KeyError:
 247                 usage(1, 'Invalid value for --add-location: ' + arg)
 248         elif opt in ('--no-location',):
 249             options.location = 0
 250
 251     # calculate all keywords
 252     options.keywords.extend(default_keywords)
 253
 254     # slurp through all the files
 255     eater = TokenEater(options)
 256     for filename in args:
 257         fp = open(filename)
 258         eater.set_filename(filename)
 259         tokenize.tokenize(fp.readline, eater)
 260         fp.close()
 261
 262     fp = open(options.outfile, 'w')
 263     eater.write(fp)
 264     fp.close()
 265
 266
 267 \f
 268 if __name__ == '__main__':
 269     main()