Lib/gettext.py

   1 """Internationalization and localization support.
   2
   3 This module provides internationalization (I18N) and localization (L10N)
   4 support for your Python programs by providing an interface to the GNU gettext
   5 message catalog library.
   6
   7 I18N refers to the operation by which a program is made aware of multiple
   8 languages.  L10N refers to the adaptation of your program, once
   9 internationalized, to the local language and cultural habits.
  10
  11 """
  12
  13 # This module represents the integration of work, contributions, feedback, and
  14 # suggestions from the following people:
  15 #
  16 # Martin von Loewis, who wrote the initial implementation of the underlying
  17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
  18 # gettext.py implementation.
  19 #
  20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
  21 # which also included a pure-Python implementation to read .mo files if
  22 # intlmodule wasn't available.
  23 #
  24 # James Henstridge, who also wrote a gettext.py module, which has some
  25 # interesting, but currently unsupported experimental features: the notion of
  26 # a Catalog class and instances, and the ability to add to a catalog file via
  27 # a Python API.
  28 #
  29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
  30 # and conformed all C and Python code to Python's coding standards.
  31 #
  32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
  33 # module.
  34 #
  35 # J. David Ibanez implemented plural forms.
  36 #
  37 # TODO:
  38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
  39 #   memory, but that's probably bad for large translated programs.  Instead,
  40 #   the lexical sort of original strings in GNU .mo files should be exploited
  41 #   to do binary searches and lazy initializations.  Or you might want to use
  42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
  43 #   you'll need to study the GNU gettext code to do this.
  44 #
  45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
  46 #   find this format documented anywhere.
  47
  48
  49 import copy, os, re, struct, sys
  50 from errno import ENOENT
  51
  52
  53 __all__ = ["bindtextdomain","textdomain","gettext","dgettext",
  54            "find","translation","install","Catalog"]
  55
  56 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
  57
  58
  59 def test(condition, true, false):
  60     """
  61     Implements the C expression:
  62
  63       condition ? true : false
  64
  65     Required to correctly interpret plural forms.
  66     """
  67     if condition:
  68         return true
  69     else:
  70         return false
  71
  72
  73 def c2py(plural):
  74     """
  75     Gets a C expression as used in PO files for plural forms and
  76     returns a Python lambda function that implements an equivalent
  77     expression.
  78     """
  79     # Security check, allow only the "n" identifier
  80     from StringIO import StringIO
  81     import token, tokenize
  82     tokens = tokenize.generate_tokens(StringIO(plural).readline)
  83     danger = [ x for x in tokens if x[0] == token.NAME and x[1] != 'n' ]
  84     if danger:
  85         raise ValueError, 'dangerous expression'
  86
  87     # Replace some C operators by their Python equivalents
  88     plural = plural.replace('&&', ' and ')
  89     plural = plural.replace('||', ' or ')
  90
  91     expr = re.compile(r'\![^=]')
  92     plural = expr.sub(' not ', plural)
  93
  94     # Regular expression and replacement function used to transform
  95     # "a?b:c" to "test(a,b,c)".
  96     expr = re.compile(r'(.*?)\?(.*?):(.*)')
  97     def repl(x):
  98         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
  99                                      expr.sub(repl, x.group(3)))
 100
 101     # Code to transform the plural expression, taking care of parentheses
 102     stack = ['']
 103     for c in plural:
 104         if c == '(':
 105             stack.append('')
 106         elif c == ')':
 107             if len(stack) == 0:
 108                 raise ValueError, 'unbalanced parenthesis in plural form'
 109             s = expr.sub(repl, stack.pop())
 110             stack[-1] += '(%s)' % s
 111         else:
 112             stack[-1] += c
 113     plural = expr.sub(repl, stack.pop())
 114
 115     return eval('lambda n: int(%s)' % plural)
 116
 117
 118
 119 def _expand_lang(locale):
 120     from locale import normalize
 121     locale = normalize(locale)
 122     COMPONENT_CODESET   = 1 << 0
 123     COMPONENT_TERRITORY = 1 << 1
 124     COMPONENT_MODIFIER  = 1 << 2
 125     # split up the locale into its base components
 126     mask = 0
 127     pos = locale.find('@')
 128     if pos >= 0:
 129         modifier = locale[pos:]
 130         locale = locale[:pos]
 131         mask |= COMPONENT_MODIFIER
 132     else:
 133         modifier = ''
 134     pos = locale.find('.')
 135     if pos >= 0:
 136         codeset = locale[pos:]
 137         locale = locale[:pos]
 138         mask |= COMPONENT_CODESET
 139     else:
 140         codeset = ''
 141     pos = locale.find('_')
 142     if pos >= 0:
 143         territory = locale[pos:]
 144         locale = locale[:pos]
 145         mask |= COMPONENT_TERRITORY
 146     else:
 147         territory = ''
 148     language = locale
 149     ret = []
 150     for i in range(mask+1):
 151         if not (i & ~mask):  # if all components for this combo exist ...
 152             val = language
 153             if i & COMPONENT_TERRITORY: val += territory
 154             if i & COMPONENT_CODESET:   val += codeset
 155             if i & COMPONENT_MODIFIER:  val += modifier
 156             ret.append(val)
 157     ret.reverse()
 158     return ret
 159
 160
 161
 162 class NullTranslations:
 163     def __init__(self, fp=None):
 164         self._info = {}
 165         self._charset = None
 166         self._fallback = None
 167         if fp is not None:
 168             self._parse(fp)
 169
 170     def _parse(self, fp):
 171         pass
 172
 173     def add_fallback(self, fallback):
 174         if self._fallback:
 175             self._fallback.add_fallback(fallback)
 176         else:
 177             self._fallback = fallback
 178
 179     def gettext(self, message):
 180         if self._fallback:
 181             return self._fallback.gettext(message)
 182         return message
 183
 184     def ngettext(self, msgid1, msgid2, n):
 185         if self._fallback:
 186             return self._fallback.ngettext(msgid1, msgid2, n)
 187         if n == 1:
 188             return msgid1
 189         else:
 190             return msgid2
 191
 192     def ugettext(self, message):
 193         if self._fallback:
 194             return self._fallback.ugettext(message)
 195         return unicode(message)
 196
 197     def ungettext(self, msgid1, msgid2, n):
 198         if self._fallback:
 199             return self._fallback.ungettext(msgid1, msgid2, n)
 200         if n == 1:
 201             return unicode(msgid1)
 202         else:
 203             return unicode(msgid2)
 204
 205     def info(self):
 206         return self._info
 207
 208     def charset(self):
 209         return self._charset
 210
 211     def install(self, unicode=0):
 212         import __builtin__
 213         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
 214
 215
 216 class GNUTranslations(NullTranslations):
 217     # Magic number of .mo files
 218     LE_MAGIC = 0x950412deL
 219     BE_MAGIC = 0xde120495L
 220
 221     def _parse(self, fp):
 222         """Override this method to support alternative .mo formats."""
 223         unpack = struct.unpack
 224         filename = getattr(fp, 'name', '')
 225         # Parse the .mo file header, which consists of 5 little endian 32
 226         # bit words.
 227         self._catalog = catalog = {}
 228         buf = fp.read()
 229         buflen = len(buf)
 230         # Are we big endian or little endian?
 231         magic = unpack('<I', buf[:4])[0]
 232         if magic == self.LE_MAGIC:
 233             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
 234             ii = '<II'
 235         elif magic == self.BE_MAGIC:
 236             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
 237             ii = '>II'
 238         else:
 239             raise IOError(0, 'Bad magic number', filename)
 240         # Now put all messages from the .mo file buffer into the catalog
 241         # dictionary.
 242         for i in xrange(0, msgcount):
 243             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
 244             mend = moff + mlen
 245             tlen, toff = unpack(ii, buf[transidx:transidx+8])
 246             tend = toff + tlen
 247             if mend < buflen and tend < buflen:
 248                 msg = buf[moff:mend]
 249                 tmsg = buf[toff:tend]
 250                 if msg.find('\x00') >= 0:
 251                     # Plural forms
 252                     msgid1, msgid2 = msg.split('\x00')
 253                     tmsg = tmsg.split('\x00')
 254                     for i in range(len(tmsg)):
 255                         catalog[(msgid1, i)] = tmsg[i]
 256                 else:
 257                     catalog[msg] = tmsg
 258             else:
 259                 raise IOError(0, 'File is corrupt', filename)
 260             # See if we're looking at GNU .mo conventions for metadata
 261             if mlen == 0 and tmsg.lower().startswith('project-id-version:'):
 262                 # Catalog description
 263                 for item in tmsg.split('\n'):
 264                     item = item.strip()
 265                     if not item:
 266                         continue
 267                     k, v = item.split(':', 1)
 268                     k = k.strip().lower()
 269                     v = v.strip()
 270                     self._info[k] = v
 271                     if k == 'content-type':
 272                         self._charset = v.split('charset=')[1]
 273                     elif k == 'plural-forms':
 274                         v = v.split(';')
 275 ##                        nplurals = v[0].split('nplurals=')[1]
 276 ##                        nplurals = int(nplurals.strip())
 277                         plural = v[1].split('plural=')[1]
 278                         self.plural = c2py(plural)
 279             # advance to next entry in the seek tables
 280             masteridx += 8
 281             transidx += 8
 282
 283     def gettext(self, message):
 284         try:
 285             return self._catalog[message]
 286         except KeyError:
 287             if self._fallback:
 288                 return self._fallback.gettext(message)
 289             return message
 290
 291
 292     def ngettext(self, msgid1, msgid2, n):
 293         try:
 294             return self._catalog[(msgid1, self.plural(n))]
 295         except KeyError:
 296             if self._fallback:
 297                 return self._fallback.ngettext(msgid1, msgid2, n)
 298             if n == 1:
 299                 return msgid1
 300             else:
 301                 return msgid2
 302
 303
 304     def ugettext(self, message):
 305         try:
 306             tmsg = self._catalog[message]
 307         except KeyError:
 308             if self._fallback:
 309                 return self._fallback.ugettext(message)
 310             tmsg = message
 311         return unicode(tmsg, self._charset)
 312
 313
 314     def ungettext(self, msgid1, msgid2, n):
 315         try:
 316             tmsg = self._catalog[(msgid1, self.plural(n))]
 317         except KeyError:
 318             if self._fallback:
 319                 return self._fallback.ungettext(msgid1, msgid2, n)
 320             if n == 1:
 321                 tmsg = msgid1
 322             else:
 323                 tmsg = msgid2
 324         return unicode(tmsg, self._charset)
 325
 326
 327 # Locate a .mo file using the gettext strategy
 328 def find(domain, localedir=None, languages=None, all=0):
 329     # Get some reasonable defaults for arguments that were not supplied
 330     if localedir is None:
 331         localedir = _default_localedir
 332     if languages is None:
 333         languages = []
 334         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
 335             val = os.environ.get(envar)
 336             if val:
 337                 languages = val.split(':')
 338                 break
 339         if 'C' not in languages:
 340             languages.append('C')
 341     # now normalize and expand the languages
 342     nelangs = []
 343     for lang in languages:
 344         for nelang in _expand_lang(lang):
 345             if nelang not in nelangs:
 346                 nelangs.append(nelang)
 347     # select a language
 348     if all:
 349         result = []
 350     else:
 351         result = None
 352     for lang in nelangs:
 353         if lang == 'C':
 354             break
 355         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
 356         if os.path.exists(mofile):
 357             if all:
 358                 result.append(mofile)
 359             else:
 360                 return mofile
 361     return result
 362
 363
 364
 365 # a mapping between absolute .mo file path and Translation object
 366 _translations = {}
 367
 368 def translation(domain, localedir=None, languages=None,
 369                 class_=None, fallback=0):
 370     if class_ is None:
 371         class_ = GNUTranslations
 372     mofiles = find(domain, localedir, languages, all=1)
 373     if len(mofiles)==0:
 374         if fallback:
 375             return NullTranslations()
 376         raise IOError(ENOENT, 'No translation file found for domain', domain)
 377     # TBD: do we need to worry about the file pointer getting collected?
 378     # Avoid opening, reading, and parsing the .mo file after it's been done
 379     # once.
 380     result = None
 381     for mofile in mofiles:
 382         key = os.path.abspath(mofile)
 383         t = _translations.get(key)
 384         if t is None:
 385             t = _translations.setdefault(key, class_(open(mofile, 'rb')))
 386         # Copy the translation object to allow setting fallbacks.
 387         # All other instance data is shared with the cached object.
 388         t = copy.copy(t)
 389         if result is None:
 390             result = t
 391         else:
 392             result.add_fallback(t)
 393     return result
 394
 395
 396 def install(domain, localedir=None, unicode=0):
 397     translation(domain, localedir, fallback=1).install(unicode)
 398
 399
 400
 401 # a mapping b/w domains and locale directories
 402 _localedirs = {}
 403 # current global domain, `messages' used for compatibility w/ GNU gettext
 404 _current_domain = 'messages'
 405
 406
 407 def textdomain(domain=None):
 408     global _current_domain
 409     if domain is not None:
 410         _current_domain = domain
 411     return _current_domain
 412
 413
 414 def bindtextdomain(domain, localedir=None):
 415     global _localedirs
 416     if localedir is not None:
 417         _localedirs[domain] = localedir
 418     return _localedirs.get(domain, _default_localedir)
 419
 420
 421 def dgettext(domain, message):
 422     try:
 423         t = translation(domain, _localedirs.get(domain, None))
 424     except IOError:
 425         return message
 426     return t.gettext(message)
 427
 428
 429 def dngettext(domain, msgid1, msgid2, n):
 430     try:
 431         t = translation(domain, _localedirs.get(domain, None))
 432     except IOError:
 433         if n == 1:
 434             return msgid1
 435         else:
 436             return msgid2
 437     return t.ngettext(msgid1, msgid2, n)
 438
 439
 440 def gettext(message):
 441     return dgettext(_current_domain, message)
 442
 443
 444 def ngettext(msgid1, msgid2, n):
 445     return dngettext(_current_domain, msgid1, msgid2, n)
 446
 447
 448 # dcgettext() has been deemed unnecessary and is not implemented.
 449
 450 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
 451 # was:
 452 #
 453 #    import gettext
 454 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
 455 #    _ = cat.gettext
 456 #    print _('Hello World')
 457
 458 # The resulting catalog object currently don't support access through a
 459 # dictionary API, which was supported (but apparently unused) in GNOME
 460 # gettext.
 461
 462 Catalog = translation