Lib/gettext.py

   1 """Internationalization and localization support.
   2
   3 This module provides internationalization (I18N) and localization (L10N)
   4 support for your Python programs by providing an interface to the GNU gettext
   5 message catalog library.
   6
   7 I18N refers to the operation by which a program is made aware of multiple
   8 languages.  L10N refers to the adaptation of your program, once
   9 internationalized, to the local language and cultural habits.
  10
  11 """
  12
  13 # This module represents the integration of work, contributions, feedback, and
  14 # suggestions from the following people:
  15 #
  16 # Martin von Loewis, who wrote the initial implementation of the underlying
  17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
  18 # gettext.py implementation.
  19 #
  20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
  21 # which also included a pure-Python implementation to read .mo files if
  22 # intlmodule wasn't available.
  23 #
  24 # James Henstridge, who also wrote a gettext.py module, which has some
  25 # interesting, but currently unsupported experimental features: the notion of
  26 # a Catalog class and instances, and the ability to add to a catalog file via
  27 # a Python API.
  28 #
  29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
  30 # and conformed all C and Python code to Python's coding standards.
  31 #
  32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
  33 # module.
  34 #
  35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
  36 #
  37 # TODO:
  38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
  39 #   memory, but that's probably bad for large translated programs.  Instead,
  40 #   the lexical sort of original strings in GNU .mo files should be exploited
  41 #   to do binary searches and lazy initializations.  Or you might want to use
  42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
  43 #   you'll need to study the GNU gettext code to do this.
  44 #
  45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
  46 #   find this format documented anywhere.
  47
  48
  49 import locale, copy, os, re, struct, sys
  50 from errno import ENOENT
  51
  52
  53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
  54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
  55            'dgettext', 'dngettext', 'gettext', 'ngettext',
  56            ]
  57
  58 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
  59
  60
  61 def test(condition, true, false):
  62     """
  63     Implements the C expression:
  64
  65       condition ? true : false
  66
  67     Required to correctly interpret plural forms.
  68     """
  69     if condition:
  70         return true
  71     else:
  72         return false
  73
  74
  75 def c2py(plural):
  76     """Gets a C expression as used in PO files for plural forms and returns a
  77     Python lambda function that implements an equivalent expression.
  78     """
  79     # Security check, allow only the "n" identifier
  80     from io import StringIO
  81     import token, tokenize
  82     tokens = tokenize.generate_tokens(StringIO(plural).readline)
  83     try:
  84         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
  85     except tokenize.TokenError:
  86         raise ValueError('plural forms expression error, maybe unbalanced parenthesis')
  87     else:
  88         if danger:
  89             raise ValueError('plural forms expression could be dangerous')
  90
  91     # Replace some C operators by their Python equivalents
  92     plural = plural.replace('&&', ' and ')
  93     plural = plural.replace('||', ' or ')
  94
  95     expr = re.compile(r'\!([^=])')
  96     plural = expr.sub(' not \\1', plural)
  97
  98     # Regular expression and replacement function used to transform
  99     # "a?b:c" to "test(a,b,c)".
 100     expr = re.compile(r'(.*?)\?(.*?):(.*)')
 101     def repl(x):
 102         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
 103                                      expr.sub(repl, x.group(3)))
 104
 105     # Code to transform the plural expression, taking care of parentheses
 106     stack = ['']
 107     for c in plural:
 108         if c == '(':
 109             stack.append('')
 110         elif c == ')':
 111             if len(stack) == 1:
 112                 # Actually, we never reach this code, because unbalanced
 113                 # parentheses get caught in the security check at the
 114                 # beginning.
 115                 raise ValueError('unbalanced parenthesis in plural form')
 116             s = expr.sub(repl, stack.pop())
 117             stack[-1] += '(%s)' % s
 118         else:
 119             stack[-1] += c
 120     plural = expr.sub(repl, stack.pop())
 121
 122     return eval('lambda n: int(%s)' % plural)
 123
 124
 125
 126 def _expand_lang(locale):
 127     from locale import normalize
 128     locale = normalize(locale)
 129     COMPONENT_CODESET   = 1 << 0
 130     COMPONENT_TERRITORY = 1 << 1
 131     COMPONENT_MODIFIER  = 1 << 2
 132     # split up the locale into its base components
 133     mask = 0
 134     pos = locale.find('@')
 135     if pos >= 0:
 136         modifier = locale[pos:]
 137         locale = locale[:pos]
 138         mask |= COMPONENT_MODIFIER
 139     else:
 140         modifier = ''
 141     pos = locale.find('.')
 142     if pos >= 0:
 143         codeset = locale[pos:]
 144         locale = locale[:pos]
 145         mask |= COMPONENT_CODESET
 146     else:
 147         codeset = ''
 148     pos = locale.find('_')
 149     if pos >= 0:
 150         territory = locale[pos:]
 151         locale = locale[:pos]
 152         mask |= COMPONENT_TERRITORY
 153     else:
 154         territory = ''
 155     language = locale
 156     ret = []
 157     for i in range(mask+1):
 158         if not (i & ~mask):  # if all components for this combo exist ...
 159             val = language
 160             if i & COMPONENT_TERRITORY: val += territory
 161             if i & COMPONENT_CODESET:   val += codeset
 162             if i & COMPONENT_MODIFIER:  val += modifier
 163             ret.append(val)
 164     ret.reverse()
 165     return ret
 166
 167
 168
 169 class NullTranslations:
 170     def __init__(self, fp=None):
 171         self._info = {}
 172         self._charset = None
 173         self._output_charset = None
 174         self._fallback = None
 175         if fp is not None:
 176             self._parse(fp)
 177
 178     def _parse(self, fp):
 179         pass
 180
 181     def add_fallback(self, fallback):
 182         if self._fallback:
 183             self._fallback.add_fallback(fallback)
 184         else:
 185             self._fallback = fallback
 186
 187     def gettext(self, message):
 188         if self._fallback:
 189             return self._fallback.gettext(message)
 190         return message
 191
 192     def lgettext(self, message):
 193         if self._fallback:
 194             return self._fallback.lgettext(message)
 195         return message
 196
 197     def ngettext(self, msgid1, msgid2, n):
 198         if self._fallback:
 199             return self._fallback.ngettext(msgid1, msgid2, n)
 200         if n == 1:
 201             return msgid1
 202         else:
 203             return msgid2
 204
 205     def lngettext(self, msgid1, msgid2, n):
 206         if self._fallback:
 207             return self._fallback.lngettext(msgid1, msgid2, n)
 208         if n == 1:
 209             return msgid1
 210         else:
 211             return msgid2
 212
 213     def ugettext(self, message):
 214         if self._fallback:
 215             return self._fallback.ugettext(message)
 216         return str(message)
 217
 218     def ungettext(self, msgid1, msgid2, n):
 219         if self._fallback:
 220             return self._fallback.ungettext(msgid1, msgid2, n)
 221         if n == 1:
 222             return str(msgid1)
 223         else:
 224             return str(msgid2)
 225
 226     def info(self):
 227         return self._info
 228
 229     def charset(self):
 230         return self._charset
 231
 232     def output_charset(self):
 233         return self._output_charset
 234
 235     def set_output_charset(self, charset):
 236         self._output_charset = charset
 237
 238     def install(self, str=False, names=None):
 239         import builtins
 240         builtins.__dict__['_'] = str and self.ugettext or self.gettext
 241         if hasattr(names, "__contains__"):
 242             if "gettext" in names:
 243                 builtins.__dict__['gettext'] = builtins.__dict__['_']
 244             if "ngettext" in names:
 245                 builtins.__dict__['ngettext'] = (str and self.ungettext
 246                                                              or self.ngettext)
 247             if "lgettext" in names:
 248                 builtins.__dict__['lgettext'] = self.lgettext
 249             if "lngettext" in names:
 250                 builtins.__dict__['lngettext'] = self.lngettext
 251
 252
 253 class GNUTranslations(NullTranslations):
 254     # Magic number of .mo files
 255     LE_MAGIC = 0x950412de
 256     BE_MAGIC = 0xde120495
 257
 258     def _parse(self, fp):
 259         """Override this method to support alternative .mo formats."""
 260         unpack = struct.unpack
 261         filename = getattr(fp, 'name', '')
 262         # Parse the .mo file header, which consists of 5 little endian 32
 263         # bit words.
 264         self._catalog = catalog = {}
 265         self.plural = lambda n: int(n != 1) # germanic plural by default
 266         buf = fp.read()
 267         buflen = len(buf)
 268         # Are we big endian or little endian?
 269         magic = unpack('<I', buf[:4])[0]
 270         if magic == self.LE_MAGIC:
 271             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
 272             ii = '<II'
 273         elif magic == self.BE_MAGIC:
 274             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
 275             ii = '>II'
 276         else:
 277             raise IOError(0, 'Bad magic number', filename)
 278         # Now put all messages from the .mo file buffer into the catalog
 279         # dictionary.
 280         for i in range(0, msgcount):
 281             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
 282             mend = moff + mlen
 283             tlen, toff = unpack(ii, buf[transidx:transidx+8])
 284             tend = toff + tlen
 285             if mend < buflen and tend < buflen:
 286                 msg = buf[moff:mend]
 287                 tmsg = buf[toff:tend]
 288             else:
 289                 raise IOError(0, 'File is corrupt', filename)
 290             # See if we're looking at GNU .mo conventions for metadata
 291             if mlen == 0:
 292                 # Catalog description
 293                 lastk = k = None
 294                 for b_item in tmsg.split('\n'.encode("ascii")):
 295                     item = b_item.decode().strip()
 296                     if not item:
 297                         continue
 298                     if ':' in item:
 299                         k, v = item.split(':', 1)
 300                         k = k.strip().lower()
 301                         v = v.strip()
 302                         self._info[k] = v
 303                         lastk = k
 304                     elif lastk:
 305                         self._info[lastk] += '\n' + item
 306                     if k == 'content-type':
 307                         self._charset = v.split('charset=')[1]
 308                     elif k == 'plural-forms':
 309                         v = v.split(';')
 310                         plural = v[1].split('plural=')[1]
 311                         self.plural = c2py(plural)
 312             # Note: we unconditionally convert both msgids and msgstrs to
 313             # Unicode using the character encoding specified in the charset
 314             # parameter of the Content-Type header.  The gettext documentation
 315             # strongly encourages msgids to be us-ascii, but some appliations
 316             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
 317             # traditional gettext applications, the msgid conversion will
 318             # cause no problems since us-ascii should always be a subset of
 319             # the charset encoding.  We may want to fall back to 8-bit msgids
 320             # if the Unicode conversion fails.
 321             if b'\x00' in msg:
 322                 # Plural forms
 323                 msgid1, msgid2 = msg.split(b'\x00')
 324                 tmsg = tmsg.split(b'\x00')
 325                 if self._charset:
 326                     msgid1 = str(msgid1, self._charset)
 327                     tmsg = [str(x, self._charset) for x in tmsg]
 328                 else:
 329                     msgid1 = str(msgid1)
 330                     tmsg = [str(x) for x in tmsg]
 331                 for i in range(len(tmsg)):
 332                     catalog[(msgid1, i)] = tmsg[i]
 333             else:
 334                 if self._charset:
 335                     msg = str(msg, self._charset)
 336                     tmsg = str(tmsg, self._charset)
 337                 else:
 338                     msg = str(msg)
 339                     tmsg = str(tmsg)
 340                 catalog[msg] = tmsg
 341             # advance to next entry in the seek tables
 342             masteridx += 8
 343             transidx += 8
 344
 345     def lgettext(self, message):
 346         missing = object()
 347         tmsg = self._catalog.get(message, missing)
 348         if tmsg is missing:
 349             if self._fallback:
 350                 return self._fallback.lgettext(message)
 351             return message
 352         if self._output_charset:
 353             return tmsg.encode(self._output_charset)
 354         return tmsg.encode(locale.getpreferredencoding())
 355
 356     def lngettext(self, msgid1, msgid2, n):
 357         try:
 358             tmsg = self._catalog[(msgid1, self.plural(n))]
 359             if self._output_charset:
 360                 return tmsg.encode(self._output_charset)
 361             return tmsg.encode(locale.getpreferredencoding())
 362         except KeyError:
 363             if self._fallback:
 364                 return self._fallback.lngettext(msgid1, msgid2, n)
 365             if n == 1:
 366                 return msgid1
 367             else:
 368                 return msgid2
 369
 370     def ugettext(self, message):
 371         missing = object()
 372         tmsg = self._catalog.get(message, missing)
 373         if tmsg is missing:
 374             if self._fallback:
 375                 return self._fallback.ugettext(message)
 376             return str(message)
 377         return tmsg
 378
 379     gettext = ugettext
 380
 381     def ungettext(self, msgid1, msgid2, n):
 382         try:
 383             tmsg = self._catalog[(msgid1, self.plural(n))]
 384         except KeyError:
 385             if self._fallback:
 386                 return self._fallback.ungettext(msgid1, msgid2, n)
 387             if n == 1:
 388                 tmsg = str(msgid1)
 389             else:
 390                 tmsg = str(msgid2)
 391         return tmsg
 392
 393     ngettext = ungettext
 394
 395
 396 # Locate a .mo file using the gettext strategy
 397 def find(domain, localedir=None, languages=None, all=0):
 398     # Get some reasonable defaults for arguments that were not supplied
 399     if localedir is None:
 400         localedir = _default_localedir
 401     if languages is None:
 402         languages = []
 403         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
 404             val = os.environ.get(envar)
 405             if val:
 406                 languages = val.split(':')
 407                 break
 408         if 'C' not in languages:
 409             languages.append('C')
 410     # now normalize and expand the languages
 411     nelangs = []
 412     for lang in languages:
 413         for nelang in _expand_lang(lang):
 414             if nelang not in nelangs:
 415                 nelangs.append(nelang)
 416     # select a language
 417     if all:
 418         result = []
 419     else:
 420         result = None
 421     for lang in nelangs:
 422         if lang == 'C':
 423             break
 424         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
 425         if os.path.exists(mofile):
 426             if all:
 427                 result.append(mofile)
 428             else:
 429                 return mofile
 430     return result
 431
 432
 433
 434 # a mapping between absolute .mo file path and Translation object
 435 _translations = {}
 436
 437 def translation(domain, localedir=None, languages=None,
 438                 class_=None, fallback=False, codeset=None):
 439     if class_ is None:
 440         class_ = GNUTranslations
 441     mofiles = find(domain, localedir, languages, all=1)
 442     if not mofiles:
 443         if fallback:
 444             return NullTranslations()
 445         raise IOError(ENOENT, 'No translation file found for domain', domain)
 446     # TBD: do we need to worry about the file pointer getting collected?
 447     # Avoid opening, reading, and parsing the .mo file after it's been done
 448     # once.
 449     result = None
 450     for mofile in mofiles:
 451         key = os.path.abspath(mofile)
 452         t = _translations.get(key)
 453         if t is None:
 454             t = _translations.setdefault(key, class_(open(mofile, 'rb')))
 455         # Copy the translation object to allow setting fallbacks and
 456         # output charset. All other instance data is shared with the
 457         # cached object.
 458         t = copy.copy(t)
 459         if codeset:
 460             t.set_output_charset(codeset)
 461         if result is None:
 462             result = t
 463         else:
 464             result.add_fallback(t)
 465     return result
 466
 467
 468 def install(domain, localedir=None, str=False, codeset=None, names=None):
 469     t = translation(domain, localedir, fallback=True, codeset=codeset)
 470     t.install(str, names)
 471
 472
 473
 474 # a mapping b/w domains and locale directories
 475 _localedirs = {}
 476 # a mapping b/w domains and codesets
 477 _localecodesets = {}
 478 # current global domain, `messages' used for compatibility w/ GNU gettext
 479 _current_domain = 'messages'
 480
 481
 482 def textdomain(domain=None):
 483     global _current_domain
 484     if domain is not None:
 485         _current_domain = domain
 486     return _current_domain
 487
 488
 489 def bindtextdomain(domain, localedir=None):
 490     global _localedirs
 491     if localedir is not None:
 492         _localedirs[domain] = localedir
 493     return _localedirs.get(domain, _default_localedir)
 494
 495
 496 def bind_textdomain_codeset(domain, codeset=None):
 497     global _localecodesets
 498     if codeset is not None:
 499         _localecodesets[domain] = codeset
 500     return _localecodesets.get(domain)
 501
 502
 503 def dgettext(domain, message):
 504     try:
 505         t = translation(domain, _localedirs.get(domain, None),
 506                         codeset=_localecodesets.get(domain))
 507     except IOError:
 508         return message
 509     return t.gettext(message)
 510
 511 def ldgettext(domain, message):
 512     try:
 513         t = translation(domain, _localedirs.get(domain, None),
 514                         codeset=_localecodesets.get(domain))
 515     except IOError:
 516         return message
 517     return t.lgettext(message)
 518
 519 def dngettext(domain, msgid1, msgid2, n):
 520     try:
 521         t = translation(domain, _localedirs.get(domain, None),
 522                         codeset=_localecodesets.get(domain))
 523     except IOError:
 524         if n == 1:
 525             return msgid1
 526         else:
 527             return msgid2
 528     return t.ngettext(msgid1, msgid2, n)
 529
 530 def ldngettext(domain, msgid1, msgid2, n):
 531     try:
 532         t = translation(domain, _localedirs.get(domain, None),
 533                         codeset=_localecodesets.get(domain))
 534     except IOError:
 535         if n == 1:
 536             return msgid1
 537         else:
 538             return msgid2
 539     return t.lngettext(msgid1, msgid2, n)
 540
 541 def gettext(message):
 542     return dgettext(_current_domain, message)
 543
 544 def lgettext(message):
 545     return ldgettext(_current_domain, message)
 546
 547 def ngettext(msgid1, msgid2, n):
 548     return dngettext(_current_domain, msgid1, msgid2, n)
 549
 550 def lngettext(msgid1, msgid2, n):
 551     return ldngettext(_current_domain, msgid1, msgid2, n)
 552
 553 # dcgettext() has been deemed unnecessary and is not implemented.
 554
 555 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
 556 # was:
 557 #
 558 #    import gettext
 559 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
 560 #    _ = cat.gettext
 561 #    print _('Hello World')
 562
 563 # The resulting catalog object currently don't support access through a
 564 # dictionary API, which was supported (but apparently unused) in GNOME
 565 # gettext.
 566
 567 Catalog = translation