solenv/bin/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5 8
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14
  15 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  16 __version__ = '1.0.8'
  17 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
  18            'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
  19
  20 import array
  21 import codecs
  22 import os
  23 import re
  24 import struct
  25 import sys
  26 import textwrap
  27 import binascii
  28
  29 try:
  30     import io
  31 except ImportError:
  32     # replacement of io.open() for python < 2.6
  33     # we use codecs instead
  34     class io(object):
  35         @staticmethod
  36         def open(fpath, mode='r', encoding=None):
  37             return codecs.open(fpath, mode, encoding)
  38
  39
  40 # the default encoding to use when encoding cannot be detected
  41 default_encoding = 'utf-8'
  42
  43 # python 2/3 compatibility helpers {{{
  44
  45
  46 if sys.version_info[:2] < (3, 0):
  47     PY3 = False
  48     text_type = unicode
  49
  50     def b(s):
  51         return s
  52
  53     def u(s):
  54         return unicode(s, "unicode_escape")
  55
  56 else:
  57     PY3 = True
  58     text_type = str
  59
  60     def b(s):
  61         return s.encode("latin-1")
  62
  63     def u(s):
  64         return s
  65 # }}}
  66 # _pofile_or_mofile {{{
  67
  68
  69 def _pofile_or_mofile(f, type, **kwargs):
  70     """
  71     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  72     honor the DRY concept.
  73     """
  74     # get the file encoding
  75     enc = kwargs.get('encoding')
  76     if enc is None:
  77         enc = detect_encoding(f, type == 'mofile')
  78
  79     # parse the file
  80     kls = type == 'pofile' and _POFileParser or _MOFileParser
  81     parser = kls(
  82         f,
  83         encoding=enc,
  84         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  85         klass=kwargs.get('klass')
  86     )
  87     instance = parser.parse()
  88     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  89     return instance
  90 # }}}
  91 # _is_file {{{
  92
  93
  94 def _is_file(filename_or_contents):
  95     """
  96     Safely returns the value of os.path.exists(filename_or_contents).
  97
  98     Arguments:
  99
 100     ``filename_or_contents``
 101         either a filename, or a string holding the contents of some file.
 102         In the latter case, this function will always return False.
 103     """
 104     try:
 105         return os.path.exists(filename_or_contents)
 106     except (ValueError, UnicodeEncodeError):
 107         return False
 108 # }}}
 109 # function pofile() {{{
 110
 111
 112 def pofile(pofile, **kwargs):
 113     """
 114     Convenience function that parses the po or pot file ``pofile`` and returns
 115     a :class:`~polib.POFile` instance.
 116
 117     Arguments:
 118
 119     ``pofile``
 120         string, full or relative path to the po/pot file or its content (data).
 121
 122     ``wrapwidth``
 123         integer, the wrap width, only useful when the ``-w`` option was passed
 124         to xgettext (optional, default: ``78``).
 125
 126     ``encoding``
 127         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 128         encoding will be auto-detected).
 129
 130     ``check_for_duplicates``
 131         whether to check for duplicate entries when adding entries to the
 132         file (optional, default: ``False``).
 133
 134     ``klass``
 135         class which is used to instantiate the return value (optional,
 136         default: ``None``, the return value with be a :class:`~polib.POFile`
 137         instance).
 138     """
 139     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 140 # }}}
 141 # function mofile() {{{
 142
 143
 144 def mofile(mofile, **kwargs):
 145     """
 146     Convenience function that parses the mo file ``mofile`` and returns a
 147     :class:`~polib.MOFile` instance.
 148
 149     Arguments:
 150
 151     ``mofile``
 152         string, full or relative path to the mo file or its content (data).
 153
 154     ``wrapwidth``
 155         integer, the wrap width, only useful when the ``-w`` option was passed
 156         to xgettext to generate the po file that was used to format the mo file
 157         (optional, default: ``78``).
 158
 159     ``encoding``
 160         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 161         encoding will be auto-detected).
 162
 163     ``check_for_duplicates``
 164         whether to check for duplicate entries when adding entries to the
 165         file (optional, default: ``False``).
 166
 167     ``klass``
 168         class which is used to instantiate the return value (optional,
 169         default: ``None``, the return value with be a :class:`~polib.POFile`
 170         instance).
 171     """
 172     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 173 # }}}
 174 # function detect_encoding() {{{
 175
 176
 177 def detect_encoding(file, binary_mode=False):
 178     """
 179     Try to detect the encoding used by the ``file``. The ``file`` argument can
 180     be a PO or MO file path or a string containing the contents of the file.
 181     If the encoding cannot be detected, the function will return the value of
 182     ``default_encoding``.
 183
 184     Arguments:
 185
 186     ``file``
 187         string, full or relative path to the po/mo file or its content.
 188
 189     ``binary_mode``
 190         boolean, set this to True if ``file`` is a mo file.
 191     """
 192     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 193     rxt = re.compile(u(PATTERN))
 194     rxb = re.compile(b(PATTERN))
 195
 196     def charset_exists(charset):
 197         """Check whether ``charset`` is valid or not."""
 198         try:
 199             codecs.lookup(charset)
 200         except LookupError:
 201             return False
 202         return True
 203
 204     if not _is_file(file):
 205         match = rxt.search(file)
 206         if match:
 207             enc = match.group(1).strip()
 208             if charset_exists(enc):
 209                 return enc
 210     else:
 211         # For PY3, always treat as binary
 212         if binary_mode or PY3:
 213             mode = 'rb'
 214             rx = rxb
 215         else:
 216             mode = 'r'
 217             rx = rxt
 218         f = open(file, mode)
 219         for l in f.readlines():
 220             match = rx.search(l)
 221             if match:
 222                 f.close()
 223                 enc = match.group(1).strip()
 224                 if not isinstance(enc, text_type):
 225                     enc = enc.decode('utf-8')
 226                 if charset_exists(enc):
 227                     return enc
 228         f.close()
 229     return default_encoding
 230 # }}}
 231 # function escape() {{{
 232
 233
 234 def escape(st):
 235     """
 236     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 237     the given string ``st`` and returns it.
 238     """
 239     return st.replace('\\', r'\\')\
 240              .replace('\t', r'\t')\
 241              .replace('\r', r'\r')\
 242              .replace('\n', r'\n')\
 243              .replace('\"', r'\"')
 244 # }}}
 245 # function unescape() {{{
 246
 247
 248 def unescape(st):
 249     """
 250     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 251     the given string ``st`` and returns it.
 252     """
 253     def unescape_repl(m):
 254         m = m.group(1)
 255         if m == 'n':
 256             return '\n'
 257         if m == 't':
 258             return '\t'
 259         if m == 'r':
 260             return '\r'
 261         if m == '\\':
 262             return '\\'
 263         return m  # handles escaped double quote
 264     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 265 # }}}
 266 # function natural_sort() {{{
 267
 268
 269 def natural_sort(lst):
 270     """
 271     Sort naturally the given list.
 272     Credits: http://stackoverflow.com/a/4836734
 273     """
 274     convert = lambda text: int(text) if text.isdigit() else text.lower()
 275     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
 276     return sorted(lst, key = alphanum_key)
 277 # }}}
 278 # class _BaseFile {{{
 279
 280
 281 class _BaseFile(list):
 282     """
 283     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 284     classes. This class should **not** be instantiated directly.
 285     """
 286
 287     def __init__(self, *args, **kwargs):
 288         """
 289         Constructor, accepts the following keyword arguments:
 290
 291         ``pofile``
 292             string, the path to the po or mo file, or its content as a string.
 293
 294         ``wrapwidth``
 295             integer, the wrap width, only useful when the ``-w`` option was
 296             passed to xgettext (optional, default: ``78``).
 297
 298         ``encoding``
 299             string, the encoding to use, defaults to ``default_encoding``
 300             global variable (optional).
 301
 302         ``check_for_duplicates``
 303             whether to check for duplicate entries when adding entries to the
 304             file, (optional, default: ``False``).
 305         """
 306         list.__init__(self)
 307         # the opened file handle
 308         pofile = kwargs.get('pofile', None)
 309         if pofile and _is_file(pofile):
 310             self.fpath = pofile
 311         else:
 312             self.fpath = kwargs.get('fpath')
 313         # the width at which lines should be wrapped
 314         self.wrapwidth = kwargs.get('wrapwidth', 78)
 315         # the file encoding
 316         self.encoding = kwargs.get('encoding', default_encoding)
 317         # whether to check for duplicate entries or not
 318         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 319         # header
 320         self.header = ''
 321         # both po and mo files have metadata
 322         self.metadata = {}
 323         self.metadata_is_fuzzy = 0
 324
 325     def __unicode__(self):
 326         """
 327         Returns the unicode representation of the file.
 328         """
 329         ret = []
 330         entries = [self.metadata_as_entry()] + \
 331                   [e for e in self if not e.obsolete]
 332         for entry in entries:
 333             ret.append(entry.__unicode__(self.wrapwidth))
 334         for entry in self.obsolete_entries():
 335             ret.append(entry.__unicode__(self.wrapwidth))
 336         ret = u('\n').join(ret)
 337
 338         assert isinstance(ret, text_type)
 339         #if type(ret) != text_type:
 340         #    return unicode(ret, self.encoding)
 341         return ret
 342
 343     if PY3:
 344         def __str__(self):
 345             return self.__unicode__()
 346     else:
 347         def __str__(self):
 348             """
 349             Returns the string representation of the file.
 350             """
 351             return unicode(self).encode(self.encoding)
 352
 353     def __contains__(self, entry):
 354         """
 355         Overridden ``list`` method to implement the membership test (in and
 356         not in).
 357         The method considers that an entry is in the file if it finds an entry
 358         that has the same msgid (the test is **case sensitive**) and the same
 359         msgctxt (or none for both entries).
 360
 361         Argument:
 362
 363         ``entry``
 364             an instance of :class:`~polib._BaseEntry`.
 365         """
 366         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
 367             is not None
 368
 369     def __eq__(self, other):
 370         return str(self) == str(other)
 371
 372     def append(self, entry):
 373         """
 374         Overridden method to check for duplicates entries, if a user tries to
 375         add an entry that is already in the file, the method will raise a
 376         ``ValueError`` exception.
 377
 378         Argument:
 379
 380         ``entry``
 381             an instance of :class:`~polib._BaseEntry`.
 382         """
 383         # check_for_duplicates may not be defined (yet) when unpickling.
 384         # But if pickling, we never want to check for duplicates anyway.
 385         if getattr(self, 'check_for_duplicates', False) and entry in self:
 386             raise ValueError('Entry "%s" already exists' % entry.msgid)
 387         super(_BaseFile, self).append(entry)
 388
 389     def insert(self, index, entry):
 390         """
 391         Overridden method to check for duplicates entries, if a user tries to
 392         add an entry that is already in the file, the method will raise a
 393         ``ValueError`` exception.
 394
 395         Arguments:
 396
 397         ``index``
 398             index at which the entry should be inserted.
 399
 400         ``entry``
 401             an instance of :class:`~polib._BaseEntry`.
 402         """
 403         if self.check_for_duplicates and entry in self:
 404             raise ValueError('Entry "%s" already exists' % entry.msgid)
 405         super(_BaseFile, self).insert(index, entry)
 406
 407     def metadata_as_entry(self):
 408         """
 409         Returns the file metadata as a :class:`~polib.POFile` instance.
 410         """
 411         e = POEntry(msgid='')
 412         mdata = self.ordered_metadata()
 413         if mdata:
 414             strs = []
 415             for name, value in mdata:
 416                 # Strip whitespace off each line in a multi-line entry
 417                 strs.append('%s: %s' % (name, value))
 418             e.msgstr = '\n'.join(strs) + '\n'
 419         if self.metadata_is_fuzzy:
 420             e.flags.append('fuzzy')
 421         return e
 422
 423     def save(self, fpath=None, repr_method='__unicode__'):
 424         """
 425         Saves the po file to ``fpath``.
 426         If it is an existing file and no ``fpath`` is provided, then the
 427         existing file is rewritten with the modified data.
 428
 429         Keyword arguments:
 430
 431         ``fpath``
 432             string, full or relative path to the file.
 433
 434         ``repr_method``
 435             string, the method to use for output.
 436         """
 437         if self.fpath is None and fpath is None:
 438             raise IOError('You must provide a file path to save() method')
 439         contents = getattr(self, repr_method)()
 440         if fpath is None:
 441             fpath = self.fpath
 442         if repr_method == 'to_binary':
 443             fhandle = open(fpath, 'wb')
 444         else:
 445             fhandle = io.open(fpath, 'w', encoding=self.encoding)
 446             if not isinstance(contents, text_type):
 447                 contents = contents.decode(self.encoding)
 448         fhandle.write(contents)
 449         fhandle.close()
 450         # set the file path if not set
 451         if self.fpath is None and fpath:
 452             self.fpath = fpath
 453
 454     def find(self, st, by='msgid', include_obsolete_entries=False,
 455              msgctxt=False):
 456         """
 457         Find the entry which msgid (or property identified by the ``by``
 458         argument) matches the string ``st``.
 459
 460         Keyword arguments:
 461
 462         ``st``
 463             string, the string to search for.
 464
 465         ``by``
 466             string, the property to use for comparison (default: ``msgid``).
 467
 468         ``include_obsolete_entries``
 469             boolean, whether to also search in entries that are obsolete.
 470
 471         ``msgctxt``
 472             string, allows specifying a specific message context for the
 473             search.
 474         """
 475         if include_obsolete_entries:
 476             entries = self[:]
 477         else:
 478             entries = [e for e in self if not e.obsolete]
 479         for e in entries:
 480             if getattr(e, by) == st:
 481                 if msgctxt is not False and e.msgctxt != msgctxt:
 482                     continue
 483                 return e
 484         return None
 485
 486     def ordered_metadata(self):
 487         """
 488         Convenience method that returns an ordered version of the metadata
 489         dictionary. The return value is list of tuples (metadata name,
 490         metadata_value).
 491         """
 492         # copy the dict first
 493         metadata = self.metadata.copy()
 494         data_order = [
 495             'Project-Id-Version',
 496             'Report-Msgid-Bugs-To',
 497             'POT-Creation-Date',
 498             'PO-Revision-Date',
 499             'Last-Translator',
 500             'Language-Team',
 501             'Language',
 502             'MIME-Version',
 503             'Content-Type',
 504             'Content-Transfer-Encoding',
 505             'Plural-Forms'
 506         ]
 507         ordered_data = []
 508         for data in data_order:
 509             try:
 510                 value = metadata.pop(data)
 511                 ordered_data.append((data, value))
 512             except KeyError:
 513                 pass
 514         # the rest of the metadata will be alphabetically ordered since there
 515         # are no specs for this AFAIK
 516         for data in natural_sort(metadata.keys()):
 517             value = metadata[data]
 518             ordered_data.append((data, value))
 519         return ordered_data
 520
 521     def to_binary(self):
 522         """
 523         Return the binary representation of the file.
 524         """
 525         offsets = []
 526         entries = self.translated_entries()
 527
 528         # the keys are sorted in the .mo file
 529         def cmp(_self, other):
 530             # msgfmt compares entries with msgctxt if it exists
 531             self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
 532             other_msgid = other.msgctxt and other.msgctxt or other.msgid
 533             if self_msgid > other_msgid:
 534                 return 1
 535             elif self_msgid < other_msgid:
 536                 return -1
 537             else:
 538                 return 0
 539         # add metadata entry
 540         entries.sort(key=lambda o: o.msgctxt or o.msgid)
 541         mentry = self.metadata_as_entry()
 542         #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
 543         entries = [mentry] + entries
 544         entries_len = len(entries)
 545         ids, strs = b(''), b('')
 546         for e in entries:
 547             # For each string, we need size and file offset.  Each string is
 548             # NUL terminated; the NUL does not count into the size.
 549             msgid = b('')
 550             if e.msgctxt:
 551                 # Contexts are stored by storing the concatenation of the
 552                 # context, a <EOT> byte, and the original string
 553                 msgid = self._encode(e.msgctxt + '\4')
 554             if e.msgid_plural:
 555                 msgstr = []
 556                 for index in sorted(e.msgstr_plural.keys()):
 557                     msgstr.append(e.msgstr_plural[index])
 558                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 559                 msgstr = self._encode('\0'.join(msgstr))
 560             else:
 561                 msgid += self._encode(e.msgid)
 562                 msgstr = self._encode(e.msgstr)
 563             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 564             ids += msgid + b('\0')
 565             strs += msgstr + b('\0')
 566
 567         # The header is 7 32-bit unsigned integers.
 568         keystart = 7 * 4 + 16 * entries_len
 569         # and the values start after the keys
 570         valuestart = keystart + len(ids)
 571         koffsets = []
 572         voffsets = []
 573         # The string table first has the list of keys, then the list of values.
 574         # Each entry has first the size of the string, then the file offset.
 575         for o1, l1, o2, l2 in offsets:
 576             koffsets += [l1, o1 + keystart]
 577             voffsets += [l2, o2 + valuestart]
 578         offsets = koffsets + voffsets
 579
 580         output = struct.pack(
 581             "Iiiiiii",
 582             # Magic number
 583             MOFile.MAGIC,
 584             # Version
 585             0,
 586             # number of entries
 587             entries_len,
 588             # start of key index
 589             7 * 4,
 590             # start of value index
 591             7 * 4 + entries_len * 8,
 592             # size and offset of hash table, we don't use hash tables
 593             0, keystart
 594
 595         )
 596         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 597             output += array.array("i", offsets).tobytes()
 598         else:
 599             output += array.array("i", offsets).tostring()
 600         output += ids
 601         output += strs
 602         return output
 603
 604     def _encode(self, mixed):
 605         """
 606         Encodes the given ``mixed`` argument with the file encoding if and
 607         only if it's an unicode string and returns the encoded string.
 608         """
 609         if isinstance(mixed, text_type):
 610             mixed = mixed.encode(self.encoding)
 611         return mixed
 612 # }}}
 613 # class POFile {{{
 614
 615
 616 class POFile(_BaseFile):
 617     """
 618     Po (or Pot) file reader/writer.
 619     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 620     the python ``list`` type.
 621     """
 622
 623     def __unicode__(self):
 624         """
 625         Returns the unicode representation of the po file.
 626         """
 627         ret, headers = '', self.header.split('\n')
 628         for header in headers:
 629             if not len(header):
 630                 ret += "#\n"
 631             elif header[:1] in [',', ':']:
 632                 ret += '#%s\n' % header
 633             else:
 634                 ret += '# %s\n' % header
 635
 636         if not isinstance(ret, text_type):
 637             ret = ret.decode(self.encoding)
 638
 639         return ret + _BaseFile.__unicode__(self)
 640
 641     def save_as_mofile(self, fpath):
 642         """
 643         Saves the binary representation of the file to given ``fpath``.
 644
 645         Keyword argument:
 646
 647         ``fpath``
 648             string, full or relative path to the mo file.
 649         """
 650         _BaseFile.save(self, fpath, 'to_binary')
 651
 652     def percent_translated(self):
 653         """
 654         Convenience method that returns the percentage of translated
 655         messages.
 656         """
 657         total = len([e for e in self if not e.obsolete])
 658         if total == 0:
 659             return 100
 660         translated = len(self.translated_entries())
 661         return int(translated * 100 / float(total))
 662
 663     def translated_entries(self):
 664         """
 665         Convenience method that returns the list of translated entries.
 666         """
 667         return [e for e in self if e.translated()]
 668
 669     def untranslated_entries(self):
 670         """
 671         Convenience method that returns the list of untranslated entries.
 672         """
 673         return [e for e in self if not e.translated() and not e.obsolete
 674                 and not 'fuzzy' in e.flags]
 675
 676     def fuzzy_entries(self):
 677         """
 678         Convenience method that returns the list of fuzzy entries.
 679         """
 680         return [e for e in self if 'fuzzy' in e.flags]
 681
 682     def obsolete_entries(self):
 683         """
 684         Convenience method that returns the list of obsolete entries.
 685         """
 686         return [e for e in self if e.obsolete]
 687
 688     def merge(self, refpot):
 689         """
 690         Convenience method that merges the current pofile with the pot file
 691         provided. It behaves exactly as the gettext msgmerge utility:
 692
 693         * comments of this file will be preserved, but extracted comments and
 694           occurrences will be discarded;
 695         * any translations or comments in the file will be discarded, however,
 696           dot comments and file positions will be preserved;
 697         * the fuzzy flags are preserved.
 698
 699         Keyword argument:
 700
 701         ``refpot``
 702             object POFile, the reference catalog.
 703         """
 704         # Store entries in dict/set for faster access
 705         self_entries = dict((entry.msgid, entry) for entry in self)
 706         refpot_msgids = set(entry.msgid for entry in refpot)
 707         # Merge entries that are in the refpot
 708         for entry in refpot:
 709             e = self_entries.get(entry.msgid)
 710             if e is None:
 711                 e = POEntry()
 712                 self.append(e)
 713             e.merge(entry)
 714         # ok, now we must "obsolete" entries that are not in the refpot anymore
 715         for entry in self:
 716             if entry.msgid not in refpot_msgids:
 717                 entry.obsolete = True
 718 # }}}
 719 # class MOFile {{{
 720
 721
 722 class MOFile(_BaseFile):
 723     """
 724     Mo file reader/writer.
 725     This class inherits the :class:`~polib._BaseFile` class and, by
 726     extension, the python ``list`` type.
 727     """
 728     MAGIC = 0x950412de
 729     MAGIC_SWAPPED = 0xde120495
 730
 731     def __init__(self, *args, **kwargs):
 732         """
 733         Constructor, accepts all keywords arguments accepted by
 734         :class:`~polib._BaseFile` class.
 735         """
 736         _BaseFile.__init__(self, *args, **kwargs)
 737         self.magic_number = None
 738         self.version = 0
 739
 740     def save_as_pofile(self, fpath):
 741         """
 742         Saves the mofile as a pofile to ``fpath``.
 743
 744         Keyword argument:
 745
 746         ``fpath``
 747             string, full or relative path to the file.
 748         """
 749         _BaseFile.save(self, fpath)
 750
 751     def save(self, fpath=None):
 752         """
 753         Saves the mofile to ``fpath``.
 754
 755         Keyword argument:
 756
 757         ``fpath``
 758             string, full or relative path to the file.
 759         """
 760         _BaseFile.save(self, fpath, 'to_binary')
 761
 762     def percent_translated(self):
 763         """
 764         Convenience method to keep the same interface with POFile instances.
 765         """
 766         return 100
 767
 768     def translated_entries(self):
 769         """
 770         Convenience method to keep the same interface with POFile instances.
 771         """
 772         return self
 773
 774     def untranslated_entries(self):
 775         """
 776         Convenience method to keep the same interface with POFile instances.
 777         """
 778         return []
 779
 780     def fuzzy_entries(self):
 781         """
 782         Convenience method to keep the same interface with POFile instances.
 783         """
 784         return []
 785
 786     def obsolete_entries(self):
 787         """
 788         Convenience method to keep the same interface with POFile instances.
 789         """
 790         return []
 791 # }}}
 792 # class _BaseEntry {{{
 793
 794
 795 class _BaseEntry(object):
 796     """
 797     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 798     This class should **not** be instantiated directly.
 799     """
 800
 801     def __init__(self, *args, **kwargs):
 802         """
 803         Constructor, accepts the following keyword arguments:
 804
 805         ``msgid``
 806             string, the entry msgid.
 807
 808         ``msgstr``
 809             string, the entry msgstr.
 810
 811         ``msgid_plural``
 812             string, the entry msgid_plural.
 813
 814         ``msgstr_plural``
 815             list, the entry msgstr_plural lines.
 816
 817         ``msgctxt``
 818             string, the entry context (msgctxt).
 819
 820         ``obsolete``
 821             bool, whether the entry is "obsolete" or not.
 822
 823         ``encoding``
 824             string, the encoding to use, defaults to ``default_encoding``
 825             global variable (optional).
 826         """
 827         self.msgid = kwargs.get('msgid', '')
 828         self.msgstr = kwargs.get('msgstr', '')
 829         self.msgid_plural = kwargs.get('msgid_plural', '')
 830         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 831         self.msgctxt = kwargs.get('msgctxt', None)
 832         self.obsolete = kwargs.get('obsolete', False)
 833         self.encoding = kwargs.get('encoding', default_encoding)
 834
 835     def __unicode__(self, wrapwidth=78):
 836         """
 837         Returns the unicode representation of the entry.
 838         """
 839         if self.obsolete:
 840             delflag = '#~ '
 841         else:
 842             delflag = ''
 843         ret = []
 844         # write the msgctxt if any
 845         if self.msgctxt is not None:
 846             ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
 847                                    wrapwidth)
 848         # write the msgid
 849         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 850         # write the msgid_plural if any
 851         if self.msgid_plural:
 852             ret += self._str_field("msgid_plural", delflag, "",
 853                                    self.msgid_plural, wrapwidth)
 854         if self.msgstr_plural:
 855             # write the msgstr_plural if any
 856             msgstrs = self.msgstr_plural
 857             keys = list(msgstrs)
 858             keys.sort()
 859             for index in keys:
 860                 msgstr = msgstrs[index]
 861                 plural_index = '[%s]' % index
 862                 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
 863                                        wrapwidth)
 864         else:
 865             # otherwise write the msgstr
 866             ret += self._str_field("msgstr", delflag, "", self.msgstr,
 867                                    wrapwidth)
 868         ret.append('')
 869         usedirect = True
 870         if not PY3 and type(ret[0] != unicode):
 871             try:
 872                 usedirect = False
 873                 ret = u('\n').join(x.decode('utf-8') for x in ret)
 874             except:
 875                 usedirect = True
 876         if usedirect:
 877             ret = u('\n').join(ret)
 878         return ret
 879
 880     if PY3:
 881         def __str__(self):
 882             return self.__unicode__()
 883     else:
 884         def __str__(self):
 885             """
 886             Returns the string representation of the entry.
 887             """
 888             return unicode(self).encode(self.encoding)
 889
 890     def __eq__(self, other):
 891         return str(self) == str(other)
 892
 893     def _str_field(self, fieldname, delflag, plural_index, field,
 894                    wrapwidth=78):
 895         lines = field.splitlines(True)
 896         if len(lines) > 1:
 897             lines = [''] + lines  # start with initial empty line
 898         else:
 899             escaped_field = escape(field)
 900             specialchars_count = 0
 901             for c in ['\\', '\n', '\r', '\t', '"']:
 902                 specialchars_count += field.count(c)
 903             # comparison must take into account fieldname length + one space
 904             # + 2 quotes (eg. msgid "<string>")
 905             flength = len(fieldname) + 3
 906             if plural_index:
 907                 flength += len(plural_index)
 908             real_wrapwidth = wrapwidth - flength + specialchars_count
 909             if wrapwidth > 0 and len(field) > real_wrapwidth:
 910                 # Wrap the line but take field name into account
 911                 lines = [''] + [unescape(item) for item in wrap(
 912                     escaped_field,
 913                     wrapwidth - 2,  # 2 for quotes ""
 914                     drop_whitespace=False,
 915                     break_long_words=False
 916                 )]
 917             else:
 918                 lines = [field]
 919         if fieldname.startswith('previous_'):
 920             # quick and dirty trick to get the real field name
 921             fieldname = fieldname[9:]
 922
 923         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
 924                                 escape(lines.pop(0)))]
 925         for line in lines:
 926             ret.append('%s"%s"' % (delflag, escape(line)))
 927         return ret
 928 # }}}
 929 # class POEntry {{{
 930
 931
 932 class POEntry(_BaseEntry):
 933     """
 934     Represents a po file entry.
 935     """
 936
 937     def __init__(self, *args, **kwargs):
 938         """
 939         Constructor, accepts the following keyword arguments:
 940
 941         ``comment``
 942             string, the entry comment.
 943
 944         ``tcomment``
 945             string, the entry translator comment.
 946
 947         ``occurrences``
 948             list, the entry occurrences.
 949
 950         ``flags``
 951             list, the entry flags.
 952
 953         ``previous_msgctxt``
 954             string, the entry previous context.
 955
 956         ``previous_msgid``
 957             string, the entry previous msgid.
 958
 959         ``previous_msgid_plural``
 960             string, the entry previous msgid_plural.
 961
 962         ``linenum``
 963             integer, the line number of the entry
 964         """
 965         _BaseEntry.__init__(self, *args, **kwargs)
 966         self.comment = kwargs.get('comment', '')
 967         self.tcomment = kwargs.get('tcomment', '')
 968         self.occurrences = kwargs.get('occurrences', [])
 969         self.flags = kwargs.get('flags', [])
 970         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
 971         self.previous_msgid = kwargs.get('previous_msgid', None)
 972         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
 973         self.linenum = kwargs.get('linenum', None)
 974
 975     def __unicode__(self, wrapwidth=0):
 976         """
 977         Returns the unicode representation of the entry.
 978         """
 979         ret = []
 980         # comments first, if any (with text wrapping as xgettext does)
 981         if self.obsolete:
 982             comments = [('tcomment', '# ')]
 983         else:
 984             comments = [('comment', '#. '), ('tcomment', '# ')]
 985         for c in comments:
 986             val = getattr(self, c[0])
 987             if val:
 988                 for comment in val.split('\n'):
 989                     if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
 990                         ret += wrap(
 991                             comment,
 992                             wrapwidth,
 993                             initial_indent=c[1],
 994                             subsequent_indent=c[1],
 995                             break_long_words=False
 996                         )
 997                     else:
 998                         ret.append('%s%s' % (c[1], comment))
 999
1000         # occurrences (with text wrapping as xgettext does)
1001         if not self.obsolete and self.occurrences:
1002             filelist = []
1003             for fpath, lineno in self.occurrences:
1004                 if lineno:
1005                     filelist.append('%s:%s' % (fpath, lineno))
1006                 else:
1007                     filelist.append(fpath)
1008             filestr = ' '.join(filelist)
1009             if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1010                 # textwrap split words that contain hyphen, this is not
1011                 # what we want for filenames, so the dirty hack is to
1012                 # temporally replace hyphens with a char that a file cannot
1013                 # contain, like "*"
1014                 ret += [l.replace('*', '-') for l in wrap(
1015                     filestr.replace('-', '*'),
1016                     wrapwidth,
1017                     initial_indent='#: ',
1018                     subsequent_indent='#: ',
1019                     break_long_words=False
1020                 )]
1021             else:
1022                 ret.append('#: ' + filestr)
1023
1024         # flags (TODO: wrapping ?)
1025         if self.flags:
1026             ret.append('#, %s' % ', '.join(self.flags))
1027
1028         # previous context and previous msgid/msgid_plural
1029         fields = ['previous_msgctxt', 'previous_msgid',
1030                   'previous_msgid_plural']
1031         if self.obsolete:
1032             prefix = "#~| "
1033         else:
1034             prefix = "#| "
1035         for f in fields:
1036             val = getattr(self, f)
1037             if val:
1038                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1039
1040         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1041         ret = u('\n').join(ret)
1042         return ret
1043
1044     def __cmp__(self, other):
1045         """
1046         Called by comparison operations if rich comparison is not defined.
1047         """
1048
1049         # First: Obsolete test
1050         if self.obsolete != other.obsolete:
1051             if self.obsolete:
1052                 return -1
1053             else:
1054                 return 1
1055         # Work on a copy to protect original
1056         occ1 = sorted(self.occurrences[:])
1057         occ2 = sorted(other.occurrences[:])
1058         pos = 0
1059         for entry1 in occ1:
1060             try:
1061                 entry2 = occ2[pos]
1062             except IndexError:
1063                 return 1
1064             pos = pos + 1
1065             if entry1[0] != entry2[0]:
1066                 if entry1[0] > entry2[0]:
1067                     return 1
1068                 else:
1069                     return -1
1070             if entry1[1] != entry2[1]:
1071                 if entry1[1] > entry2[1]:
1072                     return 1
1073                 else:
1074                     return -1
1075         # Compare msgid_plural if set
1076         if self.msgid_plural:
1077             if not other.msgid_plural:
1078                 return 1
1079             for pos in self.msgid_plural:
1080                 if pos not in other.msgid_plural:
1081                     return 1
1082                 if self.msgid_plural[pos] > other.msgid_plural[pos]:
1083                     return 1
1084                 if self.msgid_plural[pos] < other.msgid_plural[pos]:
1085                     return -1
1086         # Finally: Compare message ID
1087         if self.msgid > other.msgid:
1088             return 1
1089         elif self.msgid < other.msgid:
1090             return -1
1091         return 0
1092
1093     def __gt__(self, other):
1094         return self.__cmp__(other) > 0
1095
1096     def __lt__(self, other):
1097         return self.__cmp__(other) < 0
1098
1099     def __ge__(self, other):
1100         return self.__cmp__(other) >= 0
1101
1102     def __le__(self, other):
1103         return self.__cmp__(other) <= 0
1104
1105     def __eq__(self, other):
1106         return self.__cmp__(other) == 0
1107
1108     def __ne__(self, other):
1109         return self.__cmp__(other) != 0
1110
1111     def translated(self):
1112         """
1113         Returns ``True`` if the entry has been translated or ``False``
1114         otherwise.
1115         """
1116         if self.obsolete or 'fuzzy' in self.flags:
1117             return False
1118         if self.msgstr != '':
1119             return True
1120         if self.msgstr_plural:
1121             for pos in self.msgstr_plural:
1122                 if self.msgstr_plural[pos] == '':
1123                     return False
1124             return True
1125         return False
1126
1127     def merge(self, other):
1128         """
1129         Merge the current entry with the given pot entry.
1130         """
1131         self.msgid = other.msgid
1132         self.msgctxt = other.msgctxt
1133         self.occurrences = other.occurrences
1134         self.comment = other.comment
1135         fuzzy = 'fuzzy' in self.flags
1136         self.flags = other.flags[:]  # clone flags
1137         if fuzzy:
1138             self.flags.append('fuzzy')
1139         self.msgid_plural = other.msgid_plural
1140         self.obsolete = other.obsolete
1141         self.previous_msgctxt = other.previous_msgctxt
1142         self.previous_msgid = other.previous_msgid
1143         self.previous_msgid_plural = other.previous_msgid_plural
1144         if other.msgstr_plural:
1145             for pos in other.msgstr_plural:
1146                 try:
1147                     # keep existing translation at pos if any
1148                     self.msgstr_plural[pos]
1149                 except KeyError:
1150                     self.msgstr_plural[pos] = ''
1151
1152     def __hash__(self):
1153         return hash((self.msgid, self.msgstr))
1154 # }}}
1155 # class MOEntry {{{
1156
1157
1158 class MOEntry(_BaseEntry):
1159     """
1160     Represents a mo file entry.
1161     """
1162     def __init__(self, *args, **kwargs):
1163         """
1164         Constructor, accepts the following keyword arguments,
1165         for consistency with :class:`~polib.POEntry`:
1166
1167         ``comment``
1168         ``tcomment``
1169         ``occurrences``
1170         ``flags``
1171         ``previous_msgctxt``
1172         ``previous_msgid``
1173         ``previous_msgid_plural``
1174
1175         Note: even though these keyword arguments are accepted,
1176         they hold no real meaning in the context of MO files
1177         and are simply ignored.
1178         """
1179         _BaseEntry.__init__(self, *args, **kwargs)
1180         self.comment = ''
1181         self.tcomment = ''
1182         self.occurrences = []
1183         self.flags = []
1184         self.previous_msgctxt = None
1185         self.previous_msgid = None
1186         self.previous_msgid_plural = None
1187
1188     def __hash__(self):
1189         return hash((self.msgid, self.msgstr))
1190
1191 # }}}
1192 # class _POFileParser {{{
1193
1194
1195 class _POFileParser(object):
1196     """
1197     A finite state machine to parse efficiently and correctly po
1198     file format.
1199     """
1200
1201     def __init__(self, pofile, *args, **kwargs):
1202         """
1203         Constructor.
1204
1205         Keyword arguments:
1206
1207         ``pofile``
1208             string, path to the po file or its content
1209
1210         ``encoding``
1211             string, the encoding to use, defaults to ``default_encoding``
1212             global variable (optional).
1213
1214         ``check_for_duplicates``
1215             whether to check for duplicate entries when adding entries to the
1216             file (optional, default: ``False``).
1217         """
1218         enc = kwargs.get('encoding', default_encoding)
1219         if _is_file(pofile):
1220             try:
1221                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1222             except LookupError:
1223                 enc = default_encoding
1224                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1225         else:
1226             self.fhandle = pofile.splitlines()
1227
1228         klass = kwargs.get('klass')
1229         if klass is None:
1230             klass = POFile
1231         self.instance = klass(
1232             pofile=pofile,
1233             encoding=enc,
1234             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1235         )
1236         self.transitions = {}
1237         self.current_line = 0
1238         self.current_entry = POEntry(linenum=self.current_line)
1239         self.current_state = 'st'
1240         self.current_token = None
1241         # two memo flags used in handlers
1242         self.msgstr_index = 0
1243         self.entry_obsolete = 0
1244         # Configure the state machine, by adding transitions.
1245         # Signification of symbols:
1246         #     * ST: Beginning of the file (start)
1247         #     * HE: Header
1248         #     * TC: a translation comment
1249         #     * GC: a generated comment
1250         #     * OC: a file/line occurrence
1251         #     * FL: a flags line
1252         #     * CT: a message context
1253         #     * PC: a previous msgctxt
1254         #     * PM: a previous msgid
1255         #     * PP: a previous msgid_plural
1256         #     * MI: a msgid
1257         #     * MP: a msgid plural
1258         #     * MS: a msgstr
1259         #     * MX: a msgstr plural
1260         #     * MC: a msgid or msgstr continuation line
1261         all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1262                'ms', 'mp', 'mx', 'mi']
1263
1264         self.add('tc', ['st', 'he'],                                     'he')
1265         self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1266                         'mp', 'mx', 'mi'],                               'tc')
1267         self.add('gc', all,                                              'gc')
1268         self.add('oc', all,                                              'oc')
1269         self.add('fl', all,                                              'fl')
1270         self.add('pc', all,                                              'pc')
1271         self.add('pm', all,                                              'pm')
1272         self.add('pp', all,                                              'pp')
1273         self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1274                         'pp', 'ms', 'mx'],                               'ct')
1275         self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1276                  'pm', 'pp', 'ms', 'mx'],                                'mi')
1277         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
1278         self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
1279         self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
1280         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1281
1282     def parse(self):
1283         """
1284         Run the state machine, parse the file line by line and call process()
1285         with the current matched symbol.
1286         """
1287
1288         keywords = {
1289             'msgctxt': 'ct',
1290             'msgid': 'mi',
1291             'msgstr': 'ms',
1292             'msgid_plural': 'mp',
1293         }
1294         prev_keywords = {
1295             'msgid_plural': 'pp',
1296             'msgid': 'pm',
1297             'msgctxt': 'pc',
1298         }
1299         tokens = []
1300         for line in self.fhandle:
1301             self.current_line += 1
1302             line = line.strip()
1303             if line == '':
1304                 continue
1305
1306             tokens = line.split(None, 2)
1307             nb_tokens = len(tokens)
1308
1309             if tokens[0] == '#~|':
1310                 continue
1311
1312             if tokens[0] == '#~' and nb_tokens > 1:
1313                 line = line[3:].strip()
1314                 tokens = tokens[1:]
1315                 nb_tokens -= 1
1316                 self.entry_obsolete = 1
1317             else:
1318                 self.entry_obsolete = 0
1319
1320             # Take care of keywords like
1321             # msgid, msgid_plural, msgctxt & msgstr.
1322             if tokens[0] in keywords and nb_tokens > 1:
1323                 line = line[len(tokens[0]):].lstrip()
1324                 if re.search(r'([^\\]|^)"', line[1:-1]):
1325                     raise IOError('Syntax error in po file %s (line %s): '
1326                                   'unescaped double quote found' %
1327                                   (self.instance.fpath, self.current_line))
1328                 self.current_token = line
1329                 self.process(keywords[tokens[0]])
1330                 continue
1331
1332             self.current_token = line
1333
1334             if tokens[0] == '#:':
1335                 if nb_tokens <= 1:
1336                     continue
1337                 # we are on a occurrences line
1338                 self.process('oc')
1339
1340             elif line[:1] == '"':
1341                 # we are on a continuation line
1342                 if re.search(r'([^\\]|^)"', line[1:-1]):
1343                     raise IOError('Syntax error in po file %s (line %s): '
1344                                   'unescaped double quote found' %
1345                                   (self.instance.fpath, self.current_line))
1346                 self.process('mc')
1347
1348             elif line[:7] == 'msgstr[':
1349                 # we are on a msgstr plural
1350                 self.process('mx')
1351
1352             elif tokens[0] == '#,':
1353                 if nb_tokens <= 1:
1354                     continue
1355                 # we are on a flags line
1356                 self.process('fl')
1357
1358             elif tokens[0] == '#' or tokens[0].startswith('##'):
1359                 if line == '#':
1360                     line += ' '
1361                 # we are on a translator comment line
1362                 self.process('tc')
1363
1364             elif tokens[0] == '#.':
1365                 if nb_tokens <= 1:
1366                     continue
1367                 # we are on a generated comment line
1368                 self.process('gc')
1369
1370             elif tokens[0] == '#|':
1371                 if nb_tokens <= 1:
1372                     raise IOError('Syntax error in po file %s (line %s)' %
1373                                   (self.instance.fpath, self.current_line))
1374
1375                 # Remove the marker and any whitespace right after that.
1376                 line = line[2:].lstrip()
1377                 self.current_token = line
1378
1379                 if tokens[1].startswith('"'):
1380                     # Continuation of previous metadata.
1381                     self.process('mc')
1382                     continue
1383
1384                 if nb_tokens == 2:
1385                     # Invalid continuation line.
1386                     raise IOError('Syntax error in po file %s (line %s): '
1387                                   'invalid continuation line' %
1388                                   (self.instance.fpath, self.current_line))
1389
1390                 # we are on a "previous translation" comment line,
1391                 if tokens[1] not in prev_keywords:
1392                     # Unknown keyword in previous translation comment.
1393                     raise IOError('Syntax error in po file %s (line %s): '
1394                                   'unknown keyword %s' %
1395                                   (self.instance.fpath, self.current_line,
1396                                    tokens[1]))
1397
1398                 # Remove the keyword and any whitespace
1399                 # between it and the starting quote.
1400                 line = line[len(tokens[1]):].lstrip()
1401                 self.current_token = line
1402                 self.process(prev_keywords[tokens[1]])
1403
1404             else:
1405                 raise IOError('Syntax error in po file %s (line %s)' %
1406                               (self.instance.fpath, self.current_line))
1407
1408         if self.current_entry and len(tokens) > 0 and \
1409            not tokens[0].startswith('#'):
1410             # since entries are added when another entry is found, we must add
1411             # the last entry here (only if there are lines). Trailing comments
1412             # are ignored
1413             self.instance.append(self.current_entry)
1414
1415         # before returning the instance, check if there's metadata and if
1416         # so extract it in a dict
1417         metadataentry = self.instance.find('')
1418         if metadataentry:  # metadata found
1419             # remove the entry
1420             self.instance.remove(metadataentry)
1421             self.instance.metadata_is_fuzzy = metadataentry.flags
1422             key = None
1423             for msg in metadataentry.msgstr.splitlines():
1424                 try:
1425                     key, val = msg.split(':', 1)
1426                     self.instance.metadata[key] = val.strip()
1427                 except (ValueError, KeyError):
1428                     if key is not None:
1429                         self.instance.metadata[key] += '\n' + msg.strip()
1430         # close opened file
1431         if not isinstance(self.fhandle, list):  # must be file
1432             self.fhandle.close()
1433         return self.instance
1434
1435     def add(self, symbol, states, next_state):
1436         """
1437         Add a transition to the state machine.
1438
1439         Keywords arguments:
1440
1441         ``symbol``
1442             string, the matched token (two chars symbol).
1443
1444         ``states``
1445             list, a list of states (two chars symbols).
1446
1447         ``next_state``
1448             the next state the fsm will have after the action.
1449         """
1450         for state in states:
1451             action = getattr(self, 'handle_%s' % next_state)
1452             self.transitions[(symbol, state)] = (action, next_state)
1453
1454     def process(self, symbol):
1455         """
1456         Process the transition corresponding to the current state and the
1457         symbol provided.
1458
1459         Keywords arguments:
1460
1461         ``symbol``
1462             string, the matched token (two chars symbol).
1463
1464         ``linenum``
1465             integer, the current line number of the parsed file.
1466         """
1467         try:
1468             (action, state) = self.transitions[(symbol, self.current_state)]
1469             if action():
1470                 self.current_state = state
1471         except Exception:
1472             raise IOError('Syntax error in po file (line %s)' %
1473                           self.current_line)
1474
1475     # state handlers
1476
1477     def handle_he(self):
1478         """Handle a header comment."""
1479         if self.instance.header != '':
1480             self.instance.header += '\n'
1481         self.instance.header += self.current_token[2:]
1482         return 1
1483
1484     def handle_tc(self):
1485         """Handle a translator comment."""
1486         if self.current_state in ['mc', 'ms', 'mx']:
1487             self.instance.append(self.current_entry)
1488             self.current_entry = POEntry(linenum=self.current_line)
1489         if self.current_entry.tcomment != '':
1490             self.current_entry.tcomment += '\n'
1491         tcomment = self.current_token.lstrip('#')
1492         if tcomment.startswith(' '):
1493             tcomment = tcomment[1:]
1494         self.current_entry.tcomment += tcomment
1495         return True
1496
1497     def handle_gc(self):
1498         """Handle a generated comment."""
1499         if self.current_state in ['mc', 'ms', 'mx']:
1500             self.instance.append(self.current_entry)
1501             self.current_entry = POEntry(linenum=self.current_line)
1502         if self.current_entry.comment != '':
1503             self.current_entry.comment += '\n'
1504         self.current_entry.comment += self.current_token[3:]
1505         return True
1506
1507     def handle_oc(self):
1508         """Handle a file:num occurrence."""
1509         if self.current_state in ['mc', 'ms', 'mx']:
1510             self.instance.append(self.current_entry)
1511             self.current_entry = POEntry(linenum=self.current_line)
1512         occurrences = self.current_token[3:].split()
1513         for occurrence in occurrences:
1514             if occurrence != '':
1515                 try:
1516                     fil, line = occurrence.rsplit(':', 1)
1517                     if not line.isdigit():
1518                         fil = fil + line
1519                         line = ''
1520                     self.current_entry.occurrences.append((fil, line))
1521                 except (ValueError, AttributeError):
1522                     self.current_entry.occurrences.append((occurrence, ''))
1523         return True
1524
1525     def handle_fl(self):
1526         """Handle a flags line."""
1527         if self.current_state in ['mc', 'ms', 'mx']:
1528             self.instance.append(self.current_entry)
1529             self.current_entry = POEntry(linenum=self.current_line)
1530         self.current_entry.flags += [c.strip() for c in
1531                                      self.current_token[3:].split(',')]
1532         return True
1533
1534     def handle_pp(self):
1535         """Handle a previous msgid_plural line."""
1536         if self.current_state in ['mc', 'ms', 'mx']:
1537             self.instance.append(self.current_entry)
1538             self.current_entry = POEntry(linenum=self.current_line)
1539         self.current_entry.previous_msgid_plural = \
1540             unescape(self.current_token[1:-1])
1541         return True
1542
1543     def handle_pm(self):
1544         """Handle a previous msgid line."""
1545         if self.current_state in ['mc', 'ms', 'mx']:
1546             self.instance.append(self.current_entry)
1547             self.current_entry = POEntry(linenum=self.current_line)
1548         self.current_entry.previous_msgid = \
1549             unescape(self.current_token[1:-1])
1550         return True
1551
1552     def handle_pc(self):
1553         """Handle a previous msgctxt line."""
1554         if self.current_state in ['mc', 'ms', 'mx']:
1555             self.instance.append(self.current_entry)
1556             self.current_entry = POEntry(linenum=self.current_line)
1557         self.current_entry.previous_msgctxt = \
1558             unescape(self.current_token[1:-1])
1559         return True
1560
1561     def handle_ct(self):
1562         """Handle a msgctxt."""
1563         if self.current_state in ['mc', 'ms', 'mx']:
1564             self.instance.append(self.current_entry)
1565             self.current_entry = POEntry(linenum=self.current_line)
1566         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1567         return True
1568
1569     def handle_mi(self):
1570         """Handle a msgid."""
1571         if self.current_state in ['mc', 'ms', 'mx']:
1572             self.instance.append(self.current_entry)
1573             self.current_entry = POEntry(linenum=self.current_line)
1574         self.current_entry.obsolete = self.entry_obsolete
1575         self.current_entry.msgid = unescape(self.current_token[1:-1])
1576         return True
1577
1578     def handle_mp(self):
1579         """Handle a msgid plural."""
1580         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1581         return True
1582
1583     def handle_ms(self):
1584         """Handle a msgstr."""
1585         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1586         return True
1587
1588     def handle_mx(self):
1589         """Handle a msgstr plural."""
1590         index = self.current_token[7]
1591         value = self.current_token[self.current_token.find('"') + 1:-1]
1592         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1593         self.msgstr_index = int(index)
1594         return True
1595
1596     def handle_mc(self):
1597         """Handle a msgid or msgstr continuation line."""
1598         token = unescape(self.current_token[1:-1])
1599         if self.current_state == 'ct':
1600             self.current_entry.msgctxt += token
1601         elif self.current_state == 'mi':
1602             self.current_entry.msgid += token
1603         elif self.current_state == 'mp':
1604             self.current_entry.msgid_plural += token
1605         elif self.current_state == 'ms':
1606             self.current_entry.msgstr += token
1607         elif self.current_state == 'mx':
1608             self.current_entry.msgstr_plural[self.msgstr_index] += token
1609         elif self.current_state == 'pp':
1610             self.current_entry.previous_msgid_plural += token
1611         elif self.current_state == 'pm':
1612             self.current_entry.previous_msgid += token
1613         elif self.current_state == 'pc':
1614             self.current_entry.previous_msgctxt += token
1615         # don't change the current state
1616         return False
1617 # }}}
1618 # class _MOFileParser {{{
1619
1620
1621 class _MOFileParser(object):
1622     """
1623     A class to parse binary mo files.
1624     """
1625
1626     def __init__(self, mofile, *args, **kwargs):
1627         """
1628         Constructor.
1629
1630         Keyword arguments:
1631
1632         ``mofile``
1633             string, path to the mo file or its content
1634
1635         ``encoding``
1636             string, the encoding to use, defaults to ``default_encoding``
1637             global variable (optional).
1638
1639         ``check_for_duplicates``
1640             whether to check for duplicate entries when adding entries to the
1641             file (optional, default: ``False``).
1642         """
1643         self.fhandle = open(mofile, 'rb')
1644
1645         klass = kwargs.get('klass')
1646         if klass is None:
1647             klass = MOFile
1648         self.instance = klass(
1649             fpath=mofile,
1650             encoding=kwargs.get('encoding', default_encoding),
1651             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1652         )
1653
1654     def __del__(self):
1655         """
1656         Make sure the file is closed, this prevents warnings on unclosed file
1657         when running tests with python >= 3.2.
1658         """
1659         if self.fhandle:
1660             self.fhandle.close()
1661
1662     def parse(self):
1663         """
1664         Build the instance with the file handle provided in the
1665         constructor.
1666         """
1667         # parse magic number
1668         magic_number = self._readbinary('<I', 4)
1669         if magic_number == MOFile.MAGIC:
1670             ii = '<II'
1671         elif magic_number == MOFile.MAGIC_SWAPPED:
1672             ii = '>II'
1673         else:
1674             raise IOError('Invalid mo file, magic number is incorrect !')
1675         self.instance.magic_number = magic_number
1676         # parse the version number and the number of strings
1677         version, numofstrings = self._readbinary(ii, 8)
1678         # from MO file format specs: "A program seeing an unexpected major
1679         # revision number should stop reading the MO file entirely"
1680         if version not in (0, 1):
1681             raise IOError('Invalid mo file, unexpected major revision number')
1682         self.instance.version = version
1683         # original strings and translation strings hash table offset
1684         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1685         # move to msgid hash table and read length and offset of msgids
1686         self.fhandle.seek(msgids_hash_offset)
1687         msgids_index = []
1688         for i in range(numofstrings):
1689             msgids_index.append(self._readbinary(ii, 8))
1690         # move to msgstr hash table and read length and offset of msgstrs
1691         self.fhandle.seek(msgstrs_hash_offset)
1692         msgstrs_index = []
1693         for i in range(numofstrings):
1694             msgstrs_index.append(self._readbinary(ii, 8))
1695         # build entries
1696         encoding = self.instance.encoding
1697         for i in range(numofstrings):
1698             self.fhandle.seek(msgids_index[i][1])
1699             msgid = self.fhandle.read(msgids_index[i][0])
1700
1701             self.fhandle.seek(msgstrs_index[i][1])
1702             msgstr = self.fhandle.read(msgstrs_index[i][0])
1703             if i == 0 and not msgid:  # metadata
1704                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1705                 for line in raw_metadata:
1706                     tokens = line.split(b(':'), 1)
1707                     if tokens[0] != b(''):
1708                         try:
1709                             k = tokens[0].decode(encoding)
1710                             v = tokens[1].decode(encoding)
1711                             metadata[k] = v.strip()
1712                         except IndexError:
1713                             metadata[k] = u('')
1714                 self.instance.metadata = metadata
1715                 continue
1716             # test if we have a plural entry
1717             msgid_tokens = msgid.split(b('\0'))
1718             if len(msgid_tokens) > 1:
1719                 entry = self._build_entry(
1720                     msgid=msgid_tokens[0],
1721                     msgid_plural=msgid_tokens[1],
1722                     msgstr_plural=dict((k, v) for k, v in
1723                                        enumerate(msgstr.split(b('\0'))))
1724                 )
1725             else:
1726                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1727             self.instance.append(entry)
1728         # close opened file
1729         self.fhandle.close()
1730         return self.instance
1731
1732     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1733                      msgstr_plural=None):
1734         msgctxt_msgid = msgid.split(b('\x04'))
1735         encoding = self.instance.encoding
1736         if len(msgctxt_msgid) > 1:
1737             kwargs = {
1738                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1739                 'msgid': msgctxt_msgid[1].decode(encoding),
1740             }
1741         else:
1742             kwargs = {'msgid': msgid.decode(encoding)}
1743         if msgstr:
1744             kwargs['msgstr'] = msgstr.decode(encoding)
1745         if msgid_plural:
1746             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1747         if msgstr_plural:
1748             for k in msgstr_plural:
1749                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1750             kwargs['msgstr_plural'] = msgstr_plural
1751         return MOEntry(**kwargs)
1752
1753     def _readbinary(self, fmt, numbytes):
1754         """
1755         Private method that unpack n bytes of data using format <fmt>.
1756         It returns a tuple or a mixed value if the tuple length is 1.
1757         """
1758         bytes = self.fhandle.read(numbytes)
1759         tup = struct.unpack(fmt, bytes)
1760         if len(tup) == 1:
1761             return tup[0]
1762         return tup
1763 # }}}
1764 # class TextWrapper {{{
1765
1766
1767 class TextWrapper(textwrap.TextWrapper):
1768     """
1769     Subclass of textwrap.TextWrapper that backport the
1770     drop_whitespace option.
1771     """
1772     def __init__(self, *args, **kwargs):
1773         drop_whitespace = kwargs.pop('drop_whitespace', True)
1774         textwrap.TextWrapper.__init__(self, *args, **kwargs)
1775         self.drop_whitespace = drop_whitespace
1776
1777     def _wrap_chunks(self, chunks):
1778         """_wrap_chunks(chunks : [string]) -> [string]
1779
1780         Wrap a sequence of text chunks and return a list of lines of
1781         length 'self.width' or less.  (If 'break_long_words' is false,
1782         some lines may be longer than this.)  Chunks correspond roughly
1783         to words and the whitespace between them: each chunk is
1784         indivisible (modulo 'break_long_words'), but a line break can
1785         come between any two chunks.  Chunks should not have internal
1786         whitespace; ie. a chunk is either all whitespace or a "word".
1787         Whitespace chunks will be removed from the beginning and end of
1788         lines, but apart from that whitespace is preserved.
1789         """
1790         lines = []
1791         if self.width <= 0:
1792             raise ValueError("invalid width %r (must be > 0)" % self.width)
1793
1794         # Arrange in reverse order so items can be efficiently popped
1795         # from a stack of chucks.
1796         chunks.reverse()
1797
1798         while chunks:
1799
1800             # Start the list of chunks that will make up the current line.
1801             # cur_len is just the length of all the chunks in cur_line.
1802             cur_line = []
1803             cur_len = 0
1804
1805             # Figure out which static string will prefix this line.
1806             if lines:
1807                 indent = self.subsequent_indent
1808             else:
1809                 indent = self.initial_indent
1810
1811             # Maximum width for this line.
1812             width = self.width - len(indent)
1813
1814             # First chunk on line is whitespace -- drop it, unless this
1815             # is the very beginning of the text (ie. no lines started yet).
1816             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1817                 del chunks[-1]
1818
1819             while chunks:
1820                 l = len(chunks[-1])
1821
1822                 # Can at least squeeze this chunk onto the current line.
1823                 if cur_len + l <= width:
1824                     cur_line.append(chunks.pop())
1825                     cur_len += l
1826
1827                 # Nope, this line is full.
1828                 else:
1829                     break
1830
1831             # The current line is full, and the next chunk is too big to
1832             # fit on *any* line (not just this one).
1833             if chunks and len(chunks[-1]) > width:
1834                 self._handle_long_word(chunks, cur_line, cur_len, width)
1835
1836             # If the last chunk on this line is all whitespace, drop it.
1837             if self.drop_whitespace and cur_line and not cur_line[-1].strip():
1838                 del cur_line[-1]
1839
1840             # Convert current line back to a string and store it in list
1841             # of all lines (return value).
1842             if cur_line:
1843                 lines.append(indent + ''.join(cur_line))
1844
1845         return lines
1846 # }}}
1847 # function wrap() {{{
1848
1849
1850 def wrap(text, width=70, **kwargs):
1851     """
1852     Wrap a single paragraph of text, returning a list of wrapped lines.
1853     """
1854     if sys.version_info < (2, 6):
1855         return TextWrapper(width=width, **kwargs).wrap(text)
1856     return textwrap.wrap(text, width=width, **kwargs)
1857
1858 # }}}
1859
1860 def genKeyId(inkey):
1861     crc = binascii.crc32(bytes(inkey)) & 0xffffffff
1862     # Use simple ASCII characters, exclude I, l, 1 and O, 0 to avoid confusing IDs
1863     symbols = "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz23456789";
1864     outkey = ""
1865     for keyind in range(0, 5):
1866         outkey += symbols[(crc & 63) % len(symbols)];
1867         crc >>= 6;
1868     return outkey