Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / solenv / bin / polib.py
blobe1511cdb975ffbafe64cc46638e5ffb9a1a6d6c0
1 # -* coding: utf-8 -*-
3 # License: MIT (see LICENSE file provided)
4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 """
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
13 """
15 __author__ = 'David Jean Louis <izimobil@gmail.com>'
16 __version__ = '1.0.8'
17 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
18 'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
20 import array
21 import codecs
22 import os
23 import re
24 import struct
25 import sys
26 import textwrap
27 import binascii
29 try:
30 import io
31 except ImportError:
32 # replacement of io.open() for python < 2.6
33 # we use codecs instead
34 class io(object):
35 @staticmethod
36 def open(fpath, mode='r', encoding=None):
37 return codecs.open(fpath, mode, encoding)
40 # the default encoding to use when encoding cannot be detected
41 default_encoding = 'utf-8'
43 # python 2/3 compatibility helpers {{{
46 if sys.version_info < (3, 0):
47 PY3 = False
48 text_type = unicode
50 def b(s):
51 return s
53 def u(s):
54 return unicode(s, "unicode_escape")
56 else:
57 PY3 = True
58 text_type = str
60 def b(s):
61 return s.encode("latin-1")
63 def u(s):
64 return s
65 # }}}
66 # _pofile_or_mofile {{{
69 def _pofile_or_mofile(f, type, **kwargs):
70 """
71 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
72 honor the DRY concept.
73 """
74 # get the file encoding
75 enc = kwargs.get('encoding')
76 if enc is None:
77 enc = detect_encoding(f, type == 'mofile')
79 # parse the file
80 kls = type == 'pofile' and _POFileParser or _MOFileParser
81 parser = kls(
83 encoding=enc,
84 check_for_duplicates=kwargs.get('check_for_duplicates', False),
85 klass=kwargs.get('klass')
87 instance = parser.parse()
88 instance.wrapwidth = kwargs.get('wrapwidth', 78)
89 return instance
90 # }}}
91 # _is_file {{{
94 def _is_file(filename_or_contents):
95 """
96 Safely returns the value of os.path.exists(filename_or_contents).
98 Arguments:
100 ``filename_or_contents``
101 either a filename, or a string holding the contents of some file.
102 In the latter case, this function will always return False.
104 try:
105 return os.path.exists(filename_or_contents)
106 except (ValueError, UnicodeEncodeError):
107 return False
108 # }}}
109 # function pofile() {{{
112 def pofile(pofile, **kwargs):
114 Convenience function that parses the po or pot file ``pofile`` and returns
115 a :class:`~polib.POFile` instance.
117 Arguments:
119 ``pofile``
120 string, full or relative path to the po/pot file or its content (data).
122 ``wrapwidth``
123 integer, the wrap width, only useful when the ``-w`` option was passed
124 to xgettext (optional, default: ``78``).
126 ``encoding``
127 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
128 encoding will be auto-detected).
130 ``check_for_duplicates``
131 whether to check for duplicate entries when adding entries to the
132 file (optional, default: ``False``).
134 ``klass``
135 class which is used to instantiate the return value (optional,
136 default: ``None``, the return value with be a :class:`~polib.POFile`
137 instance).
139 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
140 # }}}
141 # function mofile() {{{
144 def mofile(mofile, **kwargs):
146 Convenience function that parses the mo file ``mofile`` and returns a
147 :class:`~polib.MOFile` instance.
149 Arguments:
151 ``mofile``
152 string, full or relative path to the mo file or its content (data).
154 ``wrapwidth``
155 integer, the wrap width, only useful when the ``-w`` option was passed
156 to xgettext to generate the po file that was used to format the mo file
157 (optional, default: ``78``).
159 ``encoding``
160 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
161 encoding will be auto-detected).
163 ``check_for_duplicates``
164 whether to check for duplicate entries when adding entries to the
165 file (optional, default: ``False``).
167 ``klass``
168 class which is used to instantiate the return value (optional,
169 default: ``None``, the return value with be a :class:`~polib.POFile`
170 instance).
172 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
173 # }}}
174 # function detect_encoding() {{{
177 def detect_encoding(file, binary_mode=False):
179 Try to detect the encoding used by the ``file``. The ``file`` argument can
180 be a PO or MO file path or a string containing the contents of the file.
181 If the encoding cannot be detected, the function will return the value of
182 ``default_encoding``.
184 Arguments:
186 ``file``
187 string, full or relative path to the po/mo file or its content.
189 ``binary_mode``
190 boolean, set this to True if ``file`` is a mo file.
192 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
193 rxt = re.compile(u(PATTERN))
194 rxb = re.compile(b(PATTERN))
196 def charset_exists(charset):
197 """Check whether ``charset`` is valid or not."""
198 try:
199 codecs.lookup(charset)
200 except LookupError:
201 return False
202 return True
204 if not _is_file(file):
205 match = rxt.search(file)
206 if match:
207 enc = match.group(1).strip()
208 if charset_exists(enc):
209 return enc
210 else:
211 # For PY3, always treat as binary
212 if binary_mode or PY3:
213 mode = 'rb'
214 rx = rxb
215 else:
216 mode = 'r'
217 rx = rxt
218 f = open(file, mode)
219 for l in f.readlines():
220 match = rx.search(l)
221 if match:
222 f.close()
223 enc = match.group(1).strip()
224 if not isinstance(enc, text_type):
225 enc = enc.decode('utf-8')
226 if charset_exists(enc):
227 return enc
228 f.close()
229 return default_encoding
230 # }}}
231 # function escape() {{{
234 def escape(st):
236 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
237 the given string ``st`` and returns it.
239 return st.replace('\\', r'\\')\
240 .replace('\t', r'\t')\
241 .replace('\r', r'\r')\
242 .replace('\n', r'\n')\
243 .replace('\"', r'\"')
244 # }}}
245 # function unescape() {{{
248 def unescape(st):
250 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
251 the given string ``st`` and returns it.
253 def unescape_repl(m):
254 m = m.group(1)
255 if m == 'n':
256 return '\n'
257 if m == 't':
258 return '\t'
259 if m == 'r':
260 return '\r'
261 if m == '\\':
262 return '\\'
263 return m # handles escaped double quote
264 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
265 # }}}
266 # function natural_sort() {{{
269 def natural_sort(lst):
271 Sort naturally the given list.
272 Credits: http://stackoverflow.com/a/4836734
274 convert = lambda text: int(text) if text.isdigit() else text.lower()
275 alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
276 return sorted(lst, key = alphanum_key)
277 # }}}
278 # class _BaseFile {{{
281 class _BaseFile(list):
283 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
284 classes. This class should **not** be instantiated directly.
287 def __init__(self, *args, **kwargs):
289 Constructor, accepts the following keyword arguments:
291 ``pofile``
292 string, the path to the po or mo file, or its content as a string.
294 ``wrapwidth``
295 integer, the wrap width, only useful when the ``-w`` option was
296 passed to xgettext (optional, default: ``78``).
298 ``encoding``
299 string, the encoding to use, defaults to ``default_encoding``
300 global variable (optional).
302 ``check_for_duplicates``
303 whether to check for duplicate entries when adding entries to the
304 file, (optional, default: ``False``).
306 list.__init__(self)
307 # the opened file handle
308 pofile = kwargs.get('pofile', None)
309 if pofile and _is_file(pofile):
310 self.fpath = pofile
311 else:
312 self.fpath = kwargs.get('fpath')
313 # the width at which lines should be wrapped
314 self.wrapwidth = kwargs.get('wrapwidth', 78)
315 # the file encoding
316 self.encoding = kwargs.get('encoding', default_encoding)
317 # whether to check for duplicate entries or not
318 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
319 # header
320 self.header = ''
321 # both po and mo files have metadata
322 self.metadata = {}
323 self.metadata_is_fuzzy = 0
325 def __unicode__(self):
327 Returns the unicode representation of the file.
329 ret = []
330 entries = [self.metadata_as_entry()] + \
331 [e for e in self if not e.obsolete]
332 for entry in entries:
333 ret.append(entry.__unicode__(self.wrapwidth))
334 for entry in self.obsolete_entries():
335 ret.append(entry.__unicode__(self.wrapwidth))
336 ret = u('\n').join(ret)
338 assert isinstance(ret, text_type)
339 #if type(ret) != text_type:
340 # return unicode(ret, self.encoding)
341 return ret
343 if PY3:
344 def __str__(self):
345 return self.__unicode__()
346 else:
347 def __str__(self):
349 Returns the string representation of the file.
351 return unicode(self).encode(self.encoding)
353 def __contains__(self, entry):
355 Overridden ``list`` method to implement the membership test (in and
356 not in).
357 The method considers that an entry is in the file if it finds an entry
358 that has the same msgid (the test is **case sensitive**) and the same
359 msgctxt (or none for both entries).
361 Argument:
363 ``entry``
364 an instance of :class:`~polib._BaseEntry`.
366 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
367 is not None
369 def __eq__(self, other):
370 return str(self) == str(other)
372 def append(self, entry):
374 Overridden method to check for duplicates entries, if a user tries to
375 add an entry that is already in the file, the method will raise a
376 ``ValueError`` exception.
378 Argument:
380 ``entry``
381 an instance of :class:`~polib._BaseEntry`.
383 # check_for_duplicates may not be defined (yet) when unpickling.
384 # But if pickling, we never want to check for duplicates anyway.
385 if getattr(self, 'check_for_duplicates', False) and entry in self:
386 raise ValueError('Entry "%s" already exists' % entry.msgid)
387 super(_BaseFile, self).append(entry)
389 def insert(self, index, entry):
391 Overridden method to check for duplicates entries, if a user tries to
392 add an entry that is already in the file, the method will raise a
393 ``ValueError`` exception.
395 Arguments:
397 ``index``
398 index at which the entry should be inserted.
400 ``entry``
401 an instance of :class:`~polib._BaseEntry`.
403 if self.check_for_duplicates and entry in self:
404 raise ValueError('Entry "%s" already exists' % entry.msgid)
405 super(_BaseFile, self).insert(index, entry)
407 def metadata_as_entry(self):
409 Returns the file metadata as a :class:`~polib.POFile` instance.
411 e = POEntry(msgid='')
412 mdata = self.ordered_metadata()
413 if mdata:
414 strs = []
415 for name, value in mdata:
416 # Strip whitespace off each line in a multi-line entry
417 strs.append('%s: %s' % (name, value))
418 e.msgstr = '\n'.join(strs) + '\n'
419 if self.metadata_is_fuzzy:
420 e.flags.append('fuzzy')
421 return e
423 def save(self, fpath=None, repr_method='__unicode__'):
425 Saves the po file to ``fpath``.
426 If it is an existing file and no ``fpath`` is provided, then the
427 existing file is rewritten with the modified data.
429 Keyword arguments:
431 ``fpath``
432 string, full or relative path to the file.
434 ``repr_method``
435 string, the method to use for output.
437 if self.fpath is None and fpath is None:
438 raise IOError('You must provide a file path to save() method')
439 contents = getattr(self, repr_method)()
440 if fpath is None:
441 fpath = self.fpath
442 if repr_method == 'to_binary':
443 fhandle = open(fpath, 'wb')
444 else:
445 fhandle = io.open(fpath, 'w', encoding=self.encoding)
446 if not isinstance(contents, text_type):
447 contents = contents.decode(self.encoding)
448 fhandle.write(contents)
449 fhandle.close()
450 # set the file path if not set
451 if self.fpath is None and fpath:
452 self.fpath = fpath
454 def find(self, st, by='msgid', include_obsolete_entries=False,
455 msgctxt=False):
457 Find the entry which msgid (or property identified by the ``by``
458 argument) matches the string ``st``.
460 Keyword arguments:
462 ``st``
463 string, the string to search for.
465 ``by``
466 string, the property to use for comparison (default: ``msgid``).
468 ``include_obsolete_entries``
469 boolean, whether to also search in entries that are obsolete.
471 ``msgctxt``
472 string, allows specifying a specific message context for the
473 search.
475 if include_obsolete_entries:
476 entries = self[:]
477 else:
478 entries = [e for e in self if not e.obsolete]
479 for e in entries:
480 if getattr(e, by) == st:
481 if msgctxt is not False and e.msgctxt != msgctxt:
482 continue
483 return e
484 return None
486 def ordered_metadata(self):
488 Convenience method that returns an ordered version of the metadata
489 dictionary. The return value is list of tuples (metadata name,
490 metadata_value).
492 # copy the dict first
493 metadata = self.metadata.copy()
494 data_order = [
495 'Project-Id-Version',
496 'Report-Msgid-Bugs-To',
497 'POT-Creation-Date',
498 'PO-Revision-Date',
499 'Last-Translator',
500 'Language-Team',
501 'Language',
502 'MIME-Version',
503 'Content-Type',
504 'Content-Transfer-Encoding',
505 'Plural-Forms'
507 ordered_data = []
508 for data in data_order:
509 try:
510 value = metadata.pop(data)
511 ordered_data.append((data, value))
512 except KeyError:
513 pass
514 # the rest of the metadata will be alphabetically ordered since there
515 # are no specs for this AFAIK
516 for data in natural_sort(metadata.keys()):
517 value = metadata[data]
518 ordered_data.append((data, value))
519 return ordered_data
521 def to_binary(self):
523 Return the binary representation of the file.
525 offsets = []
526 entries = self.translated_entries()
528 # the keys are sorted in the .mo file
529 def cmp(_self, other):
530 # msgfmt compares entries with msgctxt if it exists
531 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
532 other_msgid = other.msgctxt and other.msgctxt or other.msgid
533 if self_msgid > other_msgid:
534 return 1
535 elif self_msgid < other_msgid:
536 return -1
537 else:
538 return 0
539 # add metadata entry
540 entries.sort(key=lambda o: o.msgctxt or o.msgid)
541 mentry = self.metadata_as_entry()
542 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
543 entries = [mentry] + entries
544 entries_len = len(entries)
545 ids, strs = b(''), b('')
546 for e in entries:
547 # For each string, we need size and file offset. Each string is
548 # NUL terminated; the NUL does not count into the size.
549 msgid = b('')
550 if e.msgctxt:
551 # Contexts are stored by storing the concatenation of the
552 # context, a <EOT> byte, and the original string
553 msgid = self._encode(e.msgctxt + '\4')
554 if e.msgid_plural:
555 msgstr = []
556 for index in sorted(e.msgstr_plural.keys()):
557 msgstr.append(e.msgstr_plural[index])
558 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
559 msgstr = self._encode('\0'.join(msgstr))
560 else:
561 msgid += self._encode(e.msgid)
562 msgstr = self._encode(e.msgstr)
563 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
564 ids += msgid + b('\0')
565 strs += msgstr + b('\0')
567 # The header is 7 32-bit unsigned integers.
568 keystart = 7 * 4 + 16 * entries_len
569 # and the values start after the keys
570 valuestart = keystart + len(ids)
571 koffsets = []
572 voffsets = []
573 # The string table first has the list of keys, then the list of values.
574 # Each entry has first the size of the string, then the file offset.
575 for o1, l1, o2, l2 in offsets:
576 koffsets += [l1, o1 + keystart]
577 voffsets += [l2, o2 + valuestart]
578 offsets = koffsets + voffsets
580 output = struct.pack(
581 "Iiiiiii",
582 # Magic number
583 MOFile.MAGIC,
584 # Version
586 # number of entries
587 entries_len,
588 # start of key index
589 7 * 4,
590 # start of value index
591 7 * 4 + entries_len * 8,
592 # size and offset of hash table, we don't use hash tables
593 0, keystart
596 if sys.version_info >= (3, 2):
597 output += array.array("i", offsets).tobytes()
598 else:
599 output += array.array("i", offsets).tostring()
600 output += ids
601 output += strs
602 return output
604 def _encode(self, mixed):
606 Encodes the given ``mixed`` argument with the file encoding if and
607 only if it's a unicode string and returns the encoded string.
609 if isinstance(mixed, text_type):
610 mixed = mixed.encode(self.encoding)
611 return mixed
612 # }}}
613 # class POFile {{{
616 class POFile(_BaseFile):
618 Po (or Pot) file reader/writer.
619 This class inherits the :class:`~polib._BaseFile` class and, by extension,
620 the python ``list`` type.
623 def __unicode__(self):
625 Returns the unicode representation of the po file.
627 ret, headers = '', self.header.split('\n')
628 for header in headers:
629 if not len(header):
630 ret += "#\n"
631 elif header[:1] in [',', ':']:
632 ret += '#%s\n' % header
633 else:
634 ret += '# %s\n' % header
636 if not isinstance(ret, text_type):
637 ret = ret.decode(self.encoding)
639 return ret + _BaseFile.__unicode__(self)
641 def save_as_mofile(self, fpath):
643 Saves the binary representation of the file to given ``fpath``.
645 Keyword argument:
647 ``fpath``
648 string, full or relative path to the mo file.
650 _BaseFile.save(self, fpath, 'to_binary')
652 def percent_translated(self):
654 Convenience method that returns the percentage of translated
655 messages.
657 total = len([e for e in self if not e.obsolete])
658 if total == 0:
659 return 100
660 translated = len(self.translated_entries())
661 return int(translated * 100 / float(total))
663 def translated_entries(self):
665 Convenience method that returns the list of translated entries.
667 return [e for e in self if e.translated()]
669 def untranslated_entries(self):
671 Convenience method that returns the list of untranslated entries.
673 return [e for e in self if not e.translated() and not e.obsolete
674 and not 'fuzzy' in e.flags]
676 def fuzzy_entries(self):
678 Convenience method that returns the list of fuzzy entries.
680 return [e for e in self if 'fuzzy' in e.flags]
682 def obsolete_entries(self):
684 Convenience method that returns the list of obsolete entries.
686 return [e for e in self if e.obsolete]
688 def merge(self, refpot):
690 Convenience method that merges the current pofile with the pot file
691 provided. It behaves exactly as the gettext msgmerge utility:
693 * comments of this file will be preserved, but extracted comments and
694 occurrences will be discarded;
695 * any translations or comments in the file will be discarded, however,
696 dot comments and file positions will be preserved;
697 * the fuzzy flags are preserved.
699 Keyword argument:
701 ``refpot``
702 object POFile, the reference catalog.
704 # Store entries in dict/set for faster access
705 self_entries = dict((entry.msgid, entry) for entry in self)
706 refpot_msgids = set(entry.msgid for entry in refpot)
707 # Merge entries that are in the refpot
708 for entry in refpot:
709 e = self_entries.get(entry.msgid)
710 if e is None:
711 e = POEntry()
712 self.append(e)
713 e.merge(entry)
714 # ok, now we must "obsolete" entries that are not in the refpot anymore
715 for entry in self:
716 if entry.msgid not in refpot_msgids:
717 entry.obsolete = True
718 # }}}
719 # class MOFile {{{
722 class MOFile(_BaseFile):
724 Mo file reader/writer.
725 This class inherits the :class:`~polib._BaseFile` class and, by
726 extension, the python ``list`` type.
728 MAGIC = 0x950412de
729 MAGIC_SWAPPED = 0xde120495
731 def __init__(self, *args, **kwargs):
733 Constructor, accepts all keywords arguments accepted by
734 :class:`~polib._BaseFile` class.
736 _BaseFile.__init__(self, *args, **kwargs)
737 self.magic_number = None
738 self.version = 0
740 def save_as_pofile(self, fpath):
742 Saves the mofile as a pofile to ``fpath``.
744 Keyword argument:
746 ``fpath``
747 string, full or relative path to the file.
749 _BaseFile.save(self, fpath)
751 def save(self, fpath=None):
753 Saves the mofile to ``fpath``.
755 Keyword argument:
757 ``fpath``
758 string, full or relative path to the file.
760 _BaseFile.save(self, fpath, 'to_binary')
762 def percent_translated(self):
764 Convenience method to keep the same interface with POFile instances.
766 return 100
768 def translated_entries(self):
770 Convenience method to keep the same interface with POFile instances.
772 return self
774 def untranslated_entries(self):
776 Convenience method to keep the same interface with POFile instances.
778 return []
780 def fuzzy_entries(self):
782 Convenience method to keep the same interface with POFile instances.
784 return []
786 def obsolete_entries(self):
788 Convenience method to keep the same interface with POFile instances.
790 return []
791 # }}}
792 # class _BaseEntry {{{
795 class _BaseEntry(object):
797 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
798 This class should **not** be instantiated directly.
801 def __init__(self, *args, **kwargs):
803 Constructor, accepts the following keyword arguments:
805 ``msgid``
806 string, the entry msgid.
808 ``msgstr``
809 string, the entry msgstr.
811 ``msgid_plural``
812 string, the entry msgid_plural.
814 ``msgstr_plural``
815 list, the entry msgstr_plural lines.
817 ``msgctxt``
818 string, the entry context (msgctxt).
820 ``obsolete``
821 bool, whether the entry is "obsolete" or not.
823 ``encoding``
824 string, the encoding to use, defaults to ``default_encoding``
825 global variable (optional).
827 self.msgid = kwargs.get('msgid', '')
828 self.msgstr = kwargs.get('msgstr', '')
829 self.msgid_plural = kwargs.get('msgid_plural', '')
830 self.msgstr_plural = kwargs.get('msgstr_plural', {})
831 self.msgctxt = kwargs.get('msgctxt', None)
832 self.obsolete = kwargs.get('obsolete', False)
833 self.encoding = kwargs.get('encoding', default_encoding)
835 def __unicode__(self, wrapwidth=78):
837 Returns the unicode representation of the entry.
839 if self.obsolete:
840 delflag = '#~ '
841 else:
842 delflag = ''
843 ret = []
844 # write the msgctxt if any
845 if self.msgctxt is not None:
846 ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
847 wrapwidth)
848 # write the msgid
849 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
850 # write the msgid_plural if any
851 if self.msgid_plural:
852 ret += self._str_field("msgid_plural", delflag, "",
853 self.msgid_plural, wrapwidth)
854 if self.msgstr_plural:
855 # write the msgstr_plural if any
856 msgstrs = self.msgstr_plural
857 keys = list(msgstrs)
858 keys.sort()
859 for index in keys:
860 msgstr = msgstrs[index]
861 plural_index = '[%s]' % index
862 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
863 wrapwidth)
864 else:
865 # otherwise write the msgstr
866 ret += self._str_field("msgstr", delflag, "", self.msgstr,
867 wrapwidth)
868 ret.append('')
869 usedirect = True
870 if not PY3 and type(ret[0] != unicode):
871 try:
872 usedirect = False
873 ret = u('\n').join(x.decode('utf-8') for x in ret)
874 except:
875 usedirect = True
876 if usedirect:
877 ret = u('\n').join(ret)
878 return ret
880 if PY3:
881 def __str__(self):
882 return self.__unicode__()
883 else:
884 def __str__(self):
886 Returns the string representation of the entry.
888 return unicode(self).encode(self.encoding)
890 def __eq__(self, other):
891 return str(self) == str(other)
893 def _str_field(self, fieldname, delflag, plural_index, field,
894 wrapwidth=78):
895 lines = field.splitlines(True)
896 if len(lines) > 1:
897 lines = [''] + lines # start with initial empty line
898 else:
899 escaped_field = escape(field)
900 specialchars_count = 0
901 for c in ['\\', '\n', '\r', '\t', '"']:
902 specialchars_count += field.count(c)
903 # comparison must take into account fieldname length + one space
904 # + 2 quotes (eg. msgid "<string>")
905 flength = len(fieldname) + 3
906 if plural_index:
907 flength += len(plural_index)
908 real_wrapwidth = wrapwidth - flength + specialchars_count
909 if wrapwidth > 0 and len(field) > real_wrapwidth:
910 # Wrap the line but take field name into account
911 lines = [''] + [unescape(item) for item in wrap(
912 escaped_field,
913 wrapwidth - 2, # 2 for quotes ""
914 drop_whitespace=False,
915 break_long_words=False
917 else:
918 lines = [field]
919 if fieldname.startswith('previous_'):
920 # quick and dirty trick to get the real field name
921 fieldname = fieldname[9:]
923 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
924 escape(lines.pop(0)))]
925 for line in lines:
926 ret.append('%s"%s"' % (delflag, escape(line)))
927 return ret
928 # }}}
929 # class POEntry {{{
932 class POEntry(_BaseEntry):
934 Represents a po file entry.
937 def __init__(self, *args, **kwargs):
939 Constructor, accepts the following keyword arguments:
941 ``comment``
942 string, the entry comment.
944 ``tcomment``
945 string, the entry translator comment.
947 ``occurrences``
948 list, the entry occurrences.
950 ``flags``
951 list, the entry flags.
953 ``previous_msgctxt``
954 string, the entry previous context.
956 ``previous_msgid``
957 string, the entry previous msgid.
959 ``previous_msgid_plural``
960 string, the entry previous msgid_plural.
962 ``linenum``
963 integer, the line number of the entry
965 _BaseEntry.__init__(self, *args, **kwargs)
966 self.comment = kwargs.get('comment', '')
967 self.tcomment = kwargs.get('tcomment', '')
968 self.occurrences = kwargs.get('occurrences', [])
969 self.flags = kwargs.get('flags', [])
970 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
971 self.previous_msgid = kwargs.get('previous_msgid', None)
972 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
973 self.linenum = kwargs.get('linenum', None)
975 def __unicode__(self, wrapwidth=0):
977 Returns the unicode representation of the entry.
979 ret = []
980 # comments first, if any (with text wrapping as xgettext does)
981 if self.obsolete:
982 comments = [('tcomment', '# ')]
983 else:
984 comments = [('comment', '#. '), ('tcomment', '# ')]
985 for c in comments:
986 val = getattr(self, c[0])
987 if val:
988 for comment in val.split('\n'):
989 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
990 ret += wrap(
991 comment,
992 wrapwidth,
993 initial_indent=c[1],
994 subsequent_indent=c[1],
995 break_long_words=False
997 else:
998 ret.append('%s%s' % (c[1], comment))
1000 # occurrences (with text wrapping as xgettext does)
1001 if not self.obsolete and self.occurrences:
1002 filelist = []
1003 for fpath, lineno in self.occurrences:
1004 if lineno:
1005 filelist.append('%s:%s' % (fpath, lineno))
1006 else:
1007 filelist.append(fpath)
1008 filestr = ' '.join(filelist)
1009 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1010 # textwrap split words that contain hyphen, this is not
1011 # what we want for filenames, so the dirty hack is to
1012 # temporally replace hyphens with a char that a file cannot
1013 # contain, like "*"
1014 ret += [l.replace('*', '-') for l in wrap(
1015 filestr.replace('-', '*'),
1016 wrapwidth,
1017 initial_indent='#: ',
1018 subsequent_indent='#: ',
1019 break_long_words=False
1021 else:
1022 ret.append('#: ' + filestr)
1024 # flags (TODO: wrapping ?)
1025 if self.flags:
1026 ret.append('#, %s' % ', '.join(self.flags))
1028 # previous context and previous msgid/msgid_plural
1029 fields = ['previous_msgctxt', 'previous_msgid',
1030 'previous_msgid_plural']
1031 if self.obsolete:
1032 prefix = "#~| "
1033 else:
1034 prefix = "#| "
1035 for f in fields:
1036 val = getattr(self, f)
1037 if val:
1038 ret += self._str_field(f, prefix, "", val, wrapwidth)
1040 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1041 ret = u('\n').join(ret)
1042 return ret
1044 def __cmp__(self, other):
1046 Called by comparison operations if rich comparison is not defined.
1049 # First: Obsolete test
1050 if self.obsolete != other.obsolete:
1051 if self.obsolete:
1052 return -1
1053 else:
1054 return 1
1055 # Work on a copy to protect original
1056 occ1 = sorted(self.occurrences[:])
1057 occ2 = sorted(other.occurrences[:])
1058 pos = 0
1059 for entry1 in occ1:
1060 try:
1061 entry2 = occ2[pos]
1062 except IndexError:
1063 return 1
1064 pos = pos + 1
1065 if entry1[0] != entry2[0]:
1066 if entry1[0] > entry2[0]:
1067 return 1
1068 else:
1069 return -1
1070 if entry1[1] != entry2[1]:
1071 if entry1[1] > entry2[1]:
1072 return 1
1073 else:
1074 return -1
1075 # Compare msgid_plural if set
1076 if self.msgid_plural:
1077 if not other.msgid_plural:
1078 return 1
1079 for pos in self.msgid_plural:
1080 if pos not in other.msgid_plural:
1081 return 1
1082 if self.msgid_plural[pos] > other.msgid_plural[pos]:
1083 return 1
1084 if self.msgid_plural[pos] < other.msgid_plural[pos]:
1085 return -1
1086 # Finally: Compare message ID
1087 if self.msgid > other.msgid:
1088 return 1
1089 elif self.msgid < other.msgid:
1090 return -1
1091 return 0
1093 def __gt__(self, other):
1094 return self.__cmp__(other) > 0
1096 def __lt__(self, other):
1097 return self.__cmp__(other) < 0
1099 def __ge__(self, other):
1100 return self.__cmp__(other) >= 0
1102 def __le__(self, other):
1103 return self.__cmp__(other) <= 0
1105 def __eq__(self, other):
1106 return self.__cmp__(other) == 0
1108 def __ne__(self, other):
1109 return self.__cmp__(other) != 0
1111 def translated(self):
1113 Returns ``True`` if the entry has been translated or ``False``
1114 otherwise.
1116 if self.obsolete or 'fuzzy' in self.flags:
1117 return False
1118 if self.msgstr != '':
1119 return True
1120 if self.msgstr_plural:
1121 for pos in self.msgstr_plural:
1122 if self.msgstr_plural[pos] == '':
1123 return False
1124 return True
1125 return False
1127 def merge(self, other):
1129 Merge the current entry with the given pot entry.
1131 self.msgid = other.msgid
1132 self.msgctxt = other.msgctxt
1133 self.occurrences = other.occurrences
1134 self.comment = other.comment
1135 fuzzy = 'fuzzy' in self.flags
1136 self.flags = other.flags[:] # clone flags
1137 if fuzzy:
1138 self.flags.append('fuzzy')
1139 self.msgid_plural = other.msgid_plural
1140 self.obsolete = other.obsolete
1141 self.previous_msgctxt = other.previous_msgctxt
1142 self.previous_msgid = other.previous_msgid
1143 self.previous_msgid_plural = other.previous_msgid_plural
1144 if other.msgstr_plural:
1145 for pos in other.msgstr_plural:
1146 try:
1147 # keep existing translation at pos if any
1148 self.msgstr_plural[pos]
1149 except KeyError:
1150 self.msgstr_plural[pos] = ''
1152 def __hash__(self):
1153 return hash((self.msgid, self.msgstr))
1154 # }}}
1155 # class MOEntry {{{
1158 class MOEntry(_BaseEntry):
1160 Represents a mo file entry.
1162 def __init__(self, *args, **kwargs):
1164 Constructor, accepts the following keyword arguments,
1165 for consistency with :class:`~polib.POEntry`:
1167 ``comment``
1168 ``tcomment``
1169 ``occurrences``
1170 ``flags``
1171 ``previous_msgctxt``
1172 ``previous_msgid``
1173 ``previous_msgid_plural``
1175 Note: even though these keyword arguments are accepted,
1176 they hold no real meaning in the context of MO files
1177 and are simply ignored.
1179 _BaseEntry.__init__(self, *args, **kwargs)
1180 self.comment = ''
1181 self.tcomment = ''
1182 self.occurrences = []
1183 self.flags = []
1184 self.previous_msgctxt = None
1185 self.previous_msgid = None
1186 self.previous_msgid_plural = None
1188 def __hash__(self):
1189 return hash((self.msgid, self.msgstr))
1191 # }}}
1192 # class _POFileParser {{{
1195 class _POFileParser(object):
1197 A finite state machine to parse efficiently and correctly po
1198 file format.
1201 def __init__(self, pofile, *args, **kwargs):
1203 Constructor.
1205 Keyword arguments:
1207 ``pofile``
1208 string, path to the po file or its content
1210 ``encoding``
1211 string, the encoding to use, defaults to ``default_encoding``
1212 global variable (optional).
1214 ``check_for_duplicates``
1215 whether to check for duplicate entries when adding entries to the
1216 file (optional, default: ``False``).
1218 enc = kwargs.get('encoding', default_encoding)
1219 if _is_file(pofile):
1220 try:
1221 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1222 except LookupError:
1223 enc = default_encoding
1224 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1225 else:
1226 self.fhandle = pofile.splitlines()
1228 klass = kwargs.get('klass')
1229 if klass is None:
1230 klass = POFile
1231 self.instance = klass(
1232 pofile=pofile,
1233 encoding=enc,
1234 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1236 self.transitions = {}
1237 self.current_line = 0
1238 self.current_entry = POEntry(linenum=self.current_line)
1239 self.current_state = 'st'
1240 self.current_token = None
1241 # two memo flags used in handlers
1242 self.msgstr_index = 0
1243 self.entry_obsolete = 0
1244 # Configure the state machine, by adding transitions.
1245 # Signification of symbols:
1246 # * ST: Beginning of the file (start)
1247 # * HE: Header
1248 # * TC: a translation comment
1249 # * GC: a generated comment
1250 # * OC: a file/line occurrence
1251 # * FL: a flags line
1252 # * CT: a message context
1253 # * PC: a previous msgctxt
1254 # * PM: a previous msgid
1255 # * PP: a previous msgid_plural
1256 # * MI: a msgid
1257 # * MP: a msgid plural
1258 # * MS: a msgstr
1259 # * MX: a msgstr plural
1260 # * MC: a msgid or msgstr continuation line
1261 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1262 'ms', 'mp', 'mx', 'mi']
1264 self.add('tc', ['st', 'he'], 'he')
1265 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1266 'mp', 'mx', 'mi'], 'tc')
1267 self.add('gc', all, 'gc')
1268 self.add('oc', all, 'oc')
1269 self.add('fl', all, 'fl')
1270 self.add('pc', all, 'pc')
1271 self.add('pm', all, 'pm')
1272 self.add('pp', all, 'pp')
1273 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1274 'pp', 'ms', 'mx'], 'ct')
1275 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1276 'pm', 'pp', 'ms', 'mx'], 'mi')
1277 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1278 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1279 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1280 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1282 def parse(self):
1284 Run the state machine, parse the file line by line and call process()
1285 with the current matched symbol.
1288 keywords = {
1289 'msgctxt': 'ct',
1290 'msgid': 'mi',
1291 'msgstr': 'ms',
1292 'msgid_plural': 'mp',
1294 prev_keywords = {
1295 'msgid_plural': 'pp',
1296 'msgid': 'pm',
1297 'msgctxt': 'pc',
1299 tokens = []
1300 for line in self.fhandle:
1301 self.current_line += 1
1302 line = line.strip()
1303 if line == '':
1304 continue
1306 tokens = line.split(None, 2)
1307 nb_tokens = len(tokens)
1309 if tokens[0] == '#~|':
1310 continue
1312 if tokens[0] == '#~' and nb_tokens > 1:
1313 line = line[3:].strip()
1314 tokens = tokens[1:]
1315 nb_tokens -= 1
1316 self.entry_obsolete = 1
1317 else:
1318 self.entry_obsolete = 0
1320 # Take care of keywords like
1321 # msgid, msgid_plural, msgctxt & msgstr.
1322 if tokens[0] in keywords and nb_tokens > 1:
1323 line = line[len(tokens[0]):].lstrip()
1324 if re.search(r'([^\\]|^)"', line[1:-1]):
1325 raise IOError('Syntax error in po file %s (line %s): '
1326 'unescaped double quote found' %
1327 (self.instance.fpath, self.current_line))
1328 self.current_token = line
1329 self.process(keywords[tokens[0]])
1330 continue
1332 self.current_token = line
1334 if tokens[0] == '#:':
1335 if nb_tokens <= 1:
1336 continue
1337 # we are on an occurrences line
1338 self.process('oc')
1340 elif line[:1] == '"':
1341 # we are on a continuation line
1342 if re.search(r'([^\\]|^)"', line[1:-1]):
1343 raise IOError('Syntax error in po file %s (line %s): '
1344 'unescaped double quote found' %
1345 (self.instance.fpath, self.current_line))
1346 self.process('mc')
1348 elif line[:7] == 'msgstr[':
1349 # we are on a msgstr plural
1350 self.process('mx')
1352 elif tokens[0] == '#,':
1353 if nb_tokens <= 1:
1354 continue
1355 # we are on a flags line
1356 self.process('fl')
1358 elif tokens[0] == '#' or tokens[0].startswith('##'):
1359 if line == '#':
1360 line += ' '
1361 # we are on a translator comment line
1362 self.process('tc')
1364 elif tokens[0] == '#.':
1365 if nb_tokens <= 1:
1366 continue
1367 # we are on a generated comment line
1368 self.process('gc')
1370 elif tokens[0] == '#|':
1371 if nb_tokens <= 1:
1372 raise IOError('Syntax error in po file %s (line %s)' %
1373 (self.instance.fpath, self.current_line))
1375 # Remove the marker and any whitespace right after that.
1376 line = line[2:].lstrip()
1377 self.current_token = line
1379 if tokens[1].startswith('"'):
1380 # Continuation of previous metadata.
1381 self.process('mc')
1382 continue
1384 if nb_tokens == 2:
1385 # Invalid continuation line.
1386 raise IOError('Syntax error in po file %s (line %s): '
1387 'invalid continuation line' %
1388 (self.instance.fpath, self.current_line))
1390 # we are on a "previous translation" comment line,
1391 if tokens[1] not in prev_keywords:
1392 # Unknown keyword in previous translation comment.
1393 raise IOError('Syntax error in po file %s (line %s): '
1394 'unknown keyword %s' %
1395 (self.instance.fpath, self.current_line,
1396 tokens[1]))
1398 # Remove the keyword and any whitespace
1399 # between it and the starting quote.
1400 line = line[len(tokens[1]):].lstrip()
1401 self.current_token = line
1402 self.process(prev_keywords[tokens[1]])
1404 else:
1405 raise IOError('Syntax error in po file %s (line %s)' %
1406 (self.instance.fpath, self.current_line))
1408 if self.current_entry and len(tokens) > 0 and \
1409 not tokens[0].startswith('#'):
1410 # since entries are added when another entry is found, we must add
1411 # the last entry here (only if there are lines). Trailing comments
1412 # are ignored
1413 self.instance.append(self.current_entry)
1415 # before returning the instance, check if there's metadata and if
1416 # so extract it in a dict
1417 metadataentry = self.instance.find('')
1418 if metadataentry: # metadata found
1419 # remove the entry
1420 self.instance.remove(metadataentry)
1421 self.instance.metadata_is_fuzzy = metadataentry.flags
1422 key = None
1423 for msg in metadataentry.msgstr.splitlines():
1424 try:
1425 key, val = msg.split(':', 1)
1426 self.instance.metadata[key] = val.strip()
1427 except (ValueError, KeyError):
1428 if key is not None:
1429 self.instance.metadata[key] += '\n' + msg.strip()
1430 # close opened file
1431 if not isinstance(self.fhandle, list): # must be file
1432 self.fhandle.close()
1433 return self.instance
1435 def add(self, symbol, states, next_state):
1437 Add a transition to the state machine.
1439 Keywords arguments:
1441 ``symbol``
1442 string, the matched token (two chars symbol).
1444 ``states``
1445 list, a list of states (two chars symbols).
1447 ``next_state``
1448 the next state the fsm will have after the action.
1450 for state in states:
1451 action = getattr(self, 'handle_%s' % next_state)
1452 self.transitions[(symbol, state)] = (action, next_state)
1454 def process(self, symbol):
1456 Process the transition corresponding to the current state and the
1457 symbol provided.
1459 Keywords arguments:
1461 ``symbol``
1462 string, the matched token (two chars symbol).
1464 ``linenum``
1465 integer, the current line number of the parsed file.
1467 try:
1468 (action, state) = self.transitions[(symbol, self.current_state)]
1469 if action():
1470 self.current_state = state
1471 except Exception:
1472 raise IOError('Syntax error in po file (line %s)' %
1473 self.current_line)
1475 # state handlers
1477 def handle_he(self):
1478 """Handle a header comment."""
1479 if self.instance.header != '':
1480 self.instance.header += '\n'
1481 self.instance.header += self.current_token[2:]
1482 return 1
1484 def handle_tc(self):
1485 """Handle a translator comment."""
1486 if self.current_state in ['mc', 'ms', 'mx']:
1487 self.instance.append(self.current_entry)
1488 self.current_entry = POEntry(linenum=self.current_line)
1489 if self.current_entry.tcomment != '':
1490 self.current_entry.tcomment += '\n'
1491 tcomment = self.current_token.lstrip('#')
1492 if tcomment.startswith(' '):
1493 tcomment = tcomment[1:]
1494 self.current_entry.tcomment += tcomment
1495 return True
1497 def handle_gc(self):
1498 """Handle a generated comment."""
1499 if self.current_state in ['mc', 'ms', 'mx']:
1500 self.instance.append(self.current_entry)
1501 self.current_entry = POEntry(linenum=self.current_line)
1502 if self.current_entry.comment != '':
1503 self.current_entry.comment += '\n'
1504 self.current_entry.comment += self.current_token[3:]
1505 return True
1507 def handle_oc(self):
1508 """Handle a file:num occurrence."""
1509 if self.current_state in ['mc', 'ms', 'mx']:
1510 self.instance.append(self.current_entry)
1511 self.current_entry = POEntry(linenum=self.current_line)
1512 occurrences = self.current_token[3:].split()
1513 for occurrence in occurrences:
1514 if occurrence != '':
1515 try:
1516 fil, line = occurrence.rsplit(':', 1)
1517 if not line.isdigit():
1518 fil = fil + line
1519 line = ''
1520 self.current_entry.occurrences.append((fil, line))
1521 except (ValueError, AttributeError):
1522 self.current_entry.occurrences.append((occurrence, ''))
1523 return True
1525 def handle_fl(self):
1526 """Handle a flags line."""
1527 if self.current_state in ['mc', 'ms', 'mx']:
1528 self.instance.append(self.current_entry)
1529 self.current_entry = POEntry(linenum=self.current_line)
1530 self.current_entry.flags += [c.strip() for c in
1531 self.current_token[3:].split(',')]
1532 return True
1534 def handle_pp(self):
1535 """Handle a previous msgid_plural line."""
1536 if self.current_state in ['mc', 'ms', 'mx']:
1537 self.instance.append(self.current_entry)
1538 self.current_entry = POEntry(linenum=self.current_line)
1539 self.current_entry.previous_msgid_plural = \
1540 unescape(self.current_token[1:-1])
1541 return True
1543 def handle_pm(self):
1544 """Handle a previous msgid line."""
1545 if self.current_state in ['mc', 'ms', 'mx']:
1546 self.instance.append(self.current_entry)
1547 self.current_entry = POEntry(linenum=self.current_line)
1548 self.current_entry.previous_msgid = \
1549 unescape(self.current_token[1:-1])
1550 return True
1552 def handle_pc(self):
1553 """Handle a previous msgctxt line."""
1554 if self.current_state in ['mc', 'ms', 'mx']:
1555 self.instance.append(self.current_entry)
1556 self.current_entry = POEntry(linenum=self.current_line)
1557 self.current_entry.previous_msgctxt = \
1558 unescape(self.current_token[1:-1])
1559 return True
1561 def handle_ct(self):
1562 """Handle a msgctxt."""
1563 if self.current_state in ['mc', 'ms', 'mx']:
1564 self.instance.append(self.current_entry)
1565 self.current_entry = POEntry(linenum=self.current_line)
1566 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1567 return True
1569 def handle_mi(self):
1570 """Handle a msgid."""
1571 if self.current_state in ['mc', 'ms', 'mx']:
1572 self.instance.append(self.current_entry)
1573 self.current_entry = POEntry(linenum=self.current_line)
1574 self.current_entry.obsolete = self.entry_obsolete
1575 self.current_entry.msgid = unescape(self.current_token[1:-1])
1576 return True
1578 def handle_mp(self):
1579 """Handle a msgid plural."""
1580 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1581 return True
1583 def handle_ms(self):
1584 """Handle a msgstr."""
1585 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1586 return True
1588 def handle_mx(self):
1589 """Handle a msgstr plural."""
1590 index = self.current_token[7]
1591 value = self.current_token[self.current_token.find('"') + 1:-1]
1592 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1593 self.msgstr_index = int(index)
1594 return True
1596 def handle_mc(self):
1597 """Handle a msgid or msgstr continuation line."""
1598 token = unescape(self.current_token[1:-1])
1599 if self.current_state == 'ct':
1600 self.current_entry.msgctxt += token
1601 elif self.current_state == 'mi':
1602 self.current_entry.msgid += token
1603 elif self.current_state == 'mp':
1604 self.current_entry.msgid_plural += token
1605 elif self.current_state == 'ms':
1606 self.current_entry.msgstr += token
1607 elif self.current_state == 'mx':
1608 self.current_entry.msgstr_plural[self.msgstr_index] += token
1609 elif self.current_state == 'pp':
1610 self.current_entry.previous_msgid_plural += token
1611 elif self.current_state == 'pm':
1612 self.current_entry.previous_msgid += token
1613 elif self.current_state == 'pc':
1614 self.current_entry.previous_msgctxt += token
1615 # don't change the current state
1616 return False
1617 # }}}
1618 # class _MOFileParser {{{
1621 class _MOFileParser(object):
1623 A class to parse binary mo files.
1626 def __init__(self, mofile, *args, **kwargs):
1628 Constructor.
1630 Keyword arguments:
1632 ``mofile``
1633 string, path to the mo file or its content
1635 ``encoding``
1636 string, the encoding to use, defaults to ``default_encoding``
1637 global variable (optional).
1639 ``check_for_duplicates``
1640 whether to check for duplicate entries when adding entries to the
1641 file (optional, default: ``False``).
1643 self.fhandle = open(mofile, 'rb')
1645 klass = kwargs.get('klass')
1646 if klass is None:
1647 klass = MOFile
1648 self.instance = klass(
1649 fpath=mofile,
1650 encoding=kwargs.get('encoding', default_encoding),
1651 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1654 def __del__(self):
1656 Make sure the file is closed, this prevents warnings on unclosed file
1657 when running tests with python >= 3.2.
1659 if self.fhandle:
1660 self.fhandle.close()
1662 def parse(self):
1664 Build the instance with the file handle provided in the
1665 constructor.
1667 # parse magic number
1668 magic_number = self._readbinary('<I', 4)
1669 if magic_number == MOFile.MAGIC:
1670 ii = '<II'
1671 elif magic_number == MOFile.MAGIC_SWAPPED:
1672 ii = '>II'
1673 else:
1674 raise IOError('Invalid mo file, magic number is incorrect !')
1675 self.instance.magic_number = magic_number
1676 # parse the version number and the number of strings
1677 version, numofstrings = self._readbinary(ii, 8)
1678 # from MO file format specs: "A program seeing an unexpected major
1679 # revision number should stop reading the MO file entirely"
1680 if version not in (0, 1):
1681 raise IOError('Invalid mo file, unexpected major revision number')
1682 self.instance.version = version
1683 # original strings and translation strings hash table offset
1684 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1685 # move to msgid hash table and read length and offset of msgids
1686 self.fhandle.seek(msgids_hash_offset)
1687 msgids_index = []
1688 for i in range(numofstrings):
1689 msgids_index.append(self._readbinary(ii, 8))
1690 # move to msgstr hash table and read length and offset of msgstrs
1691 self.fhandle.seek(msgstrs_hash_offset)
1692 msgstrs_index = []
1693 for i in range(numofstrings):
1694 msgstrs_index.append(self._readbinary(ii, 8))
1695 # build entries
1696 encoding = self.instance.encoding
1697 for i in range(numofstrings):
1698 self.fhandle.seek(msgids_index[i][1])
1699 msgid = self.fhandle.read(msgids_index[i][0])
1701 self.fhandle.seek(msgstrs_index[i][1])
1702 msgstr = self.fhandle.read(msgstrs_index[i][0])
1703 if i == 0 and not msgid: # metadata
1704 raw_metadata, metadata = msgstr.split(b('\n')), {}
1705 for line in raw_metadata:
1706 tokens = line.split(b(':'), 1)
1707 if tokens[0] != b(''):
1708 try:
1709 k = tokens[0].decode(encoding)
1710 v = tokens[1].decode(encoding)
1711 metadata[k] = v.strip()
1712 except IndexError:
1713 metadata[k] = u('')
1714 self.instance.metadata = metadata
1715 continue
1716 # test if we have a plural entry
1717 msgid_tokens = msgid.split(b('\0'))
1718 if len(msgid_tokens) > 1:
1719 entry = self._build_entry(
1720 msgid=msgid_tokens[0],
1721 msgid_plural=msgid_tokens[1],
1722 msgstr_plural=dict((k, v) for k, v in
1723 enumerate(msgstr.split(b('\0'))))
1725 else:
1726 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1727 self.instance.append(entry)
1728 # close opened file
1729 self.fhandle.close()
1730 return self.instance
1732 def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1733 msgstr_plural=None):
1734 msgctxt_msgid = msgid.split(b('\x04'))
1735 encoding = self.instance.encoding
1736 if len(msgctxt_msgid) > 1:
1737 kwargs = {
1738 'msgctxt': msgctxt_msgid[0].decode(encoding),
1739 'msgid': msgctxt_msgid[1].decode(encoding),
1741 else:
1742 kwargs = {'msgid': msgid.decode(encoding)}
1743 if msgstr:
1744 kwargs['msgstr'] = msgstr.decode(encoding)
1745 if msgid_plural:
1746 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1747 if msgstr_plural:
1748 for k in msgstr_plural:
1749 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1750 kwargs['msgstr_plural'] = msgstr_plural
1751 return MOEntry(**kwargs)
1753 def _readbinary(self, fmt, numbytes):
1755 Private method that unpack n bytes of data using format <fmt>.
1756 It returns a tuple or a mixed value if the tuple length is 1.
1758 bytes = self.fhandle.read(numbytes)
1759 tup = struct.unpack(fmt, bytes)
1760 if len(tup) == 1:
1761 return tup[0]
1762 return tup
1763 # }}}
1764 # class TextWrapper {{{
1767 class TextWrapper(textwrap.TextWrapper):
1769 Subclass of textwrap.TextWrapper that backport the
1770 drop_whitespace option.
1772 def __init__(self, *args, **kwargs):
1773 drop_whitespace = kwargs.pop('drop_whitespace', True)
1774 textwrap.TextWrapper.__init__(self, *args, **kwargs)
1775 self.drop_whitespace = drop_whitespace
1777 def _wrap_chunks(self, chunks):
1778 """_wrap_chunks(chunks : [string]) -> [string]
1780 Wrap a sequence of text chunks and return a list of lines of
1781 length 'self.width' or less. (If 'break_long_words' is false,
1782 some lines may be longer than this.) Chunks correspond roughly
1783 to words and the whitespace between them: each chunk is
1784 indivisible (modulo 'break_long_words'), but a line break can
1785 come between any two chunks. Chunks should not have internal
1786 whitespace; ie. a chunk is either all whitespace or a "word".
1787 Whitespace chunks will be removed from the beginning and end of
1788 lines, but apart from that whitespace is preserved.
1790 lines = []
1791 if self.width <= 0:
1792 raise ValueError("invalid width %r (must be > 0)" % self.width)
1794 # Arrange in reverse order so items can be efficiently popped
1795 # from a stack of chucks.
1796 chunks.reverse()
1798 while chunks:
1800 # Start the list of chunks that will make up the current line.
1801 # cur_len is just the length of all the chunks in cur_line.
1802 cur_line = []
1803 cur_len = 0
1805 # Figure out which static string will prefix this line.
1806 if lines:
1807 indent = self.subsequent_indent
1808 else:
1809 indent = self.initial_indent
1811 # Maximum width for this line.
1812 width = self.width - len(indent)
1814 # First chunk on line is whitespace -- drop it, unless this
1815 # is the very beginning of the text (ie. no lines started yet).
1816 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1817 del chunks[-1]
1819 while chunks:
1820 l = len(chunks[-1])
1822 # Can at least squeeze this chunk onto the current line.
1823 if cur_len + l <= width:
1824 cur_line.append(chunks.pop())
1825 cur_len += l
1827 # Nope, this line is full.
1828 else:
1829 break
1831 # The current line is full, and the next chunk is too big to
1832 # fit on *any* line (not just this one).
1833 if chunks and len(chunks[-1]) > width:
1834 self._handle_long_word(chunks, cur_line, cur_len, width)
1836 # If the last chunk on this line is all whitespace, drop it.
1837 if self.drop_whitespace and cur_line and not cur_line[-1].strip():
1838 del cur_line[-1]
1840 # Convert current line back to a string and store it in list
1841 # of all lines (return value).
1842 if cur_line:
1843 lines.append(indent + ''.join(cur_line))
1845 return lines
1846 # }}}
1847 # function wrap() {{{
1850 def wrap(text, width=70, **kwargs):
1852 Wrap a single paragraph of text, returning a list of wrapped lines.
1854 if sys.version_info < (2, 6):
1855 return TextWrapper(width=width, **kwargs).wrap(text)
1856 return textwrap.wrap(text, width=width, **kwargs)
1858 # }}}
1860 def genKeyId(inkey):
1861 crc = binascii.crc32(bytes(inkey, encoding="UTF-8")) & 0xffffffff
1862 # Use simple ASCII characters, exclude I, l, 1 and O, 0 to avoid confusing IDs
1863 symbols = "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz23456789";
1864 outkey = ""
1865 for keyind in range(0, 5):
1866 outkey += symbols[(crc & 63) % len(symbols)];
1867 crc >>= 6;
1868 return outkey