storage/qm.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21 #
  22
  23 """Module for parsing Qt .qm files
  24
  25 @note: based on documentation from Gettext's .qm implementation (see write-qt.c) and on observation
  26 of the output of lrelease.
  27 @note: Certain deprecated section tags are not implemented.  These will break and print out
  28 the missing tag.  They are easy to implement and should follow the structure in 03
  29 (Translation).  We could find no examples that use these so we'd rather leave it
  30 unimplemented until we actually have test data.
  31 @note: Many .qm files are unable to be parsed as they do not have the source text.  We assume
  32 that since they use a hash table to lookup the data there is actually no need for the
  33 source text.  It seems however that in Qt4's lrelease all data is included in the resultant .qm
  34 file.
  35 @todo: We can only parse, not create, a .qm file.  The main issue is that we need to
  36 implement the hashing algorithm (which seems to be identical to the Gettext hash algorithm).  Unlike
  37 Gettext it seems that the hash is required, but that has not been validated.
  38 @todo: The code can parse files correctly.  But it could be cleaned up to be more readable, especially
  39 the part that breaks the file into sections.
  40 """
  41
  42 from translate.storage import base
  43 from translate.misc.multistring import multistring
  44 import codecs
  45 import struct
  46 import sys
  47
  48 QM_MAGIC_NUMBER = (0x3CB86418L, 0xCAEF9C95L, 0xCD211CBFL, 0x60A1BDDDL)
  49
  50 def qmunpack(qmfile='messages.mo'):
  51     """Helper to unpack Qt .qm files into a Python string"""
  52     f = open(qmfile)
  53     s = f.read()
  54     print "\\x%02x"*len(s) % tuple(map(ord, s))
  55     f.close()
  56
  57 class qmunit(base.TranslationUnit):
  58     """A class representing a .qm translation message."""
  59     def __init__(self, source=None):
  60         super(qmunit, self).__init__(source)
  61
  62 class qmfile(base.TranslationStore):
  63     """A class representing a .qm file."""
  64     UnitClass = qmunit
  65     def __init__(self, inputfile=None, unitclass=qmunit):
  66         self.UnitClass = unitclass
  67         base.TranslationStore.__init__(self, unitclass=unitclass)
  68         self.units = []
  69         self.filename = ''
  70         if inputfile is not None:
  71             self.parsestring(inputfile)
  72
  73     def __str__(self):
  74         """Output a string representation of the .qm data file"""
  75         return ""
  76
  77     def parse(self, input):
  78         """parses the given file or file source string"""
  79         if hasattr(input, 'name'):
  80             self.filename = input.name
  81         elif not getattr(self, 'filename', ''):
  82             self.filename = ''
  83         if hasattr(input, "read"):
  84             qmsrc = input.read()
  85             input.close()
  86             input = qmsrc
  87         magic = struct.unpack(">4L", input[:16])
  88         if magic != QM_MAGIC_NUMBER:
  89             raise ValueError("This is not a .qm file")
  90         startsection = 16
  91         sectionheader = 5
  92         while startsection < len(input):
  93             section_type, length = struct.unpack(">bL", input[startsection:startsection+sectionheader])
  94             if section_type == 0x42:
  95                 #print "Section: hash"
  96                 hashash = True
  97                 hash_start = startsection+sectionheader
  98                 hash_data = struct.unpack(">%db" % length, input[startsection+sectionheader:startsection+sectionheader+length])
  99             elif section_type == 0x69:
 100                 #print "Section: messages"
 101                 hasmessages = True
 102                 messages_start = startsection+sectionheader
 103                 messages_data = struct.unpack(">%db" % length, input[startsection+sectionheader:startsection+sectionheader+length])
 104             elif section_type == 0x2f:
 105                 #print "Section: contexts"
 106                 hascontexts = True
 107                 contexts_start = startsection+sectionheader
 108                 contexts_data = struct.unpack(">%db" % length, input[startsection+sectionheader:startsection+sectionheader+length])
 109             startsection = startsection+sectionheader+length
 110         pos = messages_start
 111         source = target = None
 112         while pos < messages_start + len(messages_data):
 113             subsection, = struct.unpack(">b", input[pos:pos+1])
 114             if subsection == 0x01: # End
 115                 #print "End"
 116                 pos = pos+1
 117                 if not source is None and not target is None:
 118                     newunit = self.addsourceunit(source)
 119                     newunit.target = target
 120                     source = target = None
 121                 else:
 122                     raise ValueError("Old .qm format with no source defined")
 123                 continue
 124             #print pos, subsection
 125             pos = pos+1
 126             length, = struct.unpack(">l", input[pos:pos+4])
 127             if subsection == 0x03: # Translation
 128                 if length != -1:
 129                     raw, = struct.unpack(">%ds" % length, input[pos+4:pos+4+length])
 130                     string, templen = codecs.utf_16_be_decode(raw)
 131                     if target:
 132                         target.strings.append(string)
 133                     else:
 134                         target = multistring(string)
 135                     pos = pos+4+length
 136                 else:
 137                     target = ""
 138                     pos = pos+4
 139                 #print "Translation: %s" % target.encode('utf-8')
 140             elif subsection == 0x06: # SourceText
 141                 source = input[pos+4:pos+4+length].decode('iso-8859-1')
 142                 #print "SourceText: %s" % source
 143                 pos = pos+4+length
 144             elif subsection == 0x07: # Context
 145                 context = input[pos+4:pos+4+length].decode('iso-8859-1')
 146                 #print "Context: %s" % context
 147                 pos = pos+4+length
 148             elif subsection == 0x08: # Disambiguating-comment
 149                 comment = input[pos+4:pos+4+length]
 150                 #print "Disambiguating-comment: %s" % comment
 151                 pos = pos+4+length
 152             elif subsection == 0x05: # hash
 153                 hash = input[pos:pos+4]
 154                 #print "Hash: %s" % hash
 155                 pos = pos+4
 156             else:
 157                 if subsection == 0x02: # SourceText16
 158                     subsection_name = "SourceText16"
 159                 elif subsection == 0x04: # Context16
 160                     subsection_name = "Context16"
 161                 else:
 162                     subsection_name = "Unkown"
 163                 print >> sys.stderr, "Unimplemented: %s %s" % (subsection, subsection_name)
 164                 return