storage/mo.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007 Zuza Software Foundation
   5 #
   6 # the function "__str__" was derived from Python v2.4
   7 #       (Tools/i18n/msgfmt.py - function "generate"):
   8 #   Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
   9 #   Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
  10 #   All rights reserved.
  11 #   original license: Python Software Foundation (version 2)
  12 #
  13 #
  14 # This file is part of translate.
  15 #
  16 # translate is free software; you can redistribute it and/or modify
  17 # it under the terms of the GNU General Public License as published by
  18 # the Free Software Foundation; either version 2 of the License, or
  19 # (at your option) any later version.
  20 #
  21 # translate is distributed in the hope that it will be useful,
  22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24 # GNU General Public License for more details.
  25 #
  26 # You should have received a copy of the GNU General Public License
  27 # along with translate; if not, write to the Free Software
  28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  29 #
  30
  31 """Module for parsing Gettext .mo files for translation.
  32
  33 The coding of .mo files was produced from documentation in Gettext 0.16 and
  34 from observation and testing of existing .mo files in the wild.
  35
  36 The class does not implement any of the hashing componets of Gettext.  This
  37 will probably make the output file slower in some instances.
  38 """
  39
  40 from translate.storage import base
  41 from translate.storage import po
  42 from translate.misc.multistring import multistring
  43 import struct
  44 import array
  45 import re
  46
  47 MO_MAGIC_NUMBER = 0x950412deL
  48
  49 def mounpack(filename='messages.mo'):
  50     """Helper to unpack Gettext MO files into a Python string"""
  51     f = open(filename)
  52     s = f.read()
  53     print "\\x%02x"*len(s) % tuple(map(ord, s))
  54     f.close()
  55
  56 def my_swap4(result):
  57     c0 = (result >> 0) & 0xff
  58     c1 = (result >> 8) & 0xff
  59     c2 = (result >> 16) & 0xff
  60     c3 = (result >> 24) & 0xff
  61
  62     return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
  63
  64 def hashpjw(str_param):
  65    HASHWORDBITS = 32
  66    hval = 0
  67    g = None
  68    s = str_param
  69    for s in str_param:
  70        hval = hval << 4
  71        hval += ord(s)
  72        g = hval & 0xf << (HASHWORDBITS - 4)
  73        if (g != 0):
  74            hval = hval ^ g >> (HASHWORDBITS - 8)
  75            hval = hval ^ g
  76    return hval
  77
  78
  79 class mounit(base.TranslationUnit):
  80     """A class representing a .mo translation message."""
  81     def __init__(self, source=None):
  82         self.msgctxt = []
  83         self.msgidcomments = []
  84         super(mounit, self).__init__(source)
  85
  86     def getcontext(self):
  87         """Get the message context"""
  88         # Still need to handle KDE comments
  89         if self.msgctxt is None:
  90             return None
  91         return "".join(self.msgctxt)
  92
  93     def isheader(self):
  94         """Is this a header entry?"""
  95         return self.source == ""
  96
  97     def istranslatable(self):
  98         """Is this message translateable?"""
  99         return bool(self.source)
 100
 101 class mofile(base.TranslationStore):
 102     """A class representing a .mo file."""
 103     UnitClass = mounit
 104     def __init__(self, inputfile=None, unitclass=mounit):
 105         self.UnitClass = unitclass
 106         base.TranslationStore.__init__(self, unitclass=unitclass)
 107         self.units = []
 108         self.filename = ''
 109         if inputfile is not None:
 110             self.parsestring(inputfile)
 111
 112     def __str__(self):
 113         """Output a string representation of the MO data file"""
 114         # check the header of this file for the copyright note of this function
 115         def add_to_hash_table(string, i):
 116             V = hashpjw(string)
 117             S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gettext-tools/src/wrote-mo.c:408-409
 118             hash_cursor = V % S;
 119             orig_hash_cursor = hash_cursor;
 120             increment = 1 + (V % (S - 2));
 121             while True:
 122                 index = hash_table[hash_cursor]
 123                 if (index == 0):
 124                     hash_table[hash_cursor] = i + 1
 125                     break
 126                 hash_cursor += increment
 127                 hash_cursor = hash_cursor % S
 128                 assert(hash_cursor != orig_hash_cursor)
 129
 130         if len(self.units) == 0:
 131             return ''
 132         hash_size = int(len(self.units) * 1.4)
 133         MESSAGES = {}
 134         for unit in self.units:
 135             if isinstance(unit.source, multistring):
 136                 source = "".join(unit.msgidcomments) + "\0".join(unit.source.strings)
 137             else:
 138                 source = "".join(unit.msgidcomments) + unit.source
 139             if unit.msgctxt:
 140                 source = "".join(unit.msgctxt) + "\x04" + source
 141             if isinstance(unit.target, multistring):
 142                 target = "\0".join(unit.target.strings)
 143             else:
 144                 target = unit.target
 145             if unit.target:
 146                 MESSAGES[source.encode("utf-8")] = target
 147         hash_table = array.array("L", [0] * hash_size)
 148         keys = MESSAGES.keys()
 149         # the keys are sorted in the .mo file
 150         keys.sort()
 151         offsets = []
 152         ids = strs = ''
 153         for i, id in enumerate(keys):
 154             # For each string, we need size and file offset.  Each string is NUL
 155             # terminated; the NUL does not count into the size.
 156             # TODO: We don't do any encoding detection from the PO Header
 157             add_to_hash_table(id, i)
 158             string = MESSAGES[id] # id is already encoded for use as a dictionary key
 159             if isinstance(string, unicode):
 160                 string = string.encode('utf-8')
 161             offsets.append((len(ids), len(id), len(strs), len(string)))
 162             ids = ids + id + '\0'
 163             strs = strs + string + '\0'
 164         output = ''
 165         # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
 166         # the keys start right after the index tables.
 167         # translated string.
 168         keystart = 7*4+16*len(keys)+hash_size*4
 169         # and the values start after the keys
 170         valuestart = keystart + len(ids)
 171         koffsets = []
 172         voffsets = []
 173         # The string table first has the list of keys, then the list of values.
 174         # Each entry has first the size of the string, then the file offset.
 175         for o1, l1, o2, l2 in offsets:
 176             koffsets = koffsets + [l1, o1+keystart]
 177             voffsets = voffsets + [l2, o2+valuestart]
 178         offsets = koffsets + voffsets
 179         output = struct.pack("Iiiiiii",
 180                              MO_MAGIC_NUMBER,   # Magic
 181                              0,                 # Version
 182                              len(keys),         # # of entries
 183                              7*4,               # start of key index
 184                              7*4+len(keys)*8,   # start of value index
 185                              hash_size, 7*4+2*(len(keys)*8))              # size and offset of hash table
 186         output = output + array.array("i", offsets).tostring()
 187         output = output + hash_table.tostring()
 188         output = output + ids
 189         output = output + strs
 190         return output
 191
 192     def parse(self, input):
 193         """parses the given file or file source string"""
 194         if hasattr(input, 'name'):
 195             self.filename = input.name
 196         elif not getattr(self, 'filename', ''):
 197             self.filename = ''
 198         if hasattr(input, "read"):
 199             mosrc = input.read()
 200             input.close()
 201             input = mosrc
 202         little, = struct.unpack("<L", input[:4])
 203         big, = struct.unpack(">L", input[:4])
 204         if little == MO_MAGIC_NUMBER:
 205             endian = "<"
 206         elif big == MO_MAGIC_NUMBER:
 207             endian = ">"
 208         else:
 209             raise ValueError("This is not an MO file")
 210         magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = struct.unpack("%sLiiiiii" % endian, input[:(7*4)])
 211         if version > 1:
 212             raise ValueError("Unable to process MO files with versions > 1.  This is a %d version MO file" % version)
 213         encoding = 'UTF-8'
 214         for i in range(lenkeys):
 215             nextkey = startkey+(i*2*4)
 216             nextvalue = startvalue+(i*2*4)
 217             klength, koffset = struct.unpack("%sii" % endian, input[nextkey:nextkey+(2*4)])
 218             vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:nextvalue+(2*4)])
 219             source = input[koffset:koffset+klength]
 220             context = None
 221             if "\x04" in source:
 222                 context, source = source.split("\x04")
 223             # Still need to handle KDE comments
 224             source = multistring(source.split("\0"), encoding=encoding)
 225             if source == "":
 226                 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+vlength])
 227                 if charset:
 228                     encoding = po.encodingToUse(charset.group(1))
 229             target = multistring(input[voffset:voffset+vlength].split("\0"), encoding=encoding)
 230             newunit = mounit(source)
 231             newunit.settarget(target)
 232             if context is not None:
 233                 newunit.msgctxt.append(context)
 234             self.addunit(newunit)