fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / mo.py
blob5388cd6d1beb060c639f6fb55aca5941a339855d
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2007 Zuza Software Foundation
6 # the function "__str__" was derived from Python v2.4
7 # (Tools/i18n/msgfmt.py - function "generate"):
8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
10 # All rights reserved.
11 # original license: Python Software Foundation (version 2)
14 # This file is part of translate.
16 # translate is free software; you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation; either version 2 of the License, or
19 # (at your option) any later version.
21 # translate is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details.
26 # You should have received a copy of the GNU General Public License
27 # along with translate; if not, write to the Free Software
28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 """Module for parsing Gettext .mo files for translation.
33 The coding of .mo files was produced from documentation in Gettext 0.16 and
34 from observation and testing of existing .mo files in the wild.
36 The class does not implement any of the hashing componets of Gettext. This
37 will probably make the output file slower in some instances.
38 """
40 from translate.storage import base
41 from translate.storage import po
42 from translate.misc.multistring import multistring
43 import struct
44 import array
45 import re
47 MO_MAGIC_NUMBER = 0x950412deL
49 def mounpack(filename='messages.mo'):
50 """Helper to unpack Gettext MO files into a Python string"""
51 f = open(filename)
52 s = f.read()
53 print "\\x%02x"*len(s) % tuple(map(ord, s))
54 f.close()
56 def my_swap4(result):
57 c0 = (result >> 0) & 0xff
58 c1 = (result >> 8) & 0xff
59 c2 = (result >> 16) & 0xff
60 c3 = (result >> 24) & 0xff
62 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
64 def hashpjw(str_param):
65 HASHWORDBITS = 32
66 hval = 0
67 g = None
68 s = str_param
69 for s in str_param:
70 hval = hval << 4
71 hval += ord(s)
72 g = hval & 0xf << (HASHWORDBITS - 4)
73 if (g != 0):
74 hval = hval ^ g >> (HASHWORDBITS - 8)
75 hval = hval ^ g
76 return hval
79 class mounit(base.TranslationUnit):
80 """A class representing a .mo translation message."""
81 def __init__(self, source=None):
82 self.msgctxt = []
83 self.msgidcomments = []
84 super(mounit, self).__init__(source)
86 def getcontext(self):
87 """Get the message context"""
88 # Still need to handle KDE comments
89 if self.msgctxt is None:
90 return None
91 return "".join(self.msgctxt)
93 def isheader(self):
94 """Is this a header entry?"""
95 return self.source == ""
97 def istranslatable(self):
98 """Is this message translateable?"""
99 return bool(self.source)
101 class mofile(base.TranslationStore):
102 """A class representing a .mo file."""
103 UnitClass = mounit
104 def __init__(self, inputfile=None, unitclass=mounit):
105 self.UnitClass = unitclass
106 base.TranslationStore.__init__(self, unitclass=unitclass)
107 self.units = []
108 self.filename = ''
109 if inputfile is not None:
110 self.parsestring(inputfile)
112 def __str__(self):
113 """Output a string representation of the MO data file"""
114 # check the header of this file for the copyright note of this function
115 def add_to_hash_table(string, i):
116 V = hashpjw(string)
117 S = hash_size <= 2 and 3 or hash_size # Taken from gettext-0.17:gettext-tools/src/wrote-mo.c:408-409
118 hash_cursor = V % S;
119 orig_hash_cursor = hash_cursor;
120 increment = 1 + (V % (S - 2));
121 while True:
122 index = hash_table[hash_cursor]
123 if (index == 0):
124 hash_table[hash_cursor] = i + 1
125 break
126 hash_cursor += increment
127 hash_cursor = hash_cursor % S
128 assert(hash_cursor != orig_hash_cursor)
130 if len(self.units) == 0:
131 return ''
132 hash_size = int(len(self.units) * 1.4)
133 MESSAGES = {}
134 for unit in self.units:
135 if isinstance(unit.source, multistring):
136 source = "".join(unit.msgidcomments) + "\0".join(unit.source.strings)
137 else:
138 source = "".join(unit.msgidcomments) + unit.source
139 if unit.msgctxt:
140 source = "".join(unit.msgctxt) + "\x04" + source
141 if isinstance(unit.target, multistring):
142 target = "\0".join(unit.target.strings)
143 else:
144 target = unit.target
145 if unit.target:
146 MESSAGES[source.encode("utf-8")] = target
147 hash_table = array.array("L", [0] * hash_size)
148 keys = MESSAGES.keys()
149 # the keys are sorted in the .mo file
150 keys.sort()
151 offsets = []
152 ids = strs = ''
153 for i, id in enumerate(keys):
154 # For each string, we need size and file offset. Each string is NUL
155 # terminated; the NUL does not count into the size.
156 # TODO: We don't do any encoding detection from the PO Header
157 add_to_hash_table(id, i)
158 string = MESSAGES[id] # id is already encoded for use as a dictionary key
159 if isinstance(string, unicode):
160 string = string.encode('utf-8')
161 offsets.append((len(ids), len(id), len(strs), len(string)))
162 ids = ids + id + '\0'
163 strs = strs + string + '\0'
164 output = ''
165 # The header is 7 32-bit unsigned integers. We don't use hash tables, so
166 # the keys start right after the index tables.
167 # translated string.
168 keystart = 7*4+16*len(keys)+hash_size*4
169 # and the values start after the keys
170 valuestart = keystart + len(ids)
171 koffsets = []
172 voffsets = []
173 # The string table first has the list of keys, then the list of values.
174 # Each entry has first the size of the string, then the file offset.
175 for o1, l1, o2, l2 in offsets:
176 koffsets = koffsets + [l1, o1+keystart]
177 voffsets = voffsets + [l2, o2+valuestart]
178 offsets = koffsets + voffsets
179 output = struct.pack("Iiiiiii",
180 MO_MAGIC_NUMBER, # Magic
181 0, # Version
182 len(keys), # # of entries
183 7*4, # start of key index
184 7*4+len(keys)*8, # start of value index
185 hash_size, 7*4+2*(len(keys)*8)) # size and offset of hash table
186 output = output + array.array("i", offsets).tostring()
187 output = output + hash_table.tostring()
188 output = output + ids
189 output = output + strs
190 return output
192 def parse(self, input):
193 """parses the given file or file source string"""
194 if hasattr(input, 'name'):
195 self.filename = input.name
196 elif not getattr(self, 'filename', ''):
197 self.filename = ''
198 if hasattr(input, "read"):
199 mosrc = input.read()
200 input.close()
201 input = mosrc
202 little, = struct.unpack("<L", input[:4])
203 big, = struct.unpack(">L", input[:4])
204 if little == MO_MAGIC_NUMBER:
205 endian = "<"
206 elif big == MO_MAGIC_NUMBER:
207 endian = ">"
208 else:
209 raise ValueError("This is not an MO file")
210 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = struct.unpack("%sLiiiiii" % endian, input[:(7*4)])
211 if version > 1:
212 raise ValueError("Unable to process MO files with versions > 1. This is a %d version MO file" % version)
213 encoding = 'UTF-8'
214 for i in range(lenkeys):
215 nextkey = startkey+(i*2*4)
216 nextvalue = startvalue+(i*2*4)
217 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:nextkey+(2*4)])
218 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:nextvalue+(2*4)])
219 source = input[koffset:koffset+klength]
220 context = None
221 if "\x04" in source:
222 context, source = source.split("\x04")
223 # Still need to handle KDE comments
224 source = multistring(source.split("\0"), encoding=encoding)
225 if source == "":
226 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+vlength])
227 if charset:
228 encoding = po.encodingToUse(charset.group(1))
229 target = multistring(input[voffset:voffset+vlength].split("\0"), encoding=encoding)
230 newunit = mounit(source)
231 newunit.settarget(target)
232 if context is not None:
233 newunit.msgctxt.append(context)
234 self.addunit(newunit)