2 # -*- coding: utf-8 -*-
4 # Copyright 2007 Zuza Software Foundation
6 # the function "__str__" was derived from Python v2.4
7 # (Tools/i18n/msgfmt.py - function "generate"):
8 # Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
9 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
10 # All rights reserved.
11 # original license: Python Software Foundation (version 2)
14 # This file is part of translate.
16 # translate is free software; you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation; either version 2 of the License, or
19 # (at your option) any later version.
21 # translate is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details.
26 # You should have received a copy of the GNU General Public License
27 # along with translate; if not, write to the Free Software
28 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
31 """Module for parsing Gettext .mo files for translation.
33 The coding of .mo files was produced from documentation in Gettext 0.16 and
34 from observation and testing of existing .mo files in the wild.
36 The class does not implement any of the hashing componets of Gettext. This
37 will probably make the output file slower in some instances.
40 from translate
.storage
import base
41 from translate
.storage
import po
42 from translate
.misc
.multistring
import multistring
47 MO_MAGIC_NUMBER
= 0x950412deL
49 def mounpack(filename
='messages.mo'):
50 """Helper to unpack Gettext MO files into a Python string"""
53 print "\\x%02x"*len(s
) % tuple(map(ord, s
))
57 c0
= (result
>> 0) & 0xff
58 c1
= (result
>> 8) & 0xff
59 c2
= (result
>> 16) & 0xff
60 c3
= (result
>> 24) & 0xff
62 return (c0
<< 24) |
(c1
<< 16) |
(c2
<< 8) | c3
64 def hashpjw(str_param
):
72 g
= hval
& 0xf << (HASHWORDBITS
- 4)
74 hval
= hval ^ g
>> (HASHWORDBITS
- 8)
79 class mounit(base
.TranslationUnit
):
80 """A class representing a .mo translation message."""
81 def __init__(self
, source
=None):
83 self
.msgidcomments
= []
84 super(mounit
, self
).__init
__(source
)
87 """Get the message context"""
88 # Still need to handle KDE comments
89 if self
.msgctxt
is None:
91 return "".join(self
.msgctxt
)
94 """Is this a header entry?"""
95 return self
.source
== ""
97 def istranslatable(self
):
98 """Is this message translateable?"""
99 return bool(self
.source
)
101 class mofile(base
.TranslationStore
):
102 """A class representing a .mo file."""
104 def __init__(self
, inputfile
=None, unitclass
=mounit
):
105 self
.UnitClass
= unitclass
106 base
.TranslationStore
.__init
__(self
, unitclass
=unitclass
)
109 if inputfile
is not None:
110 self
.parsestring(inputfile
)
113 """Output a string representation of the MO data file"""
114 # check the header of this file for the copyright note of this function
115 def add_to_hash_table(string
, i
):
117 S
= hash_size
<= 2 and 3 or hash_size
# Taken from gettext-0.17:gettext-tools/src/wrote-mo.c:408-409
119 orig_hash_cursor
= hash_cursor
;
120 increment
= 1 + (V
% (S
- 2));
122 index
= hash_table
[hash_cursor
]
124 hash_table
[hash_cursor
] = i
+ 1
126 hash_cursor
+= increment
127 hash_cursor
= hash_cursor
% S
128 assert(hash_cursor
!= orig_hash_cursor
)
130 if len(self
.units
) == 0:
132 hash_size
= int(len(self
.units
) * 1.4)
134 for unit
in self
.units
:
135 if isinstance(unit
.source
, multistring
):
136 source
= "".join(unit
.msgidcomments
) + "\0".join(unit
.source
.strings
)
138 source
= "".join(unit
.msgidcomments
) + unit
.source
140 source
= "".join(unit
.msgctxt
) + "\x04" + source
141 if isinstance(unit
.target
, multistring
):
142 target
= "\0".join(unit
.target
.strings
)
146 MESSAGES
[source
.encode("utf-8")] = target
147 hash_table
= array
.array("L", [0] * hash_size
)
148 keys
= MESSAGES
.keys()
149 # the keys are sorted in the .mo file
153 for i
, id in enumerate(keys
):
154 # For each string, we need size and file offset. Each string is NUL
155 # terminated; the NUL does not count into the size.
156 # TODO: We don't do any encoding detection from the PO Header
157 add_to_hash_table(id, i
)
158 string
= MESSAGES
[id] # id is already encoded for use as a dictionary key
159 if isinstance(string
, unicode):
160 string
= string
.encode('utf-8')
161 offsets
.append((len(ids
), len(id), len(strs
), len(string
)))
162 ids
= ids
+ id + '\0'
163 strs
= strs
+ string
+ '\0'
165 # The header is 7 32-bit unsigned integers. We don't use hash tables, so
166 # the keys start right after the index tables.
168 keystart
= 7*4+16*len(keys
)+hash_size
*4
169 # and the values start after the keys
170 valuestart
= keystart
+ len(ids
)
173 # The string table first has the list of keys, then the list of values.
174 # Each entry has first the size of the string, then the file offset.
175 for o1
, l1
, o2
, l2
in offsets
:
176 koffsets
= koffsets
+ [l1
, o1
+keystart
]
177 voffsets
= voffsets
+ [l2
, o2
+valuestart
]
178 offsets
= koffsets
+ voffsets
179 output
= struct
.pack("Iiiiiii",
180 MO_MAGIC_NUMBER
, # Magic
182 len(keys
), # # of entries
183 7*4, # start of key index
184 7*4+len(keys
)*8, # start of value index
185 hash_size
, 7*4+2*(len(keys
)*8)) # size and offset of hash table
186 output
= output
+ array
.array("i", offsets
).tostring()
187 output
= output
+ hash_table
.tostring()
188 output
= output
+ ids
189 output
= output
+ strs
192 def parse(self
, input):
193 """parses the given file or file source string"""
194 if hasattr(input, 'name'):
195 self
.filename
= input.name
196 elif not getattr(self
, 'filename', ''):
198 if hasattr(input, "read"):
202 little
, = struct
.unpack("<L", input[:4])
203 big
, = struct
.unpack(">L", input[:4])
204 if little
== MO_MAGIC_NUMBER
:
206 elif big
== MO_MAGIC_NUMBER
:
209 raise ValueError("This is not an MO file")
210 magic
, version
, lenkeys
, startkey
, startvalue
, sizehash
, offsethash
= struct
.unpack("%sLiiiiii" % endian
, input[:(7*4)])
212 raise ValueError("Unable to process MO files with versions > 1. This is a %d version MO file" % version
)
214 for i
in range(lenkeys
):
215 nextkey
= startkey
+(i
*2*4)
216 nextvalue
= startvalue
+(i
*2*4)
217 klength
, koffset
= struct
.unpack("%sii" % endian
, input[nextkey
:nextkey
+(2*4)])
218 vlength
, voffset
= struct
.unpack("%sii" % endian
, input[nextvalue
:nextvalue
+(2*4)])
219 source
= input[koffset
:koffset
+klength
]
222 context
, source
= source
.split("\x04")
223 # Still need to handle KDE comments
224 source
= multistring(source
.split("\0"), encoding
=encoding
)
226 charset
= re
.search("charset=([^\\s]+)", input[voffset
:voffset
+vlength
])
228 encoding
= po
.encodingToUse(charset
.group(1))
229 target
= multistring(input[voffset
:voffset
+vlength
].split("\0"), encoding
=encoding
)
230 newunit
= mounit(source
)
231 newunit
.settarget(target
)
232 if context
is not None:
233 newunit
.msgctxt
.append(context
)
234 self
.addunit(newunit
)