fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / wordfast.py
blob2aec32ca4ff83ea263d3e7f0bf68d5cd79d3d19d
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2007 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Manage the Wordfast Translation Memory format
23 """
25 import csv
26 import time
27 from translate.storage import base
29 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
30 """Time format used by Wordfast"""
32 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"]
33 """Field names for the Wordfast header"""
35 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"]
36 """Field names for a Wordfast TU"""
38 WF_FIELDNAMES_HEADER_DEFAULTS = {
39 "date": "%19000101~121212",
40 "userlist": "%User ID,TT,TT Translate-Toolkit",
41 "tucount": "%TU=00000001",
42 "src-lang": "%EN-US",
43 "version": "%Wordfast TM v.5.51w9/00",
44 "target-lang": "",
45 "license": "%---00000001",
46 "attr1list": "",
47 "attr2list": "",
48 "attr3list": "",
49 "attr4list": "" }
50 """Default or minimum header entries for a Wordfast file"""
52 # TODO Needs validation. The following need to be checked against a WF TM file to ensure
53 # that the correct Unicode values have been chosen for the characters. For now these look
54 # correct and have been taken from Windows CP1252 and Macintosh code points found for
55 # the respective character sets on Linux.
56 WF_ESCAPE_MAP = (
57 ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes)
58 ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark
59 ("&'85;", u"\u2026"), # … - Elippsis
60 ("&'91;", u"\u2018"), # ‘ - left single quotation mark
61 ("&'92;", u"\u2019"), # ’ - right single quotation mark
62 ("&'93;", u"\u201C"), # “ - left double quotation mark
63 ("&'94;", u"\u201D"), # ” - right double quotation mark
64 ("&'96;", u"\u2013"), # – - en dash (validate)
65 ("&'97;", u"\u2014"), # — - em dash (validate)
66 ("&'99;", u"\u2122"), # ™ - Trade mark
67 # Windows only
68 ("&'A0;", u"\u00A0"), #   - Non breaking space
69 ("&'A9;", u"\u00A9"), # © - Copyright
70 ("&'AE;", u"\u00AE"), # ® - Registered
71 ("&'BC;", u"\u00BC"), # ¼
72 ("&'BD;", u"\u00BD"), # ½
73 ("&'BE;", u"\u00BE"), # ¾
74 # Mac only
75 ("&'A8;", u"\u00AE"), # ® - Registered
76 ("&'AA;", u"\u2122"), # ™ - Trade mark
77 ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark
78 ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark
79 ("&'C9;", u"\u2026"), # … - Horizontal Elippsis
80 ("&'CA;", u"\u00A0"), #   - Non breaking space
81 ("&'D0;", u"\u2013"), # – - en dash (validate)
82 ("&'D1;", u"\u2014"), # — - em dash (validate)
83 ("&'D2;", u"\u201C"), # “ - left double quotation mark
84 ("&'D3;", u"\u201D"), # ” - right double quotation mark
85 ("&'D4;", u"\u2018"), # ‘ - left single quotation mark
86 ("&'D5;", u"\u2019"), # ’ - right single quotation mark
87 ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark
88 ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark
89 # Other markers
90 #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028
92 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
94 TAB_UTF16 = "\x00\x09"
96 def _char_to_wf(string):
97 """Char -> Wordfast &'XX; escapes
99 @note: Full roundtripping is not possible because of the escaping of \n and \t"""
100 # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac
101 if string:
102 for code, char in WF_ESCAPE_MAP:
103 string = string.replace(char.encode('utf-8'), code)
104 string = string.replace("\n", "\\n").replace("\t", "\\t")
105 return string
107 def _wf_to_char(string):
108 """Wordfast &'XX; escapes -> Char"""
109 if string:
110 for code, char in WF_ESCAPE_MAP:
111 string = string.replace(code, char.encode('utf-8'))
112 string = string.replace("\\n", "\n").replace("\\t", "\t")
113 return string
115 class WordfastTime(object):
116 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
117 def __init__(self, newtime=None):
118 self._time = None
119 if not newtime:
120 self.time = None
121 elif isinstance(newtime, basestring):
122 self.timestring = newtime
123 elif isinstance(newtime, time.struct_time):
124 self.time = newtime
126 def get_timestring(self):
127 """Get the time in the Wordfast time format"""
128 if not self._time:
129 return None
130 else:
131 return time.strftime(WF_TIMEFORMAT, self._time)
133 def set_timestring(self, timestring):
134 """Set the time_sturct object using a Wordfast time formated string
136 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
137 @type timestring: String
139 self._time = time.strptime(timestring, WF_TIMEFORMAT)
140 timestring = property(get_timestring, set_timestring)
142 def get_time(self):
143 """Get the time_struct object"""
144 return self._time
146 def set_time(self, newtime):
147 """Set the time_struct object
149 @param newtime: a new time object
150 @type newtime: time.time_struct
152 if newtime and isinstance(newtime, time.struct_time):
153 self._time = newtime
154 else:
155 self._time = None
156 time = property(get_time, set_time)
158 def __str__(self):
159 if not self.timestring:
160 return ""
161 else:
162 return self.timestring
164 class WordfastHeader(object):
165 """A wordfast translation memory header"""
166 def __init__(self, header=None):
167 self._header_dict = []
168 if not header:
169 self.header = self._create_default_header()
170 elif isinstance(header, dict):
171 self.header = header
173 def _create_default_header(self):
174 """Create a default Wordfast header with the date set to the current time"""
175 defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS
176 defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring
177 return defaultheader
179 def getheader(self):
180 """Get the header dictionary"""
181 return self._header_dict
183 def setheader(self, newheader):
184 self._header_dict = newheader
185 header = property(getheader, setheader)
187 def settargetlang(self, newlang):
188 self._header_dict['target-lang'] = '%%%s' % newlang
189 targetlang = property(None, settargetlang)
191 def settucount(self, count):
192 self._header_dict['tucount'] = '%%TU=%08d' % count
193 tucount = property(None, settucount)
195 class WordfastUnit(base.TranslationUnit):
196 """A Wordfast translation memory unit"""
197 def __init__(self, source=None):
198 self._dict = {}
199 if source:
200 self.source = source
201 super(WordfastUnit, self).__init__(source)
203 def _update_timestamp(self):
204 """Refresh the timestamp for the unit"""
205 self._dict['date'] = WordfastTime(time.localtime()).timestring
207 def getdict(self):
208 """Get the dictionary of values for a Wordfast line"""
209 return self._dict
211 def setdict(self, newdict):
212 """Set the dictionary of values for a Wordfast line
214 @param newdict: a new dictionary with Wordfast line elements
215 @type newdict: Dict
217 # TODO First check that the values are OK
218 self._dict = newdict
219 dict = property(getdict, setdict)
221 def _get_source_or_target(self, key):
222 if self._dict[key] is None:
223 return None
224 elif self._dict[key]:
225 return _wf_to_char(self._dict[key]).decode('utf-8')
226 else:
227 return ""
229 def _set_source_or_target(self, key, newvalue):
230 if newvalue is None:
231 self._dict[key] = None
232 if isinstance(newvalue, unicode):
233 newvalue = newvalue.encode('utf-8')
234 newvalue = _char_to_wf(newvalue)
235 if not key in self._dict or newvalue != self._dict[key]:
236 self._dict[key] = newvalue
237 self._update_timestamp()
239 def getsource(self):
240 return self._get_source_or_target('source')
242 def setsource(self, newsource):
243 return self._set_source_or_target('source', newsource)
244 source = property(getsource, setsource)
246 def gettarget(self):
247 return self._get_source_or_target('target')
249 def settarget(self, newtarget):
250 return self._set_source_or_target('target', newtarget)
251 target = property(gettarget, settarget)
253 def settargetlang(self, newlang):
254 self._dict['target-lang'] = newlang
255 targetlang = property(None, settargetlang)
257 def __str__(self):
258 return str(self._dict)
260 def istranslated(self):
261 if not self._dict.get('source', None):
262 return False
263 return bool(self._dict.get('target', None))
266 class WordfastTMFile(base.TranslationStore):
267 """A Wordfast translation memory file"""
268 def __init__(self, inputfile=None, unitclass=WordfastUnit):
269 """construct a Wordfast TM, optionally reading in from inputfile."""
270 self.UnitClass = unitclass
271 base.TranslationStore.__init__(self, unitclass=unitclass)
272 self.filename = ''
273 self.header = WordfastHeader()
274 self._encoding = 'utf-16'
275 if inputfile is not None:
276 self.parse(inputfile)
278 def parse(self, input):
279 """parsese the given file or file source string"""
280 if hasattr(input, 'name'):
281 self.filename = input.name
282 elif not getattr(self, 'filename', ''):
283 self.filename = ''
284 if hasattr(input, "read"):
285 tmsrc = input.read()
286 input.close()
287 input = tmsrc
288 if TAB_UTF16 in input.split("\n")[0]:
289 self._encoding = 'utf-16'
290 else:
291 self._encoding = 'iso-8859-1'
292 try:
293 input = input.decode(self._encoding).encode('utf-8')
294 except:
295 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
296 for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="excel-tab"):
297 self.header = WordfastHeader(header)
298 lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="excel-tab")
299 for line in lines:
300 newunit = WordfastUnit()
301 newunit.dict = line
302 self.addunit(newunit)
304 def __str__(self):
305 output = csv.StringIO()
306 header_output = csv.StringIO()
307 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="excel-tab")
308 unit_count = 0
309 for unit in self.units:
310 if unit.istranslated():
311 unit_count += 1
312 writer.writerow(unit.dict)
313 if unit_count == 0:
314 return ""
315 output.reset()
316 self.header.tucount = unit_count
317 outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="excel-tab")
318 outheader.writerow(self.header.header)
319 header_output.reset()
320 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
321 try:
322 return decoded.encode(self._encoding)
323 except UnicodeEncodeError:
324 return decoded.encode('utf-16')