2 # -*- coding: utf-8 -*-
4 # Copyright 2007 Zuza Software Foundation
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Manage the Wordfast Translation Memory format
27 from translate
.storage
import base
29 WF_TIMEFORMAT
= "%Y%m%d~%H%M%S"
30 """Time format used by Wordfast"""
32 WF_FIELDNAMES_HEADER
= ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"]
33 """Field names for the Wordfast header"""
35 WF_FIELDNAMES
= ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"]
36 """Field names for a Wordfast TU"""
38 WF_FIELDNAMES_HEADER_DEFAULTS
= {
39 "date": "%19000101~121212",
40 "userlist": "%User ID,TT,TT Translate-Toolkit",
41 "tucount": "%TU=00000001",
43 "version": "%Wordfast TM v.5.51w9/00",
45 "license": "%---00000001",
50 """Default or minimum header entries for a Wordfast file"""
52 # TODO Needs validation. The following need to be checked against a WF TM file to ensure
53 # that the correct Unicode values have been chosen for the characters. For now these look
54 # correct and have been taken from Windows CP1252 and Macintosh code points found for
55 # the respective character sets on Linux.
57 ("&'26;", u
"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes)
58 ("&'82;", u
"\u201A"), # ‚ - Single low-9 quotation mark
59 ("&'85;", u
"\u2026"), # … - Elippsis
60 ("&'91;", u
"\u2018"), # ‘ - left single quotation mark
61 ("&'92;", u
"\u2019"), # ’ - right single quotation mark
62 ("&'93;", u
"\u201C"), # “ - left double quotation mark
63 ("&'94;", u
"\u201D"), # ” - right double quotation mark
64 ("&'96;", u
"\u2013"), # – - en dash (validate)
65 ("&'97;", u
"\u2014"), # — - em dash (validate)
66 ("&'99;", u
"\u2122"), # ™ - Trade mark
68 ("&'A0;", u
"\u00A0"), # - Non breaking space
69 ("&'A9;", u
"\u00A9"), # © - Copyright
70 ("&'AE;", u
"\u00AE"), # ® - Registered
71 ("&'BC;", u
"\u00BC"), # ¼
72 ("&'BD;", u
"\u00BD"), # ½
73 ("&'BE;", u
"\u00BE"), # ¾
75 ("&'A8;", u
"\u00AE"), # ® - Registered
76 ("&'AA;", u
"\u2122"), # ™ - Trade mark
77 ("&'C7;", u
"\u00AB"), # « - Left-pointing double angle quotation mark
78 ("&'C8;", u
"\u00BB"), # » - Right-pointing double angle quotation mark
79 ("&'C9;", u
"\u2026"), # … - Horizontal Elippsis
80 ("&'CA;", u
"\u00A0"), # - Non breaking space
81 ("&'D0;", u
"\u2013"), # – - en dash (validate)
82 ("&'D1;", u
"\u2014"), # — - em dash (validate)
83 ("&'D2;", u
"\u201C"), # “ - left double quotation mark
84 ("&'D3;", u
"\u201D"), # ” - right double quotation mark
85 ("&'D4;", u
"\u2018"), # ‘ - left single quotation mark
86 ("&'D5;", u
"\u2019"), # ’ - right single quotation mark
87 ("&'E2;", u
"\u201A"), # ‚ - Single low-9 quotation mark
88 ("&'E3;", u
"\u201E"), # „ - Double low-9 quotation mark
90 #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028
92 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
94 TAB_UTF16
= "\x00\x09"
96 def _char_to_wf(string
):
97 """Char -> Wordfast &'XX; escapes
99 @note: Full roundtripping is not possible because of the escaping of \n and \t"""
100 # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac
102 for code
, char
in WF_ESCAPE_MAP
:
103 string
= string
.replace(char
.encode('utf-8'), code
)
104 string
= string
.replace("\n", "\\n").replace("\t", "\\t")
107 def _wf_to_char(string
):
108 """Wordfast &'XX; escapes -> Char"""
110 for code
, char
in WF_ESCAPE_MAP
:
111 string
= string
.replace(code
, char
.encode('utf-8'))
112 string
= string
.replace("\\n", "\n").replace("\\t", "\t")
115 class WordfastTime(object):
116 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
117 def __init__(self
, newtime
=None):
121 elif isinstance(newtime
, basestring
):
122 self
.timestring
= newtime
123 elif isinstance(newtime
, time
.struct_time
):
126 def get_timestring(self
):
127 """Get the time in the Wordfast time format"""
131 return time
.strftime(WF_TIMEFORMAT
, self
._time
)
133 def set_timestring(self
, timestring
):
134 """Set the time_sturct object using a Wordfast time formated string
136 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
137 @type timestring: String
139 self
._time
= time
.strptime(timestring
, WF_TIMEFORMAT
)
140 timestring
= property(get_timestring
, set_timestring
)
143 """Get the time_struct object"""
146 def set_time(self
, newtime
):
147 """Set the time_struct object
149 @param newtime: a new time object
150 @type newtime: time.time_struct
152 if newtime
and isinstance(newtime
, time
.struct_time
):
156 time
= property(get_time
, set_time
)
159 if not self
.timestring
:
162 return self
.timestring
164 class WordfastHeader(object):
165 """A wordfast translation memory header"""
166 def __init__(self
, header
=None):
167 self
._header
_dict
= []
169 self
.header
= self
._create
_default
_header
()
170 elif isinstance(header
, dict):
173 def _create_default_header(self
):
174 """Create a default Wordfast header with the date set to the current time"""
175 defaultheader
= WF_FIELDNAMES_HEADER_DEFAULTS
176 defaultheader
['date'] = '%%%s' % WordfastTime(time
.localtime()).timestring
180 """Get the header dictionary"""
181 return self
._header
_dict
183 def setheader(self
, newheader
):
184 self
._header
_dict
= newheader
185 header
= property(getheader
, setheader
)
187 def settargetlang(self
, newlang
):
188 self
._header
_dict
['target-lang'] = '%%%s' % newlang
189 targetlang
= property(None, settargetlang
)
191 def settucount(self
, count
):
192 self
._header
_dict
['tucount'] = '%%TU=%08d' % count
193 tucount
= property(None, settucount
)
195 class WordfastUnit(base
.TranslationUnit
):
196 """A Wordfast translation memory unit"""
197 def __init__(self
, source
=None):
201 super(WordfastUnit
, self
).__init
__(source
)
203 def _update_timestamp(self
):
204 """Refresh the timestamp for the unit"""
205 self
._dict
['date'] = WordfastTime(time
.localtime()).timestring
208 """Get the dictionary of values for a Wordfast line"""
211 def setdict(self
, newdict
):
212 """Set the dictionary of values for a Wordfast line
214 @param newdict: a new dictionary with Wordfast line elements
217 # TODO First check that the values are OK
219 dict = property(getdict
, setdict
)
221 def _get_source_or_target(self
, key
):
222 if self
._dict
[key
] is None:
224 elif self
._dict
[key
]:
225 return _wf_to_char(self
._dict
[key
]).decode('utf-8')
229 def _set_source_or_target(self
, key
, newvalue
):
231 self
._dict
[key
] = None
232 if isinstance(newvalue
, unicode):
233 newvalue
= newvalue
.encode('utf-8')
234 newvalue
= _char_to_wf(newvalue
)
235 if not key
in self
._dict
or newvalue
!= self
._dict
[key
]:
236 self
._dict
[key
] = newvalue
237 self
._update
_timestamp
()
240 return self
._get
_source
_or
_target
('source')
242 def setsource(self
, newsource
):
243 return self
._set
_source
_or
_target
('source', newsource
)
244 source
= property(getsource
, setsource
)
247 return self
._get
_source
_or
_target
('target')
249 def settarget(self
, newtarget
):
250 return self
._set
_source
_or
_target
('target', newtarget
)
251 target
= property(gettarget
, settarget
)
253 def settargetlang(self
, newlang
):
254 self
._dict
['target-lang'] = newlang
255 targetlang
= property(None, settargetlang
)
258 return str(self
._dict
)
260 def istranslated(self
):
261 if not self
._dict
.get('source', None):
263 return bool(self
._dict
.get('target', None))
266 class WordfastTMFile(base
.TranslationStore
):
267 """A Wordfast translation memory file"""
268 def __init__(self
, inputfile
=None, unitclass
=WordfastUnit
):
269 """construct a Wordfast TM, optionally reading in from inputfile."""
270 self
.UnitClass
= unitclass
271 base
.TranslationStore
.__init
__(self
, unitclass
=unitclass
)
273 self
.header
= WordfastHeader()
274 self
._encoding
= 'utf-16'
275 if inputfile
is not None:
276 self
.parse(inputfile
)
278 def parse(self
, input):
279 """parsese the given file or file source string"""
280 if hasattr(input, 'name'):
281 self
.filename
= input.name
282 elif not getattr(self
, 'filename', ''):
284 if hasattr(input, "read"):
288 if TAB_UTF16
in input.split("\n")[0]:
289 self
._encoding
= 'utf-16'
291 self
._encoding
= 'iso-8859-1'
293 input = input.decode(self
._encoding
).encode('utf-8')
295 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
296 for header
in csv
.DictReader(input.split("\n")[:1], fieldnames
=WF_FIELDNAMES_HEADER
, dialect
="excel-tab"):
297 self
.header
= WordfastHeader(header
)
298 lines
= csv
.DictReader(input.split("\n")[1:], fieldnames
=WF_FIELDNAMES
, dialect
="excel-tab")
300 newunit
= WordfastUnit()
302 self
.addunit(newunit
)
305 output
= csv
.StringIO()
306 header_output
= csv
.StringIO()
307 writer
= csv
.DictWriter(output
, fieldnames
=WF_FIELDNAMES
, dialect
="excel-tab")
309 for unit
in self
.units
:
310 if unit
.istranslated():
312 writer
.writerow(unit
.dict)
316 self
.header
.tucount
= unit_count
317 outheader
= csv
.DictWriter(header_output
, fieldnames
=WF_FIELDNAMES_HEADER
, dialect
="excel-tab")
318 outheader
.writerow(self
.header
.header
)
319 header_output
.reset()
320 decoded
= "".join(header_output
.readlines() + output
.readlines()).decode('utf-8')
322 return decoded
.encode(self
._encoding
)
323 except UnicodeEncodeError:
324 return decoded
.encode('utf-16')