storage/wordfast.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2007 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """Manage the Wordfast Translation Memory format
  23 """
  24
  25 import csv
  26 import time
  27 from translate.storage import base
  28
  29 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
  30 """Time format used by Wordfast"""
  31
  32 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"]
  33 """Field names for the Wordfast header"""
  34
  35 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"]
  36 """Field names for a Wordfast TU"""
  37
  38 WF_FIELDNAMES_HEADER_DEFAULTS = {
  39 "date": "%19000101~121212",
  40 "userlist": "%User ID,TT,TT Translate-Toolkit",
  41 "tucount": "%TU=00000001",
  42 "src-lang": "%EN-US",
  43 "version": "%Wordfast TM v.5.51w9/00",
  44 "target-lang": "",
  45 "license": "%---00000001",
  46 "attr1list": "",
  47 "attr2list": "",
  48 "attr3list": "",
  49 "attr4list": "" }
  50 """Default or minimum header entries for a Wordfast file"""
  51
  52 # TODO Needs validation.  The following need to be checked against a WF TM file to ensure
  53 # that the correct Unicode values have been chosen for the characters. For now these look
  54 # correct and have been taken from Windows CP1252 and Macintosh code points found for
  55 # the respective character sets on Linux.
  56 WF_ESCAPE_MAP = (
  57               ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes)
  58               ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark
  59               ("&'85;", u"\u2026"), # … - Elippsis
  60               ("&'91;", u"\u2018"), # ‘ - left single quotation mark
  61               ("&'92;", u"\u2019"), # ’ - right single quotation mark
  62               ("&'93;", u"\u201C"), # “ - left double quotation mark
  63               ("&'94;", u"\u201D"), # ” - right double quotation mark
  64               ("&'96;", u"\u2013"), # – - en dash (validate)
  65               ("&'97;", u"\u2014"), # — - em dash (validate)
  66               ("&'99;", u"\u2122"), # ™ - Trade mark
  67               # Windows only
  68               ("&'A0;", u"\u00A0"), #   - Non breaking space
  69               ("&'A9;", u"\u00A9"), # © - Copyright
  70               ("&'AE;", u"\u00AE"), # ® - Registered
  71               ("&'BC;", u"\u00BC"), # ¼
  72               ("&'BD;", u"\u00BD"), # ½
  73               ("&'BE;", u"\u00BE"), # ¾
  74               # Mac only
  75               ("&'A8;", u"\u00AE"), # ® - Registered
  76               ("&'AA;", u"\u2122"), # ™ - Trade mark
  77               ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark
  78               ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark
  79               ("&'C9;", u"\u2026"), # … - Horizontal Elippsis
  80               ("&'CA;", u"\u00A0"), #   - Non breaking space
  81               ("&'D0;", u"\u2013"), # – - en dash (validate)
  82               ("&'D1;", u"\u2014"), # — - em dash (validate)
  83               ("&'D2;", u"\u201C"), # “ - left double quotation mark
  84               ("&'D3;", u"\u201D"), # ” - right double quotation mark
  85               ("&'D4;", u"\u2018"), # ‘ - left single quotation mark
  86               ("&'D5;", u"\u2019"), # ’ - right single quotation mark
  87               ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark
  88               ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark
  89               # Other markers
  90               #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028
  91              )
  92 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
  93
  94 TAB_UTF16 = "\x00\x09"
  95
  96 def _char_to_wf(string):
  97     """Char -> Wordfast &'XX; escapes
  98
  99     @note: Full roundtripping is not possible because of the escaping of \n and \t"""
 100     # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac
 101     if string:
 102         for code, char in WF_ESCAPE_MAP:
 103             string = string.replace(char.encode('utf-8'), code)
 104         string = string.replace("\n", "\\n").replace("\t", "\\t")
 105     return string
 106
 107 def _wf_to_char(string):
 108     """Wordfast &'XX; escapes -> Char"""
 109     if string:
 110         for code, char in WF_ESCAPE_MAP:
 111             string = string.replace(code, char.encode('utf-8'))
 112         string = string.replace("\\n", "\n").replace("\\t", "\t")
 113     return string
 114
 115 class WordfastTime(object):
 116     """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
 117     def __init__(self, newtime=None):
 118         self._time = None
 119         if not newtime:
 120             self.time = None
 121         elif isinstance(newtime, basestring):
 122             self.timestring = newtime
 123         elif isinstance(newtime, time.struct_time):
 124             self.time = newtime
 125
 126     def get_timestring(self):
 127         """Get the time in the Wordfast time format"""
 128         if not self._time:
 129             return None
 130         else:
 131             return time.strftime(WF_TIMEFORMAT, self._time)
 132
 133     def set_timestring(self, timestring):
 134         """Set the time_sturct object using a Wordfast time formated string
 135
 136         @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
 137         @type timestring: String
 138         """
 139         self._time = time.strptime(timestring, WF_TIMEFORMAT)
 140     timestring = property(get_timestring, set_timestring)
 141
 142     def get_time(self):
 143         """Get the time_struct object"""
 144         return self._time
 145
 146     def set_time(self, newtime):
 147         """Set the time_struct object
 148
 149         @param newtime: a new time object
 150         @type newtime: time.time_struct
 151         """
 152         if newtime and isinstance(newtime, time.struct_time):
 153             self._time = newtime
 154         else:
 155             self._time = None
 156     time = property(get_time, set_time)
 157
 158     def __str__(self):
 159         if not self.timestring:
 160             return ""
 161         else:
 162             return self.timestring
 163
 164 class WordfastHeader(object):
 165     """A wordfast translation memory header"""
 166     def __init__(self, header=None):
 167         self._header_dict = []
 168         if not header:
 169             self.header = self._create_default_header()
 170         elif isinstance(header, dict):
 171             self.header = header
 172
 173     def _create_default_header(self):
 174         """Create a default Wordfast header with the date set to the current time"""
 175         defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS
 176         defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring
 177         return defaultheader
 178
 179     def getheader(self):
 180         """Get the header dictionary"""
 181         return self._header_dict
 182
 183     def setheader(self, newheader):
 184         self._header_dict = newheader
 185     header = property(getheader, setheader)
 186
 187     def settargetlang(self, newlang):
 188         self._header_dict['target-lang'] = '%%%s' % newlang
 189     targetlang = property(None, settargetlang)
 190
 191     def settucount(self, count):
 192         self._header_dict['tucount'] = '%%TU=%08d' % count
 193     tucount = property(None, settucount)
 194
 195 class WordfastUnit(base.TranslationUnit):
 196     """A Wordfast translation memory unit"""
 197     def __init__(self, source=None):
 198         self._dict = {}
 199         if source:
 200             self.source = source
 201         super(WordfastUnit, self).__init__(source)
 202
 203     def _update_timestamp(self):
 204         """Refresh the timestamp for the unit"""
 205         self._dict['date'] = WordfastTime(time.localtime()).timestring
 206
 207     def getdict(self):
 208         """Get the dictionary of values for a Wordfast line"""
 209         return self._dict
 210
 211     def setdict(self, newdict):
 212         """Set the dictionary of values for a Wordfast line
 213
 214         @param newdict: a new dictionary with Wordfast line elements
 215         @type newdict: Dict
 216         """
 217         # TODO First check that the values are OK
 218         self._dict = newdict
 219     dict = property(getdict, setdict)
 220
 221     def _get_source_or_target(self, key):
 222         if self._dict[key] is None:
 223             return None
 224         elif self._dict[key]:
 225             return _wf_to_char(self._dict[key]).decode('utf-8')
 226         else:
 227             return ""
 228
 229     def _set_source_or_target(self, key, newvalue):
 230         if newvalue is None:
 231             self._dict[key] = None
 232         if isinstance(newvalue, unicode):
 233             newvalue = newvalue.encode('utf-8')
 234         newvalue = _char_to_wf(newvalue)
 235         if not key in self._dict or newvalue != self._dict[key]:
 236             self._dict[key] = newvalue
 237             self._update_timestamp()
 238
 239     def getsource(self):
 240         return self._get_source_or_target('source')
 241
 242     def setsource(self, newsource):
 243         return self._set_source_or_target('source', newsource)
 244     source = property(getsource, setsource)
 245
 246     def gettarget(self):
 247         return self._get_source_or_target('target')
 248
 249     def settarget(self, newtarget):
 250         return self._set_source_or_target('target', newtarget)
 251     target = property(gettarget, settarget)
 252
 253     def settargetlang(self, newlang):
 254         self._dict['target-lang'] = newlang
 255     targetlang = property(None, settargetlang)
 256
 257     def __str__(self):
 258         return str(self._dict)
 259
 260     def istranslated(self):
 261         if not self._dict.get('source', None):
 262             return False
 263         return bool(self._dict.get('target', None))
 264
 265
 266 class WordfastTMFile(base.TranslationStore):
 267     """A Wordfast translation memory file"""
 268     def __init__(self, inputfile=None, unitclass=WordfastUnit):
 269         """construct a Wordfast TM, optionally reading in from inputfile."""
 270         self.UnitClass = unitclass
 271         base.TranslationStore.__init__(self, unitclass=unitclass)
 272         self.filename = ''
 273         self.header = WordfastHeader()
 274         self._encoding = 'utf-16'
 275         if inputfile is not None:
 276             self.parse(inputfile)
 277
 278     def parse(self, input):
 279         """parsese the given file or file source string"""
 280         if hasattr(input, 'name'):
 281             self.filename = input.name
 282         elif not getattr(self, 'filename', ''):
 283             self.filename = ''
 284         if hasattr(input, "read"):
 285             tmsrc = input.read()
 286             input.close()
 287             input = tmsrc
 288         if TAB_UTF16 in input.split("\n")[0]:
 289             self._encoding = 'utf-16'
 290         else:
 291             self._encoding = 'iso-8859-1'
 292         try:
 293             input = input.decode(self._encoding).encode('utf-8')
 294         except:
 295             raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
 296         for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="excel-tab"):
 297             self.header = WordfastHeader(header)
 298         lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="excel-tab")
 299         for line in lines:
 300             newunit = WordfastUnit()
 301             newunit.dict = line
 302             self.addunit(newunit)
 303
 304     def __str__(self):
 305         output = csv.StringIO()
 306         header_output = csv.StringIO()
 307         writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="excel-tab")
 308         unit_count = 0
 309         for unit in self.units:
 310             if unit.istranslated():
 311                 unit_count += 1
 312                 writer.writerow(unit.dict)
 313         if unit_count == 0:
 314             return ""
 315         output.reset()
 316         self.header.tucount = unit_count
 317         outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="excel-tab")
 318         outheader.writerow(self.header.header)
 319         header_output.reset()
 320         decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
 321         try:
 322             return decoded.encode(self._encoding)
 323         except UnicodeEncodeError:
 324             return decoded.encode('utf-16')
 325
 326