storage/dtd.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Copyright 2002-2006 Zuza Software Foundation
   5 #
   6 # This file is part of translate.
   7 #
   8 # translate is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # translate is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with translate; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile)
  23 these are specific .dtd files for localisation used by mozilla"""
  24
  25 from translate.storage import base
  26 from translate.misc import quote
  27
  28 import re
  29 import sys
  30 import warnings
  31
  32 def quotefordtd(source):
  33     if '"' in source:
  34         if "'" in source:
  35             return "'" + source.replace("'", '&apos;') + "'"
  36         else:
  37             return quote.singlequotestr(source)
  38     else:
  39         return quote.quotestr(source)
  40
  41 def unquotefromdtd(source):
  42     """unquotes a quoted dtd definition"""
  43     # extract the string, get rid of quoting
  44     if len(source) == 0: source = '""'
  45     quotechar = source[0]
  46     extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False)
  47     if quotechar == "'" and "&apos;" in extracted:
  48         extracted = extracted.replace("&apos;", "'")
  49     # the quote characters should be the first and last characters in the string
  50     # of course there could also be quote characters within the string; not handled here
  51     return extracted
  52
  53 class dtdunit(base.TranslationUnit):
  54     """this class represents an entity definition from a dtd file (and possibly associated comments)"""
  55     def __init__(self, source=""):
  56         """construct the dtdunit, prepare it for parsing"""
  57         super(dtdunit, self).__init__(source)
  58         self.comments = []
  59         self.unparsedlines = []
  60         self.incomment = 0
  61         self.inentity = 0
  62         self.entity = "FakeEntityOnlyForInitialisationAndTesting"
  63         self.source = source
  64
  65     # Note that source and target are equivalent for monolingual units
  66     def setsource(self, source):
  67         """Sets the definition to the quoted value of source"""
  68         self.definition = quotefordtd(source)
  69
  70     def getsource(self):
  71         """gets the unquoted source string"""
  72         return unquotefromdtd(self.definition)
  73     source = property(getsource, setsource)
  74
  75     def settarget(self, target):
  76         """Sets the definition to the quoted value of target"""
  77         if target is None:
  78             target = ""
  79         self.definition = quotefordtd(target)
  80
  81     def gettarget(self):
  82         """gets the unquoted target string"""
  83         return unquotefromdtd(self.definition)
  84     target = property(gettarget, settarget)
  85
  86     def isnull(self):
  87         """returns whether this dtdunit doesn't actually have an entity definition"""
  88         # for dtds, we currently return a blank string if there is no .entity (==location in other files)
  89         # TODO: this needs to work better with base class expectations
  90         return self.entity is None
  91
  92     def parse(self, dtdsrc):
  93         """read the first dtd element from the source code into this object, return linesprocessed"""
  94         self.comments = []
  95         # make all the lists the same
  96         self.locfilenotes = self.comments
  97         self.locgroupstarts = self.comments
  98         self.locgroupends = self.comments
  99         self.locnotes = self.comments
 100         # self.locfilenotes = []
 101         # self.locgroupstarts = []
 102         # self.locgroupends = []
 103         # self.locnotes = []
 104         # self.comments = []
 105         self.entity = None
 106         self.definition = ''
 107         if not dtdsrc:
 108             return 0
 109         lines = dtdsrc.split("\n")
 110         linesprocessed = 0
 111         comment = ""
 112         for line in lines:
 113             line += "\n"
 114             linesprocessed += 1
 115             # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1]
 116             if not self.incomment:
 117                 if (line.find('<!--') != -1):
 118                     self.incomment = 1
 119                     self.continuecomment = 0
 120                     # now work out the type of comment, and save it (remember we're not in the comment yet)
 121                     (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
 122                     if comment.find('LOCALIZATION NOTE') != -1:
 123                         l = quote.findend(comment,'LOCALIZATION NOTE')
 124                         while (comment[l] == ' '): l += 1
 125                         if comment.find('FILE', l) == l:
 126                             self.commenttype = "locfile"
 127                         elif comment.find('BEGIN', l) == l:
 128                             self.commenttype = "locgroupstart"
 129                         elif comment.find('END', l) == l:
 130                             self.commenttype = "locgroupend"
 131                         else:
 132                             self.commenttype = "locnote"
 133                     else:
 134                         # plain comment
 135                         self.commenttype = "comment"
 136
 137             if self.incomment:
 138                 # some kind of comment
 139                 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment)
 140                 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment
 141                 self.continuecomment = self.incomment
 142                 # strip the comment out of what will be parsed
 143                 line = line.replace(comment, "", 1)
 144                 # add a end of line of this is the end of the comment
 145                 if not self.incomment:
 146                     if line.isspace():
 147                         comment += line
 148                         line = ''
 149                     else:
 150                         comment += '\n'
 151                 # check if there's actually an entity definition that's commented out
 152                 # TODO: parse these, store as obsolete messages
 153                 # if comment.find('<!ENTITY') != -1:
 154                 #     # remove the entity from the comment
 155                 #     comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1)
 156                 # depending on the type of comment (worked out at the start), put it in the right place
 157                 # make it record the comment and type as a tuple
 158                 commentpair = (self.commenttype, comment)
 159                 if self.commenttype == "locfile":
 160                     self.locfilenotes.append(commentpair)
 161                 elif self.commenttype == "locgroupstart":
 162                     self.locgroupstarts.append(commentpair)
 163                 elif self.commenttype == "locgroupend":
 164                     self.locgroupends.append(commentpair)
 165                 elif self.commenttype == "locnote":
 166                     self.locnotes.append(commentpair)
 167                 elif self.commenttype == "comment":
 168                     self.comments.append(commentpair)
 169
 170             if not self.inentity and not self.incomment:
 171                 entitypos = line.find('<!ENTITY')
 172                 if entitypos != -1:
 173                     self.inentity = 1
 174                     beforeentity = line[:entitypos].strip()
 175                     if beforeentity.startswith("#"):
 176                         self.hashprefix = beforeentity
 177                     self.entitypart = "start"
 178                 else:
 179                     self.unparsedlines.append(line)
 180
 181             if self.inentity:
 182                 if self.entitypart == "start":
 183                     # the entity definition
 184                     e = quote.findend(line,'<!ENTITY')
 185                     line = line[e:]
 186                     self.entitypart = "name"
 187                     self.entitytype = "internal"
 188                 if self.entitypart == "name":
 189                     e = 0
 190                     while (e < len(line) and line[e].isspace()): e += 1
 191                     self.entity = ''
 192                     if (e < len(line) and line[e] == '%'):
 193                         self.entitytype = "external"
 194                         self.entityparameter = ""
 195                         e += 1
 196                         while (e < len(line) and line[e].isspace()): e += 1
 197                     while (e < len(line) and not line[e].isspace()):
 198                         self.entity += line[e]
 199                         e += 1
 200                     while (e < len(line) and line[e].isspace()): e += 1
 201                     if self.entity:
 202                         if self.entitytype == "external":
 203                             self.entitypart = "parameter"
 204                         else:
 205                             self.entitypart = "definition"
 206                         # remember the start position and the quote character
 207                         if e == len(line):
 208                             self.entityhelp = None
 209                             continue
 210                         elif self.entitypart == "definition":
 211                             self.entityhelp = (e, line[e])
 212                             self.instring = 0
 213                 if self.entitypart == "parameter":
 214                     paramstart = e
 215                     while (e < len(line) and line[e].isalnum()): e += 1
 216                     self.entityparameter += line[paramstart:e]
 217                     while (e < len(line) and line[e].isspace()): e += 1
 218                     line = line[e:]
 219                     e = 0
 220                     if not line:
 221                         continue
 222                     if line[0] in ('"', "'"):
 223                         self.entitypart = "definition"
 224                         self.entityhelp = (e, line[e])
 225                         self.instring = 0
 226                 if self.entitypart == "definition":
 227                     if self.entityhelp is None:
 228                         e = 0
 229                         while (e < len(line) and line[e].isspace()): e += 1
 230                         if e == len(line):
 231                             continue
 232                         self.entityhelp = (e, line[e])
 233                         self.instring = 0
 234                     # actually the lines below should remember instring, rather than using it as dummy
 235                     e = self.entityhelp[0]
 236                     if (self.entityhelp[1] == "'"):
 237                         (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False)
 238                     elif (self.entityhelp[1] == '"'):
 239                         (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False)
 240                     else:
 241                         raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1]))
 242                     # for any following lines, start at the beginning of the line. remember the quote character
 243                     self.entityhelp = (0, self.entityhelp[1])
 244                     self.definition += defpart
 245                     if not self.instring:
 246                         self.inentity = 0
 247                         break
 248
 249         # uncomment this line to debug processing
 250         if 0:
 251             for attr in dir(self):
 252                 r = repr(getattr(self, attr))
 253                 if len(r) > 60: r = r[:57]+"..."
 254                 self.comments.append(("comment", "self.%s = %s" % (attr, r) ))
 255         return linesprocessed
 256
 257     def __str__(self):
 258         """convert to a string. double check that unicode is handled somehow here"""
 259         source = self.getoutput()
 260         if isinstance(source, unicode):
 261             return source.encode(getattr(self, "encoding", "UTF-8"))
 262         return source
 263
 264     def getoutput(self):
 265         """convert the dtd entity back to string form"""
 266         lines = []
 267         lines.extend([comment for commenttype, comment in self.comments])
 268         lines.extend(self.unparsedlines)
 269         if self.isnull():
 270             result = "".join(lines)
 271             return result.rstrip() + "\n"
 272         # for f in self.locfilenotes: yield f
 273         # for ge in self.locgroupends: yield ge
 274         # for gs in self.locgroupstarts: yield gs
 275         # for n in self.locnotes: yield n
 276         if len(self.entity) > 0:
 277             if getattr(self, 'entitytype', None) == 'external':
 278                 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>'
 279             else:
 280                 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>'
 281             if getattr(self, 'hashprefix', None):
 282                 entityline = self.hashprefix + " " + entityline
 283             if isinstance(entityline, unicode):
 284                 entityline = entityline.encode('UTF-8')
 285             lines.append(entityline+'\n')
 286         return "".join(lines)
 287
 288 class dtdfile(base.TranslationStore):
 289     """this class represents a .dtd file, made up of dtdunits"""
 290     UnitClass = dtdunit
 291     def __init__(self, inputfile=None):
 292         """construct a dtdfile, optionally reading in from inputfile"""
 293         base.TranslationStore.__init__(self, unitclass = self.UnitClass)
 294         self.units = []
 295         self.filename = getattr(inputfile, 'name', '')
 296         if inputfile is not None:
 297             dtdsrc = inputfile.read()
 298             self.parse(dtdsrc)
 299             self.makeindex()
 300
 301     def parse(self, dtdsrc):
 302         """read the source code of a dtd file in and include them as dtdunits in self.units (any existing units are lost)"""
 303         self.units = []
 304         start = 0
 305         end = 0
 306         lines = dtdsrc.split("\n")
 307         while end < len(lines):
 308             if (start == end): end += 1
 309             foundentity = 0
 310             while end < len(lines):
 311                 if end >= len(lines):
 312                     break
 313                 if lines[end].find('<!ENTITY') > -1:
 314                     foundentity = 1
 315                 if foundentity and re.match("[\"']\s*>", lines[end]):
 316                     end += 1
 317                     break
 318                 end += 1
 319             # print "processing from %d to %d" % (start,end)
 320
 321             linesprocessed = 1 # to initialise loop
 322             while linesprocessed >= 1:
 323                 newdtd = dtdunit()
 324                 try:
 325                     linesprocessed = newdtd.parse("\n".join(lines[start:end]))
 326                     if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines):
 327                         self.units.append(newdtd)
 328                 except Exception, e:
 329                     warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end])))
 330                 start += linesprocessed
 331
 332     def __str__(self):
 333         """convert to a string. double check that unicode is handled somehow here"""
 334         source = self.getoutput()
 335         if isinstance(source, unicode):
 336             return source.encode(getattr(self, "encoding", "UTF-8"))
 337         return source
 338
 339     def getoutput(self):
 340         """convert the units back to source"""
 341         sources = [str(dtd) for dtd in self.units]
 342         return "".join(sources)
 343
 344     def makeindex(self):
 345         """makes self.index dictionary keyed on entities"""
 346         self.index = {}
 347         for dtd in self.units:
 348             if not dtd.isnull():
 349                 self.index[dtd.entity] = dtd
 350
 351     def rewrap(self):
 352         for dtd in self.units:
 353             lines = dtd.definition.split("\n")
 354             if len(lines) > 1:
 355                 definition = lines[0]
 356                 for line in lines[1:]:
 357                     if definition[-1:].isspace() or line[:1].isspace():
 358                         definition += line
 359                     else:
 360                         definition += " " + line
 361                 dtd.definition = definition
 362
 363 if __name__ == "__main__":
 364     import sys
 365     d = dtdfile(sys.stdin)
 366     d.rewrap()
 367     sys.stdout.write(str(d))
 368