fix git support for v1.5.3 (or higher) by setting "--work-tree"
[translate_toolkit.git] / storage / dtd.py
blobb0c4b93047ea5f0c99ac1b3c02bae6b14e8f001a
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2002-2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
8 # translate is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
13 # translate is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with translate; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile)
23 these are specific .dtd files for localisation used by mozilla"""
25 from translate.storage import base
26 from translate.misc import quote
28 import re
29 import sys
30 import warnings
32 def quotefordtd(source):
33 if '"' in source:
34 if "'" in source:
35 return "'" + source.replace("'", ''') + "'"
36 else:
37 return quote.singlequotestr(source)
38 else:
39 return quote.quotestr(source)
41 def unquotefromdtd(source):
42 """unquotes a quoted dtd definition"""
43 # extract the string, get rid of quoting
44 if len(source) == 0: source = '""'
45 quotechar = source[0]
46 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False)
47 if quotechar == "'" and "'" in extracted:
48 extracted = extracted.replace("'", "'")
49 # the quote characters should be the first and last characters in the string
50 # of course there could also be quote characters within the string; not handled here
51 return extracted
53 class dtdunit(base.TranslationUnit):
54 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
55 def __init__(self, source=""):
56 """construct the dtdunit, prepare it for parsing"""
57 super(dtdunit, self).__init__(source)
58 self.comments = []
59 self.unparsedlines = []
60 self.incomment = 0
61 self.inentity = 0
62 self.entity = "FakeEntityOnlyForInitialisationAndTesting"
63 self.source = source
65 # Note that source and target are equivalent for monolingual units
66 def setsource(self, source):
67 """Sets the definition to the quoted value of source"""
68 self.definition = quotefordtd(source)
70 def getsource(self):
71 """gets the unquoted source string"""
72 return unquotefromdtd(self.definition)
73 source = property(getsource, setsource)
75 def settarget(self, target):
76 """Sets the definition to the quoted value of target"""
77 if target is None:
78 target = ""
79 self.definition = quotefordtd(target)
81 def gettarget(self):
82 """gets the unquoted target string"""
83 return unquotefromdtd(self.definition)
84 target = property(gettarget, settarget)
86 def isnull(self):
87 """returns whether this dtdunit doesn't actually have an entity definition"""
88 # for dtds, we currently return a blank string if there is no .entity (==location in other files)
89 # TODO: this needs to work better with base class expectations
90 return self.entity is None
92 def parse(self, dtdsrc):
93 """read the first dtd element from the source code into this object, return linesprocessed"""
94 self.comments = []
95 # make all the lists the same
96 self.locfilenotes = self.comments
97 self.locgroupstarts = self.comments
98 self.locgroupends = self.comments
99 self.locnotes = self.comments
100 # self.locfilenotes = []
101 # self.locgroupstarts = []
102 # self.locgroupends = []
103 # self.locnotes = []
104 # self.comments = []
105 self.entity = None
106 self.definition = ''
107 if not dtdsrc:
108 return 0
109 lines = dtdsrc.split("\n")
110 linesprocessed = 0
111 comment = ""
112 for line in lines:
113 line += "\n"
114 linesprocessed += 1
115 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1]
116 if not self.incomment:
117 if (line.find('<!--') != -1):
118 self.incomment = 1
119 self.continuecomment = 0
120 # now work out the type of comment, and save it (remember we're not in the comment yet)
121 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
122 if comment.find('LOCALIZATION NOTE') != -1:
123 l = quote.findend(comment,'LOCALIZATION NOTE')
124 while (comment[l] == ' '): l += 1
125 if comment.find('FILE', l) == l:
126 self.commenttype = "locfile"
127 elif comment.find('BEGIN', l) == l:
128 self.commenttype = "locgroupstart"
129 elif comment.find('END', l) == l:
130 self.commenttype = "locgroupend"
131 else:
132 self.commenttype = "locnote"
133 else:
134 # plain comment
135 self.commenttype = "comment"
137 if self.incomment:
138 # some kind of comment
139 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment)
140 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment
141 self.continuecomment = self.incomment
142 # strip the comment out of what will be parsed
143 line = line.replace(comment, "", 1)
144 # add a end of line of this is the end of the comment
145 if not self.incomment:
146 if line.isspace():
147 comment += line
148 line = ''
149 else:
150 comment += '\n'
151 # check if there's actually an entity definition that's commented out
152 # TODO: parse these, store as obsolete messages
153 # if comment.find('<!ENTITY') != -1:
154 # # remove the entity from the comment
155 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1)
156 # depending on the type of comment (worked out at the start), put it in the right place
157 # make it record the comment and type as a tuple
158 commentpair = (self.commenttype, comment)
159 if self.commenttype == "locfile":
160 self.locfilenotes.append(commentpair)
161 elif self.commenttype == "locgroupstart":
162 self.locgroupstarts.append(commentpair)
163 elif self.commenttype == "locgroupend":
164 self.locgroupends.append(commentpair)
165 elif self.commenttype == "locnote":
166 self.locnotes.append(commentpair)
167 elif self.commenttype == "comment":
168 self.comments.append(commentpair)
170 if not self.inentity and not self.incomment:
171 entitypos = line.find('<!ENTITY')
172 if entitypos != -1:
173 self.inentity = 1
174 beforeentity = line[:entitypos].strip()
175 if beforeentity.startswith("#"):
176 self.hashprefix = beforeentity
177 self.entitypart = "start"
178 else:
179 self.unparsedlines.append(line)
181 if self.inentity:
182 if self.entitypart == "start":
183 # the entity definition
184 e = quote.findend(line,'<!ENTITY')
185 line = line[e:]
186 self.entitypart = "name"
187 self.entitytype = "internal"
188 if self.entitypart == "name":
189 e = 0
190 while (e < len(line) and line[e].isspace()): e += 1
191 self.entity = ''
192 if (e < len(line) and line[e] == '%'):
193 self.entitytype = "external"
194 self.entityparameter = ""
195 e += 1
196 while (e < len(line) and line[e].isspace()): e += 1
197 while (e < len(line) and not line[e].isspace()):
198 self.entity += line[e]
199 e += 1
200 while (e < len(line) and line[e].isspace()): e += 1
201 if self.entity:
202 if self.entitytype == "external":
203 self.entitypart = "parameter"
204 else:
205 self.entitypart = "definition"
206 # remember the start position and the quote character
207 if e == len(line):
208 self.entityhelp = None
209 continue
210 elif self.entitypart == "definition":
211 self.entityhelp = (e, line[e])
212 self.instring = 0
213 if self.entitypart == "parameter":
214 paramstart = e
215 while (e < len(line) and line[e].isalnum()): e += 1
216 self.entityparameter += line[paramstart:e]
217 while (e < len(line) and line[e].isspace()): e += 1
218 line = line[e:]
219 e = 0
220 if not line:
221 continue
222 if line[0] in ('"', "'"):
223 self.entitypart = "definition"
224 self.entityhelp = (e, line[e])
225 self.instring = 0
226 if self.entitypart == "definition":
227 if self.entityhelp is None:
228 e = 0
229 while (e < len(line) and line[e].isspace()): e += 1
230 if e == len(line):
231 continue
232 self.entityhelp = (e, line[e])
233 self.instring = 0
234 # actually the lines below should remember instring, rather than using it as dummy
235 e = self.entityhelp[0]
236 if (self.entityhelp[1] == "'"):
237 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False)
238 elif (self.entityhelp[1] == '"'):
239 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False)
240 else:
241 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1]))
242 # for any following lines, start at the beginning of the line. remember the quote character
243 self.entityhelp = (0, self.entityhelp[1])
244 self.definition += defpart
245 if not self.instring:
246 self.inentity = 0
247 break
249 # uncomment this line to debug processing
250 if 0:
251 for attr in dir(self):
252 r = repr(getattr(self, attr))
253 if len(r) > 60: r = r[:57]+"..."
254 self.comments.append(("comment", "self.%s = %s" % (attr, r) ))
255 return linesprocessed
257 def __str__(self):
258 """convert to a string. double check that unicode is handled somehow here"""
259 source = self.getoutput()
260 if isinstance(source, unicode):
261 return source.encode(getattr(self, "encoding", "UTF-8"))
262 return source
264 def getoutput(self):
265 """convert the dtd entity back to string form"""
266 lines = []
267 lines.extend([comment for commenttype, comment in self.comments])
268 lines.extend(self.unparsedlines)
269 if self.isnull():
270 result = "".join(lines)
271 return result.rstrip() + "\n"
272 # for f in self.locfilenotes: yield f
273 # for ge in self.locgroupends: yield ge
274 # for gs in self.locgroupstarts: yield gs
275 # for n in self.locnotes: yield n
276 if len(self.entity) > 0:
277 if getattr(self, 'entitytype', None) == 'external':
278 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>'
279 else:
280 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>'
281 if getattr(self, 'hashprefix', None):
282 entityline = self.hashprefix + " " + entityline
283 if isinstance(entityline, unicode):
284 entityline = entityline.encode('UTF-8')
285 lines.append(entityline+'\n')
286 return "".join(lines)
288 class dtdfile(base.TranslationStore):
289 """this class represents a .dtd file, made up of dtdunits"""
290 UnitClass = dtdunit
291 def __init__(self, inputfile=None):
292 """construct a dtdfile, optionally reading in from inputfile"""
293 base.TranslationStore.__init__(self, unitclass = self.UnitClass)
294 self.units = []
295 self.filename = getattr(inputfile, 'name', '')
296 if inputfile is not None:
297 dtdsrc = inputfile.read()
298 self.parse(dtdsrc)
299 self.makeindex()
301 def parse(self, dtdsrc):
302 """read the source code of a dtd file in and include them as dtdunits in self.units (any existing units are lost)"""
303 self.units = []
304 start = 0
305 end = 0
306 lines = dtdsrc.split("\n")
307 while end < len(lines):
308 if (start == end): end += 1
309 foundentity = 0
310 while end < len(lines):
311 if end >= len(lines):
312 break
313 if lines[end].find('<!ENTITY') > -1:
314 foundentity = 1
315 if foundentity and re.match("[\"']\s*>", lines[end]):
316 end += 1
317 break
318 end += 1
319 # print "processing from %d to %d" % (start,end)
321 linesprocessed = 1 # to initialise loop
322 while linesprocessed >= 1:
323 newdtd = dtdunit()
324 try:
325 linesprocessed = newdtd.parse("\n".join(lines[start:end]))
326 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines):
327 self.units.append(newdtd)
328 except Exception, e:
329 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end])))
330 start += linesprocessed
332 def __str__(self):
333 """convert to a string. double check that unicode is handled somehow here"""
334 source = self.getoutput()
335 if isinstance(source, unicode):
336 return source.encode(getattr(self, "encoding", "UTF-8"))
337 return source
339 def getoutput(self):
340 """convert the units back to source"""
341 sources = [str(dtd) for dtd in self.units]
342 return "".join(sources)
344 def makeindex(self):
345 """makes self.index dictionary keyed on entities"""
346 self.index = {}
347 for dtd in self.units:
348 if not dtd.isnull():
349 self.index[dtd.entity] = dtd
351 def rewrap(self):
352 for dtd in self.units:
353 lines = dtd.definition.split("\n")
354 if len(lines) > 1:
355 definition = lines[0]
356 for line in lines[1:]:
357 if definition[-1:].isspace() or line[:1].isspace():
358 definition += line
359 else:
360 definition += " " + line
361 dtd.definition = definition
363 if __name__ == "__main__":
364 import sys
365 d = dtdfile(sys.stdin)
366 d.rewrap()
367 sys.stdout.write(str(d))