Whitespace normalization.
[python/dscho.git] / Doc / tools / sgmlconv / latex2esis.py
blobb30aaa5631673fa15c77ca9bf0751a9f9c4bb8cf
1 #! /usr/bin/env python
3 """Generate ESIS events based on a LaTeX source document and
4 configuration data.
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
16 """
18 import errno
19 import getopt
20 import os
21 import re
22 import sys
23 import xml.sax
24 import xml.sax.saxutils
26 from esistools import encode
29 DEBUG = 0
32 class LaTeXFormatError(Exception):
33 pass
36 class LaTeXStackError(LaTeXFormatError):
37 def __init__(self, found, stack):
38 msg = "environment close for %s doesn't match;\n stack = %s" \
39 % (found, stack)
40 self.found = found
41 self.stack = stack[:]
42 LaTeXFormatError.__init__(self, msg)
45 _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
46 _end_env_rx = re.compile(r"[\\]end{([^}]*)}")
47 _begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
48 _comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
49 _text_rx = re.compile(r"[^]~%\\{}]+")
50 _optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
51 # _parameter_rx is this complicated to allow {...} inside a parameter;
52 # this is useful to match tabular layout specifications like {c|p{24pt}}
53 _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
54 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
55 _start_group_rx = re.compile("[ \n]*{")
56 _start_optional_rx = re.compile("[ \n]*[[]")
59 ESCAPED_CHARS = "$%#^ {}&~"
62 def dbgmsg(msg):
63 if DEBUG:
64 sys.stderr.write(msg + "\n")
66 def pushing(name, point, depth):
67 dbgmsg("pushing <%s> at %s" % (name, point))
69 def popping(name, point, depth):
70 dbgmsg("popping </%s> at %s" % (name, point))
73 class _Stack(list):
74 def append(self, entry):
75 if not isinstance(entry, str):
76 raise LaTeXFormatError("cannot push non-string on stack: %r"
77 % (entry, ))
78 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
79 list.append(self, entry)
81 def pop(self, index=-1):
82 entry = self[index]
83 del self[index]
84 #dbgmsg("%s</%s>" % (" " * len(self), entry))
86 def __delitem__(self, index):
87 entry = self[index]
88 list.__delitem__(self, index)
89 #dbgmsg("%s</%s>" % (" " * len(self), entry))
92 def new_stack():
93 if DEBUG:
94 return _Stack()
95 else:
96 return []
99 class Conversion:
100 def __init__(self, ifp, ofp, table):
101 self.write = ofp.write
102 self.ofp = ofp
103 self.table = table
104 L = [s.rstrip() for s in ifp.readlines()]
105 L.append("")
106 self.line = "\n".join(L)
107 self.preamble = 1
109 def convert(self):
110 self.subconvert()
112 def subconvert(self, endchar=None, depth=0):
114 # Parses content, including sub-structures, until the character
115 # 'endchar' is found (with no open structures), or until the end
116 # of the input data is endchar is None.
118 stack = new_stack()
119 line = self.line
120 while line:
121 if line[0] == endchar and not stack:
122 self.line = line
123 return line
124 m = _comment_rx.match(line)
125 if m:
126 text = m.group(1)
127 if text:
128 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
129 % encode(text))
130 line = line[m.end():]
131 continue
132 m = _begin_env_rx.match(line)
133 if m:
134 name = m.group(1)
135 entry = self.get_env_entry(name)
136 # re-write to use the macro handler
137 line = r"\%s %s" % (name, line[m.end():])
138 continue
139 m = _end_env_rx.match(line)
140 if m:
141 # end of environment
142 envname = m.group(1)
143 entry = self.get_entry(envname)
144 while stack and envname != stack[-1] \
145 and stack[-1] in entry.endcloses:
146 self.write(")%s\n" % stack.pop())
147 if stack and envname == stack[-1]:
148 self.write(")%s\n" % entry.outputname)
149 del stack[-1]
150 else:
151 raise LaTeXStackError(envname, stack)
152 line = line[m.end():]
153 continue
154 m = _begin_macro_rx.match(line)
155 if m:
156 # start of macro
157 macroname = m.group(1)
158 if macroname == "c":
159 # Ugh! This is a combining character...
160 endpos = m.end()
161 self.combining_char("c", line[endpos])
162 line = line[endpos + 1:]
163 continue
164 entry = self.get_entry(macroname)
165 if entry.verbatim:
166 # magic case!
167 pos = line.find("\\end{%s}" % macroname)
168 text = line[m.end(1):pos]
169 stack.append(entry.name)
170 self.write("(%s\n" % entry.outputname)
171 self.write("-%s\n" % encode(text))
172 self.write(")%s\n" % entry.outputname)
173 stack.pop()
174 line = line[pos + len("\\end{%s}" % macroname):]
175 continue
176 while stack and stack[-1] in entry.closes:
177 top = stack.pop()
178 topentry = self.get_entry(top)
179 if topentry.outputname:
180 self.write(")%s\n-\\n\n" % topentry.outputname)
182 if entry.outputname and entry.empty:
183 self.write("e\n")
185 params, optional, empty = self.start_macro(macroname)
186 # rip off the macroname
187 if params:
188 line = line[m.end(1):]
189 elif empty:
190 line = line[m.end(1):]
191 else:
192 line = line[m.end():]
193 opened = 0
194 implied_content = 0
196 # handle attribute mappings here:
197 for pentry in params:
198 if pentry.type == "attribute":
199 if pentry.optional:
200 m = _optional_rx.match(line)
201 if m and entry.outputname:
202 line = line[m.end():]
203 self.dump_attr(pentry, m.group(1))
204 elif pentry.text and entry.outputname:
205 # value supplied by conversion spec:
206 self.dump_attr(pentry, pentry.text)
207 else:
208 m = _parameter_rx.match(line)
209 if not m:
210 raise LaTeXFormatError(
211 "could not extract parameter %s for %s: %r"
212 % (pentry.name, macroname, line[:100]))
213 if entry.outputname:
214 self.dump_attr(pentry, m.group(1))
215 line = line[m.end():]
216 elif pentry.type == "child":
217 if pentry.optional:
218 m = _optional_rx.match(line)
219 if m:
220 line = line[m.end():]
221 if entry.outputname and not opened:
222 opened = 1
223 self.write("(%s\n" % entry.outputname)
224 stack.append(macroname)
225 stack.append(pentry.name)
226 self.write("(%s\n" % pentry.name)
227 self.write("-%s\n" % encode(m.group(1)))
228 self.write(")%s\n" % pentry.name)
229 stack.pop()
230 else:
231 if entry.outputname and not opened:
232 opened = 1
233 self.write("(%s\n" % entry.outputname)
234 stack.append(entry.name)
235 self.write("(%s\n" % pentry.name)
236 stack.append(pentry.name)
237 self.line = skip_white(line)[1:]
238 line = self.subconvert(
239 "}", len(stack) + depth + 1)[1:]
240 self.write(")%s\n" % stack.pop())
241 elif pentry.type == "content":
242 if pentry.implied:
243 implied_content = 1
244 else:
245 if entry.outputname and not opened:
246 opened = 1
247 self.write("(%s\n" % entry.outputname)
248 stack.append(entry.name)
249 line = skip_white(line)
250 if line[0] != "{":
251 raise LaTeXFormatError(
252 "missing content for " + macroname)
253 self.line = line[1:]
254 line = self.subconvert("}", len(stack) + depth + 1)
255 if line and line[0] == "}":
256 line = line[1:]
257 elif pentry.type == "text" and pentry.text:
258 if entry.outputname and not opened:
259 opened = 1
260 stack.append(entry.name)
261 self.write("(%s\n" % entry.outputname)
262 #dbgmsg("--- text: %r" % pentry.text)
263 self.write("-%s\n" % encode(pentry.text))
264 elif pentry.type == "entityref":
265 self.write("&%s\n" % pentry.name)
266 if entry.outputname:
267 if not opened:
268 self.write("(%s\n" % entry.outputname)
269 stack.append(entry.name)
270 if not implied_content:
271 self.write(")%s\n" % entry.outputname)
272 stack.pop()
273 continue
274 if line[0] == endchar and not stack:
275 self.line = line[1:]
276 return self.line
277 if line[0] == "}":
278 # end of macro or group
279 macroname = stack[-1]
280 if macroname:
281 conversion = self.table[macroname]
282 if conversion.outputname:
283 # otherwise, it was just a bare group
284 self.write(")%s\n" % conversion.outputname)
285 del stack[-1]
286 line = line[1:]
287 continue
288 if line[0] == "~":
289 # don't worry about the "tie" aspect of this command
290 line = line[1:]
291 self.write("- \n")
292 continue
293 if line[0] == "{":
294 stack.append("")
295 line = line[1:]
296 continue
297 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
298 self.write("-%s\n" % encode(line[1]))
299 line = line[2:]
300 continue
301 if line[:2] == r"\\":
302 self.write("(BREAK\n)BREAK\n")
303 line = line[2:]
304 continue
305 if line[:2] == r"\_":
306 line = "_" + line[2:]
307 continue
308 if line[:2] in (r"\'", r'\"'):
309 # combining characters...
310 self.combining_char(line[1], line[2])
311 line = line[3:]
312 continue
313 m = _text_rx.match(line)
314 if m:
315 text = encode(m.group())
316 self.write("-%s\n" % text)
317 line = line[m.end():]
318 continue
319 # special case because of \item[]
320 # XXX can we axe this???
321 if line[0] == "]":
322 self.write("-]\n")
323 line = line[1:]
324 continue
325 # avoid infinite loops
326 extra = ""
327 if len(line) > 100:
328 extra = "..."
329 raise LaTeXFormatError("could not identify markup: %r%s"
330 % (line[:100], extra))
331 while stack:
332 entry = self.get_entry(stack[-1])
333 if entry.closes:
334 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
335 del stack[-1]
336 else:
337 break
338 if stack:
339 raise LaTeXFormatError("elements remain on stack: "
340 + ", ".join(stack))
341 # otherwise we just ran out of input here...
343 # This is a really limited table of combinations, but it will have
344 # to do for now.
345 _combinations = {
346 ("c", "c"): 0x00E7,
347 ("'", "e"): 0x00E9,
348 ('"', "o"): 0x00F6,
351 def combining_char(self, prefix, char):
352 ordinal = self._combinations[(prefix, char)]
353 self.write("-\\%%%d;\n" % ordinal)
355 def start_macro(self, name):
356 conversion = self.get_entry(name)
357 parameters = conversion.parameters
358 optional = parameters and parameters[0].optional
359 return parameters, optional, conversion.empty
361 def get_entry(self, name):
362 entry = self.table.get(name)
363 if entry is None:
364 dbgmsg("get_entry(%r) failing; building default entry!" % (name, ))
365 # not defined; build a default entry:
366 entry = TableEntry(name)
367 entry.has_content = 1
368 entry.parameters.append(Parameter("content"))
369 self.table[name] = entry
370 return entry
372 def get_env_entry(self, name):
373 entry = self.table.get(name)
374 if entry is None:
375 # not defined; build a default entry:
376 entry = TableEntry(name, 1)
377 entry.has_content = 1
378 entry.parameters.append(Parameter("content"))
379 entry.parameters[-1].implied = 1
380 self.table[name] = entry
381 elif not entry.environment:
382 raise LaTeXFormatError(
383 name + " is defined as a macro; expected environment")
384 return entry
386 def dump_attr(self, pentry, value):
387 if not (pentry.name and value):
388 return
389 if _token_rx.match(value):
390 dtype = "TOKEN"
391 else:
392 dtype = "CDATA"
393 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
396 def convert(ifp, ofp, table):
397 c = Conversion(ifp, ofp, table)
398 try:
399 c.convert()
400 except IOError, (err, msg):
401 if err != errno.EPIPE:
402 raise
405 def skip_white(line):
406 while line and line[0] in " %\n\t\r":
407 line = line[1:].lstrip()
408 return line
412 class TableEntry:
413 def __init__(self, name, environment=0):
414 self.name = name
415 self.outputname = name
416 self.environment = environment
417 self.empty = not environment
418 self.has_content = 0
419 self.verbatim = 0
420 self.auto_close = 0
421 self.parameters = []
422 self.closes = []
423 self.endcloses = []
425 class Parameter:
426 def __init__(self, type, name=None, optional=0):
427 self.type = type
428 self.name = name
429 self.optional = optional
430 self.text = ''
431 self.implied = 0
434 class TableHandler(xml.sax.handler.ContentHandler):
435 def __init__(self):
436 self.__table = {}
437 self.__buffer = ''
438 self.__methods = {}
440 def get_table(self):
441 for entry in self.__table.values():
442 if entry.environment and not entry.has_content:
443 p = Parameter("content")
444 p.implied = 1
445 entry.parameters.append(p)
446 entry.has_content = 1
447 return self.__table
449 def startElement(self, tag, attrs):
450 try:
451 start, end = self.__methods[tag]
452 except KeyError:
453 start = getattr(self, "start_" + tag, None)
454 end = getattr(self, "end_" + tag, None)
455 self.__methods[tag] = (start, end)
456 if start:
457 start(attrs)
459 def endElement(self, tag):
460 start, end = self.__methods[tag]
461 if end:
462 end()
464 def endDocument(self):
465 self.__methods.clear()
467 def characters(self, data):
468 self.__buffer += data
470 def start_environment(self, attrs):
471 name = attrs["name"]
472 self.__current = TableEntry(name, environment=1)
473 self.__current.verbatim = attrs.get("verbatim") == "yes"
474 if attrs.has_key("outputname"):
475 self.__current.outputname = attrs.get("outputname")
476 self.__current.endcloses = attrs.get("endcloses", "").split()
477 def end_environment(self):
478 self.end_macro()
480 def start_macro(self, attrs):
481 name = attrs["name"]
482 self.__current = TableEntry(name)
483 self.__current.closes = attrs.get("closes", "").split()
484 if attrs.has_key("outputname"):
485 self.__current.outputname = attrs.get("outputname")
486 def end_macro(self):
487 name = self.__current.name
488 if self.__table.has_key(name):
489 raise ValueError("name %r already in use" % (name,))
490 self.__table[name] = self.__current
491 self.__current = None
493 def start_attribute(self, attrs):
494 name = attrs.get("name")
495 optional = attrs.get("optional") == "yes"
496 if name:
497 p = Parameter("attribute", name, optional=optional)
498 else:
499 p = Parameter("attribute", optional=optional)
500 self.__current.parameters.append(p)
501 self.__buffer = ''
502 def end_attribute(self):
503 self.__current.parameters[-1].text = self.__buffer
505 def start_entityref(self, attrs):
506 name = attrs["name"]
507 p = Parameter("entityref", name)
508 self.__current.parameters.append(p)
510 def start_child(self, attrs):
511 name = attrs["name"]
512 p = Parameter("child", name, attrs.get("optional") == "yes")
513 self.__current.parameters.append(p)
514 self.__current.empty = 0
516 def start_content(self, attrs):
517 p = Parameter("content")
518 p.implied = attrs.get("implied") == "yes"
519 if self.__current.environment:
520 p.implied = 1
521 self.__current.parameters.append(p)
522 self.__current.has_content = 1
523 self.__current.empty = 0
525 def start_text(self, attrs):
526 self.__current.empty = 0
527 self.__buffer = ''
528 def end_text(self):
529 p = Parameter("text")
530 p.text = self.__buffer
531 self.__current.parameters.append(p)
534 def load_table(fp):
535 ch = TableHandler()
536 xml.sax.parse(fp, ch)
537 return ch.get_table()
540 def main():
541 global DEBUG
543 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
544 for opt, arg in opts:
545 if opt in ("-D", "--debug"):
546 DEBUG += 1
547 if len(args) == 0:
548 ifp = sys.stdin
549 ofp = sys.stdout
550 elif len(args) == 1:
551 ifp = open(args[0])
552 ofp = sys.stdout
553 elif len(args) == 2:
554 ifp = open(args[0])
555 ofp = open(args[1], "w")
556 else:
557 usage()
558 sys.exit(2)
560 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
561 convert(ifp, ofp, table)
564 if __name__ == "__main__":
565 main()