- Got rid of newmodule.c
[python/dscho.git] / Doc / tools / sgmlconv / latex2esis.py
blob3ccd7e700ca744af82b544baacc19dbfa1f8be1b
1 #! /usr/bin/env python
3 """Generate ESIS events based on a LaTeX source document and
4 configuration data.
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
16 """
18 import errno
19 import getopt
20 import os
21 import re
22 import string
23 import sys
24 import UserList
25 import xml.sax
26 import xml.sax.saxutils
28 from types import ListType, StringType, TupleType
30 from esistools import encode
33 DEBUG = 0
36 class LaTeXFormatError(Exception):
37 pass
40 class LaTeXStackError(LaTeXFormatError):
41 def __init__(self, found, stack):
42 msg = "environment close for %s doesn't match;\n stack = %s" \
43 % (found, stack)
44 self.found = found
45 self.stack = stack[:]
46 LaTeXFormatError.__init__(self, msg)
49 _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
50 _end_env_rx = re.compile(r"[\\]end{([^}]*)}")
51 _begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
52 _comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
53 _text_rx = re.compile(r"[^]~%\\{}]+")
54 _optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
55 # _parameter_rx is this complicated to allow {...} inside a parameter;
56 # this is useful to match tabular layout specifications like {c|p{24pt}}
57 _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
58 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
59 _start_group_rx = re.compile("[ \n]*{")
60 _start_optional_rx = re.compile("[ \n]*[[]")
63 ESCAPED_CHARS = "$%#^ {}&~"
66 def dbgmsg(msg):
67 if DEBUG:
68 sys.stderr.write(msg + "\n")
70 def pushing(name, point, depth):
71 dbgmsg("pushing <%s> at %s" % (name, point))
73 def popping(name, point, depth):
74 dbgmsg("popping </%s> at %s" % (name, point))
77 class _Stack(UserList.UserList):
78 def append(self, entry):
79 if type(entry) is not StringType:
80 raise LaTeXFormatError("cannot push non-string on stack: "
81 + `entry`)
82 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
83 self.data.append(entry)
85 def pop(self, index=-1):
86 entry = self.data[index]
87 del self.data[index]
88 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
90 def __delitem__(self, index):
91 entry = self.data[index]
92 del self.data[index]
93 #dbgmsg("%s</%s>" % (" "*len(self.data), entry))
96 def new_stack():
97 if DEBUG:
98 return _Stack()
99 return []
102 class Conversion:
103 def __init__(self, ifp, ofp, table):
104 self.write = ofp.write
105 self.ofp = ofp
106 self.table = table
107 L = [s.rstrip() for s in ifp.readlines()]
108 L.append("")
109 self.line = string.join(L, "\n")
110 self.preamble = 1
112 def convert(self):
113 self.subconvert()
115 def subconvert(self, endchar=None, depth=0):
117 # Parses content, including sub-structures, until the character
118 # 'endchar' is found (with no open structures), or until the end
119 # of the input data is endchar is None.
121 stack = new_stack()
122 line = self.line
123 while line:
124 if line[0] == endchar and not stack:
125 self.line = line
126 return line
127 m = _comment_rx.match(line)
128 if m:
129 text = m.group(1)
130 if text:
131 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
132 % encode(text))
133 line = line[m.end():]
134 continue
135 m = _begin_env_rx.match(line)
136 if m:
137 name = m.group(1)
138 entry = self.get_env_entry(name)
139 # re-write to use the macro handler
140 line = r"\%s %s" % (name, line[m.end():])
141 continue
142 m = _end_env_rx.match(line)
143 if m:
144 # end of environment
145 envname = m.group(1)
146 entry = self.get_entry(envname)
147 while stack and envname != stack[-1] \
148 and stack[-1] in entry.endcloses:
149 self.write(")%s\n" % stack.pop())
150 if stack and envname == stack[-1]:
151 self.write(")%s\n" % entry.outputname)
152 del stack[-1]
153 else:
154 raise LaTeXStackError(envname, stack)
155 line = line[m.end():]
156 continue
157 m = _begin_macro_rx.match(line)
158 if m:
159 # start of macro
160 macroname = m.group(1)
161 if macroname == "c":
162 # Ugh! This is a combining character...
163 endpos = m.end()
164 self.combining_char("c", line[endpos])
165 line = line[endpos + 1:]
166 continue
167 entry = self.get_entry(macroname)
168 if entry.verbatim:
169 # magic case!
170 pos = line.find("\\end{%s}" % macroname)
171 text = line[m.end(1):pos]
172 stack.append(entry.name)
173 self.write("(%s\n" % entry.outputname)
174 self.write("-%s\n" % encode(text))
175 self.write(")%s\n" % entry.outputname)
176 stack.pop()
177 line = line[pos + len("\\end{%s}" % macroname):]
178 continue
179 while stack and stack[-1] in entry.closes:
180 top = stack.pop()
181 topentry = self.get_entry(top)
182 if topentry.outputname:
183 self.write(")%s\n-\\n\n" % topentry.outputname)
185 if entry.outputname and entry.empty:
186 self.write("e\n")
188 params, optional, empty = self.start_macro(macroname)
189 # rip off the macroname
190 if params:
191 line = line[m.end(1):]
192 elif empty:
193 line = line[m.end(1):]
194 else:
195 line = line[m.end():]
196 opened = 0
197 implied_content = 0
199 # handle attribute mappings here:
200 for pentry in params:
201 if pentry.type == "attribute":
202 if pentry.optional:
203 m = _optional_rx.match(line)
204 if m and entry.outputname:
205 line = line[m.end():]
206 self.dump_attr(pentry, m.group(1))
207 elif pentry.text and entry.outputname:
208 # value supplied by conversion spec:
209 self.dump_attr(pentry, pentry.text)
210 else:
211 m = _parameter_rx.match(line)
212 if not m:
213 raise LaTeXFormatError(
214 "could not extract parameter %s for %s: %s"
215 % (pentry.name, macroname, `line[:100]`))
216 if entry.outputname:
217 self.dump_attr(pentry, m.group(1))
218 line = line[m.end():]
219 elif pentry.type == "child":
220 if pentry.optional:
221 m = _optional_rx.match(line)
222 if m:
223 line = line[m.end():]
224 if entry.outputname and not opened:
225 opened = 1
226 self.write("(%s\n" % entry.outputname)
227 stack.append(macroname)
228 stack.append(pentry.name)
229 self.write("(%s\n" % pentry.name)
230 self.write("-%s\n" % encode(m.group(1)))
231 self.write(")%s\n" % pentry.name)
232 stack.pop()
233 else:
234 if entry.outputname and not opened:
235 opened = 1
236 self.write("(%s\n" % entry.outputname)
237 stack.append(entry.name)
238 self.write("(%s\n" % pentry.name)
239 stack.append(pentry.name)
240 self.line = skip_white(line)[1:]
241 line = self.subconvert(
242 "}", len(stack) + depth + 1)[1:]
243 self.write(")%s\n" % stack.pop())
244 elif pentry.type == "content":
245 if pentry.implied:
246 implied_content = 1
247 else:
248 if entry.outputname and not opened:
249 opened = 1
250 self.write("(%s\n" % entry.outputname)
251 stack.append(entry.name)
252 line = skip_white(line)
253 if line[0] != "{":
254 raise LaTeXFormatError(
255 "missing content for " + macroname)
256 self.line = line[1:]
257 line = self.subconvert("}", len(stack) + depth + 1)
258 if line and line[0] == "}":
259 line = line[1:]
260 elif pentry.type == "text" and pentry.text:
261 if entry.outputname and not opened:
262 opened = 1
263 stack.append(entry.name)
264 self.write("(%s\n" % entry.outputname)
265 #dbgmsg("--- text: %s" % `pentry.text`)
266 self.write("-%s\n" % encode(pentry.text))
267 elif pentry.type == "entityref":
268 self.write("&%s\n" % pentry.name)
269 if entry.outputname:
270 if not opened:
271 self.write("(%s\n" % entry.outputname)
272 stack.append(entry.name)
273 if not implied_content:
274 self.write(")%s\n" % entry.outputname)
275 stack.pop()
276 continue
277 if line[0] == endchar and not stack:
278 self.line = line[1:]
279 return self.line
280 if line[0] == "}":
281 # end of macro or group
282 macroname = stack[-1]
283 if macroname:
284 conversion = self.table[macroname]
285 if conversion.outputname:
286 # otherwise, it was just a bare group
287 self.write(")%s\n" % conversion.outputname)
288 del stack[-1]
289 line = line[1:]
290 continue
291 if line[0] == "~":
292 # don't worry about the "tie" aspect of this command
293 line = line[1:]
294 self.write("- \n")
295 continue
296 if line[0] == "{":
297 stack.append("")
298 line = line[1:]
299 continue
300 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
301 self.write("-%s\n" % encode(line[1]))
302 line = line[2:]
303 continue
304 if line[:2] == r"\\":
305 self.write("(BREAK\n)BREAK\n")
306 line = line[2:]
307 continue
308 if line[:2] == r"\_":
309 line = "_" + line[2:]
310 continue
311 if line[:2] in (r"\'", r'\"'):
312 # combining characters...
313 self.combining_char(line[1], line[2])
314 line = line[3:]
315 continue
316 m = _text_rx.match(line)
317 if m:
318 text = encode(m.group())
319 self.write("-%s\n" % text)
320 line = line[m.end():]
321 continue
322 # special case because of \item[]
323 # XXX can we axe this???
324 if line[0] == "]":
325 self.write("-]\n")
326 line = line[1:]
327 continue
328 # avoid infinite loops
329 extra = ""
330 if len(line) > 100:
331 extra = "..."
332 raise LaTeXFormatError("could not identify markup: %s%s"
333 % (`line[:100]`, extra))
334 while stack:
335 entry = self.get_entry(stack[-1])
336 if entry.closes:
337 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
338 del stack[-1]
339 else:
340 break
341 if stack:
342 raise LaTeXFormatError("elements remain on stack: "
343 + string.join(stack, ", "))
344 # otherwise we just ran out of input here...
346 # This is a really limited table of combinations, but it will have
347 # to do for now.
348 _combinations = {
349 ("c", "c"): 0x00E7,
350 ("'", "e"): 0x00E9,
351 ('"', "o"): 0x00F6,
354 def combining_char(self, prefix, char):
355 ordinal = self._combinations[(prefix, char)]
356 self.write("-\\%%%d;\n" % ordinal)
358 def start_macro(self, name):
359 conversion = self.get_entry(name)
360 parameters = conversion.parameters
361 optional = parameters and parameters[0].optional
362 return parameters, optional, conversion.empty
364 def get_entry(self, name):
365 entry = self.table.get(name)
366 if entry is None:
367 dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
368 # not defined; build a default entry:
369 entry = TableEntry(name)
370 entry.has_content = 1
371 entry.parameters.append(Parameter("content"))
372 self.table[name] = entry
373 return entry
375 def get_env_entry(self, name):
376 entry = self.table.get(name)
377 if entry is None:
378 # not defined; build a default entry:
379 entry = TableEntry(name, 1)
380 entry.has_content = 1
381 entry.parameters.append(Parameter("content"))
382 entry.parameters[-1].implied = 1
383 self.table[name] = entry
384 elif not entry.environment:
385 raise LaTeXFormatError(
386 name + " is defined as a macro; expected environment")
387 return entry
389 def dump_attr(self, pentry, value):
390 if not (pentry.name and value):
391 return
392 if _token_rx.match(value):
393 dtype = "TOKEN"
394 else:
395 dtype = "CDATA"
396 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
399 def convert(ifp, ofp, table):
400 c = Conversion(ifp, ofp, table)
401 try:
402 c.convert()
403 except IOError, (err, msg):
404 if err != errno.EPIPE:
405 raise
408 def skip_white(line):
409 while line and line[0] in " %\n\t\r":
410 line = line[1:].lstrip()
411 return line
415 class TableEntry:
416 def __init__(self, name, environment=0):
417 self.name = name
418 self.outputname = name
419 self.environment = environment
420 self.empty = not environment
421 self.has_content = 0
422 self.verbatim = 0
423 self.auto_close = 0
424 self.parameters = []
425 self.closes = []
426 self.endcloses = []
428 class Parameter:
429 def __init__(self, type, name=None, optional=0):
430 self.type = type
431 self.name = name
432 self.optional = optional
433 self.text = ''
434 self.implied = 0
437 class TableHandler(xml.sax.handler.ContentHandler):
438 def __init__(self):
439 self.__table = {}
440 self.__buffer = ''
441 self.__methods = {}
443 def get_table(self):
444 for entry in self.__table.values():
445 if entry.environment and not entry.has_content:
446 p = Parameter("content")
447 p.implied = 1
448 entry.parameters.append(p)
449 entry.has_content = 1
450 return self.__table
452 def startElement(self, tag, attrs):
453 try:
454 start, end = self.__methods[tag]
455 except KeyError:
456 start = getattr(self, "start_" + tag, None)
457 end = getattr(self, "end_" + tag, None)
458 self.__methods[tag] = (start, end)
459 if start:
460 start(attrs)
462 def endElement(self, tag):
463 start, end = self.__methods[tag]
464 if end:
465 end()
467 def endDocument(self):
468 self.__methods.clear()
470 def characters(self, data):
471 self.__buffer += data
473 def start_environment(self, attrs):
474 name = attrs["name"]
475 self.__current = TableEntry(name, environment=1)
476 self.__current.verbatim = attrs.get("verbatim") == "yes"
477 if attrs.has_key("outputname"):
478 self.__current.outputname = attrs.get("outputname")
479 self.__current.endcloses = attrs.get("endcloses", "").split()
480 def end_environment(self):
481 self.end_macro()
483 def start_macro(self, attrs):
484 name = attrs["name"]
485 self.__current = TableEntry(name)
486 self.__current.closes = attrs.get("closes", "").split()
487 if attrs.has_key("outputname"):
488 self.__current.outputname = attrs.get("outputname")
489 def end_macro(self):
490 name = self.__current.name
491 if self.__table.has_key(name):
492 raise ValueError("name %s already in use" % `name`)
493 self.__table[name] = self.__current
494 self.__current = None
496 def start_attribute(self, attrs):
497 name = attrs.get("name")
498 optional = attrs.get("optional") == "yes"
499 if name:
500 p = Parameter("attribute", name, optional=optional)
501 else:
502 p = Parameter("attribute", optional=optional)
503 self.__current.parameters.append(p)
504 self.__buffer = ''
505 def end_attribute(self):
506 self.__current.parameters[-1].text = self.__buffer
508 def start_entityref(self, attrs):
509 name = attrs["name"]
510 p = Parameter("entityref", name)
511 self.__current.parameters.append(p)
513 def start_child(self, attrs):
514 name = attrs["name"]
515 p = Parameter("child", name, attrs.get("optional") == "yes")
516 self.__current.parameters.append(p)
517 self.__current.empty = 0
519 def start_content(self, attrs):
520 p = Parameter("content")
521 p.implied = attrs.get("implied") == "yes"
522 if self.__current.environment:
523 p.implied = 1
524 self.__current.parameters.append(p)
525 self.__current.has_content = 1
526 self.__current.empty = 0
528 def start_text(self, attrs):
529 self.__current.empty = 0
530 self.__buffer = ''
531 def end_text(self):
532 p = Parameter("text")
533 p.text = self.__buffer
534 self.__current.parameters.append(p)
537 def load_table(fp):
538 ch = TableHandler()
539 xml.sax.parse(fp, ch)
540 return ch.get_table()
543 def main():
544 global DEBUG
546 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
547 for opt, arg in opts:
548 if opt in ("-D", "--debug"):
549 DEBUG = DEBUG + 1
550 if len(args) == 0:
551 ifp = sys.stdin
552 ofp = sys.stdout
553 elif len(args) == 1:
554 ifp = open(args[0])
555 ofp = sys.stdout
556 elif len(args) == 2:
557 ifp = open(args[0])
558 ofp = open(args[1], "w")
559 else:
560 usage()
561 sys.exit(2)
563 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
564 convert(ifp, ofp, table)
567 if __name__ == "__main__":
568 main()