Files for 2.1b1 distribution.
[python/dscho.git] / Doc / tools / sgmlconv / latex2esis.py
blobd4bfa3f8162d540feaca60571f86a5991844251f
1 #! /usr/bin/env python
3 """Generate ESIS events based on a LaTeX source document and
4 configuration data.
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
16 """
18 import errno
19 import getopt
20 import os
21 import re
22 import string
23 import sys
24 import UserList
25 import xml.sax.saxutils
27 from types import ListType, StringType, TupleType
29 try:
30 from xml.parsers.xmllib import XMLParser
31 except ImportError:
32 from xmllib import XMLParser
35 DEBUG = 0
38 class LaTeXFormatError(Exception):
39 pass
42 class LaTeXStackError(LaTeXFormatError):
43 def __init__(self, found, stack):
44 msg = "environment close for %s doesn't match;\n stack = %s" \
45 % (found, stack)
46 self.found = found
47 self.stack = stack[:]
48 LaTeXFormatError.__init__(self, msg)
50 def encode(s):
51 s = xml.sax.saxutils.escape(s)
52 return s.replace("\n", "\\n\n-")
55 _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
56 _end_env_rx = re.compile(r"[\\]end{([^}]*)}")
57 _begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
58 _comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
59 _text_rx = re.compile(r"[^]~%\\{}]+")
60 _optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
61 # _parameter_rx is this complicated to allow {...} inside a parameter;
62 # this is useful to match tabular layout specifications like {c|p{24pt}}
63 _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
64 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
65 _start_group_rx = re.compile("[ \n]*{")
66 _start_optional_rx = re.compile("[ \n]*[[]")
69 ESCAPED_CHARS = "$%#^ {}&~"
72 def dbgmsg(msg):
73 if DEBUG:
74 sys.stderr.write(msg + "\n")
76 def pushing(name, point, depth):
77 dbgmsg("pushing <%s> at %s" % (name, point))
79 def popping(name, point, depth):
80 dbgmsg("popping </%s> at %s" % (name, point))
83 class _Stack(UserList.UserList):
84 def append(self, entry):
85 if type(entry) is not StringType:
86 raise LaTeXFormatError("cannot push non-string on stack: "
87 + `entry`)
88 sys.stderr.write("%s<%s>\n" % (" "*len(self.data), entry))
89 self.data.append(entry)
91 def pop(self, index=-1):
92 entry = self.data[index]
93 del self.data[index]
94 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
96 def __delitem__(self, index):
97 entry = self.data[index]
98 del self.data[index]
99 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
102 def new_stack():
103 if DEBUG:
104 return _Stack()
105 return []
108 class Conversion:
109 def __init__(self, ifp, ofp, table):
110 self.write = ofp.write
111 self.ofp = ofp
112 self.table = table
113 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
114 self.preamble = 1
116 def err_write(self, msg):
117 if DEBUG:
118 sys.stderr.write(str(msg) + "\n")
120 def convert(self):
121 self.subconvert()
123 def subconvert(self, endchar=None, depth=0):
125 # Parses content, including sub-structures, until the character
126 # 'endchar' is found (with no open structures), or until the end
127 # of the input data is endchar is None.
129 stack = new_stack()
130 line = self.line
131 while line:
132 if line[0] == endchar and not stack:
133 self.line = line
134 return line
135 m = _comment_rx.match(line)
136 if m:
137 text = m.group(1)
138 if text:
139 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
140 % encode(text))
141 line = line[m.end():]
142 continue
143 m = _begin_env_rx.match(line)
144 if m:
145 name = m.group(1)
146 entry = self.get_env_entry(name)
147 # re-write to use the macro handler
148 line = r"\%s %s" % (name, line[m.end():])
149 continue
150 m = _end_env_rx.match(line)
151 if m:
152 # end of environment
153 envname = m.group(1)
154 entry = self.get_entry(envname)
155 while stack and envname != stack[-1] \
156 and stack[-1] in entry.endcloses:
157 self.write(")%s\n" % stack.pop())
158 if stack and envname == stack[-1]:
159 self.write(")%s\n" % entry.outputname)
160 del stack[-1]
161 else:
162 raise LaTeXStackError(envname, stack)
163 line = line[m.end():]
164 continue
165 m = _begin_macro_rx.match(line)
166 if m:
167 # start of macro
168 macroname = m.group(1)
169 if macroname == "c":
170 # Ugh! This is a combining character...
171 endpos = m.end()
172 self.combining_char("c", line[endpos])
173 line = line[endpos + 1:]
174 continue
175 entry = self.get_entry(macroname)
176 if entry.verbatim:
177 # magic case!
178 pos = string.find(line, "\\end{%s}" % macroname)
179 text = line[m.end(1):pos]
180 stack.append(entry.name)
181 self.write("(%s\n" % entry.outputname)
182 self.write("-%s\n" % encode(text))
183 self.write(")%s\n" % entry.outputname)
184 stack.pop()
185 line = line[pos + len("\\end{%s}" % macroname):]
186 continue
187 while stack and stack[-1] in entry.closes:
188 top = stack.pop()
189 topentry = self.get_entry(top)
190 if topentry.outputname:
191 self.write(")%s\n-\\n\n" % topentry.outputname)
193 if entry.outputname:
194 if entry.empty:
195 self.write("e\n")
197 params, optional, empty, environ = self.start_macro(macroname)
198 # rip off the macroname
199 if params:
200 line = line[m.end(1):]
201 elif empty:
202 line = line[m.end(1):]
203 else:
204 line = line[m.end():]
205 opened = 0
206 implied_content = 0
208 # handle attribute mappings here:
209 for pentry in params:
210 if pentry.type == "attribute":
211 if pentry.optional:
212 m = _optional_rx.match(line)
213 if m and entry.outputname:
214 line = line[m.end():]
215 self.dump_attr(pentry, m.group(1))
216 elif pentry.text and entry.outputname:
217 # value supplied by conversion spec:
218 self.dump_attr(pentry, pentry.text)
219 else:
220 m = _parameter_rx.match(line)
221 if not m:
222 raise LaTeXFormatError(
223 "could not extract parameter %s for %s: %s"
224 % (pentry.name, macroname, `line[:100]`))
225 if entry.outputname:
226 self.dump_attr(pentry, m.group(1))
227 line = line[m.end():]
228 elif pentry.type == "child":
229 if pentry.optional:
230 m = _optional_rx.match(line)
231 if m:
232 line = line[m.end():]
233 if entry.outputname and not opened:
234 opened = 1
235 self.write("(%s\n" % entry.outputname)
236 stack.append(macroname)
237 stack.append(pentry.name)
238 self.write("(%s\n" % pentry.name)
239 self.write("-%s\n" % encode(m.group(1)))
240 self.write(")%s\n" % pentry.name)
241 stack.pop()
242 else:
243 if entry.outputname and not opened:
244 opened = 1
245 self.write("(%s\n" % entry.outputname)
246 stack.append(entry.name)
247 self.write("(%s\n" % pentry.name)
248 stack.append(pentry.name)
249 self.line = skip_white(line)[1:]
250 line = self.subconvert(
251 "}", len(stack) + depth + 1)[1:]
252 self.write(")%s\n" % stack.pop())
253 elif pentry.type == "content":
254 if pentry.implied:
255 implied_content = 1
256 else:
257 if entry.outputname and not opened:
258 opened = 1
259 self.write("(%s\n" % entry.outputname)
260 stack.append(entry.name)
261 line = skip_white(line)
262 if line[0] != "{":
263 raise LaTeXFormatError(
264 "missing content for " + macroname)
265 self.line = line[1:]
266 line = self.subconvert("}", len(stack) + depth + 1)
267 if line and line[0] == "}":
268 line = line[1:]
269 elif pentry.type == "text" and pentry.text:
270 if entry.outputname and not opened:
271 opened = 1
272 stack.append(entry.name)
273 self.write("(%s\n" % entry.outputname)
274 self.err_write("--- text: %s\n" % `pentry.text`)
275 self.write("-%s\n" % encode(pentry.text))
276 elif pentry.type == "entityref":
277 self.write("&%s\n" % pentry.name)
278 if entry.outputname:
279 if not opened:
280 self.write("(%s\n" % entry.outputname)
281 stack.append(entry.name)
282 if not implied_content:
283 self.write(")%s\n" % entry.outputname)
284 stack.pop()
285 continue
286 if line[0] == endchar and not stack:
287 self.line = line[1:]
288 return self.line
289 if line[0] == "}":
290 # end of macro or group
291 macroname = stack[-1]
292 if macroname:
293 conversion = self.table.get(macroname)
294 if conversion.outputname:
295 # otherwise, it was just a bare group
296 self.write(")%s\n" % conversion.outputname)
297 del stack[-1]
298 line = line[1:]
299 continue
300 if line[0] == "~":
301 # don't worry about the "tie" aspect of this command
302 line = line[1:]
303 self.write("- \n")
304 continue
305 if line[0] == "{":
306 stack.append("")
307 line = line[1:]
308 continue
309 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
310 self.write("-%s\n" % encode(line[1]))
311 line = line[2:]
312 continue
313 if line[:2] == r"\\":
314 self.write("(BREAK\n)BREAK\n")
315 line = line[2:]
316 continue
317 if line[:2] == r"\_":
318 line = "_" + line[2:]
319 continue
320 if line[:2] in (r"\'", r'\"'):
321 # combining characters...
322 self.combining_char(line[1], line[2])
323 line = line[3:]
324 continue
325 m = _text_rx.match(line)
326 if m:
327 text = encode(m.group())
328 self.write("-%s\n" % text)
329 line = line[m.end():]
330 continue
331 # special case because of \item[]
332 # XXX can we axe this???
333 if line[0] == "]":
334 self.write("-]\n")
335 line = line[1:]
336 continue
337 # avoid infinite loops
338 extra = ""
339 if len(line) > 100:
340 extra = "..."
341 raise LaTeXFormatError("could not identify markup: %s%s"
342 % (`line[:100]`, extra))
343 while stack:
344 entry = self.get_entry(stack[-1])
345 if entry.closes:
346 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
347 del stack[-1]
348 else:
349 break
350 if stack:
351 raise LaTeXFormatError("elements remain on stack: "
352 + string.join(stack, ", "))
353 # otherwise we just ran out of input here...
355 # This is a really limited table of combinations, but it will have
356 # to do for now.
357 _combinations = {
358 ("c", "c"): 0x00E7,
359 ("'", "e"): 0x00E9,
360 ('"', "o"): 0x00F6,
363 def combining_char(self, prefix, char):
364 ordinal = self._combinations[(prefix, char)]
365 self.write("-\\%%%d;\n" % ordinal)
367 def start_macro(self, name):
368 conversion = self.get_entry(name)
369 parameters = conversion.parameters
370 optional = parameters and parameters[0].optional
371 return parameters, optional, conversion.empty, conversion.environment
373 def get_entry(self, name):
374 entry = self.table.get(name)
375 if entry is None:
376 self.err_write("get_entry(%s) failing; building default entry!"
377 % `name`)
378 # not defined; build a default entry:
379 entry = TableEntry(name)
380 entry.has_content = 1
381 entry.parameters.append(Parameter("content"))
382 self.table[name] = entry
383 return entry
385 def get_env_entry(self, name):
386 entry = self.table.get(name)
387 if entry is None:
388 # not defined; build a default entry:
389 entry = TableEntry(name, 1)
390 entry.has_content = 1
391 entry.parameters.append(Parameter("content"))
392 entry.parameters[-1].implied = 1
393 self.table[name] = entry
394 elif not entry.environment:
395 raise LaTeXFormatError(
396 name + " is defined as a macro; expected environment")
397 return entry
399 def dump_attr(self, pentry, value):
400 if not (pentry.name and value):
401 return
402 if _token_rx.match(value):
403 dtype = "TOKEN"
404 else:
405 dtype = "CDATA"
406 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
409 def convert(ifp, ofp, table):
410 c = Conversion(ifp, ofp, table)
411 try:
412 c.convert()
413 except IOError, (err, msg):
414 if err != errno.EPIPE:
415 raise
418 def skip_white(line):
419 while line and line[0] in " %\n\t\r":
420 line = string.lstrip(line[1:])
421 return line
425 class TableEntry:
426 def __init__(self, name, environment=0):
427 self.name = name
428 self.outputname = name
429 self.environment = environment
430 self.empty = not environment
431 self.has_content = 0
432 self.verbatim = 0
433 self.auto_close = 0
434 self.parameters = []
435 self.closes = []
436 self.endcloses = []
438 class Parameter:
439 def __init__(self, type, name=None, optional=0):
440 self.type = type
441 self.name = name
442 self.optional = optional
443 self.text = ''
444 self.implied = 0
447 class TableParser(XMLParser):
448 def __init__(self, table=None):
449 if table is None:
450 table = {}
451 self.__table = table
452 self.__current = None
453 self.__buffer = ''
454 XMLParser.__init__(self)
456 def get_table(self):
457 for entry in self.__table.values():
458 if entry.environment and not entry.has_content:
459 p = Parameter("content")
460 p.implied = 1
461 entry.parameters.append(p)
462 entry.has_content = 1
463 return self.__table
465 def start_environment(self, attrs):
466 name = attrs["name"]
467 self.__current = TableEntry(name, environment=1)
468 self.__current.verbatim = attrs.get("verbatim") == "yes"
469 if attrs.has_key("outputname"):
470 self.__current.outputname = attrs.get("outputname")
471 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
472 def end_environment(self):
473 self.end_macro()
475 def start_macro(self, attrs):
476 name = attrs["name"]
477 self.__current = TableEntry(name)
478 self.__current.closes = string.split(attrs.get("closes", ""))
479 if attrs.has_key("outputname"):
480 self.__current.outputname = attrs.get("outputname")
481 def end_macro(self):
482 self.__table[self.__current.name] = self.__current
483 self.__current = None
485 def start_attribute(self, attrs):
486 name = attrs.get("name")
487 optional = attrs.get("optional") == "yes"
488 if name:
489 p = Parameter("attribute", name, optional=optional)
490 else:
491 p = Parameter("attribute", optional=optional)
492 self.__current.parameters.append(p)
493 self.__buffer = ''
494 def end_attribute(self):
495 self.__current.parameters[-1].text = self.__buffer
497 def start_entityref(self, attrs):
498 name = attrs["name"]
499 p = Parameter("entityref", name)
500 self.__current.parameters.append(p)
502 def start_child(self, attrs):
503 name = attrs["name"]
504 p = Parameter("child", name, attrs.get("optional") == "yes")
505 self.__current.parameters.append(p)
506 self.__current.empty = 0
508 def start_content(self, attrs):
509 p = Parameter("content")
510 p.implied = attrs.get("implied") == "yes"
511 if self.__current.environment:
512 p.implied = 1
513 self.__current.parameters.append(p)
514 self.__current.has_content = 1
515 self.__current.empty = 0
517 def start_text(self, attrs):
518 self.__current.empty = 0
519 self.__buffer = ''
520 def end_text(self):
521 p = Parameter("text")
522 p.text = self.__buffer
523 self.__current.parameters.append(p)
525 def handle_data(self, data):
526 self.__buffer = self.__buffer + data
529 def load_table(fp, table=None):
530 parser = TableParser(table=table)
531 parser.feed(fp.read())
532 parser.close()
533 return parser.get_table()
536 def main():
537 global DEBUG
539 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
540 for opt, arg in opts:
541 if opt in ("-D", "--debug"):
542 DEBUG = DEBUG + 1
543 if len(args) == 0:
544 ifp = sys.stdin
545 ofp = sys.stdout
546 elif len(args) == 1:
547 ifp = open(args)
548 ofp = sys.stdout
549 elif len(args) == 2:
550 ifp = open(args[0])
551 ofp = open(args[1], "w")
552 else:
553 usage()
554 sys.exit(2)
556 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
557 convert(ifp, ofp, table)
560 if __name__ == "__main__":
561 main()