Ditched '_find_SET()', since it was a no-value-added wrapper around
[python/dscho.git] / Doc / tools / sgmlconv / latex2esis.py
blob325b0b138fea2eb6f9ddbdb54dbcd99bc4587830
1 #! /usr/bin/env python
3 """Generate ESIS events based on a LaTeX source document and
4 configuration data.
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
16 """
17 __version__ = '$Revision$'
19 import copy
20 import errno
21 import getopt
22 import os
23 import re
24 import string
25 import StringIO
26 import sys
27 import UserList
29 from esistools import encode
30 from types import ListType, StringType, TupleType
32 try:
33 from xml.parsers.xmllib import XMLParser
34 except ImportError:
35 from xmllib import XMLParser
38 DEBUG = 0
41 class LaTeXFormatError(Exception):
42 pass
45 class LaTeXStackError(LaTeXFormatError):
46 def __init__(self, found, stack):
47 msg = "environment close for %s doesn't match;\n stack = %s" \
48 % (found, stack)
49 self.found = found
50 self.stack = stack[:]
51 LaTeXFormatError.__init__(self, msg)
54 _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
55 _end_env_rx = re.compile(r"[\\]end{([^}]*)}")
56 _begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
57 _comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
58 _text_rx = re.compile(r"[^]%\\{}]+")
59 _optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
60 # _parameter_rx is this complicated to allow {...} inside a parameter;
61 # this is useful to match tabular layout specifications like {c|p{24pt}}
62 _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
63 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
64 _start_group_rx = re.compile("[ \n]*{")
65 _start_optional_rx = re.compile("[ \n]*[[]")
68 ESCAPED_CHARS = "$%#^ {}&~"
71 def dbgmsg(msg):
72 if DEBUG:
73 sys.stderr.write(msg + "\n")
75 def pushing(name, point, depth):
76 dbgmsg("pushing <%s> at %s" % (name, point))
78 def popping(name, point, depth):
79 dbgmsg("popping </%s> at %s" % (name, point))
82 class _Stack(UserList.UserList):
83 def append(self, entry):
84 if type(entry) is not StringType:
85 raise LaTeXFormatError("cannot push non-string on stack: "
86 + `entry`)
87 sys.stderr.write("%s<%s>\n" % (" "*len(self.data), entry))
88 self.data.append(entry)
90 def pop(self, index=-1):
91 entry = self.data[index]
92 del self.data[index]
93 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
95 def __delitem__(self, index):
96 entry = self.data[index]
97 del self.data[index]
98 sys.stderr.write("%s</%s>\n" % (" "*len(self.data), entry))
101 def new_stack():
102 if DEBUG:
103 return _Stack()
104 return []
107 class Conversion:
108 def __init__(self, ifp, ofp, table):
109 self.write = ofp.write
110 self.ofp = ofp
111 self.table = table
112 self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
113 self.preamble = 1
115 def err_write(self, msg):
116 if DEBUG:
117 sys.stderr.write(str(msg) + "\n")
119 def convert(self):
120 self.subconvert()
122 def subconvert(self, endchar=None, depth=0):
124 # Parses content, including sub-structures, until the character
125 # 'endchar' is found (with no open structures), or until the end
126 # of the input data is endchar is None.
128 stack = new_stack()
129 line = self.line
130 while line:
131 if line[0] == endchar and not stack:
132 self.line = line
133 return line
134 m = _comment_rx.match(line)
135 if m:
136 text = m.group(1)
137 if text:
138 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
139 % encode(text))
140 line = line[m.end():]
141 continue
142 m = _begin_env_rx.match(line)
143 if m:
144 name = m.group(1)
145 entry = self.get_env_entry(name)
146 # re-write to use the macro handler
147 line = r"\%s %s" % (name, line[m.end():])
148 continue
149 m = _end_env_rx.match(line)
150 if m:
151 # end of environment
152 envname = m.group(1)
153 entry = self.get_entry(envname)
154 while stack and envname != stack[-1] \
155 and stack[-1] in entry.endcloses:
156 self.write(")%s\n" % stack.pop())
157 if stack and envname == stack[-1]:
158 self.write(")%s\n" % entry.outputname)
159 del stack[-1]
160 else:
161 raise LaTeXStackError(envname, stack)
162 line = line[m.end():]
163 continue
164 m = _begin_macro_rx.match(line)
165 if m:
166 # start of macro
167 macroname = m.group(1)
168 entry = self.get_entry(macroname)
169 if entry.verbatim:
170 # magic case!
171 pos = string.find(line, "\\end{%s}" % macroname)
172 text = line[m.end(1):pos]
173 stack.append(entry.name)
174 self.write("(%s\n" % entry.outputname)
175 self.write("-%s\n" % encode(text))
176 self.write(")%s\n" % entry.outputname)
177 stack.pop()
178 line = line[pos + len("\\end{%s}" % macroname):]
179 continue
180 while stack and stack[-1] in entry.closes:
181 top = stack.pop()
182 topentry = self.get_entry(top)
183 if topentry.outputname:
184 self.write(")%s\n-\\n\n" % topentry.outputname)
186 if entry.outputname:
187 if entry.empty:
188 self.write("e\n")
190 params, optional, empty, environ = self.start_macro(macroname)
191 # rip off the macroname
192 if params:
193 line = line[m.end(1):]
194 elif empty:
195 line = line[m.end(1):]
196 else:
197 line = line[m.end():]
198 opened = 0
199 implied_content = 0
201 # handle attribute mappings here:
202 for pentry in params:
203 if pentry.type == "attribute":
204 if pentry.optional:
205 m = _optional_rx.match(line)
206 if m and entry.outputname:
207 line = line[m.end():]
208 self.dump_attr(pentry, m.group(1))
209 elif pentry.text and entry.outputname:
210 # value supplied by conversion spec:
211 self.dump_attr(pentry, pentry.text)
212 else:
213 m = _parameter_rx.match(line)
214 if not m:
215 raise LaTeXFormatError(
216 "could not extract parameter %s for %s: %s"
217 % (pentry.name, macroname, `line[:100]`))
218 if entry.outputname:
219 self.dump_attr(pentry, m.group(1))
220 line = line[m.end():]
221 elif pentry.type == "child":
222 if pentry.optional:
223 m = _optional_rx.match(line)
224 if m:
225 line = line[m.end():]
226 if entry.outputname and not opened:
227 opened = 1
228 self.write("(%s\n" % entry.outputname)
229 stack.append(macroname)
230 stack.append(pentry.name)
231 self.write("(%s\n" % pentry.name)
232 self.write("-%s\n" % encode(m.group(1)))
233 self.write(")%s\n" % pentry.name)
234 stack.pop()
235 else:
236 if entry.outputname and not opened:
237 opened = 1
238 self.write("(%s\n" % entry.outputname)
239 stack.append(entry.name)
240 self.write("(%s\n" % pentry.name)
241 stack.append(pentry.name)
242 self.line = skip_white(line)[1:]
243 line = self.subconvert(
244 "}", len(stack) + depth + 1)[1:]
245 self.write(")%s\n" % stack.pop())
246 elif pentry.type == "content":
247 if pentry.implied:
248 implied_content = 1
249 else:
250 if entry.outputname and not opened:
251 opened = 1
252 self.write("(%s\n" % entry.outputname)
253 stack.append(entry.name)
254 line = skip_white(line)
255 if line[0] != "{":
256 raise LaTeXFormatError(
257 "missing content for " + macroname)
258 self.line = line[1:]
259 line = self.subconvert("}", len(stack) + depth + 1)
260 if line and line[0] == "}":
261 line = line[1:]
262 elif pentry.type == "text" and pentry.text:
263 if entry.outputname and not opened:
264 opened = 1
265 stack.append(entry.name)
266 self.write("(%s\n" % entry.outputname)
267 self.err_write("--- text: %s\n" % `pentry.text`)
268 self.write("-%s\n" % encode(pentry.text))
269 elif pentry.type == "entityref":
270 self.write("&%s\n" % pentry.name)
271 if entry.outputname:
272 if not opened:
273 self.write("(%s\n" % entry.outputname)
274 stack.append(entry.name)
275 if not implied_content:
276 self.write(")%s\n" % entry.outputname)
277 stack.pop()
278 continue
279 if line[0] == endchar and not stack:
280 self.line = line[1:]
281 return self.line
282 if line[0] == "}":
283 # end of macro or group
284 macroname = stack[-1]
285 if macroname:
286 conversion = self.table.get(macroname)
287 if conversion.outputname:
288 # otherwise, it was just a bare group
289 self.write(")%s\n" % conversion.outputname)
290 del stack[-1]
291 line = line[1:]
292 continue
293 if line[0] == "{":
294 stack.append("")
295 line = line[1:]
296 continue
297 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
298 self.write("-%s\n" % encode(line[1]))
299 line = line[2:]
300 continue
301 if line[:2] == r"\\":
302 self.write("(BREAK\n)BREAK\n")
303 line = line[2:]
304 continue
305 m = _text_rx.match(line)
306 if m:
307 text = encode(m.group())
308 self.write("-%s\n" % text)
309 line = line[m.end():]
310 continue
311 # special case because of \item[]
312 # XXX can we axe this???
313 if line[0] == "]":
314 self.write("-]\n")
315 line = line[1:]
316 continue
317 # avoid infinite loops
318 extra = ""
319 if len(line) > 100:
320 extra = "..."
321 raise LaTeXFormatError("could not identify markup: %s%s"
322 % (`line[:100]`, extra))
323 while stack:
324 entry = self.get_entry(stack[-1])
325 if entry.closes:
326 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
327 del stack[-1]
328 else:
329 break
330 if stack:
331 raise LaTeXFormatError("elements remain on stack: "
332 + string.join(stack, ", "))
333 # otherwise we just ran out of input here...
335 def start_macro(self, name):
336 conversion = self.get_entry(name)
337 parameters = conversion.parameters
338 optional = parameters and parameters[0].optional
339 return parameters, optional, conversion.empty, conversion.environment
341 def get_entry(self, name):
342 entry = self.table.get(name)
343 if entry is None:
344 self.err_write("get_entry(%s) failing; building default entry!"
345 % `name`)
346 # not defined; build a default entry:
347 entry = TableEntry(name)
348 entry.has_content = 1
349 entry.parameters.append(Parameter("content"))
350 self.table[name] = entry
351 return entry
353 def get_env_entry(self, name):
354 entry = self.table.get(name)
355 if entry is None:
356 # not defined; build a default entry:
357 entry = TableEntry(name, 1)
358 entry.has_content = 1
359 entry.parameters.append(Parameter("content"))
360 entry.parameters[-1].implied = 1
361 self.table[name] = entry
362 elif not entry.environment:
363 raise LaTeXFormatError(
364 name + " is defined as a macro; expected environment")
365 return entry
367 def dump_attr(self, pentry, value):
368 if not (pentry.name and value):
369 return
370 if _token_rx.match(value):
371 dtype = "TOKEN"
372 else:
373 dtype = "CDATA"
374 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
377 def convert(ifp, ofp, table):
378 c = Conversion(ifp, ofp, table)
379 try:
380 c.convert()
381 except IOError, (err, msg):
382 if err != errno.EPIPE:
383 raise
386 def skip_white(line):
387 while line and line[0] in " %\n\t\r":
388 line = string.lstrip(line[1:])
389 return line
393 class TableEntry:
394 def __init__(self, name, environment=0):
395 self.name = name
396 self.outputname = name
397 self.environment = environment
398 self.empty = not environment
399 self.has_content = 0
400 self.verbatim = 0
401 self.auto_close = 0
402 self.parameters = []
403 self.closes = []
404 self.endcloses = []
406 class Parameter:
407 def __init__(self, type, name=None, optional=0):
408 self.type = type
409 self.name = name
410 self.optional = optional
411 self.text = ''
412 self.implied = 0
415 class TableParser(XMLParser):
416 def __init__(self, table=None):
417 if table is None:
418 table = {}
419 self.__table = table
420 self.__current = None
421 self.__buffer = ''
422 XMLParser.__init__(self)
424 def get_table(self):
425 for entry in self.__table.values():
426 if entry.environment and not entry.has_content:
427 p = Parameter("content")
428 p.implied = 1
429 entry.parameters.append(p)
430 entry.has_content = 1
431 return self.__table
433 def start_environment(self, attrs):
434 name = attrs["name"]
435 self.__current = TableEntry(name, environment=1)
436 self.__current.verbatim = attrs.get("verbatim") == "yes"
437 if attrs.has_key("outputname"):
438 self.__current.outputname = attrs.get("outputname")
439 self.__current.endcloses = string.split(attrs.get("endcloses", ""))
440 def end_environment(self):
441 self.end_macro()
443 def start_macro(self, attrs):
444 name = attrs["name"]
445 self.__current = TableEntry(name)
446 self.__current.closes = string.split(attrs.get("closes", ""))
447 if attrs.has_key("outputname"):
448 self.__current.outputname = attrs.get("outputname")
449 def end_macro(self):
450 self.__table[self.__current.name] = self.__current
451 self.__current = None
453 def start_attribute(self, attrs):
454 name = attrs.get("name")
455 optional = attrs.get("optional") == "yes"
456 if name:
457 p = Parameter("attribute", name, optional=optional)
458 else:
459 p = Parameter("attribute", optional=optional)
460 self.__current.parameters.append(p)
461 self.__buffer = ''
462 def end_attribute(self):
463 self.__current.parameters[-1].text = self.__buffer
465 def start_entityref(self, attrs):
466 name = attrs["name"]
467 p = Parameter("entityref", name)
468 self.__current.parameters.append(p)
470 def start_child(self, attrs):
471 name = attrs["name"]
472 p = Parameter("child", name, attrs.get("optional") == "yes")
473 self.__current.parameters.append(p)
474 self.__current.empty = 0
476 def start_content(self, attrs):
477 p = Parameter("content")
478 p.implied = attrs.get("implied") == "yes"
479 if self.__current.environment:
480 p.implied = 1
481 self.__current.parameters.append(p)
482 self.__current.has_content = 1
483 self.__current.empty = 0
485 def start_text(self, attrs):
486 self.__current.empty = 0
487 self.__buffer = ''
488 def end_text(self):
489 p = Parameter("text")
490 p.text = self.__buffer
491 self.__current.parameters.append(p)
493 def handle_data(self, data):
494 self.__buffer = self.__buffer + data
497 def load_table(fp, table=None):
498 parser = TableParser(table=table)
499 parser.feed(fp.read())
500 parser.close()
501 return parser.get_table()
504 def main():
505 global DEBUG
507 opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
508 for opt, arg in opts:
509 if opt in ("-D", "--debug"):
510 DEBUG = DEBUG + 1
511 if len(args) == 0:
512 ifp = sys.stdin
513 ofp = sys.stdout
514 elif len(args) == 1:
515 ifp = open(args)
516 ofp = sys.stdout
517 elif len(args) == 2:
518 ifp = open(args[0])
519 ofp = open(args[1], "w")
520 else:
521 usage()
522 sys.exit(2)
524 table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
525 convert(ifp, ofp, table)
528 if __name__ == "__main__":
529 main()