3 """Generate ESIS events based on a LaTeX source document and
6 The conversion is not strong enough to work with arbitrary LaTeX
7 documents; it has only been designed to work with the highly stylized
8 markup used in the standard Python documentation. A lot of
9 information about specific markup is encoded in the control table
10 passed to the convert() function; changing this table can allow this
11 tool to support additional LaTeX markups.
13 The format of the table is largely undocumented; see the commented
14 headers where the table is specified in main(). There is no provision
15 to load an alternate table from an external file.
24 import xml
.sax
.saxutils
26 from esistools
import encode
32 class LaTeXFormatError(Exception):
36 class LaTeXStackError(LaTeXFormatError
):
37 def __init__(self
, found
, stack
):
38 msg
= "environment close for %s doesn't match;\n stack = %s" \
42 LaTeXFormatError
.__init
__(self
, msg
)
45 _begin_env_rx
= re
.compile(r
"[\\]begin{([^}]*)}")
46 _end_env_rx
= re
.compile(r
"[\\]end{([^}]*)}")
47 _begin_macro_rx
= re
.compile(r
"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
48 _comment_rx
= re
.compile("%+ ?(.*)\n[ \t]*")
49 _text_rx
= re
.compile(r
"[^]~%\\{}]+")
50 _optional_rx
= re
.compile(r
"\s*[[]([^]]*)[]]", re
.MULTILINE
)
51 # _parameter_rx is this complicated to allow {...} inside a parameter;
52 # this is useful to match tabular layout specifications like {c|p{24pt}}
53 _parameter_rx
= re
.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
54 _token_rx
= re
.compile(r
"[a-zA-Z][a-zA-Z0-9.-]*$")
55 _start_group_rx
= re
.compile("[ \n]*{")
56 _start_optional_rx
= re
.compile("[ \n]*[[]")
59 ESCAPED_CHARS
= "$%#^ {}&~"
64 sys
.stderr
.write(msg
+ "\n")
66 def pushing(name
, point
, depth
):
67 dbgmsg("pushing <%s> at %s" % (name
, point
))
69 def popping(name
, point
, depth
):
70 dbgmsg("popping </%s> at %s" % (name
, point
))
74 def append(self
, entry
):
75 if not isinstance(entry
, str):
76 raise LaTeXFormatError("cannot push non-string on stack: "
78 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
79 list.append(self
, entry
)
81 def pop(self
, index
=-1):
84 #dbgmsg("%s</%s>" % (" " * len(self), entry))
86 def __delitem__(self
, index
):
88 list.__delitem
__(self
, index
)
89 #dbgmsg("%s</%s>" % (" " * len(self), entry))
100 def __init__(self
, ifp
, ofp
, table
):
101 self
.write
= ofp
.write
104 L
= [s
.rstrip() for s
in ifp
.readlines()]
106 self
.line
= "\n".join(L
)
112 def subconvert(self
, endchar
=None, depth
=0):
114 # Parses content, including sub-structures, until the character
115 # 'endchar' is found (with no open structures), or until the end
116 # of the input data is endchar is None.
121 if line
[0] == endchar
and not stack
:
124 m
= _comment_rx
.match(line
)
128 self
.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
130 line
= line
[m
.end():]
132 m
= _begin_env_rx
.match(line
)
135 entry
= self
.get_env_entry(name
)
136 # re-write to use the macro handler
137 line
= r
"\%s %s" % (name
, line
[m
.end():])
139 m
= _end_env_rx
.match(line
)
143 entry
= self
.get_entry(envname
)
144 while stack
and envname
!= stack
[-1] \
145 and stack
[-1] in entry
.endcloses
:
146 self
.write(")%s\n" % stack
.pop())
147 if stack
and envname
== stack
[-1]:
148 self
.write(")%s\n" % entry
.outputname
)
151 raise LaTeXStackError(envname
, stack
)
152 line
= line
[m
.end():]
154 m
= _begin_macro_rx
.match(line
)
157 macroname
= m
.group(1)
159 # Ugh! This is a combining character...
161 self
.combining_char("c", line
[endpos
])
162 line
= line
[endpos
+ 1:]
164 entry
= self
.get_entry(macroname
)
167 pos
= line
.find("\\end{%s}" % macroname
)
168 text
= line
[m
.end(1):pos
]
169 stack
.append(entry
.name
)
170 self
.write("(%s\n" % entry
.outputname
)
171 self
.write("-%s\n" % encode(text
))
172 self
.write(")%s\n" % entry
.outputname
)
174 line
= line
[pos
+ len("\\end{%s}" % macroname
):]
176 while stack
and stack
[-1] in entry
.closes
:
178 topentry
= self
.get_entry(top
)
179 if topentry
.outputname
:
180 self
.write(")%s\n-\\n\n" % topentry
.outputname
)
182 if entry
.outputname
and entry
.empty
:
185 params
, optional
, empty
= self
.start_macro(macroname
)
186 # rip off the macroname
188 line
= line
[m
.end(1):]
190 line
= line
[m
.end(1):]
192 line
= line
[m
.end():]
196 # handle attribute mappings here:
197 for pentry
in params
:
198 if pentry
.type == "attribute":
200 m
= _optional_rx
.match(line
)
201 if m
and entry
.outputname
:
202 line
= line
[m
.end():]
203 self
.dump_attr(pentry
, m
.group(1))
204 elif pentry
.text
and entry
.outputname
:
205 # value supplied by conversion spec:
206 self
.dump_attr(pentry
, pentry
.text
)
208 m
= _parameter_rx
.match(line
)
210 raise LaTeXFormatError(
211 "could not extract parameter %s for %s: %s"
212 % (pentry
.name
, macroname
, `line
[:100]`
))
214 self
.dump_attr(pentry
, m
.group(1))
215 line
= line
[m
.end():]
216 elif pentry
.type == "child":
218 m
= _optional_rx
.match(line
)
220 line
= line
[m
.end():]
221 if entry
.outputname
and not opened
:
223 self
.write("(%s\n" % entry
.outputname
)
224 stack
.append(macroname
)
225 stack
.append(pentry
.name
)
226 self
.write("(%s\n" % pentry
.name
)
227 self
.write("-%s\n" % encode(m
.group(1)))
228 self
.write(")%s\n" % pentry
.name
)
231 if entry
.outputname
and not opened
:
233 self
.write("(%s\n" % entry
.outputname
)
234 stack
.append(entry
.name
)
235 self
.write("(%s\n" % pentry
.name
)
236 stack
.append(pentry
.name
)
237 self
.line
= skip_white(line
)[1:]
238 line
= self
.subconvert(
239 "}", len(stack
) + depth
+ 1)[1:]
240 self
.write(")%s\n" % stack
.pop())
241 elif pentry
.type == "content":
245 if entry
.outputname
and not opened
:
247 self
.write("(%s\n" % entry
.outputname
)
248 stack
.append(entry
.name
)
249 line
= skip_white(line
)
251 raise LaTeXFormatError(
252 "missing content for " + macroname
)
254 line
= self
.subconvert("}", len(stack
) + depth
+ 1)
255 if line
and line
[0] == "}":
257 elif pentry
.type == "text" and pentry
.text
:
258 if entry
.outputname
and not opened
:
260 stack
.append(entry
.name
)
261 self
.write("(%s\n" % entry
.outputname
)
262 #dbgmsg("--- text: %s" % `pentry.text`)
263 self
.write("-%s\n" % encode(pentry
.text
))
264 elif pentry
.type == "entityref":
265 self
.write("&%s\n" % pentry
.name
)
268 self
.write("(%s\n" % entry
.outputname
)
269 stack
.append(entry
.name
)
270 if not implied_content
:
271 self
.write(")%s\n" % entry
.outputname
)
274 if line
[0] == endchar
and not stack
:
278 # end of macro or group
279 macroname
= stack
[-1]
281 conversion
= self
.table
[macroname
]
282 if conversion
.outputname
:
283 # otherwise, it was just a bare group
284 self
.write(")%s\n" % conversion
.outputname
)
289 # don't worry about the "tie" aspect of this command
297 if line
[0] == "\\" and line
[1] in ESCAPED_CHARS
:
298 self
.write("-%s\n" % encode(line
[1]))
301 if line
[:2] == r
"\\":
302 self
.write("(BREAK\n)BREAK\n")
305 if line
[:2] == r
"\_":
306 line
= "_" + line
[2:]
308 if line
[:2] in (r
"\'", r
'\"'):
309 # combining characters...
310 self
.combining_char(line
[1], line
[2])
313 m
= _text_rx
.match(line
)
315 text
= encode(m
.group())
316 self
.write("-%s\n" % text
)
317 line
= line
[m
.end():]
319 # special case because of \item[]
320 # XXX can we axe this???
325 # avoid infinite loops
329 raise LaTeXFormatError("could not identify markup: %s%s"
330 % (`line
[:100]`
, extra
))
332 entry
= self
.get_entry(stack
[-1])
334 self
.write(")%s\n-%s\n" % (entry
.outputname
, encode("\n")))
339 raise LaTeXFormatError("elements remain on stack: "
341 # otherwise we just ran out of input here...
343 # This is a really limited table of combinations, but it will have
351 def combining_char(self
, prefix
, char
):
352 ordinal
= self
._combinations
[(prefix
, char
)]
353 self
.write("-\\%%%d;\n" % ordinal
)
355 def start_macro(self
, name
):
356 conversion
= self
.get_entry(name
)
357 parameters
= conversion
.parameters
358 optional
= parameters
and parameters
[0].optional
359 return parameters
, optional
, conversion
.empty
361 def get_entry(self
, name
):
362 entry
= self
.table
.get(name
)
364 dbgmsg("get_entry(%s) failing; building default entry!" % `name`
)
365 # not defined; build a default entry:
366 entry
= TableEntry(name
)
367 entry
.has_content
= 1
368 entry
.parameters
.append(Parameter("content"))
369 self
.table
[name
] = entry
372 def get_env_entry(self
, name
):
373 entry
= self
.table
.get(name
)
375 # not defined; build a default entry:
376 entry
= TableEntry(name
, 1)
377 entry
.has_content
= 1
378 entry
.parameters
.append(Parameter("content"))
379 entry
.parameters
[-1].implied
= 1
380 self
.table
[name
] = entry
381 elif not entry
.environment
:
382 raise LaTeXFormatError(
383 name
+ " is defined as a macro; expected environment")
386 def dump_attr(self
, pentry
, value
):
387 if not (pentry
.name
and value
):
389 if _token_rx
.match(value
):
393 self
.write("A%s %s %s\n" % (pentry
.name
, dtype
, encode(value
)))
396 def convert(ifp
, ofp
, table
):
397 c
= Conversion(ifp
, ofp
, table
)
400 except IOError, (err
, msg
):
401 if err
!= errno
.EPIPE
:
405 def skip_white(line
):
406 while line
and line
[0] in " %\n\t\r":
407 line
= line
[1:].lstrip()
413 def __init__(self
, name
, environment
=0):
415 self
.outputname
= name
416 self
.environment
= environment
417 self
.empty
= not environment
426 def __init__(self
, type, name
=None, optional
=0):
429 self
.optional
= optional
434 class TableHandler(xml
.sax
.handler
.ContentHandler
):
441 for entry
in self
.__table
.values():
442 if entry
.environment
and not entry
.has_content
:
443 p
= Parameter("content")
445 entry
.parameters
.append(p
)
446 entry
.has_content
= 1
449 def startElement(self
, tag
, attrs
):
451 start
, end
= self
.__methods
[tag
]
453 start
= getattr(self
, "start_" + tag
, None)
454 end
= getattr(self
, "end_" + tag
, None)
455 self
.__methods
[tag
] = (start
, end
)
459 def endElement(self
, tag
):
460 start
, end
= self
.__methods
[tag
]
464 def endDocument(self
):
465 self
.__methods
.clear()
467 def characters(self
, data
):
468 self
.__buffer
+= data
470 def start_environment(self
, attrs
):
472 self
.__current
= TableEntry(name
, environment
=1)
473 self
.__current
.verbatim
= attrs
.get("verbatim") == "yes"
474 if attrs
.has_key("outputname"):
475 self
.__current
.outputname
= attrs
.get("outputname")
476 self
.__current
.endcloses
= attrs
.get("endcloses", "").split()
477 def end_environment(self
):
480 def start_macro(self
, attrs
):
482 self
.__current
= TableEntry(name
)
483 self
.__current
.closes
= attrs
.get("closes", "").split()
484 if attrs
.has_key("outputname"):
485 self
.__current
.outputname
= attrs
.get("outputname")
487 name
= self
.__current
.name
488 if self
.__table
.has_key(name
):
489 raise ValueError("name %s already in use" % `name`
)
490 self
.__table
[name
] = self
.__current
491 self
.__current
= None
493 def start_attribute(self
, attrs
):
494 name
= attrs
.get("name")
495 optional
= attrs
.get("optional") == "yes"
497 p
= Parameter("attribute", name
, optional
=optional
)
499 p
= Parameter("attribute", optional
=optional
)
500 self
.__current
.parameters
.append(p
)
502 def end_attribute(self
):
503 self
.__current
.parameters
[-1].text
= self
.__buffer
505 def start_entityref(self
, attrs
):
507 p
= Parameter("entityref", name
)
508 self
.__current
.parameters
.append(p
)
510 def start_child(self
, attrs
):
512 p
= Parameter("child", name
, attrs
.get("optional") == "yes")
513 self
.__current
.parameters
.append(p
)
514 self
.__current
.empty
= 0
516 def start_content(self
, attrs
):
517 p
= Parameter("content")
518 p
.implied
= attrs
.get("implied") == "yes"
519 if self
.__current
.environment
:
521 self
.__current
.parameters
.append(p
)
522 self
.__current
.has_content
= 1
523 self
.__current
.empty
= 0
525 def start_text(self
, attrs
):
526 self
.__current
.empty
= 0
529 p
= Parameter("text")
530 p
.text
= self
.__buffer
531 self
.__current
.parameters
.append(p
)
536 xml
.sax
.parse(fp
, ch
)
537 return ch
.get_table()
543 opts
, args
= getopt
.getopt(sys
.argv
[1:], "D", ["debug"])
544 for opt
, arg
in opts
:
545 if opt
in ("-D", "--debug"):
555 ofp
= open(args
[1], "w")
560 table
= load_table(open(os
.path
.join(sys
.path
[0], 'conversion.xml')))
561 convert(ifp
, ofp
, table
)
564 if __name__
== "__main__":