Ditched '_find_SET()', since it was a no-value-added wrapper around
[python/dscho.git] / Doc / tools / sgmlconv / docfixer.py
blob9d99c63f2c2a784ff403fba3ae3d76637b6c4df1
1 #! /usr/bin/env python
3 """Perform massive transformations on a document tree created from the LaTeX
4 of the Python documentation, and dump the ESIS data for the transformed tree.
5 """
6 __version__ = '$Revision$'
9 import errno
10 import esistools
11 import re
12 import string
13 import sys
14 import xml.dom.core
16 from xml.dom.core import \
17 ELEMENT, \
18 ENTITY_REFERENCE, \
19 TEXT
22 class ConversionError(Exception):
23 pass
26 ewrite = sys.stderr.write
27 try:
28 # We can only do this trick on Unix (if tput is on $PATH)!
29 if sys.platform != "posix" or not sys.stderr.isatty():
30 raise ImportError
31 import curses
32 import commands
33 except ImportError:
34 bwrite = ewrite
35 else:
36 def bwrite(s, BOLDON=commands.getoutput("tput bold"),
37 BOLDOFF=commands.getoutput("tput sgr0")):
38 ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
41 PARA_ELEMENT = "para"
43 DEBUG_PARA_FIXER = 0
45 if DEBUG_PARA_FIXER:
46 def para_msg(s):
47 ewrite("*** %s\n" % s)
48 else:
49 def para_msg(s):
50 pass
53 # Workaround to deal with invalid documents (multiple root elements). This
54 # does not indicate a bug in the DOM implementation.
56 def get_documentElement(doc):
57 docelem = None
58 for n in doc.childNodes:
59 if n.nodeType == ELEMENT:
60 docelem = n
61 return docelem
63 xml.dom.core.Document.get_documentElement = get_documentElement
66 # Replace get_childNodes for the Document class; without this, children
67 # accessed from the Document object via .childNodes (no matter how many
68 # levels of access are used) will be given an ownerDocument of None.
70 def get_childNodes(doc):
71 return xml.dom.core.NodeList(doc._node.children, doc._node)
73 xml.dom.core.Document.get_childNodes = get_childNodes
76 def get_first_element(doc, gi):
77 for n in doc.childNodes:
78 if n.get_nodeName() == gi:
79 return n
81 def extract_first_element(doc, gi):
82 node = get_first_element(doc, gi)
83 if node is not None:
84 doc.removeChild(node)
85 return node
88 def find_all_elements(doc, gi):
89 nodes = []
90 if doc.get_nodeName() == gi:
91 nodes.append(doc)
92 for child in doc.childNodes:
93 if child.nodeType == ELEMENT:
94 if child.get_tagName() == gi:
95 nodes.append(child)
96 for node in child.getElementsByTagName(gi):
97 nodes.append(node)
98 return nodes
100 def find_all_child_elements(doc, gi):
101 nodes = []
102 for child in doc.childNodes:
103 if child.get_nodeName() == gi:
104 nodes.append(child)
105 return nodes
107 def find_all_elements_from_set(doc, gi_set):
108 return __find_all_elements_from_set(doc, gi_set, [])
110 def __find_all_elements_from_set(doc, gi_set, nodes):
111 if doc.get_nodeName() in gi_set:
112 nodes.append(doc)
113 for child in doc.childNodes:
114 if child.get_nodeType() == ELEMENT:
115 __find_all_elements_from_set(child, gi_set, nodes)
116 return nodes
119 def simplify(doc, fragment):
120 # Try to rationalize the document a bit, since these things are simply
121 # not valid SGML/XML documents as they stand, and need a little work.
122 documentclass = "document"
123 inputs = []
124 node = extract_first_element(fragment, "documentclass")
125 if node is not None:
126 documentclass = node.getAttribute("classname")
127 node = extract_first_element(fragment, "title")
128 if node is not None:
129 inputs.append(node)
130 # update the name of the root element
131 node = get_first_element(fragment, "document")
132 if node is not None:
133 node._node.name = documentclass
134 while 1:
135 node = extract_first_element(fragment, "input")
136 if node is None:
137 break
138 inputs.append(node)
139 if inputs:
140 docelem = get_documentElement(fragment)
141 inputs.reverse()
142 for node in inputs:
143 text = doc.createTextNode("\n")
144 docelem.insertBefore(text, docelem.firstChild)
145 docelem.insertBefore(node, text)
146 docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
147 while fragment.firstChild and fragment.firstChild.get_nodeType() == TEXT:
148 fragment.removeChild(fragment.firstChild)
151 def cleanup_root_text(doc):
152 discards = []
153 skip = 0
154 for n in doc.childNodes:
155 prevskip = skip
156 skip = 0
157 if n.get_nodeType() == TEXT and not prevskip:
158 discards.append(n)
159 elif n.get_nodeName() == "COMMENT":
160 skip = 1
161 for node in discards:
162 doc.removeChild(node)
165 DESCRIPTOR_ELEMENTS = (
166 "cfuncdesc", "cvardesc", "ctypedesc",
167 "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
168 "excdesc", "funcdesc", "funcdescni", "opcodedesc",
169 "datadesc", "datadescni",
172 def fixup_descriptors(doc, fragment):
173 sections = find_all_elements(fragment, "section")
174 for section in sections:
175 find_and_fix_descriptors(doc, section)
178 def find_and_fix_descriptors(doc, container):
179 children = container.childNodes
180 for child in children:
181 if child.get_nodeType() == ELEMENT:
182 tagName = child.get_tagName()
183 if tagName in DESCRIPTOR_ELEMENTS:
184 rewrite_descriptor(doc, child)
185 elif tagName == "subsection":
186 find_and_fix_descriptors(doc, child)
189 def rewrite_descriptor(doc, descriptor):
191 # Do these things:
192 # 1. Add an "index='no'" attribute to the element if the tagName
193 # ends in 'ni', removing the 'ni' from the name.
194 # 2. Create a <signature> from the name attribute
195 # 2a.Create an <args> if it appears to be available.
196 # 3. Create additional <signature>s from <*line{,ni}> elements,
197 # if found.
198 # 4. If a <versionadded> is found, move it to an attribute on the
199 # descriptor.
200 # 5. Move remaining child nodes to a <description> element.
201 # 6. Put it back together.
203 # 1.
204 descname = descriptor.get_tagName()
205 index = 1
206 if descname[-2:] == "ni":
207 descname = descname[:-2]
208 descriptor.setAttribute("index", "no")
209 descriptor._node.name = descname
210 index = 0
211 desctype = descname[:-4] # remove 'desc'
212 linename = desctype + "line"
213 if not index:
214 linename = linename + "ni"
215 # 2.
216 signature = doc.createElement("signature")
217 name = doc.createElement("name")
218 signature.appendChild(doc.createTextNode("\n "))
219 signature.appendChild(name)
220 name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
221 descriptor.removeAttribute("name")
222 # 2a.
223 if descriptor.attributes.has_key("var"):
224 if descname != "opcodedesc":
225 raise RuntimeError, \
226 "got 'var' attribute on descriptor other than opcodedesc"
227 variable = descriptor.getAttribute("var")
228 if variable:
229 args = doc.createElement("args")
230 args.appendChild(doc.createTextNode(variable))
231 signature.appendChild(doc.createTextNode("\n "))
232 signature.appendChild(args)
233 descriptor.removeAttribute("var")
234 newchildren = [signature]
235 children = descriptor.childNodes
236 pos = skip_leading_nodes(children)
237 if pos < len(children):
238 child = children[pos]
239 if child.nodeName == "args":
240 # move <args> to <signature>, or remove if empty:
241 child.parentNode.removeChild(child)
242 if len(child.childNodes):
243 signature.appendChild(doc.createTextNode("\n "))
244 signature.appendChild(child)
245 signature.appendChild(doc.createTextNode("\n "))
246 # 3, 4.
247 pos = skip_leading_nodes(children, pos)
248 while pos < len(children) \
249 and children[pos].get_nodeName() in (linename, "versionadded"):
250 if children[pos].get_tagName() == linename:
251 # this is really a supplemental signature, create <signature>
252 sig = methodline_to_signature(doc, children[pos])
253 newchildren.append(sig)
254 else:
255 # <versionadded added=...>
256 descriptor.setAttribute(
257 "added", children[pos].getAttribute("version"))
258 pos = skip_leading_nodes(children, pos + 1)
259 # 5.
260 description = doc.createElement("description")
261 description.appendChild(doc.createTextNode("\n"))
262 newchildren.append(description)
263 move_children(descriptor, description, pos)
264 last = description.childNodes[-1]
265 if last.nodeType == TEXT:
266 last.data = string.rstrip(last.data) + "\n "
267 # 6.
268 # should have nothing but whitespace and signature lines in <descriptor>;
269 # discard them
270 while descriptor.childNodes:
271 descriptor.removeChild(descriptor.childNodes[0])
272 for node in newchildren:
273 descriptor.appendChild(doc.createTextNode("\n "))
274 descriptor.appendChild(node)
275 descriptor.appendChild(doc.createTextNode("\n"))
278 def methodline_to_signature(doc, methodline):
279 signature = doc.createElement("signature")
280 signature.appendChild(doc.createTextNode("\n "))
281 name = doc.createElement("name")
282 name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
283 methodline.removeAttribute("name")
284 signature.appendChild(name)
285 if len(methodline.childNodes):
286 args = doc.createElement("args")
287 signature.appendChild(doc.createTextNode("\n "))
288 signature.appendChild(args)
289 move_children(methodline, args)
290 signature.appendChild(doc.createTextNode("\n "))
291 return signature
294 def move_children(origin, dest, start=0):
295 children = origin.childNodes
296 while start < len(children):
297 node = children[start]
298 origin.removeChild(node)
299 dest.appendChild(node)
302 def handle_appendix(doc, fragment):
303 # must be called after simplfy() if document is multi-rooted to begin with
304 docelem = get_documentElement(fragment)
305 toplevel = docelem.get_tagName() == "manual" and "chapter" or "section"
306 appendices = 0
307 nodes = []
308 for node in docelem.childNodes:
309 if appendices:
310 nodes.append(node)
311 elif node.nodeType == ELEMENT:
312 appnodes = node.getElementsByTagName("appendix")
313 if appnodes:
314 appendices = 1
315 parent = appnodes[0].parentNode
316 parent.removeChild(appnodes[0])
317 parent.normalize()
318 if nodes:
319 map(docelem.removeChild, nodes)
320 docelem.appendChild(doc.createTextNode("\n\n\n"))
321 back = doc.createElement("back-matter")
322 docelem.appendChild(back)
323 back.appendChild(doc.createTextNode("\n"))
324 while nodes and nodes[0].nodeType == TEXT \
325 and not string.strip(nodes[0].data):
326 del nodes[0]
327 map(back.appendChild, nodes)
328 docelem.appendChild(doc.createTextNode("\n"))
331 def handle_labels(doc, fragment):
332 for label in find_all_elements(fragment, "label"):
333 id = label.getAttribute("id")
334 if not id:
335 continue
336 parent = label.parentNode
337 parentTagName = parent.get_tagName()
338 if parentTagName == "title":
339 parent.parentNode.setAttribute("id", id)
340 else:
341 parent.setAttribute("id", id)
342 # now, remove <label id="..."/> from parent:
343 parent.removeChild(label)
344 if parentTagName == "title":
345 parent.normalize()
346 children = parent.childNodes
347 if children[-1].nodeType == TEXT:
348 children[-1].data = string.rstrip(children[-1].data)
351 def fixup_trailing_whitespace(doc, wsmap):
352 queue = [doc]
353 while queue:
354 node = queue[0]
355 del queue[0]
356 if wsmap.has_key(node.get_nodeName()):
357 ws = wsmap[node.get_tagName()]
358 children = node.childNodes
359 children.reverse()
360 if children[0].nodeType == TEXT:
361 data = string.rstrip(children[0].data) + ws
362 children[0].data = data
363 children.reverse()
364 # hack to get the title in place:
365 if node.get_tagName() == "title" \
366 and node.parentNode.firstChild.get_nodeType() == ELEMENT:
367 node.parentNode.insertBefore(doc.createText("\n "),
368 node.parentNode.firstChild)
369 for child in node.childNodes:
370 if child.nodeType == ELEMENT:
371 queue.append(child)
374 def normalize(doc):
375 for node in doc.childNodes:
376 if node.nodeType == ELEMENT:
377 node.normalize()
380 def cleanup_trailing_parens(doc, element_names):
381 d = {}
382 for gi in element_names:
383 d[gi] = gi
384 rewrite_element = d.has_key
385 queue = []
386 for node in doc.childNodes:
387 if node.nodeType == ELEMENT:
388 queue.append(node)
389 while queue:
390 node = queue[0]
391 del queue[0]
392 if rewrite_element(node.get_tagName()):
393 children = node.childNodes
394 if len(children) == 1 \
395 and children[0].nodeType == TEXT:
396 data = children[0].data
397 if data[-2:] == "()":
398 children[0].data = data[:-2]
399 else:
400 for child in node.childNodes:
401 if child.nodeType == ELEMENT:
402 queue.append(child)
405 def contents_match(left, right):
406 left_children = left.childNodes
407 right_children = right.childNodes
408 if len(left_children) != len(right_children):
409 return 0
410 for l, r in map(None, left_children, right_children):
411 nodeType = l.nodeType
412 if nodeType != r.nodeType:
413 return 0
414 if nodeType == ELEMENT:
415 if l.get_tagName() != r.get_tagName():
416 return 0
417 # should check attributes, but that's not a problem here
418 if not contents_match(l, r):
419 return 0
420 elif nodeType == TEXT:
421 if l.data != r.data:
422 return 0
423 else:
424 # not quite right, but good enough
425 return 0
426 return 1
429 def create_module_info(doc, section):
430 # Heavy.
431 node = extract_first_element(section, "modulesynopsis")
432 if node is None:
433 return
434 node._node.name = "synopsis"
435 lastchild = node.childNodes[-1]
436 if lastchild.nodeType == TEXT \
437 and lastchild.data[-1:] == ".":
438 lastchild.data = lastchild.data[:-1]
439 modauthor = extract_first_element(section, "moduleauthor")
440 if modauthor:
441 modauthor._node.name = "author"
442 modauthor.appendChild(doc.createTextNode(
443 modauthor.getAttribute("name")))
444 modauthor.removeAttribute("name")
445 platform = extract_first_element(section, "platform")
446 if section.get_tagName() == "section":
447 modinfo_pos = 2
448 modinfo = doc.createElement("moduleinfo")
449 moddecl = extract_first_element(section, "declaremodule")
450 name = None
451 if moddecl:
452 modinfo.appendChild(doc.createTextNode("\n "))
453 name = moddecl.attributes["name"].value
454 namenode = doc.createElement("name")
455 namenode.appendChild(doc.createTextNode(name))
456 modinfo.appendChild(namenode)
457 type = moddecl.attributes.get("type")
458 if type:
459 type = type.value
460 modinfo.appendChild(doc.createTextNode("\n "))
461 typenode = doc.createElement("type")
462 typenode.appendChild(doc.createTextNode(type))
463 modinfo.appendChild(typenode)
464 versionadded = extract_first_element(section, "versionadded")
465 if versionadded:
466 modinfo.setAttribute("added", versionadded.getAttribute("version"))
467 title = get_first_element(section, "title")
468 if title:
469 children = title.childNodes
470 if len(children) >= 2 \
471 and children[0].get_nodeName() == "module" \
472 and children[0].childNodes[0].data == name:
473 # this is it; morph the <title> into <short-synopsis>
474 first_data = children[1]
475 if first_data.data[:4] == " ---":
476 first_data.data = string.lstrip(first_data.data[4:])
477 title._node.name = "short-synopsis"
478 if children[-1].nodeType == TEXT \
479 and children[-1].data[-1:] == ".":
480 children[-1].data = children[-1].data[:-1]
481 section.removeChild(title)
482 section.removeChild(section.childNodes[0])
483 title.removeChild(children[0])
484 modinfo_pos = 0
485 else:
486 ewrite("module name in title doesn't match"
487 " <declaremodule/>; no <short-synopsis/>\n")
488 else:
489 ewrite("Unexpected condition: <section/> without <title/>\n")
490 modinfo.appendChild(doc.createTextNode("\n "))
491 modinfo.appendChild(node)
492 if title and not contents_match(title, node):
493 # The short synopsis is actually different,
494 # and needs to be stored:
495 modinfo.appendChild(doc.createTextNode("\n "))
496 modinfo.appendChild(title)
497 if modauthor:
498 modinfo.appendChild(doc.createTextNode("\n "))
499 modinfo.appendChild(modauthor)
500 if platform:
501 modinfo.appendChild(doc.createTextNode("\n "))
502 modinfo.appendChild(platform)
503 modinfo.appendChild(doc.createTextNode("\n "))
504 section.insertBefore(modinfo, section.childNodes[modinfo_pos])
505 section.insertBefore(doc.createTextNode("\n "), modinfo)
507 # The rest of this removes extra newlines from where we cut out
508 # a lot of elements. A lot of code for minimal value, but keeps
509 # keeps the generated *ML from being too funny looking.
511 section.normalize()
512 children = section.childNodes
513 for i in range(len(children)):
514 node = children[i]
515 if node.get_nodeName() == "moduleinfo":
516 nextnode = children[i+1]
517 if nextnode.nodeType == TEXT:
518 data = nextnode.data
519 if len(string.lstrip(data)) < (len(data) - 4):
520 nextnode.data = "\n\n\n" + string.lstrip(data)
523 def cleanup_synopses(doc, fragment):
524 for node in find_all_elements(fragment, "section"):
525 create_module_info(doc, node)
528 def fixup_table_structures(doc, fragment):
529 for table in find_all_elements(fragment, "table"):
530 fixup_table(doc, table)
533 def fixup_table(doc, table):
534 # create the table head
535 thead = doc.createElement("thead")
536 row = doc.createElement("row")
537 move_elements_by_name(doc, table, row, "entry")
538 thead.appendChild(doc.createTextNode("\n "))
539 thead.appendChild(row)
540 thead.appendChild(doc.createTextNode("\n "))
541 # create the table body
542 tbody = doc.createElement("tbody")
543 prev_row = None
544 last_was_hline = 0
545 children = table.childNodes
546 for child in children:
547 if child.nodeType == ELEMENT:
548 tagName = child.get_tagName()
549 if tagName == "hline" and prev_row is not None:
550 prev_row.setAttribute("rowsep", "1")
551 elif tagName == "row":
552 prev_row = child
553 # save the rows:
554 tbody.appendChild(doc.createTextNode("\n "))
555 move_elements_by_name(doc, table, tbody, "row", sep="\n ")
556 # and toss the rest:
557 while children:
558 child = children[0]
559 nodeType = child.nodeType
560 if nodeType == TEXT:
561 if string.strip(child.data):
562 raise ConversionError("unexpected free data in table")
563 table.removeChild(child)
564 continue
565 if nodeType == ELEMENT:
566 if child.get_tagName() != "hline":
567 raise ConversionError(
568 "unexpected <%s> in table" % child.get_tagName())
569 table.removeChild(child)
570 continue
571 raise ConversionError(
572 "unexpected %s node in table" % child.__class__.__name__)
573 # nothing left in the <table>; add the <thead> and <tbody>
574 tgroup = doc.createElement("tgroup")
575 tgroup.appendChild(doc.createTextNode("\n "))
576 tgroup.appendChild(thead)
577 tgroup.appendChild(doc.createTextNode("\n "))
578 tgroup.appendChild(tbody)
579 tgroup.appendChild(doc.createTextNode("\n "))
580 table.appendChild(tgroup)
581 # now make the <entry>s look nice:
582 for row in table.getElementsByTagName("row"):
583 fixup_row(doc, row)
586 def fixup_row(doc, row):
587 entries = []
588 map(entries.append, row.childNodes[1:])
589 for entry in entries:
590 row.insertBefore(doc.createTextNode("\n "), entry)
591 # row.appendChild(doc.createTextNode("\n "))
594 def move_elements_by_name(doc, source, dest, name, sep=None):
595 nodes = []
596 for child in source.childNodes:
597 if child.get_nodeName() == name:
598 nodes.append(child)
599 for node in nodes:
600 source.removeChild(node)
601 dest.appendChild(node)
602 if sep:
603 dest.appendChild(doc.createTextNode(sep))
606 RECURSE_INTO_PARA_CONTAINERS = (
607 "chapter", "abstract", "enumerate",
608 "section", "subsection", "subsubsection",
609 "paragraph", "subparagraph", "back-matter",
610 "howto", "manual",
611 "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
612 "definitionlist", "definition",
615 PARA_LEVEL_ELEMENTS = (
616 "moduleinfo", "title", "verbatim", "enumerate", "item",
617 "interpreter-session", "back-matter", "interactive-session",
618 "opcodedesc", "classdesc", "datadesc",
619 "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
620 "funcdescni", "methoddescni", "excdescni",
621 "tableii", "tableiii", "tableiv", "localmoduletable",
622 "sectionauthor", "seealso", "itemize",
623 # include <para>, so we can just do it again to get subsequent paras:
624 PARA_ELEMENT,
627 PARA_LEVEL_PRECEEDERS = (
628 "setindexsubitem",
629 "stindex", "obindex", "COMMENT", "label", "input", "title",
630 "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
631 "moduleauthor", "indexterm", "leader",
635 def fixup_paras(doc, fragment):
636 for child in fragment.childNodes:
637 if child.get_nodeName() in RECURSE_INTO_PARA_CONTAINERS:
638 fixup_paras_helper(doc, child)
639 descriptions = find_all_elements(fragment, "description")
640 for description in descriptions:
641 fixup_paras_helper(doc, description)
644 def fixup_paras_helper(doc, container, depth=0):
645 # document is already normalized
646 children = container.childNodes
647 start = skip_leading_nodes(children)
648 while len(children) > start:
649 if children[start].get_nodeName() in RECURSE_INTO_PARA_CONTAINERS:
650 # Something to recurse into:
651 fixup_paras_helper(doc, children[start])
652 else:
653 # Paragraph material:
654 build_para(doc, container, start, len(children))
655 if DEBUG_PARA_FIXER and depth == 10:
656 sys.exit(1)
657 start = skip_leading_nodes(children, start + 1)
660 def build_para(doc, parent, start, i):
661 children = parent.childNodes
662 after = start + 1
663 have_last = 0
664 BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
665 # Collect all children until \n\n+ is found in a text node or a
666 # member of BREAK_ELEMENTS is found.
667 for j in range(start, i):
668 after = j + 1
669 child = children[j]
670 nodeType = child.nodeType
671 if nodeType == ELEMENT:
672 if child.get_tagName() in BREAK_ELEMENTS:
673 after = j
674 break
675 elif nodeType == TEXT:
676 pos = string.find(child.data, "\n\n")
677 if pos == 0:
678 after = j
679 break
680 if pos >= 1:
681 child.splitText(pos)
682 break
683 else:
684 have_last = 1
685 if (start + 1) > after:
686 raise ConversionError(
687 "build_para() could not identify content to turn into a paragraph")
688 if children[after - 1].nodeType == TEXT:
689 # we may need to split off trailing white space:
690 child = children[after - 1]
691 data = child.data
692 if string.rstrip(data) != data:
693 have_last = 0
694 child.splitText(len(string.rstrip(data)))
695 para = doc.createElement(PARA_ELEMENT)
696 prev = None
697 indexes = range(start, after)
698 indexes.reverse()
699 for j in indexes:
700 node = parent.childNodes[j]
701 parent.removeChild(node)
702 para.insertBefore(node, prev)
703 prev = node
704 if have_last:
705 parent.appendChild(para)
706 parent.appendChild(doc.createTextNode("\n\n"))
707 return len(parent.childNodes)
708 else:
709 nextnode = parent.childNodes[start]
710 if nextnode.nodeType == TEXT:
711 if nextnode.data and nextnode.data[0] != "\n":
712 nextnode.data = "\n" + nextnode.data
713 else:
714 newnode = doc.createTextNode("\n")
715 parent.insertBefore(newnode, nextnode)
716 nextnode = newnode
717 start = start + 1
718 parent.insertBefore(para, nextnode)
719 return start + 1
722 def skip_leading_nodes(children, start=0):
723 """Return index into children of a node at which paragraph building should
724 begin or a recursive call to fixup_paras_helper() should be made (for
725 subsections, etc.).
727 When the return value >= len(children), we've built all the paras we can
728 from this list of children.
730 i = len(children)
731 while i > start:
732 # skip over leading comments and whitespace:
733 child = children[start]
734 nodeType = child.nodeType
735 if nodeType == TEXT:
736 data = child.data
737 shortened = string.lstrip(data)
738 if shortened:
739 if data != shortened:
740 # break into two nodes: whitespace and non-whitespace
741 child.splitText(len(data) - len(shortened))
742 return start + 1
743 return start
744 # all whitespace, just skip
745 elif nodeType == ELEMENT:
746 tagName = child.get_tagName()
747 if tagName in RECURSE_INTO_PARA_CONTAINERS:
748 return start
749 if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
750 return start
751 start = start + 1
752 return start
755 def fixup_rfc_references(doc, fragment):
756 for rfcnode in find_all_elements(fragment, "rfc"):
757 rfcnode.appendChild(doc.createTextNode(
758 "RFC " + rfcnode.getAttribute("num")))
761 def fixup_signatures(doc, fragment):
762 for child in fragment.childNodes:
763 if child.nodeType == ELEMENT:
764 args = child.getElementsByTagName("args")
765 for arg in args:
766 fixup_args(doc, arg)
767 arg.normalize()
768 args = child.getElementsByTagName("constructor-args")
769 for arg in args:
770 fixup_args(doc, arg)
771 arg.normalize()
774 def fixup_args(doc, arglist):
775 for child in arglist.childNodes:
776 if child.get_nodeName() == "optional":
777 # found it; fix and return
778 arglist.insertBefore(doc.createTextNode("["), child)
779 optkids = child.childNodes
780 while optkids:
781 k = optkids[0]
782 child.removeChild(k)
783 arglist.insertBefore(k, child)
784 arglist.insertBefore(doc.createTextNode("]"), child)
785 arglist.removeChild(child)
786 return fixup_args(doc, arglist)
789 def fixup_sectionauthors(doc, fragment):
790 for sectauth in find_all_elements(fragment, "sectionauthor"):
791 section = sectauth.parentNode
792 section.removeChild(sectauth)
793 sectauth._node.name = "author"
794 sectauth.appendChild(doc.createTextNode(
795 sectauth.getAttribute("name")))
796 sectauth.removeAttribute("name")
797 after = section.childNodes[2]
798 title = section.childNodes[1]
799 if title.get_nodeName() != "title":
800 after = section.childNodes[0]
801 section.insertBefore(doc.createTextNode("\n "), after)
802 section.insertBefore(sectauth, after)
805 def fixup_verbatims(doc):
806 for verbatim in find_all_elements(doc, "verbatim"):
807 child = verbatim.childNodes[0]
808 if child.nodeType == TEXT \
809 and string.lstrip(child.data)[:3] == ">>>":
810 verbatim._node.name = "interactive-session"
813 def add_node_ids(fragment, counter=0):
814 fragment._node.node_id = counter
815 for node in fragment.childNodes:
816 counter = counter + 1
817 if node.nodeType == ELEMENT:
818 counter = add_node_ids(node, counter)
819 else:
820 node._node.node_id = counter
821 return counter + 1
824 REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
825 'refexmodindex', 'refstmodindex')
827 def fixup_refmodindexes(fragment):
828 # Locate <ref*modindex>...</> co-located with <module>...</>, and
829 # remove the <ref*modindex>, replacing it with index=index on the
830 # <module> element.
831 nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
832 d = {}
833 for node in nodes:
834 parent = node.parentNode
835 d[parent._node.node_id] = parent
836 del nodes
837 map(fixup_refmodindexes_chunk, d.values())
840 def fixup_refmodindexes_chunk(container):
841 # node is probably a <para>; let's see how often it isn't:
842 if container.get_tagName() != PARA_ELEMENT:
843 bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
844 module_entries = find_all_elements(container, "module")
845 if not module_entries:
846 return
847 index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
848 removes = []
849 for entry in index_entries:
850 children = entry.childNodes
851 if len(children) != 0:
852 bwrite("--- unexpected number of children for %s node:\n"
853 % entry.get_tagName())
854 ewrite(entry.toxml() + "\n")
855 continue
856 found = 0
857 module_name = entry.getAttribute("module")
858 for node in module_entries:
859 if len(node.childNodes) != 1:
860 continue
861 this_name = node.childNodes[0].data
862 if this_name == module_name:
863 found = 1
864 node.setAttribute("index", "yes")
865 if found:
866 removes.append(entry)
867 for node in removes:
868 container.removeChild(node)
871 def fixup_bifuncindexes(fragment):
872 nodes = find_all_elements(fragment, 'bifuncindex')
873 d = {}
874 # make sure that each parent is only processed once:
875 for node in nodes:
876 parent = node.parentNode
877 d[parent._node.node_id] = parent
878 del nodes
879 map(fixup_bifuncindexes_chunk, d.values())
882 def fixup_bifuncindexes_chunk(container):
883 removes = []
884 entries = find_all_child_elements(container, "bifuncindex")
885 function_entries = find_all_child_elements(container, "function")
886 for entry in entries:
887 function_name = entry.getAttribute("name")
888 found = 0
889 for func_entry in function_entries:
890 t2 = func_entry.childNodes[0].data
891 if t2[-2:] != "()":
892 continue
893 t2 = t2[:-2]
894 if t2 == function_name:
895 func_entry.setAttribute("index", "yes")
896 func_entry.setAttribute("module", "__builtin__")
897 if not found:
898 found = 1
899 removes.append(entry)
900 for entry in removes:
901 container.removeChild(entry)
904 def join_adjacent_elements(container, gi):
905 queue = [container]
906 while queue:
907 parent = queue.pop()
908 i = 0
909 children = parent.get_childNodes()
910 nchildren = len(children)
911 while i < (nchildren - 1):
912 child = children[i]
913 if child.nodeName == gi:
914 if children[i+1].nodeName == gi:
915 ewrite("--- merging two <%s/> elements\n" % gi)
916 child = children[i]
917 nextchild = children[i+1]
918 nextchildren = nextchild.get_childNodes()
919 while len(nextchildren):
920 node = nextchildren[0]
921 nextchild.removeChild(node)
922 child.appendChild(node)
923 parent.removeChild(nextchild)
924 continue
925 if child.nodeType == ELEMENT:
926 queue.append(child)
927 i = i + 1
930 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
932 def write_esis(doc, ofp, knownempty):
933 for node in doc.childNodes:
934 nodeType = node.nodeType
935 if nodeType == ELEMENT:
936 gi = node.get_tagName()
937 if knownempty(gi):
938 if node.hasChildNodes():
939 raise ValueError, \
940 "declared-empty node <%s> has children" % gi
941 ofp.write("e\n")
942 for k, v in node.attributes.items():
943 value = v.value
944 if _token_rx.match(value):
945 dtype = "TOKEN"
946 else:
947 dtype = "CDATA"
948 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
949 ofp.write("(%s\n" % gi)
950 write_esis(node, ofp, knownempty)
951 ofp.write(")%s\n" % gi)
952 elif nodeType == TEXT:
953 ofp.write("-%s\n" % esistools.encode(node.data))
954 elif nodeType == ENTITY_REFERENCE:
955 ofp.write("&%s\n" % node.get_nodeName())
956 else:
957 raise RuntimeError, "unsupported node type: %s" % nodeType
960 def convert(ifp, ofp):
961 p = esistools.ExtendedEsisBuilder()
962 p.feed(ifp.read())
963 doc = p.document
964 fragment = p.fragment
965 normalize(fragment)
966 simplify(doc, fragment)
967 handle_labels(doc, fragment)
968 handle_appendix(doc, fragment)
969 fixup_trailing_whitespace(doc, {
970 "abstract": "\n",
971 "title": "",
972 "chapter": "\n\n",
973 "section": "\n\n",
974 "subsection": "\n\n",
975 "subsubsection": "\n\n",
976 "paragraph": "\n\n",
977 "subparagraph": "\n\n",
979 cleanup_root_text(doc)
980 cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
981 cleanup_synopses(doc, fragment)
982 fixup_descriptors(doc, fragment)
983 fixup_verbatims(fragment)
984 normalize(fragment)
985 fixup_paras(doc, fragment)
986 fixup_sectionauthors(doc, fragment)
987 fixup_table_structures(doc, fragment)
988 fixup_rfc_references(doc, fragment)
989 fixup_signatures(doc, fragment)
990 add_node_ids(fragment)
991 fixup_refmodindexes(fragment)
992 fixup_bifuncindexes(fragment)
993 # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
994 # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
995 join_adjacent_elements(fragment, "option")
997 d = {}
998 for gi in p.get_empties():
999 d[gi] = gi
1000 if d.has_key("rfc"):
1001 del d["rfc"]
1002 knownempty = d.has_key
1004 try:
1005 write_esis(fragment, ofp, knownempty)
1006 except IOError, (err, msg):
1007 # Ignore EPIPE; it just means that whoever we're writing to stopped
1008 # reading. The rest of the output would be ignored. All other errors
1009 # should still be reported,
1010 if err != errno.EPIPE:
1011 raise
1014 def main():
1015 if len(sys.argv) == 1:
1016 ifp = sys.stdin
1017 ofp = sys.stdout
1018 elif len(sys.argv) == 2:
1019 ifp = open(sys.argv[1])
1020 ofp = sys.stdout
1021 elif len(sys.argv) == 3:
1022 ifp = open(sys.argv[1])
1023 ofp = open(sys.argv[2], "w")
1024 else:
1025 usage()
1026 sys.exit(2)
1027 convert(ifp, ofp)
1030 if __name__ == "__main__":
1031 main()