Doc/tools/sgmlconv/docfixer.py

   1 #! /usr/bin/env python
   2
   3 """Perform massive transformations on a document tree created from the LaTeX
   4 of the Python documentation, and dump the ESIS data for the transformed tree.
   5 """
   6 __version__ = '$Revision$'
   7
   8
   9 import errno
  10 import esistools
  11 import re
  12 import string
  13 import sys
  14 import xml.dom.core
  15
  16 from xml.dom.core import \
  17      ELEMENT, \
  18      ENTITY_REFERENCE, \
  19      TEXT
  20
  21
  22 class ConversionError(Exception):
  23     pass
  24
  25
  26 ewrite = sys.stderr.write
  27 try:
  28     # We can only do this trick on Unix (if tput is on $PATH)!
  29     if sys.platform != "posix" or not sys.stderr.isatty():
  30         raise ImportError
  31     import curses
  32     import commands
  33 except ImportError:
  34     bwrite = ewrite
  35 else:
  36     def bwrite(s, BOLDON=commands.getoutput("tput bold"),
  37                BOLDOFF=commands.getoutput("tput sgr0")):
  38         ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
  39
  40
  41 PARA_ELEMENT = "para"
  42
  43 DEBUG_PARA_FIXER = 0
  44
  45 if DEBUG_PARA_FIXER:
  46     def para_msg(s):
  47         ewrite("*** %s\n" % s)
  48 else:
  49     def para_msg(s):
  50         pass
  51
  52
  53 # Workaround to deal with invalid documents (multiple root elements).  This
  54 # does not indicate a bug in the DOM implementation.
  55 #
  56 def get_documentElement(doc):
  57     docelem = None
  58     for n in doc.childNodes:
  59         if n.nodeType == ELEMENT:
  60             docelem = n
  61     return docelem
  62
  63 xml.dom.core.Document.get_documentElement = get_documentElement
  64
  65
  66 # Replace get_childNodes for the Document class; without this, children
  67 # accessed from the Document object via .childNodes (no matter how many
  68 # levels of access are used) will be given an ownerDocument of None.
  69 #
  70 def get_childNodes(doc):
  71     return xml.dom.core.NodeList(doc._node.children, doc._node)
  72
  73 xml.dom.core.Document.get_childNodes = get_childNodes
  74
  75
  76 def get_first_element(doc, gi):
  77     for n in doc.childNodes:
  78         if n.get_nodeName() == gi:
  79             return n
  80
  81 def extract_first_element(doc, gi):
  82     node = get_first_element(doc, gi)
  83     if node is not None:
  84         doc.removeChild(node)
  85     return node
  86
  87
  88 def find_all_elements(doc, gi):
  89     nodes = []
  90     if doc.get_nodeName() == gi:
  91         nodes.append(doc)
  92     for child in doc.childNodes:
  93         if child.nodeType == ELEMENT:
  94             if child.get_tagName() == gi:
  95                 nodes.append(child)
  96             for node in child.getElementsByTagName(gi):
  97                 nodes.append(node)
  98     return nodes
  99
 100 def find_all_child_elements(doc, gi):
 101     nodes = []
 102     for child in doc.childNodes:
 103         if child.get_nodeName() == gi:
 104             nodes.append(child)
 105     return nodes
 106
 107 def find_all_elements_from_set(doc, gi_set):
 108     return __find_all_elements_from_set(doc, gi_set, [])
 109
 110 def __find_all_elements_from_set(doc, gi_set, nodes):
 111     if doc.get_nodeName() in gi_set:
 112         nodes.append(doc)
 113     for child in doc.childNodes:
 114         if child.get_nodeType() == ELEMENT:
 115             __find_all_elements_from_set(child, gi_set, nodes)
 116     return nodes
 117
 118
 119 def simplify(doc, fragment):
 120     # Try to rationalize the document a bit, since these things are simply
 121     # not valid SGML/XML documents as they stand, and need a little work.
 122     documentclass = "document"
 123     inputs = []
 124     node = extract_first_element(fragment, "documentclass")
 125     if node is not None:
 126         documentclass = node.getAttribute("classname")
 127     node = extract_first_element(fragment, "title")
 128     if node is not None:
 129         inputs.append(node)
 130     # update the name of the root element
 131     node = get_first_element(fragment, "document")
 132     if node is not None:
 133         node._node.name = documentclass
 134     while 1:
 135         node = extract_first_element(fragment, "input")
 136         if node is None:
 137             break
 138         inputs.append(node)
 139     if inputs:
 140         docelem = get_documentElement(fragment)
 141         inputs.reverse()
 142         for node in inputs:
 143             text = doc.createTextNode("\n")
 144             docelem.insertBefore(text, docelem.firstChild)
 145             docelem.insertBefore(node, text)
 146         docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
 147     while fragment.firstChild and fragment.firstChild.get_nodeType() == TEXT:
 148         fragment.removeChild(fragment.firstChild)
 149
 150
 151 def cleanup_root_text(doc):
 152     discards = []
 153     skip = 0
 154     for n in doc.childNodes:
 155         prevskip = skip
 156         skip = 0
 157         if n.get_nodeType() == TEXT and not prevskip:
 158             discards.append(n)
 159         elif n.get_nodeName() == "COMMENT":
 160             skip = 1
 161     for node in discards:
 162         doc.removeChild(node)
 163
 164
 165 DESCRIPTOR_ELEMENTS = (
 166     "cfuncdesc", "cvardesc", "ctypedesc",
 167     "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
 168     "excdesc", "funcdesc", "funcdescni", "opcodedesc",
 169     "datadesc", "datadescni",
 170     )
 171
 172 def fixup_descriptors(doc, fragment):
 173     sections = find_all_elements(fragment, "section")
 174     for section in sections:
 175         find_and_fix_descriptors(doc, section)
 176
 177
 178 def find_and_fix_descriptors(doc, container):
 179     children = container.childNodes
 180     for child in children:
 181         if child.get_nodeType() == ELEMENT:
 182             tagName = child.get_tagName()
 183             if tagName in DESCRIPTOR_ELEMENTS:
 184                 rewrite_descriptor(doc, child)
 185             elif tagName == "subsection":
 186                 find_and_fix_descriptors(doc, child)
 187
 188
 189 def rewrite_descriptor(doc, descriptor):
 190     #
 191     # Do these things:
 192     #   1. Add an "index='no'" attribute to the element if the tagName
 193     #      ends in 'ni', removing the 'ni' from the name.
 194     #   2. Create a <signature> from the name attribute
 195     #   2a.Create an <args> if it appears to be available.
 196     #   3. Create additional <signature>s from <*line{,ni}> elements,
 197     #      if found.
 198     #   4. If a <versionadded> is found, move it to an attribute on the
 199     #      descriptor.
 200     #   5. Move remaining child nodes to a <description> element.
 201     #   6. Put it back together.
 202     #
 203     # 1.
 204     descname = descriptor.get_tagName()
 205     index = 1
 206     if descname[-2:] == "ni":
 207         descname = descname[:-2]
 208         descriptor.setAttribute("index", "no")
 209         descriptor._node.name = descname
 210         index = 0
 211     desctype = descname[:-4] # remove 'desc'
 212     linename = desctype + "line"
 213     if not index:
 214         linename = linename + "ni"
 215     # 2.
 216     signature = doc.createElement("signature")
 217     name = doc.createElement("name")
 218     signature.appendChild(doc.createTextNode("\n    "))
 219     signature.appendChild(name)
 220     name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
 221     descriptor.removeAttribute("name")
 222     # 2a.
 223     if descriptor.attributes.has_key("var"):
 224         if descname != "opcodedesc":
 225             raise RuntimeError, \
 226                   "got 'var' attribute on descriptor other than opcodedesc"
 227         variable = descriptor.getAttribute("var")
 228         if variable:
 229             args = doc.createElement("args")
 230             args.appendChild(doc.createTextNode(variable))
 231             signature.appendChild(doc.createTextNode("\n    "))
 232             signature.appendChild(args)
 233         descriptor.removeAttribute("var")
 234     newchildren = [signature]
 235     children = descriptor.childNodes
 236     pos = skip_leading_nodes(children)
 237     if pos < len(children):
 238         child = children[pos]
 239         if child.nodeName == "args":
 240             # move <args> to <signature>, or remove if empty:
 241             child.parentNode.removeChild(child)
 242             if len(child.childNodes):
 243                 signature.appendChild(doc.createTextNode("\n    "))
 244                 signature.appendChild(child)
 245     signature.appendChild(doc.createTextNode("\n  "))
 246     # 3, 4.
 247     pos = skip_leading_nodes(children, pos)
 248     while pos < len(children) \
 249           and children[pos].get_nodeName() in (linename, "versionadded"):
 250         if children[pos].get_tagName() == linename:
 251             # this is really a supplemental signature, create <signature>
 252             sig = methodline_to_signature(doc, children[pos])
 253             newchildren.append(sig)
 254         else:
 255             # <versionadded added=...>
 256             descriptor.setAttribute(
 257                 "added", children[pos].getAttribute("version"))
 258         pos = skip_leading_nodes(children, pos + 1)
 259     # 5.
 260     description = doc.createElement("description")
 261     description.appendChild(doc.createTextNode("\n"))
 262     newchildren.append(description)
 263     move_children(descriptor, description, pos)
 264     last = description.childNodes[-1]
 265     if last.nodeType == TEXT:
 266         last.data = string.rstrip(last.data) + "\n  "
 267     # 6.
 268     # should have nothing but whitespace and signature lines in <descriptor>;
 269     # discard them
 270     while descriptor.childNodes:
 271         descriptor.removeChild(descriptor.childNodes[0])
 272     for node in newchildren:
 273         descriptor.appendChild(doc.createTextNode("\n  "))
 274         descriptor.appendChild(node)
 275     descriptor.appendChild(doc.createTextNode("\n"))
 276
 277
 278 def methodline_to_signature(doc, methodline):
 279     signature = doc.createElement("signature")
 280     signature.appendChild(doc.createTextNode("\n    "))
 281     name = doc.createElement("name")
 282     name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
 283     methodline.removeAttribute("name")
 284     signature.appendChild(name)
 285     if len(methodline.childNodes):
 286         args = doc.createElement("args")
 287         signature.appendChild(doc.createTextNode("\n    "))
 288         signature.appendChild(args)
 289         move_children(methodline, args)
 290     signature.appendChild(doc.createTextNode("\n  "))
 291     return signature
 292
 293
 294 def move_children(origin, dest, start=0):
 295     children = origin.childNodes
 296     while start < len(children):
 297         node = children[start]
 298         origin.removeChild(node)
 299         dest.appendChild(node)
 300
 301
 302 def handle_appendix(doc, fragment):
 303     # must be called after simplfy() if document is multi-rooted to begin with
 304     docelem = get_documentElement(fragment)
 305     toplevel = docelem.get_tagName() == "manual" and "chapter" or "section"
 306     appendices = 0
 307     nodes = []
 308     for node in docelem.childNodes:
 309         if appendices:
 310             nodes.append(node)
 311         elif node.nodeType == ELEMENT:
 312             appnodes = node.getElementsByTagName("appendix")
 313             if appnodes:
 314                 appendices = 1
 315                 parent = appnodes[0].parentNode
 316                 parent.removeChild(appnodes[0])
 317                 parent.normalize()
 318     if nodes:
 319         map(docelem.removeChild, nodes)
 320         docelem.appendChild(doc.createTextNode("\n\n\n"))
 321         back = doc.createElement("back-matter")
 322         docelem.appendChild(back)
 323         back.appendChild(doc.createTextNode("\n"))
 324         while nodes and nodes[0].nodeType == TEXT \
 325               and not string.strip(nodes[0].data):
 326             del nodes[0]
 327         map(back.appendChild, nodes)
 328         docelem.appendChild(doc.createTextNode("\n"))
 329
 330
 331 def handle_labels(doc, fragment):
 332     for label in find_all_elements(fragment, "label"):
 333         id = label.getAttribute("id")
 334         if not id:
 335             continue
 336         parent = label.parentNode
 337         parentTagName = parent.get_tagName()
 338         if parentTagName == "title":
 339             parent.parentNode.setAttribute("id", id)
 340         else:
 341             parent.setAttribute("id", id)
 342         # now, remove <label id="..."/> from parent:
 343         parent.removeChild(label)
 344         if parentTagName == "title":
 345             parent.normalize()
 346             children = parent.childNodes
 347             if children[-1].nodeType == TEXT:
 348                 children[-1].data = string.rstrip(children[-1].data)
 349
 350
 351 def fixup_trailing_whitespace(doc, wsmap):
 352     queue = [doc]
 353     while queue:
 354         node = queue[0]
 355         del queue[0]
 356         if wsmap.has_key(node.get_nodeName()):
 357             ws = wsmap[node.get_tagName()]
 358             children = node.childNodes
 359             children.reverse()
 360             if children[0].nodeType == TEXT:
 361                 data = string.rstrip(children[0].data) + ws
 362                 children[0].data = data
 363             children.reverse()
 364             # hack to get the title in place:
 365             if node.get_tagName() == "title" \
 366                and node.parentNode.firstChild.get_nodeType() == ELEMENT:
 367                 node.parentNode.insertBefore(doc.createText("\n  "),
 368                                              node.parentNode.firstChild)
 369         for child in node.childNodes:
 370             if child.nodeType == ELEMENT:
 371                 queue.append(child)
 372
 373
 374 def normalize(doc):
 375     for node in doc.childNodes:
 376         if node.nodeType == ELEMENT:
 377             node.normalize()
 378
 379
 380 def cleanup_trailing_parens(doc, element_names):
 381     d = {}
 382     for gi in element_names:
 383         d[gi] = gi
 384     rewrite_element = d.has_key
 385     queue = []
 386     for node in doc.childNodes:
 387         if node.nodeType == ELEMENT:
 388             queue.append(node)
 389     while queue:
 390         node = queue[0]
 391         del queue[0]
 392         if rewrite_element(node.get_tagName()):
 393             children = node.childNodes
 394             if len(children) == 1 \
 395                and children[0].nodeType == TEXT:
 396                 data = children[0].data
 397                 if data[-2:] == "()":
 398                     children[0].data = data[:-2]
 399         else:
 400             for child in node.childNodes:
 401                 if child.nodeType == ELEMENT:
 402                     queue.append(child)
 403
 404
 405 def contents_match(left, right):
 406     left_children = left.childNodes
 407     right_children = right.childNodes
 408     if len(left_children) != len(right_children):
 409         return 0
 410     for l, r in map(None, left_children, right_children):
 411         nodeType = l.nodeType
 412         if nodeType != r.nodeType:
 413             return 0
 414         if nodeType == ELEMENT:
 415             if l.get_tagName() != r.get_tagName():
 416                 return 0
 417             # should check attributes, but that's not a problem here
 418             if not contents_match(l, r):
 419                 return 0
 420         elif nodeType == TEXT:
 421             if l.data != r.data:
 422                 return 0
 423         else:
 424             # not quite right, but good enough
 425             return 0
 426     return 1
 427
 428
 429 def create_module_info(doc, section):
 430     # Heavy.
 431     node = extract_first_element(section, "modulesynopsis")
 432     if node is None:
 433         return
 434     node._node.name = "synopsis"
 435     lastchild = node.childNodes[-1]
 436     if lastchild.nodeType == TEXT \
 437        and lastchild.data[-1:] == ".":
 438         lastchild.data = lastchild.data[:-1]
 439     modauthor = extract_first_element(section, "moduleauthor")
 440     if modauthor:
 441         modauthor._node.name = "author"
 442         modauthor.appendChild(doc.createTextNode(
 443             modauthor.getAttribute("name")))
 444         modauthor.removeAttribute("name")
 445     platform = extract_first_element(section, "platform")
 446     if section.get_tagName() == "section":
 447         modinfo_pos = 2
 448         modinfo = doc.createElement("moduleinfo")
 449         moddecl = extract_first_element(section, "declaremodule")
 450         name = None
 451         if moddecl:
 452             modinfo.appendChild(doc.createTextNode("\n    "))
 453             name = moddecl.attributes["name"].value
 454             namenode = doc.createElement("name")
 455             namenode.appendChild(doc.createTextNode(name))
 456             modinfo.appendChild(namenode)
 457             type = moddecl.attributes.get("type")
 458             if type:
 459                 type = type.value
 460                 modinfo.appendChild(doc.createTextNode("\n    "))
 461                 typenode = doc.createElement("type")
 462                 typenode.appendChild(doc.createTextNode(type))
 463                 modinfo.appendChild(typenode)
 464         versionadded = extract_first_element(section, "versionadded")
 465         if versionadded:
 466             modinfo.setAttribute("added", versionadded.getAttribute("version"))
 467         title = get_first_element(section, "title")
 468         if title:
 469             children = title.childNodes
 470             if len(children) >= 2 \
 471                and children[0].get_nodeName() == "module" \
 472                and children[0].childNodes[0].data == name:
 473                 # this is it; morph the <title> into <short-synopsis>
 474                 first_data = children[1]
 475                 if first_data.data[:4] == " ---":
 476                     first_data.data = string.lstrip(first_data.data[4:])
 477                 title._node.name = "short-synopsis"
 478                 if children[-1].nodeType == TEXT \
 479                    and children[-1].data[-1:] == ".":
 480                     children[-1].data = children[-1].data[:-1]
 481                 section.removeChild(title)
 482                 section.removeChild(section.childNodes[0])
 483                 title.removeChild(children[0])
 484                 modinfo_pos = 0
 485             else:
 486                 ewrite("module name in title doesn't match"
 487                        " <declaremodule/>; no <short-synopsis/>\n")
 488         else:
 489             ewrite("Unexpected condition: <section/> without <title/>\n")
 490         modinfo.appendChild(doc.createTextNode("\n    "))
 491         modinfo.appendChild(node)
 492         if title and not contents_match(title, node):
 493             # The short synopsis is actually different,
 494             # and needs to be stored:
 495             modinfo.appendChild(doc.createTextNode("\n    "))
 496             modinfo.appendChild(title)
 497         if modauthor:
 498             modinfo.appendChild(doc.createTextNode("\n    "))
 499             modinfo.appendChild(modauthor)
 500         if platform:
 501             modinfo.appendChild(doc.createTextNode("\n    "))
 502             modinfo.appendChild(platform)
 503         modinfo.appendChild(doc.createTextNode("\n  "))
 504         section.insertBefore(modinfo, section.childNodes[modinfo_pos])
 505         section.insertBefore(doc.createTextNode("\n  "), modinfo)
 506         #
 507         # The rest of this removes extra newlines from where we cut out
 508         # a lot of elements.  A lot of code for minimal value, but keeps
 509         # keeps the generated *ML from being too funny looking.
 510         #
 511         section.normalize()
 512         children = section.childNodes
 513         for i in range(len(children)):
 514             node = children[i]
 515             if node.get_nodeName() == "moduleinfo":
 516                 nextnode = children[i+1]
 517                 if nextnode.nodeType == TEXT:
 518                     data = nextnode.data
 519                     if len(string.lstrip(data)) < (len(data) - 4):
 520                         nextnode.data = "\n\n\n" + string.lstrip(data)
 521
 522
 523 def cleanup_synopses(doc, fragment):
 524     for node in find_all_elements(fragment, "section"):
 525         create_module_info(doc, node)
 526
 527
 528 def fixup_table_structures(doc, fragment):
 529     for table in find_all_elements(fragment, "table"):
 530         fixup_table(doc, table)
 531
 532
 533 def fixup_table(doc, table):
 534     # create the table head
 535     thead = doc.createElement("thead")
 536     row = doc.createElement("row")
 537     move_elements_by_name(doc, table, row, "entry")
 538     thead.appendChild(doc.createTextNode("\n    "))
 539     thead.appendChild(row)
 540     thead.appendChild(doc.createTextNode("\n    "))
 541     # create the table body
 542     tbody = doc.createElement("tbody")
 543     prev_row = None
 544     last_was_hline = 0
 545     children = table.childNodes
 546     for child in children:
 547         if child.nodeType == ELEMENT:
 548             tagName = child.get_tagName()
 549             if tagName == "hline" and prev_row is not None:
 550                 prev_row.setAttribute("rowsep", "1")
 551             elif tagName == "row":
 552                 prev_row = child
 553     # save the rows:
 554     tbody.appendChild(doc.createTextNode("\n    "))
 555     move_elements_by_name(doc, table, tbody, "row", sep="\n    ")
 556     # and toss the rest:
 557     while children:
 558         child = children[0]
 559         nodeType = child.nodeType
 560         if nodeType == TEXT:
 561             if string.strip(child.data):
 562                 raise ConversionError("unexpected free data in table")
 563             table.removeChild(child)
 564             continue
 565         if nodeType == ELEMENT:
 566             if child.get_tagName() != "hline":
 567                 raise ConversionError(
 568                     "unexpected <%s> in table" % child.get_tagName())
 569             table.removeChild(child)
 570             continue
 571         raise ConversionError(
 572             "unexpected %s node in table" % child.__class__.__name__)
 573     # nothing left in the <table>; add the <thead> and <tbody>
 574     tgroup = doc.createElement("tgroup")
 575     tgroup.appendChild(doc.createTextNode("\n  "))
 576     tgroup.appendChild(thead)
 577     tgroup.appendChild(doc.createTextNode("\n  "))
 578     tgroup.appendChild(tbody)
 579     tgroup.appendChild(doc.createTextNode("\n  "))
 580     table.appendChild(tgroup)
 581     # now make the <entry>s look nice:
 582     for row in table.getElementsByTagName("row"):
 583         fixup_row(doc, row)
 584
 585
 586 def fixup_row(doc, row):
 587     entries = []
 588     map(entries.append, row.childNodes[1:])
 589     for entry in entries:
 590         row.insertBefore(doc.createTextNode("\n         "), entry)
 591 #    row.appendChild(doc.createTextNode("\n      "))
 592
 593
 594 def move_elements_by_name(doc, source, dest, name, sep=None):
 595     nodes = []
 596     for child in source.childNodes:
 597         if child.get_nodeName() == name:
 598             nodes.append(child)
 599     for node in nodes:
 600         source.removeChild(node)
 601         dest.appendChild(node)
 602         if sep:
 603             dest.appendChild(doc.createTextNode(sep))
 604
 605
 606 RECURSE_INTO_PARA_CONTAINERS = (
 607     "chapter", "abstract", "enumerate",
 608     "section", "subsection", "subsubsection",
 609     "paragraph", "subparagraph", "back-matter",
 610     "howto", "manual",
 611     "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
 612     "definitionlist", "definition",
 613     )
 614
 615 PARA_LEVEL_ELEMENTS = (
 616     "moduleinfo", "title", "verbatim", "enumerate", "item",
 617     "interpreter-session", "back-matter", "interactive-session",
 618     "opcodedesc", "classdesc", "datadesc",
 619     "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
 620     "funcdescni", "methoddescni", "excdescni",
 621     "tableii", "tableiii", "tableiv", "localmoduletable",
 622     "sectionauthor", "seealso", "itemize",
 623     # include <para>, so we can just do it again to get subsequent paras:
 624     PARA_ELEMENT,
 625     )
 626
 627 PARA_LEVEL_PRECEEDERS = (
 628     "setindexsubitem",
 629     "stindex", "obindex", "COMMENT", "label", "input", "title",
 630     "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
 631     "moduleauthor", "indexterm", "leader",
 632     )
 633
 634
 635 def fixup_paras(doc, fragment):
 636     for child in fragment.childNodes:
 637         if child.get_nodeName() in RECURSE_INTO_PARA_CONTAINERS:
 638             fixup_paras_helper(doc, child)
 639     descriptions = find_all_elements(fragment, "description")
 640     for description in descriptions:
 641         fixup_paras_helper(doc, description)
 642
 643
 644 def fixup_paras_helper(doc, container, depth=0):
 645     # document is already normalized
 646     children = container.childNodes
 647     start = skip_leading_nodes(children)
 648     while len(children) > start:
 649         if children[start].get_nodeName() in RECURSE_INTO_PARA_CONTAINERS:
 650             # Something to recurse into:
 651             fixup_paras_helper(doc, children[start])
 652         else:
 653             # Paragraph material:
 654             build_para(doc, container, start, len(children))
 655             if DEBUG_PARA_FIXER and depth == 10:
 656                 sys.exit(1)
 657         start = skip_leading_nodes(children, start + 1)
 658
 659
 660 def build_para(doc, parent, start, i):
 661     children = parent.childNodes
 662     after = start + 1
 663     have_last = 0
 664     BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
 665     # Collect all children until \n\n+ is found in a text node or a
 666     # member of BREAK_ELEMENTS is found.
 667     for j in range(start, i):
 668         after = j + 1
 669         child = children[j]
 670         nodeType = child.nodeType
 671         if nodeType == ELEMENT:
 672             if child.get_tagName() in BREAK_ELEMENTS:
 673                 after = j
 674                 break
 675         elif nodeType == TEXT:
 676             pos = string.find(child.data, "\n\n")
 677             if pos == 0:
 678                 after = j
 679                 break
 680             if pos >= 1:
 681                 child.splitText(pos)
 682                 break
 683     else:
 684         have_last = 1
 685     if (start + 1) > after:
 686         raise ConversionError(
 687             "build_para() could not identify content to turn into a paragraph")
 688     if children[after - 1].nodeType == TEXT:
 689         # we may need to split off trailing white space:
 690         child = children[after - 1]
 691         data = child.data
 692         if string.rstrip(data) != data:
 693             have_last = 0
 694             child.splitText(len(string.rstrip(data)))
 695     para = doc.createElement(PARA_ELEMENT)
 696     prev = None
 697     indexes = range(start, after)
 698     indexes.reverse()
 699     for j in indexes:
 700         node = parent.childNodes[j]
 701         parent.removeChild(node)
 702         para.insertBefore(node, prev)
 703         prev = node
 704     if have_last:
 705         parent.appendChild(para)
 706         parent.appendChild(doc.createTextNode("\n\n"))
 707         return len(parent.childNodes)
 708     else:
 709         nextnode = parent.childNodes[start]
 710         if nextnode.nodeType == TEXT:
 711             if nextnode.data and nextnode.data[0] != "\n":
 712                 nextnode.data = "\n" + nextnode.data
 713         else:
 714             newnode = doc.createTextNode("\n")
 715             parent.insertBefore(newnode, nextnode)
 716             nextnode = newnode
 717             start = start + 1
 718         parent.insertBefore(para, nextnode)
 719         return start + 1
 720
 721
 722 def skip_leading_nodes(children, start=0):
 723     """Return index into children of a node at which paragraph building should
 724     begin or a recursive call to fixup_paras_helper() should be made (for
 725     subsections, etc.).
 726
 727     When the return value >= len(children), we've built all the paras we can
 728     from this list of children.
 729     """
 730     i = len(children)
 731     while i > start:
 732         # skip over leading comments and whitespace:
 733         child = children[start]
 734         nodeType = child.nodeType
 735         if nodeType == TEXT:
 736             data = child.data
 737             shortened = string.lstrip(data)
 738             if shortened:
 739                 if data != shortened:
 740                     # break into two nodes: whitespace and non-whitespace
 741                     child.splitText(len(data) - len(shortened))
 742                     return start + 1
 743                 return start
 744             # all whitespace, just skip
 745         elif nodeType == ELEMENT:
 746             tagName = child.get_tagName()
 747             if tagName in RECURSE_INTO_PARA_CONTAINERS:
 748                 return start
 749             if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
 750                 return start
 751         start = start + 1
 752     return start
 753
 754
 755 def fixup_rfc_references(doc, fragment):
 756     for rfcnode in find_all_elements(fragment, "rfc"):
 757         rfcnode.appendChild(doc.createTextNode(
 758             "RFC " + rfcnode.getAttribute("num")))
 759
 760
 761 def fixup_signatures(doc, fragment):
 762     for child in fragment.childNodes:
 763         if child.nodeType == ELEMENT:
 764             args = child.getElementsByTagName("args")
 765             for arg in args:
 766                 fixup_args(doc, arg)
 767                 arg.normalize()
 768             args = child.getElementsByTagName("constructor-args")
 769             for arg in args:
 770                 fixup_args(doc, arg)
 771                 arg.normalize()
 772
 773
 774 def fixup_args(doc, arglist):
 775     for child in arglist.childNodes:
 776         if child.get_nodeName() == "optional":
 777             # found it; fix and return
 778             arglist.insertBefore(doc.createTextNode("["), child)
 779             optkids = child.childNodes
 780             while optkids:
 781                 k = optkids[0]
 782                 child.removeChild(k)
 783                 arglist.insertBefore(k, child)
 784             arglist.insertBefore(doc.createTextNode("]"), child)
 785             arglist.removeChild(child)
 786             return fixup_args(doc, arglist)
 787
 788
 789 def fixup_sectionauthors(doc, fragment):
 790     for sectauth in find_all_elements(fragment, "sectionauthor"):
 791         section = sectauth.parentNode
 792         section.removeChild(sectauth)
 793         sectauth._node.name = "author"
 794         sectauth.appendChild(doc.createTextNode(
 795             sectauth.getAttribute("name")))
 796         sectauth.removeAttribute("name")
 797         after = section.childNodes[2]
 798         title = section.childNodes[1]
 799         if title.get_nodeName() != "title":
 800             after = section.childNodes[0]
 801         section.insertBefore(doc.createTextNode("\n  "), after)
 802         section.insertBefore(sectauth, after)
 803
 804
 805 def fixup_verbatims(doc):
 806     for verbatim in find_all_elements(doc, "verbatim"):
 807         child = verbatim.childNodes[0]
 808         if child.nodeType == TEXT \
 809            and string.lstrip(child.data)[:3] == ">>>":
 810             verbatim._node.name = "interactive-session"
 811
 812
 813 def add_node_ids(fragment, counter=0):
 814     fragment._node.node_id = counter
 815     for node in fragment.childNodes:
 816         counter = counter + 1
 817         if node.nodeType == ELEMENT:
 818             counter = add_node_ids(node, counter)
 819         else:
 820             node._node.node_id = counter
 821     return counter + 1
 822
 823
 824 REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
 825                         'refexmodindex', 'refstmodindex')
 826
 827 def fixup_refmodindexes(fragment):
 828     # Locate <ref*modindex>...</> co-located with <module>...</>, and
 829     # remove the <ref*modindex>, replacing it with index=index on the
 830     # <module> element.
 831     nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
 832     d = {}
 833     for node in nodes:
 834         parent = node.parentNode
 835         d[parent._node.node_id] = parent
 836     del nodes
 837     map(fixup_refmodindexes_chunk, d.values())
 838
 839
 840 def fixup_refmodindexes_chunk(container):
 841     # node is probably a <para>; let's see how often it isn't:
 842     if container.get_tagName() != PARA_ELEMENT:
 843         bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
 844     module_entries = find_all_elements(container, "module")
 845     if not module_entries:
 846         return
 847     index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
 848     removes = []
 849     for entry in index_entries:
 850         children = entry.childNodes
 851         if len(children) != 0:
 852             bwrite("--- unexpected number of children for %s node:\n"
 853                    % entry.get_tagName())
 854             ewrite(entry.toxml() + "\n")
 855             continue
 856         found = 0
 857         module_name = entry.getAttribute("module")
 858         for node in module_entries:
 859             if len(node.childNodes) != 1:
 860                 continue
 861             this_name = node.childNodes[0].data
 862             if this_name == module_name:
 863                 found = 1
 864                 node.setAttribute("index", "yes")
 865         if found:
 866             removes.append(entry)
 867     for node in removes:
 868         container.removeChild(node)
 869
 870
 871 def fixup_bifuncindexes(fragment):
 872     nodes = find_all_elements(fragment, 'bifuncindex')
 873     d = {}
 874     # make sure that each parent is only processed once:
 875     for node in nodes:
 876         parent = node.parentNode
 877         d[parent._node.node_id] = parent
 878     del nodes
 879     map(fixup_bifuncindexes_chunk, d.values())
 880
 881
 882 def fixup_bifuncindexes_chunk(container):
 883     removes = []
 884     entries = find_all_child_elements(container, "bifuncindex")
 885     function_entries = find_all_child_elements(container, "function")
 886     for entry in entries:
 887         function_name = entry.getAttribute("name")
 888         found = 0
 889         for func_entry in function_entries:
 890             t2 = func_entry.childNodes[0].data
 891             if t2[-2:] != "()":
 892                 continue
 893             t2 = t2[:-2]
 894             if t2 == function_name:
 895                 func_entry.setAttribute("index", "yes")
 896                 func_entry.setAttribute("module", "__builtin__")
 897                 if not found:
 898                     found = 1
 899                     removes.append(entry)
 900     for entry in removes:
 901         container.removeChild(entry)
 902
 903
 904 def join_adjacent_elements(container, gi):
 905     queue = [container]
 906     while queue:
 907         parent = queue.pop()
 908         i = 0
 909         children = parent.get_childNodes()
 910         nchildren = len(children)
 911         while i < (nchildren - 1):
 912             child = children[i]
 913             if child.nodeName == gi:
 914                 if children[i+1].nodeName == gi:
 915                     ewrite("--- merging two <%s/> elements\n" % gi)
 916                     child = children[i]
 917                     nextchild = children[i+1]
 918                     nextchildren = nextchild.get_childNodes()
 919                     while len(nextchildren):
 920                         node = nextchildren[0]
 921                         nextchild.removeChild(node)
 922                         child.appendChild(node)
 923                     parent.removeChild(nextchild)
 924                     continue
 925             if child.nodeType == ELEMENT:
 926                 queue.append(child)
 927             i = i + 1
 928
 929
 930 _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
 931
 932 def write_esis(doc, ofp, knownempty):
 933     for node in doc.childNodes:
 934         nodeType = node.nodeType
 935         if nodeType == ELEMENT:
 936             gi = node.get_tagName()
 937             if knownempty(gi):
 938                 if node.hasChildNodes():
 939                     raise ValueError, \
 940                           "declared-empty node <%s> has children" % gi
 941                 ofp.write("e\n")
 942             for k, v in node.attributes.items():
 943                 value = v.value
 944                 if _token_rx.match(value):
 945                     dtype = "TOKEN"
 946                 else:
 947                     dtype = "CDATA"
 948                 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
 949             ofp.write("(%s\n" % gi)
 950             write_esis(node, ofp, knownempty)
 951             ofp.write(")%s\n" % gi)
 952         elif nodeType == TEXT:
 953             ofp.write("-%s\n" % esistools.encode(node.data))
 954         elif nodeType == ENTITY_REFERENCE:
 955             ofp.write("&%s\n" % node.get_nodeName())
 956         else:
 957             raise RuntimeError, "unsupported node type: %s" % nodeType
 958
 959
 960 def convert(ifp, ofp):
 961     p = esistools.ExtendedEsisBuilder()
 962     p.feed(ifp.read())
 963     doc = p.document
 964     fragment = p.fragment
 965     normalize(fragment)
 966     simplify(doc, fragment)
 967     handle_labels(doc, fragment)
 968     handle_appendix(doc, fragment)
 969     fixup_trailing_whitespace(doc, {
 970         "abstract": "\n",
 971         "title": "",
 972         "chapter": "\n\n",
 973         "section": "\n\n",
 974         "subsection": "\n\n",
 975         "subsubsection": "\n\n",
 976         "paragraph": "\n\n",
 977         "subparagraph": "\n\n",
 978         })
 979     cleanup_root_text(doc)
 980     cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
 981     cleanup_synopses(doc, fragment)
 982     fixup_descriptors(doc, fragment)
 983     fixup_verbatims(fragment)
 984     normalize(fragment)
 985     fixup_paras(doc, fragment)
 986     fixup_sectionauthors(doc, fragment)
 987     fixup_table_structures(doc, fragment)
 988     fixup_rfc_references(doc, fragment)
 989     fixup_signatures(doc, fragment)
 990     add_node_ids(fragment)
 991     fixup_refmodindexes(fragment)
 992     fixup_bifuncindexes(fragment)
 993     # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
 994     # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
 995     join_adjacent_elements(fragment, "option")
 996     #
 997     d = {}
 998     for gi in p.get_empties():
 999         d[gi] = gi
1000     if d.has_key("rfc"):
1001         del d["rfc"]
1002     knownempty = d.has_key
1003     #
1004     try:
1005         write_esis(fragment, ofp, knownempty)
1006     except IOError, (err, msg):
1007         # Ignore EPIPE; it just means that whoever we're writing to stopped
1008         # reading.  The rest of the output would be ignored.  All other errors
1009         # should still be reported,
1010         if err != errno.EPIPE:
1011             raise
1012
1013
1014 def main():
1015     if len(sys.argv) == 1:
1016         ifp = sys.stdin
1017         ofp = sys.stdout
1018     elif len(sys.argv) == 2:
1019         ifp = open(sys.argv[1])
1020         ofp = sys.stdout
1021     elif len(sys.argv) == 3:
1022         ifp = open(sys.argv[1])
1023         ofp = open(sys.argv[2], "w")
1024     else:
1025         usage()
1026         sys.exit(2)
1027     convert(ifp, ofp)
1028
1029
1030 if __name__ == "__main__":
1031     main()