5 from xml
.dom
import Node
6 from Ft
.Xml
.Domlette
import implementation
7 from Ft
.Xml
import XMLNS_NAMESPACE
9 from string
import find
, lower
, join
10 from socket
import gethostbyaddr
, gethostname
13 entrefpattern
= re
.compile('&(\D\S+);')
15 def node_to_xml(node
):
16 "Takes an XML node and returns an XML documentElement suitable for saving."
17 root
= implementation
.createDocument(None, 'root', None)
18 new
= node
.cloneNode(1)
19 new
= root
.importNode(new
, 1)
20 root
.replaceChild(new
, root
.documentElement
)
23 def node_to_html(node
):
24 "Takes an XML node and returns an HTML documentElement suitable for saving."
25 root
= implementation
.createHTMLDocument('HTML document')
26 def html(doc
, node
, html
):
27 new
= doc
.importNode(node
.cloneNode(deep
= 0), deep
= 0)
28 if node
.nodeType
== Node
.ELEMENT_NODE
:
29 for a
in node
.attributes
:
30 new
.setAttribute(a
.localName
, a
.value
)
31 for k
in node
.childNodes
:
32 new
.appendChild(html(doc
, k
, html
))
34 new
= html(root
, node
, html
)
35 root
.replaceChild(new
, root
.documentElement
)
38 def send_to_file(data
, path
):
40 file = open(path
, 'wb')
46 rox
.report_exception()
51 def fix_broken_html(data
):
52 """Pre-parse the data before sending to tidy to fix really really broken
53 stuff (eg, MS Word output). Returns None if data is OK"""
54 if data
.find('<o:p>') == -1:
55 return # Doesn't need fixing?
57 data
= data
.replace('<o:p></o:p>', '')
58 data
= re
.sub('<!\[[^]]*\]>', '', data
)
61 def to_html_doc(data
):
62 "Run data though tidy and return the resulting XML text"
65 #data = data.replace(' ', ' ')
66 #data = data.replace('©', '(c)')
67 #data = data.replace('ä', '(auml)')
68 #data = data.replace('ö', '(ouml)')
69 fixed
= fix_broken_html(data
)
77 tin
= os
.popen('tidy --force-output yes -q -utf8 -asxml 2>/dev/null', 'w')
79 tin
= os
.popen('tidy --force-output yes -q -asxml 2>/dev/null', 'w')
80 tin
.write(fixed
or data
)
86 data
= os
.fdopen(r
).read()
91 def parse_data(data
, path
):
92 """Convert and XML document into a DOM Document."""
93 from Ft
.Xml
.InputSource
import InputSourceFactory
94 #from Ft.Xml.cDomlette import nonvalParse
95 from Ft
.Xml
.FtMiniDom
import nonvalParse
96 isrc
= InputSourceFactory()
100 print "Parsing (with entities)..."
101 doc
= nonvalParse(isrc
.fromString(data
, path
))
103 print "Parse failed.. retry without entities..."
104 data
= entrefpattern
.sub('&\\1;',data
)
105 doc
= nonvalParse(isrc
.fromString(data
, path
))
107 type, val
, tb
= sys
.exc_info()
108 traceback
.print_exception(type, val
, tb
)
109 print "parsing failed!"
112 #rox.report_exception()