3 # $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $
5 # tools to run the "tidy" command on an HTML or XHTML file, and return
6 # the contents as an XHTML element tree.
9 # 2002-10-19 fl added to ElementTree library; added getzonebody function
11 # Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.
13 # fredrik@pythonware.com
14 # http://www.pythonware.com
18 # Tools to build element trees from HTML, using the external <b>tidy</b>
22 import glob
, string
, os
, sys
24 from ElementTree
import ElementTree
, Element
26 NS_XHTML
= "{http://www.w3.org/1999/xhtml}"
29 # Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
30 # command line utility.
32 # @param file Filename.
33 # @param new_inline_tags An optional list of valid but non-standard
35 # @return An element tree, or None if not successful.
37 def tidy(file, new_inline_tags
=None):
39 command
= ["tidy", "-qn", "-asxml"]
42 command
.append("--new-inline-tags")
43 command
.append(string
.join(new_inline_tags
, ","))
45 # FIXME: support more tidy options!
49 "%s %s >%s.out 2>%s.err" % (string
.join(command
), file, file, file)
51 # check that the result is valid XML
54 tree
.parse(file + ".out")
56 print "*** %s:%s" % sys
.exc_info()[:2]
57 print ("*** %s is not valid XML "
58 "(check %s.err for info)" % (file, file))
61 if os
.path
.isfile(file + ".out"):
62 os
.remove(file + ".out")
63 if os
.path
.isfile(file + ".err"):
64 os
.remove(file + ".err")
69 # Get document body from a an HTML or HTML-like file. This function
70 # uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
71 # up the resulting XML tree.
73 # @param file Filename.
74 # @return A <b>body</b> element, or None if not successful.
76 def getbody(file, **options
):
77 # get clean body from text file
81 tree
= apply(tidy
, (file,), options
)
90 # remove namespace uris
91 for node
in tree
.getiterator():
92 if node
.tag
.startswith(NS
):
93 node
.tag
= node
.tag
[len(NS
):]
95 body
= tree
.getroot().find("body")
100 # Same as <b>getbody</b>, but turns plain text at the start of the
101 # document into an H1 tag. This function can be used to parse zone
104 # @param file Filename.
105 # @return A <b>body</b> element, or None if not successful.
107 def getzonebody(file, **options
):
109 body
= getbody(file, **options
)
113 if body
.text
and string
.strip(body
.text
):
114 title
= Element("h1")
115 title
.text
= string
.strip(body
.text
)
117 body
.insert(0, title
)
123 if __name__
== "__main__":
126 for arg
in sys
.argv
[1:]:
127 for file in glob
.glob(arg
):
128 print file, "...", tidy(file)