getting file size for all dict files to be downloaded. coming to be 400mb or so.
[worddb.git] / libs / elementtree / TidyTools.py
blobf3a07415b0193df97fdce237ce1868538cce63a3
2 # ElementTree
3 # $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $
5 # tools to run the "tidy" command on an HTML or XHTML file, and return
6 # the contents as an XHTML element tree.
8 # history:
9 # 2002-10-19 fl added to ElementTree library; added getzonebody function
11 # Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.
13 # fredrik@pythonware.com
14 # http://www.pythonware.com
18 # Tools to build element trees from HTML, using the external <b>tidy</b>
19 # utility.
22 import glob, string, os, sys
24 from ElementTree import ElementTree, Element
26 NS_XHTML = "{http://www.w3.org/1999/xhtml}"
29 # Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
30 # command line utility.
32 # @param file Filename.
33 # @param new_inline_tags An optional list of valid but non-standard
34 # inline tags.
35 # @return An element tree, or None if not successful.
37 def tidy(file, new_inline_tags=None):
39 command = ["tidy", "-qn", "-asxml"]
41 if new_inline_tags:
42 command.append("--new-inline-tags")
43 command.append(string.join(new_inline_tags, ","))
45 # FIXME: support more tidy options!
47 # convert
48 os.system(
49 "%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file)
51 # check that the result is valid XML
52 try:
53 tree = ElementTree()
54 tree.parse(file + ".out")
55 except:
56 print "*** %s:%s" % sys.exc_info()[:2]
57 print ("*** %s is not valid XML "
58 "(check %s.err for info)" % (file, file))
59 tree = None
60 else:
61 if os.path.isfile(file + ".out"):
62 os.remove(file + ".out")
63 if os.path.isfile(file + ".err"):
64 os.remove(file + ".err")
66 return tree
69 # Get document body from a an HTML or HTML-like file. This function
70 # uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
71 # up the resulting XML tree.
73 # @param file Filename.
74 # @return A <b>body</b> element, or None if not successful.
76 def getbody(file, **options):
77 # get clean body from text file
79 # get xhtml tree
80 try:
81 tree = apply(tidy, (file,), options)
82 if tree is None:
83 return
84 except IOError, v:
85 print "***", v
86 return None
88 NS = NS_XHTML
90 # remove namespace uris
91 for node in tree.getiterator():
92 if node.tag.startswith(NS):
93 node.tag = node.tag[len(NS):]
95 body = tree.getroot().find("body")
97 return body
100 # Same as <b>getbody</b>, but turns plain text at the start of the
101 # document into an H1 tag. This function can be used to parse zone
102 # documents.
104 # @param file Filename.
105 # @return A <b>body</b> element, or None if not successful.
107 def getzonebody(file, **options):
109 body = getbody(file, **options)
110 if body is None:
111 return
113 if body.text and string.strip(body.text):
114 title = Element("h1")
115 title.text = string.strip(body.text)
116 title.tail = "\n\n"
117 body.insert(0, title)
119 body.text = None
121 return body
123 if __name__ == "__main__":
125 import sys
126 for arg in sys.argv[1:]:
127 for file in glob.glob(arg):
128 print file, "...", tidy(file)