libs/elementtree/TidyTools.py

   1 #
   2 # ElementTree
   3 # $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $
   4 #
   5 # tools to run the "tidy" command on an HTML or XHTML file, and return
   6 # the contents as an XHTML element tree.
   7 #
   8 # history:
   9 # 2002-10-19 fl   added to ElementTree library; added getzonebody function
  10 #
  11 # Copyright (c) 1999-2004 by Fredrik Lundh.  All rights reserved.
  12 #
  13 # fredrik@pythonware.com
  14 # http://www.pythonware.com
  15 #
  16
  17 ##
  18 # Tools to build element trees from HTML, using the external <b>tidy</b>
  19 # utility.
  20 ##
  21
  22 import glob, string, os, sys
  23
  24 from ElementTree import ElementTree, Element
  25
  26 NS_XHTML = "{http://www.w3.org/1999/xhtml}"
  27
  28 ##
  29 # Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
  30 # command line utility.
  31 #
  32 # @param file Filename.
  33 # @param new_inline_tags An optional list of valid but non-standard
  34 #     inline tags.
  35 # @return An element tree, or None if not successful.
  36
  37 def tidy(file, new_inline_tags=None):
  38
  39     command = ["tidy", "-qn", "-asxml"]
  40
  41     if new_inline_tags:
  42         command.append("--new-inline-tags")
  43         command.append(string.join(new_inline_tags, ","))
  44
  45     # FIXME: support more tidy options!
  46
  47     # convert
  48     os.system(
  49         "%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file)
  50         )
  51     # check that the result is valid XML
  52     try:
  53         tree = ElementTree()
  54         tree.parse(file + ".out")
  55     except:
  56         print "*** %s:%s" % sys.exc_info()[:2]
  57         print ("*** %s is not valid XML "
  58                "(check %s.err for info)" % (file, file))
  59         tree = None
  60     else:
  61         if os.path.isfile(file + ".out"):
  62             os.remove(file + ".out")
  63         if os.path.isfile(file + ".err"):
  64             os.remove(file + ".err")
  65
  66     return tree
  67
  68 ##
  69 # Get document body from a an HTML or HTML-like file.  This function
  70 # uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
  71 # up the resulting XML tree.
  72 #
  73 # @param file Filename.
  74 # @return A <b>body</b> element, or None if not successful.
  75
  76 def getbody(file, **options):
  77     # get clean body from text file
  78
  79     # get xhtml tree
  80     try:
  81         tree = apply(tidy, (file,), options)
  82         if tree is None:
  83             return
  84     except IOError, v:
  85         print "***", v
  86         return None
  87
  88     NS = NS_XHTML
  89
  90     # remove namespace uris
  91     for node in tree.getiterator():
  92         if node.tag.startswith(NS):
  93             node.tag = node.tag[len(NS):]
  94
  95     body = tree.getroot().find("body")
  96
  97     return body
  98
  99 ##
 100 # Same as <b>getbody</b>, but turns plain text at the start of the
 101 # document into an H1 tag.  This function can be used to parse zone
 102 # documents.
 103 #
 104 # @param file Filename.
 105 # @return A <b>body</b> element, or None if not successful.
 106
 107 def getzonebody(file, **options):
 108
 109     body = getbody(file, **options)
 110     if body is None:
 111         return
 112
 113     if body.text and string.strip(body.text):
 114         title = Element("h1")
 115         title.text = string.strip(body.text)
 116         title.tail = "\n\n"
 117         body.insert(0, title)
 118
 119     body.text = None
 120
 121     return body
 122
 123 if __name__ == "__main__":
 124
 125     import sys
 126     for arg in sys.argv[1:]:
 127         for file in glob.glob(arg):
 128             print file, "...", tidy(file)