Doc/lib/libhtmlparser.tex

   1 \section{\module{HTMLParser} ---
   2          Simple HTML and XHTML parser}
   3
   4 \declaremodule{standard}{HTMLParser}
   5 \modulesynopsis{A simple parser that can handle HTML and XHTML.}
   6
   7 This module defines a class \class{HTMLParser} which serves as the
   8 basis for parsing text files formatted in HTML\index{HTML} (HyperText
   9 Mark-up Language) and XHTML.\index{XHTML}  Unlike the parser in
  10 \refmodule{htmllib}, this parser is not based on the SGML parser in
  11 \refmodule{sgmllib}.
  12
  13
  14 \begin{classdesc}{HTMLParser}{}
  15 The \class{HTMLParser} class is instantiated without arguments.
  16
  17 An HTMLParser instance is fed HTML data and calls handler functions
  18 when tags begin and end.  The \class{HTMLParser} class is meant to be
  19 overridden by the user to provide a desired behavior.
  20
  21 Unlike the parser in \refmodule{htmllib}, this parser does not check
  22 that end tags match start tags or call the end-tag handler for
  23 elements which are closed implicitly by closing an outer element.
  24 \end{classdesc}
  25
  26
  27 \class{HTMLParser} instances have the following methods:
  28
  29 \begin{methoddesc}{reset}{}
  30 Reset the instance.  Loses all unprocessed data.  This is called
  31 implicitly at instantiation time.
  32 \end{methoddesc}
  33
  34 \begin{methoddesc}{feed}{data}
  35 Feed some text to the parser.  It is processed insofar as it consists
  36 of complete elements; incomplete data is buffered until more data is
  37 fed or \method{close()} is called.
  38 \end{methoddesc}
  39
  40 \begin{methoddesc}{close}{}
  41 Force processing of all buffered data as if it were followed by an
  42 end-of-file mark.  This method may be redefined by a derived class to
  43 define additional processing at the end of the input, but the
  44 redefined version should always call the \class{HTMLParser} base class
  45 method \method{close()}.
  46 \end{methoddesc}
  47
  48 \begin{methoddesc}{getpos}{}
  49 Return current line number and offset.
  50 \end{methoddesc}
  51
  52 \begin{methoddesc}{get_starttag_text}{}
  53 Return the text of the most recently opened start tag.  This should
  54 not normally be needed for structured processing, but may be useful in
  55 dealing with HTML ``as deployed'' or for re-generating input with
  56 minimal changes (whitespace between attributes can be preserved,
  57 etc.).
  58 \end{methoddesc}
  59
  60 \begin{methoddesc}{handle_starttag}{tag, attrs}
  61 This method is called to handle the start of a tag.  It is intended to
  62 be overridden by a derived class; the base class implementation does
  63 nothing.
  64
  65 The \var{tag} argument is the name of the tag converted to
  66 lower case.  The \var{attrs} argument is a list of \code{(\var{name},
  67 \var{value})} pairs containing the attributes found inside the tag's
  68 \code{<>} brackets.  The \var{name} will be translated to lower case
  69 and double quotes and backslashes in the \var{value} have been
  70 interpreted.  For instance, for the tag \code{<A
  71 HREF="http://www.cwi.nl/">}, this method would be called as
  72 \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
  73 \end{methoddesc}
  74
  75 \begin{methoddesc}{handle_startendtag}{tag, attrs}
  76 Similar to \method{handle_starttag()}, but called when the parser
  77 encounters an XHTML-style empty tag (\code{<a .../>}).  This method
  78 may be overridden by subclasses which require this particular lexical
  79 information; the default implementation simple calls
  80 \method{handle_starttag()} and \method{handle_endtag()}.
  81 \end{methoddesc}
  82
  83 \begin{methoddesc}{handle_endtag}{tag}
  84 This method is called to handle the end tag of an element.  It is
  85 intended to be overridden by a derived class; the base class
  86 implementation does nothing.  The \var{tag} argument is the name of
  87 the tag converted to lower case.
  88 \end{methoddesc}
  89
  90 \begin{methoddesc}{handle_data}{data}
  91 This method is called to process arbitrary data.  It is intended to be
  92 overridden by a derived class; the base class implementation does
  93 nothing.
  94 \end{methoddesc}
  95
  96 \begin{methoddesc}{handle_charref}{name} This method is called to
  97 process a character reference of the form \samp{\&\#\var{ref};}.  It
  98 is intended to be overridden by a derived class; the base class
  99 implementation does nothing.
 100 \end{methoddesc}
 101
 102 \begin{methoddesc}{handle_entityref}{name}
 103 This method is called to process a general entity reference of the
 104 form \samp{\&\var{name};} where \var{name} is an general entity
 105 reference.  It is intended to be overridden by a derived class; the
 106 base class implementation does nothing.
 107 \end{methoddesc}
 108
 109 \begin{methoddesc}{handle_comment}{data}
 110 This method is called when a comment is encountered.  The
 111 \var{comment} argument is a string containing the text between the
 112 \samp{<!--} and \samp{-->} delimiters, but not the delimiters
 113 themselves.  For example, the comment \samp{<!--text-->} will cause
 114 this method to be called with the argument \code{'text'}.  It is
 115 intended to be overridden by a derived class; the base class
 116 implementation does nothing.
 117 \end{methoddesc}
 118
 119 \begin{methoddesc}{handle_decl}{decl}
 120 Method called when an SGML declaration is read by the parser.  The
 121 \var{decl} parameter will be the entire contents of the declaration
 122 inside the \code{<!}...\code{>} markup.It is intended to be overridden
 123 by a derived class; the base class implementation does nothing.
 124 \end{methoddesc}
 125
 126
 127 \subsection{Example HTML Parser \label{htmlparser-example}}
 128
 129 As a basic example, below is a very basic HTML parser that uses the
 130 \class{HTMLParser} class to print out tags as they are encountered:
 131
 132 \begin{verbatim}
 133 from HTMLParser import HTMLParser
 134
 135 class MyHTMLParser(HTMLParser):
 136
 137     def handle_starttag(self, tag, attrs):
 138         print "Encountered the beginning of a %s tag" % tag
 139
 140     def handle_endtag(self, tag):
 141         print "Encountered the end of a %s tag" % tag
 142 \end{verbatim}