This commit was manufactured by cvs2svn to create tag 'r241c1'.
[python/dscho.git] / Doc / lib / libhtmlparser.tex
blobb85ba564d6a4eb5375e7ae71558185f9ea4d3980
1 \section{\module{HTMLParser} ---
2 Simple HTML and XHTML parser}
4 \declaremodule{standard}{HTMLParser}
5 \modulesynopsis{A simple parser that can handle HTML and XHTML.}
7 \versionadded{2.2}
9 This module defines a class \class{HTMLParser} which serves as the
10 basis for parsing text files formatted in HTML\index{HTML} (HyperText
11 Mark-up Language) and XHTML.\index{XHTML} Unlike the parser in
12 \refmodule{htmllib}, this parser is not based on the SGML parser in
13 \refmodule{sgmllib}.
16 \begin{classdesc}{HTMLParser}{}
17 The \class{HTMLParser} class is instantiated without arguments.
19 An HTMLParser instance is fed HTML data and calls handler functions
20 when tags begin and end. The \class{HTMLParser} class is meant to be
21 overridden by the user to provide a desired behavior.
23 Unlike the parser in \refmodule{htmllib}, this parser does not check
24 that end tags match start tags or call the end-tag handler for
25 elements which are closed implicitly by closing an outer element.
26 \end{classdesc}
28 An exception is defined as well:
30 \begin{excdesc}{HTMLParseError}
31 Exception raised by the \class{HTMLParser} class when it encounters an
32 error while parsing. This exception provides three attributes:
33 \member{msg} is a brief message explaining the error, \member{lineno}
34 is the number of the line on which the broken construct was detected,
35 and \member{offset} is the number of characters into the line at which
36 the construct starts.
37 \end{excdesc}
40 \class{HTMLParser} instances have the following methods:
42 \begin{methoddesc}{reset}{}
43 Reset the instance. Loses all unprocessed data. This is called
44 implicitly at instantiation time.
45 \end{methoddesc}
47 \begin{methoddesc}{feed}{data}
48 Feed some text to the parser. It is processed insofar as it consists
49 of complete elements; incomplete data is buffered until more data is
50 fed or \method{close()} is called.
51 \end{methoddesc}
53 \begin{methoddesc}{close}{}
54 Force processing of all buffered data as if it were followed by an
55 end-of-file mark. This method may be redefined by a derived class to
56 define additional processing at the end of the input, but the
57 redefined version should always call the \class{HTMLParser} base class
58 method \method{close()}.
59 \end{methoddesc}
61 \begin{methoddesc}{getpos}{}
62 Return current line number and offset.
63 \end{methoddesc}
65 \begin{methoddesc}{get_starttag_text}{}
66 Return the text of the most recently opened start tag. This should
67 not normally be needed for structured processing, but may be useful in
68 dealing with HTML ``as deployed'' or for re-generating input with
69 minimal changes (whitespace between attributes can be preserved,
70 etc.).
71 \end{methoddesc}
73 \begin{methoddesc}{handle_starttag}{tag, attrs}
74 This method is called to handle the start of a tag. It is intended to
75 be overridden by a derived class; the base class implementation does
76 nothing.
78 The \var{tag} argument is the name of the tag converted to
79 lower case. The \var{attrs} argument is a list of \code{(\var{name},
80 \var{value})} pairs containing the attributes found inside the tag's
81 \code{<>} brackets. The \var{name} will be translated to lower case
82 and double quotes and backslashes in the \var{value} have been
83 interpreted. For instance, for the tag \code{<A
84 HREF="http://www.cwi.nl/">}, this method would be called as
85 \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
86 \end{methoddesc}
88 \begin{methoddesc}{handle_startendtag}{tag, attrs}
89 Similar to \method{handle_starttag()}, but called when the parser
90 encounters an XHTML-style empty tag (\code{<a .../>}). This method
91 may be overridden by subclasses which require this particular lexical
92 information; the default implementation simple calls
93 \method{handle_starttag()} and \method{handle_endtag()}.
94 \end{methoddesc}
96 \begin{methoddesc}{handle_endtag}{tag}
97 This method is called to handle the end tag of an element. It is
98 intended to be overridden by a derived class; the base class
99 implementation does nothing. The \var{tag} argument is the name of
100 the tag converted to lower case.
101 \end{methoddesc}
103 \begin{methoddesc}{handle_data}{data}
104 This method is called to process arbitrary data. It is intended to be
105 overridden by a derived class; the base class implementation does
106 nothing.
107 \end{methoddesc}
109 \begin{methoddesc}{handle_charref}{name} This method is called to
110 process a character reference of the form \samp{\&\#\var{ref};}. It
111 is intended to be overridden by a derived class; the base class
112 implementation does nothing.
113 \end{methoddesc}
115 \begin{methoddesc}{handle_entityref}{name}
116 This method is called to process a general entity reference of the
117 form \samp{\&\var{name};} where \var{name} is an general entity
118 reference. It is intended to be overridden by a derived class; the
119 base class implementation does nothing.
120 \end{methoddesc}
122 \begin{methoddesc}{handle_comment}{data}
123 This method is called when a comment is encountered. The
124 \var{comment} argument is a string containing the text between the
125 \samp{--} and \samp{--} delimiters, but not the delimiters
126 themselves. For example, the comment \samp{<!--text-->} will
127 cause this method to be called with the argument \code{'text'}. It is
128 intended to be overridden by a derived class; the base class
129 implementation does nothing.
130 \end{methoddesc}
132 \begin{methoddesc}{handle_decl}{decl}
133 Method called when an SGML declaration is read by the parser. The
134 \var{decl} parameter will be the entire contents of the declaration
135 inside the \code{<!}...\code{>} markup.It is intended to be overridden
136 by a derived class; the base class implementation does nothing.
137 \end{methoddesc}
139 \begin{methoddesc}{handle_pi}{data}
140 Method called when a processing instruction is encountered. The
141 \var{data} parameter will contain the entire processing instruction.
142 For example, for the processing instruction \code{<?proc color='red'>},
143 this method would be called as \code{handle_pi("proc color='red'")}. It
144 is intended to be overridden by a derived class; the base class
145 implementation does nothing.
147 \note{The \class{HTMLParser} class uses the SGML syntactic rules for
148 processing instructions. An XHTML processing instruction using the
149 trailing \character{?} will cause the \character{?} to be included in
150 \var{data}.}
151 \end{methoddesc}
154 \subsection{Example HTML Parser Application \label{htmlparser-example}}
156 As a basic example, below is a very basic HTML parser that uses the
157 \class{HTMLParser} class to print out tags as they are encountered:
159 \begin{verbatim}
160 from HTMLParser import HTMLParser
162 class MyHTMLParser(HTMLParser):
164 def handle_starttag(self, tag, attrs):
165 print "Encountered the beginning of a %s tag" % tag
167 def handle_endtag(self, tag):
168 print "Encountered the end of a %s tag" % tag
169 \end{verbatim}