Null commit with -f option to force an uprev and put HEADs firmly on the trunk.
[python/dscho.git] / Doc / lib / libhtmlparser.tex
blobe8b4dd92c0e1bf6c93438ca60a0228253cc84433
1 \section{\module{HTMLParser} ---
2 Simple HTML and XHTML parser}
4 \declaremodule{standard}{HTMLParser}
5 \modulesynopsis{A simple parser that can handle HTML and XHTML.}
7 This module defines a class \class{HTMLParser} which serves as the
8 basis for parsing text files formatted in HTML\index{HTML} (HyperText
9 Mark-up Language) and XHTML.\index{XHTML}
12 \begin{classdesc}{HTMLParser}{}
13 The \class{HTMLParser} class is instantiated without arguments.
15 An HTMLParser instance is fed HTML data and calls handler functions
16 when tags begin and end. The \class{HTMLParser} class is meant to be
17 overridden by the user to provide a desired behavior.
18 \end{classdesc}
21 \class{HTMLParser} instances have the following methods:
23 \begin{methoddesc}{reset}{}
24 Reset the instance. Loses all unprocessed data. This is called
25 implicitly at instantiation time.
26 \end{methoddesc}
28 \begin{methoddesc}{feed}{data}
29 Feed some text to the parser. It is processed insofar as it consists
30 of complete elements; incomplete data is buffered until more data is
31 fed or \method{close()} is called.
32 \end{methoddesc}
34 \begin{methoddesc}{close}{}
35 Force processing of all buffered data as if it were followed by an
36 end-of-file mark. This method may be redefined by a derived class to
37 define additional processing at the end of the input, but the
38 redefined version should always call the \class{HTMLParser} base class
39 method \method{close()}.
40 \end{methoddesc}
42 \begin{methoddesc}{getpos}{}
43 Return current line number and offset.
44 \end{methoddesc}
46 \begin{methoddesc}{get_starttag_text}{}
47 Return the text of the most recently opened start tag. This should
48 not normally be needed for structured processing, but may be useful in
49 dealing with HTML ``as deployed'' or for re-generating input with
50 minimal changes (whitespace between attributes can be preserved,
51 etc.).
52 \end{methoddesc}
54 \begin{methoddesc}{handle_starttag}{tag, attrs}
55 This method is called to handle the start of a tag. It is intended to
56 be overridden by a derived class; the base class implementation does
57 nothing.
59 The \var{tag} argument is the name of the tag converted to
60 lower case. The \var{attrs} argument is a list of \code{(\var{name},
61 \var{value})} pairs containing the attributes found inside the tag's
62 \code{<>} brackets. The \var{name} will be translated to lower case
63 and double quotes and backslashes in the \var{value} have been
64 interpreted. For instance, for the tag \code{<A
65 HREF="http://www.cwi.nl/">}, this method would be called as
66 \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}.
67 \end{methoddesc}
69 \begin{methoddesc}{handle_startendtag}{tag, attrs}
70 Similar to \method{handle_starttag()}, but called when the parser
71 encounters an XHTML-style empty tag (\code{<a .../>}). This method
72 may be overridden by subclasses which require this particular lexical
73 information; the default implementation simple calls
74 \method{handle_starttag()} and \method{handle_endtag()}.
75 \end{methoddesc}
77 \begin{methoddesc}{handle_endtag}{tag}
78 This method is called to handle the end tag of an element. It is
79 intended to be overridden by a derived class; the base class
80 implementation does nothing. The \var{tag} argument is the name of
81 the tag converted to lower case.
82 \end{methoddesc}
84 \begin{methoddesc}{handle_data}{data}
85 This method is called to process arbitrary data. It is intended to be
86 overridden by a derived class; the base class implementation does
87 nothing.
88 \end{methoddesc}
90 \begin{methoddesc}{handle_charref}{name} This method is called to
91 process a character reference of the form \samp{\&\#\var{ref};}. It
92 is intended to be overridden by a derived class; the base class
93 implementation does nothing.
94 \end{methoddesc}
96 \begin{methoddesc}{handle_entityref}{name}
97 This method is called to process a general entity reference of the
98 form \samp{\&\var{name};} where \var{name} is an general entity
99 reference. It is intended to be overridden by a derived class; the
100 base class implementation does nothing.
101 \end{methoddesc}
103 \begin{methoddesc}{handle_comment}{data}
104 This method is called when a comment is encountered. The
105 \var{comment} argument is a string containing the text between the
106 \samp{<!--} and \samp{-->} delimiters, but not the delimiters
107 themselves. For example, the comment \samp{<!--text-->} will cause
108 this method to be called with the argument \code{'text'}. It is
109 intended to be overridden by a derived class; the base class
110 implementation does nothing.
111 \end{methoddesc}
113 \begin{methoddesc}{handle_decl}{decl}
114 Method called when an SGML declaration is read by the parser. The
115 \var{decl} parameter will be the entire contents of the declaration
116 inside the \code{<!}...\code{>} markup.It is intended to be overridden
117 by a derived class; the base class implementation does nothing.
118 \end{methoddesc}
121 \subsection{Example HTML Parser \label{htmlparser-example}}
123 As a basic example, below is a very basic HTML parser that uses the
124 \class{HTMLParser} class to print out tags as they are encountered:
126 \begin{verbatim}
127 from HTMLParser import HTMLParser
129 class MyHTMLParser(HTMLParser):
131 def handle_starttag(self, tag, attrs):
132 print "Encountered the beginning of a %s tag" % tag
134 def handle_endtag(self, tag):
135 print "Encountered the end of a %s tag" % tag
136 \end{verbatim}