3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
8 from sgmllib
import SGMLParser
9 from formatter
import AS_IS
11 __all__
= ["HTMLParser"]
13 class HTMLParser(SGMLParser
):
14 """This is the basic HTML parser class.
16 It supports all entity names required by the XHTML 1.0 Recommendation.
17 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
22 from htmlentitydefs
import entitydefs
24 def __init__(self
, formatter
, verbose
=0):
25 """Creates an instance of the HTMLParser class.
27 The formatter parameter is the formatter instance associated with
31 SGMLParser
.__init
__(self
, verbose
)
32 self
.formatter
= formatter
35 SGMLParser
.reset(self
)
45 # ------ Methods used internally; some may be overridden
47 # --- Formatter interface, taking care of 'savedata' mode;
48 # shouldn't need to be overridden
50 def handle_data(self
, data
):
51 if self
.savedata
is not None:
52 self
.savedata
= self
.savedata
+ data
55 self
.formatter
.add_literal_data(data
)
57 self
.formatter
.add_flowing_data(data
)
59 # --- Hooks to save data; shouldn't need to be overridden
62 """Begins saving character data in a buffer instead of sending it
63 to the formatter object.
65 Retrieve the stored data via the save_end() method. Use of the
66 save_bgn() / save_end() pair may not be nested.
72 """Ends buffering character data and returns all data saved since
73 the preceding call to the save_bgn() method.
75 If the nofill flag is false, whitespace is collapsed to single
76 spaces. A call to this method without a preceding call to the
77 save_bgn() method will raise a TypeError exception.
83 data
= ' '.join(data
.split())
86 # --- Hooks for anchors; should probably be overridden
88 def anchor_bgn(self
, href
, name
, type):
89 """This method is called at the start of an anchor region.
91 The arguments correspond to the attributes of the <A> tag with
92 the same names. The default implementation maintains a list of
93 hyperlinks (defined by the HREF attribute for <A> tags) within
94 the document. The list of hyperlinks is available as the data
100 self
.anchorlist
.append(href
)
102 def anchor_end(self
):
103 """This method is called at the end of an anchor region.
105 The default implementation adds a textual footnote marker using an
106 index into the list of hyperlinks created by the anchor_bgn()method.
110 self
.handle_data("[%d]" % len(self
.anchorlist
))
113 # --- Hook for images; should probably be overridden
115 def handle_image(self
, src
, alt
, *args
):
116 """This method is called to handle images.
118 The default implementation simply passes the alt value to the
119 handle_data() method.
122 self
.handle_data(alt
)
124 # --------- Top level elememts
126 def start_html(self
, attrs
): pass
127 def end_html(self
): pass
129 def start_head(self
, attrs
): pass
130 def end_head(self
): pass
132 def start_body(self
, attrs
): pass
133 def end_body(self
): pass
135 # ------ Head elements
137 def start_title(self
, attrs
):
141 self
.title
= self
.save_end()
143 def do_base(self
, attrs
):
148 def do_isindex(self
, attrs
):
151 def do_link(self
, attrs
):
154 def do_meta(self
, attrs
):
157 def do_nextid(self
, attrs
): # Deprecated
160 # ------ Body elements
164 def start_h1(self
, attrs
):
165 self
.formatter
.end_paragraph(1)
166 self
.formatter
.push_font(('h1', 0, 1, 0))
169 self
.formatter
.end_paragraph(1)
170 self
.formatter
.pop_font()
172 def start_h2(self
, attrs
):
173 self
.formatter
.end_paragraph(1)
174 self
.formatter
.push_font(('h2', 0, 1, 0))
177 self
.formatter
.end_paragraph(1)
178 self
.formatter
.pop_font()
180 def start_h3(self
, attrs
):
181 self
.formatter
.end_paragraph(1)
182 self
.formatter
.push_font(('h3', 0, 1, 0))
185 self
.formatter
.end_paragraph(1)
186 self
.formatter
.pop_font()
188 def start_h4(self
, attrs
):
189 self
.formatter
.end_paragraph(1)
190 self
.formatter
.push_font(('h4', 0, 1, 0))
193 self
.formatter
.end_paragraph(1)
194 self
.formatter
.pop_font()
196 def start_h5(self
, attrs
):
197 self
.formatter
.end_paragraph(1)
198 self
.formatter
.push_font(('h5', 0, 1, 0))
201 self
.formatter
.end_paragraph(1)
202 self
.formatter
.pop_font()
204 def start_h6(self
, attrs
):
205 self
.formatter
.end_paragraph(1)
206 self
.formatter
.push_font(('h6', 0, 1, 0))
209 self
.formatter
.end_paragraph(1)
210 self
.formatter
.pop_font()
212 # --- Block Structuring Elements
214 def do_p(self
, attrs
):
215 self
.formatter
.end_paragraph(1)
217 def start_pre(self
, attrs
):
218 self
.formatter
.end_paragraph(1)
219 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
220 self
.nofill
= self
.nofill
+ 1
223 self
.formatter
.end_paragraph(1)
224 self
.formatter
.pop_font()
225 self
.nofill
= max(0, self
.nofill
- 1)
227 def start_xmp(self
, attrs
):
228 self
.start_pre(attrs
)
229 self
.setliteral('xmp') # Tell SGML parser
234 def start_listing(self
, attrs
):
235 self
.start_pre(attrs
)
236 self
.setliteral('listing') # Tell SGML parser
238 def end_listing(self
):
241 def start_address(self
, attrs
):
242 self
.formatter
.end_paragraph(0)
243 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
245 def end_address(self
):
246 self
.formatter
.end_paragraph(0)
247 self
.formatter
.pop_font()
249 def start_blockquote(self
, attrs
):
250 self
.formatter
.end_paragraph(1)
251 self
.formatter
.push_margin('blockquote')
253 def end_blockquote(self
):
254 self
.formatter
.end_paragraph(1)
255 self
.formatter
.pop_margin()
259 def start_ul(self
, attrs
):
260 self
.formatter
.end_paragraph(not self
.list_stack
)
261 self
.formatter
.push_margin('ul')
262 self
.list_stack
.append(['ul', '*', 0])
265 if self
.list_stack
: del self
.list_stack
[-1]
266 self
.formatter
.end_paragraph(not self
.list_stack
)
267 self
.formatter
.pop_margin()
269 def do_li(self
, attrs
):
270 self
.formatter
.end_paragraph(0)
272 [dummy
, label
, counter
] = top
= self
.list_stack
[-1]
273 top
[2] = counter
= counter
+1
275 label
, counter
= '*', 0
276 self
.formatter
.add_label_data(label
, counter
)
278 def start_ol(self
, attrs
):
279 self
.formatter
.end_paragraph(not self
.list_stack
)
280 self
.formatter
.push_margin('ol')
284 if len(v
) == 1: v
= v
+ '.'
286 self
.list_stack
.append(['ol', label
, 0])
289 if self
.list_stack
: del self
.list_stack
[-1]
290 self
.formatter
.end_paragraph(not self
.list_stack
)
291 self
.formatter
.pop_margin()
293 def start_menu(self
, attrs
):
299 def start_dir(self
, attrs
):
305 def start_dl(self
, attrs
):
306 self
.formatter
.end_paragraph(1)
307 self
.list_stack
.append(['dl', '', 0])
311 if self
.list_stack
: del self
.list_stack
[-1]
313 def do_dt(self
, attrs
):
316 def do_dd(self
, attrs
):
318 self
.formatter
.push_margin('dd')
319 self
.list_stack
.append(['dd', '', 0])
321 def ddpop(self
, bl
=0):
322 self
.formatter
.end_paragraph(bl
)
324 if self
.list_stack
[-1][0] == 'dd':
325 del self
.list_stack
[-1]
326 self
.formatter
.pop_margin()
332 def start_cite(self
, attrs
): self
.start_i(attrs
)
333 def end_cite(self
): self
.end_i()
335 def start_code(self
, attrs
): self
.start_tt(attrs
)
336 def end_code(self
): self
.end_tt()
338 def start_em(self
, attrs
): self
.start_i(attrs
)
339 def end_em(self
): self
.end_i()
341 def start_kbd(self
, attrs
): self
.start_tt(attrs
)
342 def end_kbd(self
): self
.end_tt()
344 def start_samp(self
, attrs
): self
.start_tt(attrs
)
345 def end_samp(self
): self
.end_tt()
347 def start_strong(self
, attrs
): self
.start_b(attrs
)
348 def end_strong(self
): self
.end_b()
350 def start_var(self
, attrs
): self
.start_i(attrs
)
351 def end_var(self
): self
.end_i()
353 # Typographic Elements
355 def start_i(self
, attrs
):
356 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
358 self
.formatter
.pop_font()
360 def start_b(self
, attrs
):
361 self
.formatter
.push_font((AS_IS
, AS_IS
, 1, AS_IS
))
363 self
.formatter
.pop_font()
365 def start_tt(self
, attrs
):
366 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
368 self
.formatter
.pop_font()
370 def start_a(self
, attrs
):
374 for attrname
, value
in attrs
:
375 value
= value
.strip()
376 if attrname
== 'href':
378 if attrname
== 'name':
380 if attrname
== 'type':
382 self
.anchor_bgn(href
, name
, type)
389 def do_br(self
, attrs
):
390 self
.formatter
.add_line_break()
392 # --- Horizontal Rule
394 def do_hr(self
, attrs
):
395 self
.formatter
.add_hor_rule()
399 def do_img(self
, attrs
):
406 for attrname
, value
in attrs
:
407 if attrname
== 'align':
409 if attrname
== 'alt':
411 if attrname
== 'ismap':
413 if attrname
== 'src':
415 if attrname
== 'width':
416 try: width
= int(value
)
417 except ValueError: pass
418 if attrname
== 'height':
419 try: height
= int(value
)
420 except ValueError: pass
421 self
.handle_image(src
, alt
, ismap
, align
, width
, height
)
423 # --- Really Old Unofficial Deprecated Stuff
425 def do_plaintext(self
, attrs
):
426 self
.start_pre(attrs
)
427 self
.setnomoretags() # Tell SGML parser
431 def unknown_starttag(self
, tag
, attrs
):
434 def unknown_endtag(self
, tag
):
438 def test(args
= None):
439 import sys
, formatter
444 silent
= args
and args
[0] == '-s'
464 if f
is not sys
.stdin
:
468 f
= formatter
.NullFormatter()
470 f
= formatter
.AbstractFormatter(formatter
.DumbWriter())
477 if __name__
== '__main__':