3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
8 from sgmllib
import SGMLParser
9 from formatter
import AS_IS
11 __all__
= ["HTMLParser"]
13 class HTMLParser(SGMLParser
):
15 from htmlentitydefs
import entitydefs
17 def __init__(self
, formatter
, verbose
=0):
18 SGMLParser
.__init
__(self
, verbose
)
19 self
.formatter
= formatter
29 # ------ Methods used internally; some may be overridden
31 # --- Formatter interface, taking care of 'savedata' mode;
32 # shouldn't need to be overridden
34 def handle_data(self
, data
):
35 if self
.savedata
is not None:
36 self
.savedata
= self
.savedata
+ data
39 self
.formatter
.add_literal_data(data
)
41 self
.formatter
.add_flowing_data(data
)
43 # --- Hooks to save data; shouldn't need to be overridden
52 data
= ' '.join(data
.split())
55 # --- Hooks for anchors; should probably be overridden
57 def anchor_bgn(self
, href
, name
, type):
60 self
.anchorlist
.append(href
)
64 self
.handle_data("[%d]" % len(self
.anchorlist
))
67 # --- Hook for images; should probably be overridden
69 def handle_image(self
, src
, alt
, *args
):
72 # --------- Top level elememts
74 def start_html(self
, attrs
): pass
75 def end_html(self
): pass
77 def start_head(self
, attrs
): pass
78 def end_head(self
): pass
80 def start_body(self
, attrs
): pass
81 def end_body(self
): pass
83 # ------ Head elements
85 def start_title(self
, attrs
):
89 self
.title
= self
.save_end()
91 def do_base(self
, attrs
):
96 def do_isindex(self
, attrs
):
99 def do_link(self
, attrs
):
102 def do_meta(self
, attrs
):
105 def do_nextid(self
, attrs
): # Deprecated
108 # ------ Body elements
112 def start_h1(self
, attrs
):
113 self
.formatter
.end_paragraph(1)
114 self
.formatter
.push_font(('h1', 0, 1, 0))
117 self
.formatter
.end_paragraph(1)
118 self
.formatter
.pop_font()
120 def start_h2(self
, attrs
):
121 self
.formatter
.end_paragraph(1)
122 self
.formatter
.push_font(('h2', 0, 1, 0))
125 self
.formatter
.end_paragraph(1)
126 self
.formatter
.pop_font()
128 def start_h3(self
, attrs
):
129 self
.formatter
.end_paragraph(1)
130 self
.formatter
.push_font(('h3', 0, 1, 0))
133 self
.formatter
.end_paragraph(1)
134 self
.formatter
.pop_font()
136 def start_h4(self
, attrs
):
137 self
.formatter
.end_paragraph(1)
138 self
.formatter
.push_font(('h4', 0, 1, 0))
141 self
.formatter
.end_paragraph(1)
142 self
.formatter
.pop_font()
144 def start_h5(self
, attrs
):
145 self
.formatter
.end_paragraph(1)
146 self
.formatter
.push_font(('h5', 0, 1, 0))
149 self
.formatter
.end_paragraph(1)
150 self
.formatter
.pop_font()
152 def start_h6(self
, attrs
):
153 self
.formatter
.end_paragraph(1)
154 self
.formatter
.push_font(('h6', 0, 1, 0))
157 self
.formatter
.end_paragraph(1)
158 self
.formatter
.pop_font()
160 # --- Block Structuring Elements
162 def do_p(self
, attrs
):
163 self
.formatter
.end_paragraph(1)
165 def start_pre(self
, attrs
):
166 self
.formatter
.end_paragraph(1)
167 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
168 self
.nofill
= self
.nofill
+ 1
171 self
.formatter
.end_paragraph(1)
172 self
.formatter
.pop_font()
173 self
.nofill
= max(0, self
.nofill
- 1)
175 def start_xmp(self
, attrs
):
176 self
.start_pre(attrs
)
177 self
.setliteral('xmp') # Tell SGML parser
182 def start_listing(self
, attrs
):
183 self
.start_pre(attrs
)
184 self
.setliteral('listing') # Tell SGML parser
186 def end_listing(self
):
189 def start_address(self
, attrs
):
190 self
.formatter
.end_paragraph(0)
191 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
193 def end_address(self
):
194 self
.formatter
.end_paragraph(0)
195 self
.formatter
.pop_font()
197 def start_blockquote(self
, attrs
):
198 self
.formatter
.end_paragraph(1)
199 self
.formatter
.push_margin('blockquote')
201 def end_blockquote(self
):
202 self
.formatter
.end_paragraph(1)
203 self
.formatter
.pop_margin()
207 def start_ul(self
, attrs
):
208 self
.formatter
.end_paragraph(not self
.list_stack
)
209 self
.formatter
.push_margin('ul')
210 self
.list_stack
.append(['ul', '*', 0])
213 if self
.list_stack
: del self
.list_stack
[-1]
214 self
.formatter
.end_paragraph(not self
.list_stack
)
215 self
.formatter
.pop_margin()
217 def do_li(self
, attrs
):
218 self
.formatter
.end_paragraph(0)
220 [dummy
, label
, counter
] = top
= self
.list_stack
[-1]
221 top
[2] = counter
= counter
+1
223 label
, counter
= '*', 0
224 self
.formatter
.add_label_data(label
, counter
)
226 def start_ol(self
, attrs
):
227 self
.formatter
.end_paragraph(not self
.list_stack
)
228 self
.formatter
.push_margin('ol')
232 if len(v
) == 1: v
= v
+ '.'
234 self
.list_stack
.append(['ol', label
, 0])
237 if self
.list_stack
: del self
.list_stack
[-1]
238 self
.formatter
.end_paragraph(not self
.list_stack
)
239 self
.formatter
.pop_margin()
241 def start_menu(self
, attrs
):
247 def start_dir(self
, attrs
):
253 def start_dl(self
, attrs
):
254 self
.formatter
.end_paragraph(1)
255 self
.list_stack
.append(['dl', '', 0])
259 if self
.list_stack
: del self
.list_stack
[-1]
261 def do_dt(self
, attrs
):
264 def do_dd(self
, attrs
):
266 self
.formatter
.push_margin('dd')
267 self
.list_stack
.append(['dd', '', 0])
269 def ddpop(self
, bl
=0):
270 self
.formatter
.end_paragraph(bl
)
272 if self
.list_stack
[-1][0] == 'dd':
273 del self
.list_stack
[-1]
274 self
.formatter
.pop_margin()
280 def start_cite(self
, attrs
): self
.start_i(attrs
)
281 def end_cite(self
): self
.end_i()
283 def start_code(self
, attrs
): self
.start_tt(attrs
)
284 def end_code(self
): self
.end_tt()
286 def start_em(self
, attrs
): self
.start_i(attrs
)
287 def end_em(self
): self
.end_i()
289 def start_kbd(self
, attrs
): self
.start_tt(attrs
)
290 def end_kbd(self
): self
.end_tt()
292 def start_samp(self
, attrs
): self
.start_tt(attrs
)
293 def end_samp(self
): self
.end_tt()
295 def start_strong(self
, attrs
): self
.start_b(attrs
)
296 def end_strong(self
): self
.end_b()
298 def start_var(self
, attrs
): self
.start_i(attrs
)
299 def end_var(self
): self
.end_i()
301 # Typographic Elements
303 def start_i(self
, attrs
):
304 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
306 self
.formatter
.pop_font()
308 def start_b(self
, attrs
):
309 self
.formatter
.push_font((AS_IS
, AS_IS
, 1, AS_IS
))
311 self
.formatter
.pop_font()
313 def start_tt(self
, attrs
):
314 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
316 self
.formatter
.pop_font()
318 def start_a(self
, attrs
):
322 for attrname
, value
in attrs
:
323 value
= value
.strip()
324 if attrname
== 'href':
326 if attrname
== 'name':
328 if attrname
== 'type':
330 self
.anchor_bgn(href
, name
, type)
337 def do_br(self
, attrs
):
338 self
.formatter
.add_line_break()
340 # --- Horizontal Rule
342 def do_hr(self
, attrs
):
343 self
.formatter
.add_hor_rule()
347 def do_img(self
, attrs
):
354 for attrname
, value
in attrs
:
355 if attrname
== 'align':
357 if attrname
== 'alt':
359 if attrname
== 'ismap':
361 if attrname
== 'src':
363 if attrname
== 'width':
364 try: width
= int(value
)
365 except ValueError: pass
366 if attrname
== 'height':
367 try: height
= int(value
)
368 except ValueError: pass
369 self
.handle_image(src
, alt
, ismap
, align
, width
, height
)
371 # --- Really Old Unofficial Deprecated Stuff
373 def do_plaintext(self
, attrs
):
374 self
.start_pre(attrs
)
375 self
.setnomoretags() # Tell SGML parser
379 def unknown_starttag(self
, tag
, attrs
):
382 def unknown_endtag(self
, tag
):
386 def test(args
= None):
387 import sys
, formatter
392 silent
= args
and args
[0] == '-s'
412 if f
is not sys
.stdin
:
416 f
= formatter
.NullFormatter()
418 f
= formatter
.AbstractFormatter(formatter
.DumbWriter())
425 if __name__
== '__main__':