3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
10 from sgmllib
import SGMLParser
11 from formatter
import AS_IS
14 class HTMLParser(SGMLParser
):
16 from htmlentitydefs
import entitydefs
18 def __init__(self
, formatter
, verbose
=0):
19 SGMLParser
.__init
__(self
, verbose
)
20 self
.formatter
= formatter
30 # ------ Methods used internally; some may be overridden
32 # --- Formatter interface, taking care of 'savedata' mode;
33 # shouldn't need to be overridden
35 def handle_data(self
, data
):
36 if self
.savedata
is not None:
37 self
.savedata
= self
.savedata
+ data
40 self
.formatter
.add_literal_data(data
)
42 self
.formatter
.add_flowing_data(data
)
44 # --- Hooks to save data; shouldn't need to be overridden
53 data
= string
.join(string
.split(data
))
56 # --- Hooks for anchors; should probably be overridden
58 def anchor_bgn(self
, href
, name
, type):
61 self
.anchorlist
.append(href
)
65 self
.handle_data("[%d]" % len(self
.anchorlist
))
68 # --- Hook for images; should probably be overridden
70 def handle_image(self
, src
, alt
, *args
):
73 # --------- Top level elememts
75 def start_html(self
, attrs
): pass
76 def end_html(self
): pass
78 def start_head(self
, attrs
): pass
79 def end_head(self
): pass
81 def start_body(self
, attrs
): pass
82 def end_body(self
): pass
84 # ------ Head elements
86 def start_title(self
, attrs
):
90 self
.title
= self
.save_end()
92 def do_base(self
, attrs
):
97 def do_isindex(self
, attrs
):
100 def do_link(self
, attrs
):
103 def do_meta(self
, attrs
):
106 def do_nextid(self
, attrs
): # Deprecated
109 # ------ Body elements
113 def start_h1(self
, attrs
):
114 self
.formatter
.end_paragraph(1)
115 self
.formatter
.push_font(('h1', 0, 1, 0))
118 self
.formatter
.end_paragraph(1)
119 self
.formatter
.pop_font()
121 def start_h2(self
, attrs
):
122 self
.formatter
.end_paragraph(1)
123 self
.formatter
.push_font(('h2', 0, 1, 0))
126 self
.formatter
.end_paragraph(1)
127 self
.formatter
.pop_font()
129 def start_h3(self
, attrs
):
130 self
.formatter
.end_paragraph(1)
131 self
.formatter
.push_font(('h3', 0, 1, 0))
134 self
.formatter
.end_paragraph(1)
135 self
.formatter
.pop_font()
137 def start_h4(self
, attrs
):
138 self
.formatter
.end_paragraph(1)
139 self
.formatter
.push_font(('h4', 0, 1, 0))
142 self
.formatter
.end_paragraph(1)
143 self
.formatter
.pop_font()
145 def start_h5(self
, attrs
):
146 self
.formatter
.end_paragraph(1)
147 self
.formatter
.push_font(('h5', 0, 1, 0))
150 self
.formatter
.end_paragraph(1)
151 self
.formatter
.pop_font()
153 def start_h6(self
, attrs
):
154 self
.formatter
.end_paragraph(1)
155 self
.formatter
.push_font(('h6', 0, 1, 0))
158 self
.formatter
.end_paragraph(1)
159 self
.formatter
.pop_font()
161 # --- Block Structuring Elements
163 def do_p(self
, attrs
):
164 self
.formatter
.end_paragraph(1)
166 def start_pre(self
, attrs
):
167 self
.formatter
.end_paragraph(1)
168 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
169 self
.nofill
= self
.nofill
+ 1
172 self
.formatter
.end_paragraph(1)
173 self
.formatter
.pop_font()
174 self
.nofill
= max(0, self
.nofill
- 1)
176 def start_xmp(self
, attrs
):
177 self
.start_pre(attrs
)
178 self
.setliteral('xmp') # Tell SGML parser
183 def start_listing(self
, attrs
):
184 self
.start_pre(attrs
)
185 self
.setliteral('listing') # Tell SGML parser
187 def end_listing(self
):
190 def start_address(self
, attrs
):
191 self
.formatter
.end_paragraph(0)
192 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
194 def end_address(self
):
195 self
.formatter
.end_paragraph(0)
196 self
.formatter
.pop_font()
198 def start_blockquote(self
, attrs
):
199 self
.formatter
.end_paragraph(1)
200 self
.formatter
.push_margin('blockquote')
202 def end_blockquote(self
):
203 self
.formatter
.end_paragraph(1)
204 self
.formatter
.pop_margin()
208 def start_ul(self
, attrs
):
209 self
.formatter
.end_paragraph(not self
.list_stack
)
210 self
.formatter
.push_margin('ul')
211 self
.list_stack
.append(['ul', '*', 0])
214 if self
.list_stack
: del self
.list_stack
[-1]
215 self
.formatter
.end_paragraph(not self
.list_stack
)
216 self
.formatter
.pop_margin()
218 def do_li(self
, attrs
):
219 self
.formatter
.end_paragraph(0)
221 [dummy
, label
, counter
] = top
= self
.list_stack
[-1]
222 top
[2] = counter
= counter
+1
224 label
, counter
= '*', 0
225 self
.formatter
.add_label_data(label
, counter
)
227 def start_ol(self
, attrs
):
228 self
.formatter
.end_paragraph(not self
.list_stack
)
229 self
.formatter
.push_margin('ol')
233 if len(v
) == 1: v
= v
+ '.'
235 self
.list_stack
.append(['ol', label
, 0])
238 if self
.list_stack
: del self
.list_stack
[-1]
239 self
.formatter
.end_paragraph(not self
.list_stack
)
240 self
.formatter
.pop_margin()
242 def start_menu(self
, attrs
):
248 def start_dir(self
, attrs
):
254 def start_dl(self
, attrs
):
255 self
.formatter
.end_paragraph(1)
256 self
.list_stack
.append(['dl', '', 0])
260 if self
.list_stack
: del self
.list_stack
[-1]
262 def do_dt(self
, attrs
):
265 def do_dd(self
, attrs
):
267 self
.formatter
.push_margin('dd')
268 self
.list_stack
.append(['dd', '', 0])
270 def ddpop(self
, bl
=0):
271 self
.formatter
.end_paragraph(bl
)
273 if self
.list_stack
[-1][0] == 'dd':
274 del self
.list_stack
[-1]
275 self
.formatter
.pop_margin()
281 def start_cite(self
, attrs
): self
.start_i(attrs
)
282 def end_cite(self
): self
.end_i()
284 def start_code(self
, attrs
): self
.start_tt(attrs
)
285 def end_code(self
): self
.end_tt()
287 def start_em(self
, attrs
): self
.start_i(attrs
)
288 def end_em(self
): self
.end_i()
290 def start_kbd(self
, attrs
): self
.start_tt(attrs
)
291 def end_kbd(self
): self
.end_tt()
293 def start_samp(self
, attrs
): self
.start_tt(attrs
)
294 def end_samp(self
): self
.end_tt()
296 def start_strong(self
, attrs
): self
.start_b(attrs
)
297 def end_strong(self
): self
.end_b()
299 def start_var(self
, attrs
): self
.start_i(attrs
)
300 def end_var(self
): self
.end_i()
302 # Typographic Elements
304 def start_i(self
, attrs
):
305 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
307 self
.formatter
.pop_font()
309 def start_b(self
, attrs
):
310 self
.formatter
.push_font((AS_IS
, AS_IS
, 1, AS_IS
))
312 self
.formatter
.pop_font()
314 def start_tt(self
, attrs
):
315 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
317 self
.formatter
.pop_font()
319 def start_a(self
, attrs
):
323 for attrname
, value
in attrs
:
324 value
= string
.strip(value
)
325 if attrname
== 'href':
327 if attrname
== 'name':
329 if attrname
== 'type':
330 type = string
.lower(value
)
331 self
.anchor_bgn(href
, name
, type)
338 def do_br(self
, attrs
):
339 self
.formatter
.add_line_break()
341 # --- Horizontal Rule
343 def do_hr(self
, attrs
):
344 self
.formatter
.add_hor_rule()
348 def do_img(self
, attrs
):
355 for attrname
, value
in attrs
:
356 if attrname
== 'align':
358 if attrname
== 'alt':
360 if attrname
== 'ismap':
362 if attrname
== 'src':
364 if attrname
== 'width':
365 try: width
= string
.atoi(value
)
367 if attrname
== 'height':
368 try: height
= string
.atoi(value
)
370 self
.handle_image(src
, alt
, ismap
, align
, width
, height
)
372 # --- Really Old Unofficial Deprecated Stuff
374 def do_plaintext(self
, attrs
):
375 self
.start_pre(attrs
)
376 self
.setnomoretags() # Tell SGML parser
380 def unknown_starttag(self
, tag
, attrs
):
383 def unknown_endtag(self
, tag
):
387 def test(args
= None):
388 import sys
, formatter
393 silent
= args
and args
[0] == '-s'
413 if f
is not sys
.stdin
:
417 f
= formatter
.NullFormatter()
419 f
= formatter
.AbstractFormatter(formatter
.DumbWriter())
426 if __name__
== '__main__':