Merged release21-maint changes.
[python/dscho.git] / Lib / htmllib.py
blob446192f5513ff027f298085edcaaf01914cec6d0
1 """HTML 2.0 parser.
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
8 from sgmllib import SGMLParser
9 from formatter import AS_IS
11 __all__ = ["HTMLParser"]
13 class HTMLParser(SGMLParser):
15 from htmlentitydefs import entitydefs
17 def __init__(self, formatter, verbose=0):
18 SGMLParser.__init__(self, verbose)
19 self.formatter = formatter
20 self.savedata = None
21 self.isindex = 0
22 self.title = None
23 self.base = None
24 self.anchor = None
25 self.anchorlist = []
26 self.nofill = 0
27 self.list_stack = []
29 # ------ Methods used internally; some may be overridden
31 # --- Formatter interface, taking care of 'savedata' mode;
32 # shouldn't need to be overridden
34 def handle_data(self, data):
35 if self.savedata is not None:
36 self.savedata = self.savedata + data
37 else:
38 if self.nofill:
39 self.formatter.add_literal_data(data)
40 else:
41 self.formatter.add_flowing_data(data)
43 # --- Hooks to save data; shouldn't need to be overridden
45 def save_bgn(self):
46 self.savedata = ''
48 def save_end(self):
49 data = self.savedata
50 self.savedata = None
51 if not self.nofill:
52 data = ' '.join(data.split())
53 return data
55 # --- Hooks for anchors; should probably be overridden
57 def anchor_bgn(self, href, name, type):
58 self.anchor = href
59 if self.anchor:
60 self.anchorlist.append(href)
62 def anchor_end(self):
63 if self.anchor:
64 self.handle_data("[%d]" % len(self.anchorlist))
65 self.anchor = None
67 # --- Hook for images; should probably be overridden
69 def handle_image(self, src, alt, *args):
70 self.handle_data(alt)
72 # --------- Top level elememts
74 def start_html(self, attrs): pass
75 def end_html(self): pass
77 def start_head(self, attrs): pass
78 def end_head(self): pass
80 def start_body(self, attrs): pass
81 def end_body(self): pass
83 # ------ Head elements
85 def start_title(self, attrs):
86 self.save_bgn()
88 def end_title(self):
89 self.title = self.save_end()
91 def do_base(self, attrs):
92 for a, v in attrs:
93 if a == 'href':
94 self.base = v
96 def do_isindex(self, attrs):
97 self.isindex = 1
99 def do_link(self, attrs):
100 pass
102 def do_meta(self, attrs):
103 pass
105 def do_nextid(self, attrs): # Deprecated
106 pass
108 # ------ Body elements
110 # --- Headings
112 def start_h1(self, attrs):
113 self.formatter.end_paragraph(1)
114 self.formatter.push_font(('h1', 0, 1, 0))
116 def end_h1(self):
117 self.formatter.end_paragraph(1)
118 self.formatter.pop_font()
120 def start_h2(self, attrs):
121 self.formatter.end_paragraph(1)
122 self.formatter.push_font(('h2', 0, 1, 0))
124 def end_h2(self):
125 self.formatter.end_paragraph(1)
126 self.formatter.pop_font()
128 def start_h3(self, attrs):
129 self.formatter.end_paragraph(1)
130 self.formatter.push_font(('h3', 0, 1, 0))
132 def end_h3(self):
133 self.formatter.end_paragraph(1)
134 self.formatter.pop_font()
136 def start_h4(self, attrs):
137 self.formatter.end_paragraph(1)
138 self.formatter.push_font(('h4', 0, 1, 0))
140 def end_h4(self):
141 self.formatter.end_paragraph(1)
142 self.formatter.pop_font()
144 def start_h5(self, attrs):
145 self.formatter.end_paragraph(1)
146 self.formatter.push_font(('h5', 0, 1, 0))
148 def end_h5(self):
149 self.formatter.end_paragraph(1)
150 self.formatter.pop_font()
152 def start_h6(self, attrs):
153 self.formatter.end_paragraph(1)
154 self.formatter.push_font(('h6', 0, 1, 0))
156 def end_h6(self):
157 self.formatter.end_paragraph(1)
158 self.formatter.pop_font()
160 # --- Block Structuring Elements
162 def do_p(self, attrs):
163 self.formatter.end_paragraph(1)
165 def start_pre(self, attrs):
166 self.formatter.end_paragraph(1)
167 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
168 self.nofill = self.nofill + 1
170 def end_pre(self):
171 self.formatter.end_paragraph(1)
172 self.formatter.pop_font()
173 self.nofill = max(0, self.nofill - 1)
175 def start_xmp(self, attrs):
176 self.start_pre(attrs)
177 self.setliteral('xmp') # Tell SGML parser
179 def end_xmp(self):
180 self.end_pre()
182 def start_listing(self, attrs):
183 self.start_pre(attrs)
184 self.setliteral('listing') # Tell SGML parser
186 def end_listing(self):
187 self.end_pre()
189 def start_address(self, attrs):
190 self.formatter.end_paragraph(0)
191 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
193 def end_address(self):
194 self.formatter.end_paragraph(0)
195 self.formatter.pop_font()
197 def start_blockquote(self, attrs):
198 self.formatter.end_paragraph(1)
199 self.formatter.push_margin('blockquote')
201 def end_blockquote(self):
202 self.formatter.end_paragraph(1)
203 self.formatter.pop_margin()
205 # --- List Elements
207 def start_ul(self, attrs):
208 self.formatter.end_paragraph(not self.list_stack)
209 self.formatter.push_margin('ul')
210 self.list_stack.append(['ul', '*', 0])
212 def end_ul(self):
213 if self.list_stack: del self.list_stack[-1]
214 self.formatter.end_paragraph(not self.list_stack)
215 self.formatter.pop_margin()
217 def do_li(self, attrs):
218 self.formatter.end_paragraph(0)
219 if self.list_stack:
220 [dummy, label, counter] = top = self.list_stack[-1]
221 top[2] = counter = counter+1
222 else:
223 label, counter = '*', 0
224 self.formatter.add_label_data(label, counter)
226 def start_ol(self, attrs):
227 self.formatter.end_paragraph(not self.list_stack)
228 self.formatter.push_margin('ol')
229 label = '1.'
230 for a, v in attrs:
231 if a == 'type':
232 if len(v) == 1: v = v + '.'
233 label = v
234 self.list_stack.append(['ol', label, 0])
236 def end_ol(self):
237 if self.list_stack: del self.list_stack[-1]
238 self.formatter.end_paragraph(not self.list_stack)
239 self.formatter.pop_margin()
241 def start_menu(self, attrs):
242 self.start_ul(attrs)
244 def end_menu(self):
245 self.end_ul()
247 def start_dir(self, attrs):
248 self.start_ul(attrs)
250 def end_dir(self):
251 self.end_ul()
253 def start_dl(self, attrs):
254 self.formatter.end_paragraph(1)
255 self.list_stack.append(['dl', '', 0])
257 def end_dl(self):
258 self.ddpop(1)
259 if self.list_stack: del self.list_stack[-1]
261 def do_dt(self, attrs):
262 self.ddpop()
264 def do_dd(self, attrs):
265 self.ddpop()
266 self.formatter.push_margin('dd')
267 self.list_stack.append(['dd', '', 0])
269 def ddpop(self, bl=0):
270 self.formatter.end_paragraph(bl)
271 if self.list_stack:
272 if self.list_stack[-1][0] == 'dd':
273 del self.list_stack[-1]
274 self.formatter.pop_margin()
276 # --- Phrase Markup
278 # Idiomatic Elements
280 def start_cite(self, attrs): self.start_i(attrs)
281 def end_cite(self): self.end_i()
283 def start_code(self, attrs): self.start_tt(attrs)
284 def end_code(self): self.end_tt()
286 def start_em(self, attrs): self.start_i(attrs)
287 def end_em(self): self.end_i()
289 def start_kbd(self, attrs): self.start_tt(attrs)
290 def end_kbd(self): self.end_tt()
292 def start_samp(self, attrs): self.start_tt(attrs)
293 def end_samp(self): self.end_tt()
295 def start_strong(self, attrs): self.start_b(attrs)
296 def end_strong(self): self.end_b()
298 def start_var(self, attrs): self.start_i(attrs)
299 def end_var(self): self.end_i()
301 # Typographic Elements
303 def start_i(self, attrs):
304 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
305 def end_i(self):
306 self.formatter.pop_font()
308 def start_b(self, attrs):
309 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
310 def end_b(self):
311 self.formatter.pop_font()
313 def start_tt(self, attrs):
314 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
315 def end_tt(self):
316 self.formatter.pop_font()
318 def start_a(self, attrs):
319 href = ''
320 name = ''
321 type = ''
322 for attrname, value in attrs:
323 value = value.strip()
324 if attrname == 'href':
325 href = value
326 if attrname == 'name':
327 name = value
328 if attrname == 'type':
329 type = value.lower()
330 self.anchor_bgn(href, name, type)
332 def end_a(self):
333 self.anchor_end()
335 # --- Line Break
337 def do_br(self, attrs):
338 self.formatter.add_line_break()
340 # --- Horizontal Rule
342 def do_hr(self, attrs):
343 self.formatter.add_hor_rule()
345 # --- Image
347 def do_img(self, attrs):
348 align = ''
349 alt = '(image)'
350 ismap = ''
351 src = ''
352 width = 0
353 height = 0
354 for attrname, value in attrs:
355 if attrname == 'align':
356 align = value
357 if attrname == 'alt':
358 alt = value
359 if attrname == 'ismap':
360 ismap = value
361 if attrname == 'src':
362 src = value
363 if attrname == 'width':
364 try: width = int(value)
365 except ValueError: pass
366 if attrname == 'height':
367 try: height = int(value)
368 except ValueError: pass
369 self.handle_image(src, alt, ismap, align, width, height)
371 # --- Really Old Unofficial Deprecated Stuff
373 def do_plaintext(self, attrs):
374 self.start_pre(attrs)
375 self.setnomoretags() # Tell SGML parser
377 # --- Unhandled tags
379 def unknown_starttag(self, tag, attrs):
380 pass
382 def unknown_endtag(self, tag):
383 pass
386 def test(args = None):
387 import sys, formatter
389 if not args:
390 args = sys.argv[1:]
392 silent = args and args[0] == '-s'
393 if silent:
394 del args[0]
396 if args:
397 file = args[0]
398 else:
399 file = 'test.html'
401 if file == '-':
402 f = sys.stdin
403 else:
404 try:
405 f = open(file, 'r')
406 except IOError, msg:
407 print file, ":", msg
408 sys.exit(1)
410 data = f.read()
412 if f is not sys.stdin:
413 f.close()
415 if silent:
416 f = formatter.NullFormatter()
417 else:
418 f = formatter.AbstractFormatter(formatter.DumbWriter())
420 p = HTMLParser(f)
421 p.feed(data)
422 p.close()
425 if __name__ == '__main__':
426 test()