Whitespace normalization.
[python/dscho.git] / Lib / htmllib.py
blob94492a6da2417be91b79a8d04f6d73f6b567c29f
1 """HTML 2.0 parser.
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
8 from sgmllib import SGMLParser
9 from formatter import AS_IS
11 __all__ = ["HTMLParser"]
13 class HTMLParser(SGMLParser):
14 """This is the basic HTML parser class.
16 It supports all entity names required by the XHTML 1.0 Recommendation.
17 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
18 elements.
20 """
22 from htmlentitydefs import entitydefs
24 def __init__(self, formatter, verbose=0):
25 """Creates an instance of the HTMLParser class.
27 The formatter parameter is the formatter instance associated with
28 the parser.
30 """
31 SGMLParser.__init__(self, verbose)
32 self.formatter = formatter
34 def reset(self):
35 SGMLParser.reset(self)
36 self.savedata = None
37 self.isindex = 0
38 self.title = None
39 self.base = None
40 self.anchor = None
41 self.anchorlist = []
42 self.nofill = 0
43 self.list_stack = []
45 # ------ Methods used internally; some may be overridden
47 # --- Formatter interface, taking care of 'savedata' mode;
48 # shouldn't need to be overridden
50 def handle_data(self, data):
51 if self.savedata is not None:
52 self.savedata = self.savedata + data
53 else:
54 if self.nofill:
55 self.formatter.add_literal_data(data)
56 else:
57 self.formatter.add_flowing_data(data)
59 # --- Hooks to save data; shouldn't need to be overridden
61 def save_bgn(self):
62 """Begins saving character data in a buffer instead of sending it
63 to the formatter object.
65 Retrieve the stored data via the save_end() method. Use of the
66 save_bgn() / save_end() pair may not be nested.
68 """
69 self.savedata = ''
71 def save_end(self):
72 """Ends buffering character data and returns all data saved since
73 the preceding call to the save_bgn() method.
75 If the nofill flag is false, whitespace is collapsed to single
76 spaces. A call to this method without a preceding call to the
77 save_bgn() method will raise a TypeError exception.
79 """
80 data = self.savedata
81 self.savedata = None
82 if not self.nofill:
83 data = ' '.join(data.split())
84 return data
86 # --- Hooks for anchors; should probably be overridden
88 def anchor_bgn(self, href, name, type):
89 """This method is called at the start of an anchor region.
91 The arguments correspond to the attributes of the <A> tag with
92 the same names. The default implementation maintains a list of
93 hyperlinks (defined by the HREF attribute for <A> tags) within
94 the document. The list of hyperlinks is available as the data
95 attribute anchorlist.
97 """
98 self.anchor = href
99 if self.anchor:
100 self.anchorlist.append(href)
102 def anchor_end(self):
103 """This method is called at the end of an anchor region.
105 The default implementation adds a textual footnote marker using an
106 index into the list of hyperlinks created by the anchor_bgn()method.
109 if self.anchor:
110 self.handle_data("[%d]" % len(self.anchorlist))
111 self.anchor = None
113 # --- Hook for images; should probably be overridden
115 def handle_image(self, src, alt, *args):
116 """This method is called to handle images.
118 The default implementation simply passes the alt value to the
119 handle_data() method.
122 self.handle_data(alt)
124 # --------- Top level elememts
126 def start_html(self, attrs): pass
127 def end_html(self): pass
129 def start_head(self, attrs): pass
130 def end_head(self): pass
132 def start_body(self, attrs): pass
133 def end_body(self): pass
135 # ------ Head elements
137 def start_title(self, attrs):
138 self.save_bgn()
140 def end_title(self):
141 self.title = self.save_end()
143 def do_base(self, attrs):
144 for a, v in attrs:
145 if a == 'href':
146 self.base = v
148 def do_isindex(self, attrs):
149 self.isindex = 1
151 def do_link(self, attrs):
152 pass
154 def do_meta(self, attrs):
155 pass
157 def do_nextid(self, attrs): # Deprecated
158 pass
160 # ------ Body elements
162 # --- Headings
164 def start_h1(self, attrs):
165 self.formatter.end_paragraph(1)
166 self.formatter.push_font(('h1', 0, 1, 0))
168 def end_h1(self):
169 self.formatter.end_paragraph(1)
170 self.formatter.pop_font()
172 def start_h2(self, attrs):
173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h2', 0, 1, 0))
176 def end_h2(self):
177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
180 def start_h3(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h3', 0, 1, 0))
184 def end_h3(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
188 def start_h4(self, attrs):
189 self.formatter.end_paragraph(1)
190 self.formatter.push_font(('h4', 0, 1, 0))
192 def end_h4(self):
193 self.formatter.end_paragraph(1)
194 self.formatter.pop_font()
196 def start_h5(self, attrs):
197 self.formatter.end_paragraph(1)
198 self.formatter.push_font(('h5', 0, 1, 0))
200 def end_h5(self):
201 self.formatter.end_paragraph(1)
202 self.formatter.pop_font()
204 def start_h6(self, attrs):
205 self.formatter.end_paragraph(1)
206 self.formatter.push_font(('h6', 0, 1, 0))
208 def end_h6(self):
209 self.formatter.end_paragraph(1)
210 self.formatter.pop_font()
212 # --- Block Structuring Elements
214 def do_p(self, attrs):
215 self.formatter.end_paragraph(1)
217 def start_pre(self, attrs):
218 self.formatter.end_paragraph(1)
219 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
220 self.nofill = self.nofill + 1
222 def end_pre(self):
223 self.formatter.end_paragraph(1)
224 self.formatter.pop_font()
225 self.nofill = max(0, self.nofill - 1)
227 def start_xmp(self, attrs):
228 self.start_pre(attrs)
229 self.setliteral('xmp') # Tell SGML parser
231 def end_xmp(self):
232 self.end_pre()
234 def start_listing(self, attrs):
235 self.start_pre(attrs)
236 self.setliteral('listing') # Tell SGML parser
238 def end_listing(self):
239 self.end_pre()
241 def start_address(self, attrs):
242 self.formatter.end_paragraph(0)
243 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
245 def end_address(self):
246 self.formatter.end_paragraph(0)
247 self.formatter.pop_font()
249 def start_blockquote(self, attrs):
250 self.formatter.end_paragraph(1)
251 self.formatter.push_margin('blockquote')
253 def end_blockquote(self):
254 self.formatter.end_paragraph(1)
255 self.formatter.pop_margin()
257 # --- List Elements
259 def start_ul(self, attrs):
260 self.formatter.end_paragraph(not self.list_stack)
261 self.formatter.push_margin('ul')
262 self.list_stack.append(['ul', '*', 0])
264 def end_ul(self):
265 if self.list_stack: del self.list_stack[-1]
266 self.formatter.end_paragraph(not self.list_stack)
267 self.formatter.pop_margin()
269 def do_li(self, attrs):
270 self.formatter.end_paragraph(0)
271 if self.list_stack:
272 [dummy, label, counter] = top = self.list_stack[-1]
273 top[2] = counter = counter+1
274 else:
275 label, counter = '*', 0
276 self.formatter.add_label_data(label, counter)
278 def start_ol(self, attrs):
279 self.formatter.end_paragraph(not self.list_stack)
280 self.formatter.push_margin('ol')
281 label = '1.'
282 for a, v in attrs:
283 if a == 'type':
284 if len(v) == 1: v = v + '.'
285 label = v
286 self.list_stack.append(['ol', label, 0])
288 def end_ol(self):
289 if self.list_stack: del self.list_stack[-1]
290 self.formatter.end_paragraph(not self.list_stack)
291 self.formatter.pop_margin()
293 def start_menu(self, attrs):
294 self.start_ul(attrs)
296 def end_menu(self):
297 self.end_ul()
299 def start_dir(self, attrs):
300 self.start_ul(attrs)
302 def end_dir(self):
303 self.end_ul()
305 def start_dl(self, attrs):
306 self.formatter.end_paragraph(1)
307 self.list_stack.append(['dl', '', 0])
309 def end_dl(self):
310 self.ddpop(1)
311 if self.list_stack: del self.list_stack[-1]
313 def do_dt(self, attrs):
314 self.ddpop()
316 def do_dd(self, attrs):
317 self.ddpop()
318 self.formatter.push_margin('dd')
319 self.list_stack.append(['dd', '', 0])
321 def ddpop(self, bl=0):
322 self.formatter.end_paragraph(bl)
323 if self.list_stack:
324 if self.list_stack[-1][0] == 'dd':
325 del self.list_stack[-1]
326 self.formatter.pop_margin()
328 # --- Phrase Markup
330 # Idiomatic Elements
332 def start_cite(self, attrs): self.start_i(attrs)
333 def end_cite(self): self.end_i()
335 def start_code(self, attrs): self.start_tt(attrs)
336 def end_code(self): self.end_tt()
338 def start_em(self, attrs): self.start_i(attrs)
339 def end_em(self): self.end_i()
341 def start_kbd(self, attrs): self.start_tt(attrs)
342 def end_kbd(self): self.end_tt()
344 def start_samp(self, attrs): self.start_tt(attrs)
345 def end_samp(self): self.end_tt()
347 def start_strong(self, attrs): self.start_b(attrs)
348 def end_strong(self): self.end_b()
350 def start_var(self, attrs): self.start_i(attrs)
351 def end_var(self): self.end_i()
353 # Typographic Elements
355 def start_i(self, attrs):
356 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
357 def end_i(self):
358 self.formatter.pop_font()
360 def start_b(self, attrs):
361 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
362 def end_b(self):
363 self.formatter.pop_font()
365 def start_tt(self, attrs):
366 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
367 def end_tt(self):
368 self.formatter.pop_font()
370 def start_a(self, attrs):
371 href = ''
372 name = ''
373 type = ''
374 for attrname, value in attrs:
375 value = value.strip()
376 if attrname == 'href':
377 href = value
378 if attrname == 'name':
379 name = value
380 if attrname == 'type':
381 type = value.lower()
382 self.anchor_bgn(href, name, type)
384 def end_a(self):
385 self.anchor_end()
387 # --- Line Break
389 def do_br(self, attrs):
390 self.formatter.add_line_break()
392 # --- Horizontal Rule
394 def do_hr(self, attrs):
395 self.formatter.add_hor_rule()
397 # --- Image
399 def do_img(self, attrs):
400 align = ''
401 alt = '(image)'
402 ismap = ''
403 src = ''
404 width = 0
405 height = 0
406 for attrname, value in attrs:
407 if attrname == 'align':
408 align = value
409 if attrname == 'alt':
410 alt = value
411 if attrname == 'ismap':
412 ismap = value
413 if attrname == 'src':
414 src = value
415 if attrname == 'width':
416 try: width = int(value)
417 except ValueError: pass
418 if attrname == 'height':
419 try: height = int(value)
420 except ValueError: pass
421 self.handle_image(src, alt, ismap, align, width, height)
423 # --- Really Old Unofficial Deprecated Stuff
425 def do_plaintext(self, attrs):
426 self.start_pre(attrs)
427 self.setnomoretags() # Tell SGML parser
429 # --- Unhandled tags
431 def unknown_starttag(self, tag, attrs):
432 pass
434 def unknown_endtag(self, tag):
435 pass
438 def test(args = None):
439 import sys, formatter
441 if not args:
442 args = sys.argv[1:]
444 silent = args and args[0] == '-s'
445 if silent:
446 del args[0]
448 if args:
449 file = args[0]
450 else:
451 file = 'test.html'
453 if file == '-':
454 f = sys.stdin
455 else:
456 try:
457 f = open(file, 'r')
458 except IOError, msg:
459 print file, ":", msg
460 sys.exit(1)
462 data = f.read()
464 if f is not sys.stdin:
465 f.close()
467 if silent:
468 f = formatter.NullFormatter()
469 else:
470 f = formatter.AbstractFormatter(formatter.DumbWriter())
472 p = HTMLParser(f)
473 p.feed(data)
474 p.close()
477 if __name__ == '__main__':
478 test()