Improved some error messages for command line processing.
[python/dscho.git] / Lib / htmllib.py
blob77be4709ce54a9ea6d78e9390c8c10a2965ecf24
1 """HTML 2.0 parser.
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
8 import sys
9 import string
10 from sgmllib import SGMLParser
11 from formatter import AS_IS
14 class HTMLParser(SGMLParser):
16 from htmlentitydefs import entitydefs
18 def __init__(self, formatter, verbose=0):
19 SGMLParser.__init__(self, verbose)
20 self.formatter = formatter
21 self.savedata = None
22 self.isindex = 0
23 self.title = None
24 self.base = None
25 self.anchor = None
26 self.anchorlist = []
27 self.nofill = 0
28 self.list_stack = []
30 # ------ Methods used internally; some may be overridden
32 # --- Formatter interface, taking care of 'savedata' mode;
33 # shouldn't need to be overridden
35 def handle_data(self, data):
36 if self.savedata is not None:
37 self.savedata = self.savedata + data
38 else:
39 if self.nofill:
40 self.formatter.add_literal_data(data)
41 else:
42 self.formatter.add_flowing_data(data)
44 # --- Hooks to save data; shouldn't need to be overridden
46 def save_bgn(self):
47 self.savedata = ''
49 def save_end(self):
50 data = self.savedata
51 self.savedata = None
52 if not self.nofill:
53 data = string.join(string.split(data))
54 return data
56 # --- Hooks for anchors; should probably be overridden
58 def anchor_bgn(self, href, name, type):
59 self.anchor = href
60 if self.anchor:
61 self.anchorlist.append(href)
63 def anchor_end(self):
64 if self.anchor:
65 self.handle_data("[%d]" % len(self.anchorlist))
66 self.anchor = None
68 # --- Hook for images; should probably be overridden
70 def handle_image(self, src, alt, *args):
71 self.handle_data(alt)
73 # --------- Top level elememts
75 def start_html(self, attrs): pass
76 def end_html(self): pass
78 def start_head(self, attrs): pass
79 def end_head(self): pass
81 def start_body(self, attrs): pass
82 def end_body(self): pass
84 # ------ Head elements
86 def start_title(self, attrs):
87 self.save_bgn()
89 def end_title(self):
90 self.title = self.save_end()
92 def do_base(self, attrs):
93 for a, v in attrs:
94 if a == 'href':
95 self.base = v
97 def do_isindex(self, attrs):
98 self.isindex = 1
100 def do_link(self, attrs):
101 pass
103 def do_meta(self, attrs):
104 pass
106 def do_nextid(self, attrs): # Deprecated
107 pass
109 # ------ Body elements
111 # --- Headings
113 def start_h1(self, attrs):
114 self.formatter.end_paragraph(1)
115 self.formatter.push_font(('h1', 0, 1, 0))
117 def end_h1(self):
118 self.formatter.end_paragraph(1)
119 self.formatter.pop_font()
121 def start_h2(self, attrs):
122 self.formatter.end_paragraph(1)
123 self.formatter.push_font(('h2', 0, 1, 0))
125 def end_h2(self):
126 self.formatter.end_paragraph(1)
127 self.formatter.pop_font()
129 def start_h3(self, attrs):
130 self.formatter.end_paragraph(1)
131 self.formatter.push_font(('h3', 0, 1, 0))
133 def end_h3(self):
134 self.formatter.end_paragraph(1)
135 self.formatter.pop_font()
137 def start_h4(self, attrs):
138 self.formatter.end_paragraph(1)
139 self.formatter.push_font(('h4', 0, 1, 0))
141 def end_h4(self):
142 self.formatter.end_paragraph(1)
143 self.formatter.pop_font()
145 def start_h5(self, attrs):
146 self.formatter.end_paragraph(1)
147 self.formatter.push_font(('h5', 0, 1, 0))
149 def end_h5(self):
150 self.formatter.end_paragraph(1)
151 self.formatter.pop_font()
153 def start_h6(self, attrs):
154 self.formatter.end_paragraph(1)
155 self.formatter.push_font(('h6', 0, 1, 0))
157 def end_h6(self):
158 self.formatter.end_paragraph(1)
159 self.formatter.pop_font()
161 # --- Block Structuring Elements
163 def do_p(self, attrs):
164 self.formatter.end_paragraph(1)
166 def start_pre(self, attrs):
167 self.formatter.end_paragraph(1)
168 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
169 self.nofill = self.nofill + 1
171 def end_pre(self):
172 self.formatter.end_paragraph(1)
173 self.formatter.pop_font()
174 self.nofill = max(0, self.nofill - 1)
176 def start_xmp(self, attrs):
177 self.start_pre(attrs)
178 self.setliteral('xmp') # Tell SGML parser
180 def end_xmp(self):
181 self.end_pre()
183 def start_listing(self, attrs):
184 self.start_pre(attrs)
185 self.setliteral('listing') # Tell SGML parser
187 def end_listing(self):
188 self.end_pre()
190 def start_address(self, attrs):
191 self.formatter.end_paragraph(0)
192 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
194 def end_address(self):
195 self.formatter.end_paragraph(0)
196 self.formatter.pop_font()
198 def start_blockquote(self, attrs):
199 self.formatter.end_paragraph(1)
200 self.formatter.push_margin('blockquote')
202 def end_blockquote(self):
203 self.formatter.end_paragraph(1)
204 self.formatter.pop_margin()
206 # --- List Elements
208 def start_ul(self, attrs):
209 self.formatter.end_paragraph(not self.list_stack)
210 self.formatter.push_margin('ul')
211 self.list_stack.append(['ul', '*', 0])
213 def end_ul(self):
214 if self.list_stack: del self.list_stack[-1]
215 self.formatter.end_paragraph(not self.list_stack)
216 self.formatter.pop_margin()
218 def do_li(self, attrs):
219 self.formatter.end_paragraph(0)
220 if self.list_stack:
221 [dummy, label, counter] = top = self.list_stack[-1]
222 top[2] = counter = counter+1
223 else:
224 label, counter = '*', 0
225 self.formatter.add_label_data(label, counter)
227 def start_ol(self, attrs):
228 self.formatter.end_paragraph(not self.list_stack)
229 self.formatter.push_margin('ol')
230 label = '1.'
231 for a, v in attrs:
232 if a == 'type':
233 if len(v) == 1: v = v + '.'
234 label = v
235 self.list_stack.append(['ol', label, 0])
237 def end_ol(self):
238 if self.list_stack: del self.list_stack[-1]
239 self.formatter.end_paragraph(not self.list_stack)
240 self.formatter.pop_margin()
242 def start_menu(self, attrs):
243 self.start_ul(attrs)
245 def end_menu(self):
246 self.end_ul()
248 def start_dir(self, attrs):
249 self.start_ul(attrs)
251 def end_dir(self):
252 self.end_ul()
254 def start_dl(self, attrs):
255 self.formatter.end_paragraph(1)
256 self.list_stack.append(['dl', '', 0])
258 def end_dl(self):
259 self.ddpop(1)
260 if self.list_stack: del self.list_stack[-1]
262 def do_dt(self, attrs):
263 self.ddpop()
265 def do_dd(self, attrs):
266 self.ddpop()
267 self.formatter.push_margin('dd')
268 self.list_stack.append(['dd', '', 0])
270 def ddpop(self, bl=0):
271 self.formatter.end_paragraph(bl)
272 if self.list_stack:
273 if self.list_stack[-1][0] == 'dd':
274 del self.list_stack[-1]
275 self.formatter.pop_margin()
277 # --- Phrase Markup
279 # Idiomatic Elements
281 def start_cite(self, attrs): self.start_i(attrs)
282 def end_cite(self): self.end_i()
284 def start_code(self, attrs): self.start_tt(attrs)
285 def end_code(self): self.end_tt()
287 def start_em(self, attrs): self.start_i(attrs)
288 def end_em(self): self.end_i()
290 def start_kbd(self, attrs): self.start_tt(attrs)
291 def end_kbd(self): self.end_tt()
293 def start_samp(self, attrs): self.start_tt(attrs)
294 def end_samp(self): self.end_tt()
296 def start_strong(self, attrs): self.start_b(attrs)
297 def end_strong(self): self.end_b()
299 def start_var(self, attrs): self.start_i(attrs)
300 def end_var(self): self.end_i()
302 # Typographic Elements
304 def start_i(self, attrs):
305 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
306 def end_i(self):
307 self.formatter.pop_font()
309 def start_b(self, attrs):
310 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
311 def end_b(self):
312 self.formatter.pop_font()
314 def start_tt(self, attrs):
315 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
316 def end_tt(self):
317 self.formatter.pop_font()
319 def start_a(self, attrs):
320 href = ''
321 name = ''
322 type = ''
323 for attrname, value in attrs:
324 value = string.strip(value)
325 if attrname == 'href':
326 href = value
327 if attrname == 'name':
328 name = value
329 if attrname == 'type':
330 type = string.lower(value)
331 self.anchor_bgn(href, name, type)
333 def end_a(self):
334 self.anchor_end()
336 # --- Line Break
338 def do_br(self, attrs):
339 self.formatter.add_line_break()
341 # --- Horizontal Rule
343 def do_hr(self, attrs):
344 self.formatter.add_hor_rule()
346 # --- Image
348 def do_img(self, attrs):
349 align = ''
350 alt = '(image)'
351 ismap = ''
352 src = ''
353 width = 0
354 height = 0
355 for attrname, value in attrs:
356 if attrname == 'align':
357 align = value
358 if attrname == 'alt':
359 alt = value
360 if attrname == 'ismap':
361 ismap = value
362 if attrname == 'src':
363 src = value
364 if attrname == 'width':
365 try: width = string.atoi(value)
366 except: pass
367 if attrname == 'height':
368 try: height = string.atoi(value)
369 except: pass
370 self.handle_image(src, alt, ismap, align, width, height)
372 # --- Really Old Unofficial Deprecated Stuff
374 def do_plaintext(self, attrs):
375 self.start_pre(attrs)
376 self.setnomoretags() # Tell SGML parser
378 # --- Unhandled tags
380 def unknown_starttag(self, tag, attrs):
381 pass
383 def unknown_endtag(self, tag):
384 pass
387 def test(args = None):
388 import sys, formatter
390 if not args:
391 args = sys.argv[1:]
393 silent = args and args[0] == '-s'
394 if silent:
395 del args[0]
397 if args:
398 file = args[0]
399 else:
400 file = 'test.html'
402 if file == '-':
403 f = sys.stdin
404 else:
405 try:
406 f = open(file, 'r')
407 except IOError, msg:
408 print file, ":", msg
409 sys.exit(1)
411 data = f.read()
413 if f is not sys.stdin:
414 f.close()
416 if silent:
417 f = formatter.NullFormatter()
418 else:
419 f = formatter.AbstractFormatter(formatter.DumbWriter())
421 p = HTMLParser(f)
422 p.feed(data)
423 p.close()
426 if __name__ == '__main__':
427 test()