Class around PixMap objects that allows more python-like access. By Joe Strout.
[python/dscho.git] / Demo / tkinter / www / htmllib.py
blobf45657f645df12575afabc0531000adf27f5018b
1 # A parser for HTML documents
4 # HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
5 # describe hypertext documents
7 # SGML: Standard Generalized Markup Language
9 # WWW: World-Wide Web; a distributed hypertext system develped at CERN
11 # CERN: European Particle Physics Laboratory in Geneva, Switzerland
14 # This file is only concerned with parsing and formatting HTML
15 # documents, not with the other (hypertext and networking) aspects of
16 # the WWW project. (It does support highlighting of anchors.)
19 import os
20 import sys
21 import regex
22 import string
23 import sgmllib
26 class HTMLParser(sgmllib.SGMLParser):
28 # Copy base class entities and add some
29 entitydefs = {}
30 for key in sgmllib.SGMLParser.entitydefs.keys():
31 entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
32 entitydefs['bullet'] = '*'
34 # Provided -- handlers for tags introducing literal text
36 def start_listing(self, attrs):
37 self.setliteral('listing')
38 self.literal_bgn('listing', attrs)
40 def end_listing(self):
41 self.literal_end('listing')
43 def start_xmp(self, attrs):
44 self.setliteral('xmp')
45 self.literal_bgn('xmp', attrs)
47 def end_xmp(self):
48 self.literal_end('xmp')
50 def do_plaintext(self, attrs):
51 self.setnomoretags()
52 self.literal_bgn('plaintext', attrs)
54 # To be overridden -- begin/end literal mode
55 def literal_bgn(self, tag, attrs): pass
56 def literal_end(self, tag): pass
59 # Next level of sophistication -- collect anchors, title, nextid and isindex
60 class CollectingParser(HTMLParser):
62 def __init__(self):
63 HTMLParser.__init__(self)
64 self.savetext = None
65 self.nextid = ''
66 self.isindex = 0
67 self.title = ''
68 self.inanchor = 0
69 self.anchors = []
70 self.anchornames = []
71 self.anchortypes = []
73 def start_a(self, attrs):
74 self.inanchor = 0
75 href = ''
76 name = ''
77 type = ''
78 for attrname, value in attrs:
79 if attrname == 'href':
80 href = value
81 if attrname == 'name=':
82 name = value
83 if attrname == 'type=':
84 type = string.lower(value)
85 if not (href or name):
86 return
87 self.anchors.append(href)
88 self.anchornames.append(name)
89 self.anchortypes.append(type)
90 self.inanchor = len(self.anchors)
91 if not href:
92 self.inanchor = -self.inanchor
94 def end_a(self):
95 if self.inanchor > 0:
96 # Don't show anchors pointing into the current document
97 if self.anchors[self.inanchor-1][:1] <> '#':
98 self.handle_data('[' + `self.inanchor` + ']')
99 self.inanchor = 0
101 def start_header(self, attrs): pass
102 def end_header(self): pass
104 # (head is the same as header)
105 def start_head(self, attrs): pass
106 def end_head(self): pass
108 def start_body(self, attrs): pass
109 def end_body(self): pass
111 def do_nextid(self, attrs):
112 self.nextid = attrs
114 def do_isindex(self, attrs):
115 self.isindex = 1
117 def start_title(self, attrs):
118 self.savetext = ''
120 def end_title(self):
121 if self.savetext <> None:
122 self.title = self.savetext
123 self.savetext = None
125 def handle_data(self, text):
126 if self.savetext is not None:
127 self.savetext = self.savetext + text
130 # Formatting parser -- takes a formatter and a style sheet as arguments
132 # XXX The use of style sheets should change: for each tag and end tag
133 # there should be a style definition, and a style definition should
134 # encompass many more parameters: font, justification, indentation,
135 # vspace before, vspace after, hanging tag...
137 wordprog = regex.compile('[^ \t\n]*')
138 spaceprog = regex.compile('[ \t\n]*')
140 class FormattingParser(CollectingParser):
142 def __init__(self, formatter, stylesheet):
143 CollectingParser.__init__(self)
144 self.fmt = formatter
145 self.stl = stylesheet
146 self.savetext = None
147 self.compact = 0
148 self.nofill = 0
149 self.resetfont()
150 self.setindent(self.stl.stdindent)
152 def resetfont(self):
153 self.fontstack = []
154 self.stylestack = []
155 self.fontset = self.stl.stdfontset
156 self.style = ROMAN
157 self.passfont()
159 def passfont(self):
160 font = self.fontset[self.style]
161 self.fmt.setfont(font)
163 def pushstyle(self, style):
164 self.stylestack.append(self.style)
165 self.style = min(style, len(self.fontset)-1)
166 self.passfont()
168 def popstyle(self):
169 self.style = self.stylestack[-1]
170 del self.stylestack[-1]
171 self.passfont()
173 def pushfontset(self, fontset, style):
174 self.fontstack.append(self.fontset)
175 self.fontset = fontset
176 self.pushstyle(style)
178 def popfontset(self):
179 self.fontset = self.fontstack[-1]
180 del self.fontstack[-1]
181 self.popstyle()
183 def flush(self):
184 self.fmt.flush()
186 def setindent(self, n):
187 self.fmt.setleftindent(n)
189 def needvspace(self, n):
190 self.fmt.needvspace(n)
192 def close(self):
193 HTMLParser.close(self)
194 self.fmt.flush()
196 def handle_literal(self, text):
197 lines = string.splitfields(text, '\n')
198 for i in range(1, len(lines)):
199 lines[i] = string.expandtabs(lines[i], 8)
200 for line in lines[:-1]:
201 self.fmt.addword(line, 0)
202 self.fmt.flush()
203 self.fmt.nospace = 0
204 for line in lines[-1:]:
205 self.fmt.addword(line, 0)
207 def handle_data(self, text):
208 if self.savetext is not None:
209 self.savetext = self.savetext + text
210 return
211 if self.literal:
212 self.handle_literal(text)
213 return
214 i = 0
215 n = len(text)
216 while i < n:
217 j = i + wordprog.match(text, i)
218 word = text[i:j]
219 i = j + spaceprog.match(text, j)
220 self.fmt.addword(word, i-j)
221 if self.nofill and '\n' in text[j:i]:
222 self.fmt.flush()
223 self.fmt.nospace = 0
224 i = j+1
225 while text[i-1] <> '\n': i = i+1
227 def literal_bgn(self, tag, attrs):
228 if tag == 'plaintext':
229 self.flush()
230 else:
231 self.needvspace(1)
232 self.pushfontset(self.stl.stdfontset, FIXED)
233 self.setindent(self.stl.literalindent)
235 def literal_end(self, tag):
236 self.needvspace(1)
237 self.popfontset()
238 self.setindent(self.stl.stdindent)
240 def start_title(self, attrs):
241 self.flush()
242 self.savetext = ''
243 # NB end_title is unchanged
245 def do_p(self, attrs):
246 if self.compact:
247 self.flush()
248 else:
249 self.needvspace(1)
251 def do_hr(self, attrs):
252 self.fmt.hrule()
254 def start_h1(self, attrs):
255 self.needvspace(2)
256 self.setindent(self.stl.h1indent)
257 self.pushfontset(self.stl.h1fontset, BOLD)
258 self.fmt.setjust('c')
260 def end_h1(self):
261 self.popfontset()
262 self.needvspace(2)
263 self.setindent(self.stl.stdindent)
264 self.fmt.setjust('l')
266 def start_h2(self, attrs):
267 self.needvspace(1)
268 self.setindent(self.stl.h2indent)
269 self.pushfontset(self.stl.h2fontset, BOLD)
271 def end_h2(self):
272 self.popfontset()
273 self.needvspace(1)
274 self.setindent(self.stl.stdindent)
276 def start_h3(self, attrs):
277 self.needvspace(1)
278 self.setindent(self.stl.stdindent)
279 self.pushfontset(self.stl.h3fontset, BOLD)
281 def end_h3(self):
282 self.popfontset()
283 self.needvspace(1)
284 self.setindent(self.stl.stdindent)
286 def start_h4(self, attrs):
287 self.needvspace(1)
288 self.setindent(self.stl.stdindent)
289 self.pushfontset(self.stl.stdfontset, BOLD)
291 def end_h4(self):
292 self.popfontset()
293 self.needvspace(1)
294 self.setindent(self.stl.stdindent)
296 start_h5 = start_h4
297 end_h5 = end_h4
299 start_h6 = start_h5
300 end_h6 = end_h5
302 start_h7 = start_h6
303 end_h7 = end_h6
305 def start_ul(self, attrs):
306 self.needvspace(1)
307 for attrname, value in attrs:
308 if attrname == 'compact':
309 self.compact = 1
310 self.setindent(0)
311 break
312 else:
313 self.setindent(self.stl.ulindent)
315 start_dir = start_menu = start_ol = start_ul
317 do_li = do_p
319 def end_ul(self):
320 self.compact = 0
321 self.needvspace(1)
322 self.setindent(self.stl.stdindent)
324 end_dir = end_menu = end_ol = end_ul
326 def start_dl(self, attrs):
327 for attrname, value in attrs:
328 if attrname == 'compact':
329 self.compact = 1
330 self.needvspace(1)
332 def end_dl(self):
333 self.compact = 0
334 self.needvspace(1)
335 self.setindent(self.stl.stdindent)
337 def do_dt(self, attrs):
338 if self.compact:
339 self.flush()
340 else:
341 self.needvspace(1)
342 self.setindent(self.stl.stdindent)
344 def do_dd(self, attrs):
345 self.fmt.addword('', 1)
346 self.setindent(self.stl.ddindent)
348 def start_address(self, attrs):
349 self.compact = 1
350 self.needvspace(1)
351 self.fmt.setjust('r')
353 def end_address(self):
354 self.compact = 0
355 self.needvspace(1)
356 self.setindent(self.stl.stdindent)
357 self.fmt.setjust('l')
359 def start_pre(self, attrs):
360 self.needvspace(1)
361 self.nofill = self.nofill + 1
362 self.pushstyle(FIXED)
364 def end_pre(self):
365 self.popstyle()
366 self.nofill = self.nofill - 1
367 self.needvspace(1)
369 start_typewriter = start_pre
370 end_typewriter = end_pre
372 def do_img(self, attrs):
373 self.fmt.addword('(image)', 0)
375 # Physical styles
377 def start_tt(self, attrs): self.pushstyle(FIXED)
378 def end_tt(self): self.popstyle()
380 def start_b(self, attrs): self.pushstyle(BOLD)
381 def end_b(self): self.popstyle()
383 def start_i(self, attrs): self.pushstyle(ITALIC)
384 def end_i(self): self.popstyle()
386 def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
387 def end_u(self): self.popstyle()
389 def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
390 def end_r(self): self.popstyle()
392 # Logical styles
394 start_em = start_i
395 end_em = end_i
397 start_strong = start_b
398 end_strong = end_b
400 start_code = start_tt
401 end_code = end_tt
403 start_samp = start_tt
404 end_samp = end_tt
406 start_kbd = start_tt
407 end_kbd = end_tt
409 start_file = start_tt # unofficial
410 end_file = end_tt
412 start_var = start_i
413 end_var = end_i
415 start_dfn = start_i
416 end_dfn = end_i
418 start_cite = start_i
419 end_cite = end_i
421 start_hp1 = start_i
422 end_hp1 = start_i
424 start_hp2 = start_b
425 end_hp2 = end_b
427 def unknown_starttag(self, tag, attrs):
428 print '*** unknown <' + tag + '>'
430 def unknown_endtag(self, tag):
431 print '*** unknown </' + tag + '>'
434 # An extension of the formatting parser which formats anchors differently.
435 class AnchoringParser(FormattingParser):
437 def start_a(self, attrs):
438 FormattingParser.start_a(self, attrs)
439 if self.inanchor:
440 self.fmt.bgn_anchor(self.inanchor)
442 def end_a(self):
443 if self.inanchor:
444 self.fmt.end_anchor(self.inanchor)
445 self.inanchor = 0
448 # Style sheet -- this is never instantiated, but the attributes
449 # of the class object itself are used to specify fonts to be used
450 # for various paragraph styles.
451 # A font set is a non-empty list of fonts, in the order:
452 # [roman, italic, bold, fixed].
453 # When a style is not available the nearest lower style is used
455 ROMAN = 0
456 ITALIC = 1
457 BOLD = 2
458 FIXED = 3
460 class NullStylesheet:
461 # Fonts -- none
462 stdfontset = [None]
463 h1fontset = [None]
464 h2fontset = [None]
465 h3fontset = [None]
466 # Indents
467 stdindent = 2
468 ddindent = 25
469 ulindent = 4
470 h1indent = 0
471 h2indent = 0
472 literalindent = 0
475 class X11Stylesheet(NullStylesheet):
476 stdfontset = [ \
477 '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
478 '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
479 '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
480 '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
482 h1fontset = [ \
483 '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
484 '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
485 '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
487 h2fontset = [ \
488 '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
489 '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
490 '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
492 h3fontset = [ \
493 '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
494 '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
495 '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
497 ddindent = 40
500 class MacStylesheet(NullStylesheet):
501 stdfontset = [ \
502 ('Geneva', 'p', 10), \
503 ('Geneva', 'i', 10), \
504 ('Geneva', 'b', 10), \
505 ('Monaco', 'p', 10), \
507 h1fontset = [ \
508 ('Geneva', 'p', 18), \
509 ('Geneva', 'i', 18), \
510 ('Geneva', 'b', 18), \
511 ('Monaco', 'p', 18), \
513 h3fontset = [ \
514 ('Geneva', 'p', 14), \
515 ('Geneva', 'i', 14), \
516 ('Geneva', 'b', 14), \
517 ('Monaco', 'p', 14), \
519 h3fontset = [ \
520 ('Geneva', 'p', 12), \
521 ('Geneva', 'i', 12), \
522 ('Geneva', 'b', 12), \
523 ('Monaco', 'p', 12), \
527 if os.name == 'mac':
528 StdwinStylesheet = MacStylesheet
529 else:
530 StdwinStylesheet = X11Stylesheet
533 class GLStylesheet(NullStylesheet):
534 stdfontset = [ \
535 'Helvetica 10', \
536 'Helvetica-Italic 10', \
537 'Helvetica-Bold 10', \
538 'Courier 10', \
540 h1fontset = [ \
541 'Helvetica 18', \
542 'Helvetica-Italic 18', \
543 'Helvetica-Bold 18', \
544 'Courier 18', \
546 h2fontset = [ \
547 'Helvetica 14', \
548 'Helvetica-Italic 14', \
549 'Helvetica-Bold 14', \
550 'Courier 14', \
552 h3fontset = [ \
553 'Helvetica 12', \
554 'Helvetica-Italic 12', \
555 'Helvetica-Bold 12', \
556 'Courier 12', \
560 # Test program -- produces no output but times how long it takes
561 # to send a document to a null formatter, exclusive of I/O
563 def test():
564 import fmt
565 import time
566 import urllib
567 if sys.argv[1:]: file = sys.argv[1]
568 else: file = 'test.html'
569 data = urllib.urlopen(file).read()
570 t0 = time.time()
571 fmtr = fmt.WritingFormatter(sys.stdout, 79)
572 p = FormattingParser(fmtr, NullStylesheet)
573 p.feed(data)
574 p.close()
575 t1 = time.time()
576 print
577 print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
580 # Test program using stdwin
582 def testStdwin():
583 import stdwin, fmt
584 from stdwinevents import *
585 if sys.argv[1:]: file = sys.argv[1]
586 else: file = 'test.html'
587 data = open(file, 'r').read()
588 window = stdwin.open('testStdwin')
589 b = None
590 while 1:
591 etype, ewin, edetail = stdwin.getevent()
592 if etype == WE_CLOSE:
593 break
594 if etype == WE_SIZE:
595 window.setdocsize(0, 0)
596 window.setorigin(0, 0)
597 window.change((0, 0), (10000, 30000)) # XXX
598 if etype == WE_DRAW:
599 if not b:
600 b = fmt.StdwinBackEnd(window, 1)
601 f = fmt.BaseFormatter(b.d, b)
602 p = FormattingParser(f, \
603 MacStylesheet)
604 p.feed(data)
605 p.close()
606 b.finish()
607 else:
608 b.redraw(edetail)
609 window.close()
612 # Test program using GL
614 def testGL():
615 import gl, GL, fmt
616 if sys.argv[1:]: file = sys.argv[1]
617 else: file = 'test.html'
618 data = open(file, 'r').read()
619 W, H = 600, 600
620 gl.foreground()
621 gl.prefsize(W, H)
622 wid = gl.winopen('testGL')
623 gl.ortho2(0, W, H, 0)
624 gl.color(GL.WHITE)
625 gl.clear()
626 gl.color(GL.BLACK)
627 b = fmt.GLBackEnd(wid)
628 f = fmt.BaseFormatter(b.d, b)
629 p = FormattingParser(f, GLStylesheet)
630 p.feed(data)
631 p.close()
632 b.finish()
634 import time
635 time.sleep(5)
638 if __name__ == '__main__':
639 test()