1 # A parser for HTML documents
4 # HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
5 # describe hypertext documents
7 # SGML: Standard Generalized Markup Language
9 # WWW: World-Wide Web; a distributed hypertext system develped at CERN
11 # CERN: European Particle Physics Laboratory in Geneva, Switzerland
14 # This file is only concerned with parsing and formatting HTML
15 # documents, not with the other (hypertext and networking) aspects of
16 # the WWW project. (It does support highlighting of anchors.)
26 class HTMLParser(sgmllib
.SGMLParser
):
28 # Copy base class entities and add some
30 for key
in sgmllib
.SGMLParser
.entitydefs
.keys():
31 entitydefs
[key
] = sgmllib
.SGMLParser
.entitydefs
[key
]
32 entitydefs
['bullet'] = '*'
34 # Provided -- handlers for tags introducing literal text
36 def start_listing(self
, attrs
):
37 self
.setliteral('listing')
38 self
.literal_bgn('listing', attrs
)
40 def end_listing(self
):
41 self
.literal_end('listing')
43 def start_xmp(self
, attrs
):
44 self
.setliteral('xmp')
45 self
.literal_bgn('xmp', attrs
)
48 self
.literal_end('xmp')
50 def do_plaintext(self
, attrs
):
52 self
.literal_bgn('plaintext', attrs
)
54 # To be overridden -- begin/end literal mode
55 def literal_bgn(self
, tag
, attrs
): pass
56 def literal_end(self
, tag
): pass
59 # Next level of sophistication -- collect anchors, title, nextid and isindex
60 class CollectingParser(HTMLParser
):
63 HTMLParser
.__init
__(self
)
73 def start_a(self
, attrs
):
78 for attrname
, value
in attrs
:
79 if attrname
== 'href':
81 if attrname
== 'name=':
83 if attrname
== 'type=':
84 type = string
.lower(value
)
85 if not (href
or name
):
87 self
.anchors
.append(href
)
88 self
.anchornames
.append(name
)
89 self
.anchortypes
.append(type)
90 self
.inanchor
= len(self
.anchors
)
92 self
.inanchor
= -self
.inanchor
96 # Don't show anchors pointing into the current document
97 if self
.anchors
[self
.inanchor
-1][:1] <> '#':
98 self
.handle_data('[' + `self
.inanchor`
+ ']')
101 def start_header(self
, attrs
): pass
102 def end_header(self
): pass
104 # (head is the same as header)
105 def start_head(self
, attrs
): pass
106 def end_head(self
): pass
108 def start_body(self
, attrs
): pass
109 def end_body(self
): pass
111 def do_nextid(self
, attrs
):
114 def do_isindex(self
, attrs
):
117 def start_title(self
, attrs
):
121 if self
.savetext
<> None:
122 self
.title
= self
.savetext
125 def handle_data(self
, text
):
126 if self
.savetext
is not None:
127 self
.savetext
= self
.savetext
+ text
130 # Formatting parser -- takes a formatter and a style sheet as arguments
132 # XXX The use of style sheets should change: for each tag and end tag
133 # there should be a style definition, and a style definition should
134 # encompass many more parameters: font, justification, indentation,
135 # vspace before, vspace after, hanging tag...
137 wordprog
= regex
.compile('[^ \t\n]*')
138 spaceprog
= regex
.compile('[ \t\n]*')
140 class FormattingParser(CollectingParser
):
142 def __init__(self
, formatter
, stylesheet
):
143 CollectingParser
.__init
__(self
)
145 self
.stl
= stylesheet
150 self
.setindent(self
.stl
.stdindent
)
155 self
.fontset
= self
.stl
.stdfontset
160 font
= self
.fontset
[self
.style
]
161 self
.fmt
.setfont(font
)
163 def pushstyle(self
, style
):
164 self
.stylestack
.append(self
.style
)
165 self
.style
= min(style
, len(self
.fontset
)-1)
169 self
.style
= self
.stylestack
[-1]
170 del self
.stylestack
[-1]
173 def pushfontset(self
, fontset
, style
):
174 self
.fontstack
.append(self
.fontset
)
175 self
.fontset
= fontset
176 self
.pushstyle(style
)
178 def popfontset(self
):
179 self
.fontset
= self
.fontstack
[-1]
180 del self
.fontstack
[-1]
186 def setindent(self
, n
):
187 self
.fmt
.setleftindent(n
)
189 def needvspace(self
, n
):
190 self
.fmt
.needvspace(n
)
193 HTMLParser
.close(self
)
196 def handle_literal(self
, text
):
197 lines
= string
.splitfields(text
, '\n')
198 for i
in range(1, len(lines
)):
199 lines
[i
] = string
.expandtabs(lines
[i
], 8)
200 for line
in lines
[:-1]:
201 self
.fmt
.addword(line
, 0)
204 for line
in lines
[-1:]:
205 self
.fmt
.addword(line
, 0)
207 def handle_data(self
, text
):
208 if self
.savetext
is not None:
209 self
.savetext
= self
.savetext
+ text
212 self
.handle_literal(text
)
217 j
= i
+ wordprog
.match(text
, i
)
219 i
= j
+ spaceprog
.match(text
, j
)
220 self
.fmt
.addword(word
, i
-j
)
221 if self
.nofill
and '\n' in text
[j
:i
]:
225 while text
[i
-1] <> '\n': i
= i
+1
227 def literal_bgn(self
, tag
, attrs
):
228 if tag
== 'plaintext':
232 self
.pushfontset(self
.stl
.stdfontset
, FIXED
)
233 self
.setindent(self
.stl
.literalindent
)
235 def literal_end(self
, tag
):
238 self
.setindent(self
.stl
.stdindent
)
240 def start_title(self
, attrs
):
243 # NB end_title is unchanged
245 def do_p(self
, attrs
):
251 def do_hr(self
, attrs
):
254 def start_h1(self
, attrs
):
256 self
.setindent(self
.stl
.h1indent
)
257 self
.pushfontset(self
.stl
.h1fontset
, BOLD
)
258 self
.fmt
.setjust('c')
263 self
.setindent(self
.stl
.stdindent
)
264 self
.fmt
.setjust('l')
266 def start_h2(self
, attrs
):
268 self
.setindent(self
.stl
.h2indent
)
269 self
.pushfontset(self
.stl
.h2fontset
, BOLD
)
274 self
.setindent(self
.stl
.stdindent
)
276 def start_h3(self
, attrs
):
278 self
.setindent(self
.stl
.stdindent
)
279 self
.pushfontset(self
.stl
.h3fontset
, BOLD
)
284 self
.setindent(self
.stl
.stdindent
)
286 def start_h4(self
, attrs
):
288 self
.setindent(self
.stl
.stdindent
)
289 self
.pushfontset(self
.stl
.stdfontset
, BOLD
)
294 self
.setindent(self
.stl
.stdindent
)
305 def start_ul(self
, attrs
):
307 for attrname
, value
in attrs
:
308 if attrname
== 'compact':
313 self
.setindent(self
.stl
.ulindent
)
315 start_dir
= start_menu
= start_ol
= start_ul
322 self
.setindent(self
.stl
.stdindent
)
324 end_dir
= end_menu
= end_ol
= end_ul
326 def start_dl(self
, attrs
):
327 for attrname
, value
in attrs
:
328 if attrname
== 'compact':
335 self
.setindent(self
.stl
.stdindent
)
337 def do_dt(self
, attrs
):
342 self
.setindent(self
.stl
.stdindent
)
344 def do_dd(self
, attrs
):
345 self
.fmt
.addword('', 1)
346 self
.setindent(self
.stl
.ddindent
)
348 def start_address(self
, attrs
):
351 self
.fmt
.setjust('r')
353 def end_address(self
):
356 self
.setindent(self
.stl
.stdindent
)
357 self
.fmt
.setjust('l')
359 def start_pre(self
, attrs
):
361 self
.nofill
= self
.nofill
+ 1
362 self
.pushstyle(FIXED
)
366 self
.nofill
= self
.nofill
- 1
369 start_typewriter
= start_pre
370 end_typewriter
= end_pre
372 def do_img(self
, attrs
):
373 self
.fmt
.addword('(image)', 0)
377 def start_tt(self
, attrs
): self
.pushstyle(FIXED
)
378 def end_tt(self
): self
.popstyle()
380 def start_b(self
, attrs
): self
.pushstyle(BOLD
)
381 def end_b(self
): self
.popstyle()
383 def start_i(self
, attrs
): self
.pushstyle(ITALIC
)
384 def end_i(self
): self
.popstyle()
386 def start_u(self
, attrs
): self
.pushstyle(ITALIC
) # Underline???
387 def end_u(self
): self
.popstyle()
389 def start_r(self
, attrs
): self
.pushstyle(ROMAN
) # Not official
390 def end_r(self
): self
.popstyle()
397 start_strong
= start_b
400 start_code
= start_tt
403 start_samp
= start_tt
409 start_file
= start_tt
# unofficial
427 def unknown_starttag(self
, tag
, attrs
):
428 print '*** unknown <' + tag
+ '>'
430 def unknown_endtag(self
, tag
):
431 print '*** unknown </' + tag
+ '>'
434 # An extension of the formatting parser which formats anchors differently.
435 class AnchoringParser(FormattingParser
):
437 def start_a(self
, attrs
):
438 FormattingParser
.start_a(self
, attrs
)
440 self
.fmt
.bgn_anchor(self
.inanchor
)
444 self
.fmt
.end_anchor(self
.inanchor
)
448 # Style sheet -- this is never instantiated, but the attributes
449 # of the class object itself are used to specify fonts to be used
450 # for various paragraph styles.
451 # A font set is a non-empty list of fonts, in the order:
452 # [roman, italic, bold, fixed].
453 # When a style is not available the nearest lower style is used
460 class NullStylesheet
:
475 class X11Stylesheet(NullStylesheet
):
477 '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
478 '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
479 '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
480 '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
483 '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
484 '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
485 '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
488 '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
489 '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
490 '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
493 '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
494 '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
495 '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
500 class MacStylesheet(NullStylesheet
):
502 ('Geneva', 'p', 10), \
503 ('Geneva', 'i', 10), \
504 ('Geneva', 'b', 10), \
505 ('Monaco', 'p', 10), \
508 ('Geneva', 'p', 18), \
509 ('Geneva', 'i', 18), \
510 ('Geneva', 'b', 18), \
511 ('Monaco', 'p', 18), \
514 ('Geneva', 'p', 14), \
515 ('Geneva', 'i', 14), \
516 ('Geneva', 'b', 14), \
517 ('Monaco', 'p', 14), \
520 ('Geneva', 'p', 12), \
521 ('Geneva', 'i', 12), \
522 ('Geneva', 'b', 12), \
523 ('Monaco', 'p', 12), \
528 StdwinStylesheet
= MacStylesheet
530 StdwinStylesheet
= X11Stylesheet
533 class GLStylesheet(NullStylesheet
):
536 'Helvetica-Italic 10', \
537 'Helvetica-Bold 10', \
542 'Helvetica-Italic 18', \
543 'Helvetica-Bold 18', \
548 'Helvetica-Italic 14', \
549 'Helvetica-Bold 14', \
554 'Helvetica-Italic 12', \
555 'Helvetica-Bold 12', \
560 # Test program -- produces no output but times how long it takes
561 # to send a document to a null formatter, exclusive of I/O
567 if sys
.argv
[1:]: file = sys
.argv
[1]
568 else: file = 'test.html'
569 data
= urllib
.urlopen(file).read()
571 fmtr
= fmt
.WritingFormatter(sys
.stdout
, 79)
572 p
= FormattingParser(fmtr
, NullStylesheet
)
577 print '*** Formatting time:', round(t1
-t0
, 3), 'seconds.'
580 # Test program using stdwin
584 from stdwinevents
import *
585 if sys
.argv
[1:]: file = sys
.argv
[1]
586 else: file = 'test.html'
587 data
= open(file, 'r').read()
588 window
= stdwin
.open('testStdwin')
591 etype
, ewin
, edetail
= stdwin
.getevent()
592 if etype
== WE_CLOSE
:
595 window
.setdocsize(0, 0)
596 window
.setorigin(0, 0)
597 window
.change((0, 0), (10000, 30000)) # XXX
600 b
= fmt
.StdwinBackEnd(window
, 1)
601 f
= fmt
.BaseFormatter(b
.d
, b
)
602 p
= FormattingParser(f
, \
612 # Test program using GL
616 if sys
.argv
[1:]: file = sys
.argv
[1]
617 else: file = 'test.html'
618 data
= open(file, 'r').read()
622 wid
= gl
.winopen('testGL')
623 gl
.ortho2(0, W
, H
, 0)
627 b
= fmt
.GLBackEnd(wid
)
628 f
= fmt
.BaseFormatter(b
.d
, b
)
629 p
= FormattingParser(f
, GLStylesheet
)
638 if __name__
== '__main__':