1 # A parser for HTML documents
4 # HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
5 # describe hypertext documents
7 # SGML: Standard Generalized Markup Language
9 # WWW: World-Wide Web; a distributed hypertext system develped at CERN
11 # CERN: European Particle Physics Laboratory in Geneva, Switzerland
14 # This file is only concerned with parsing and formatting HTML
15 # documents, not with the other (hypertext and networking) aspects of
16 # the WWW project. (It does support highlighting of anchors.)
26 class HTMLParser(sgmllib
.SGMLParser
):
28 # Copy base class entities and add some
30 for key
in sgmllib
.SGMLParser
.entitydefs
.keys():
31 entitydefs
[key
] = sgmllib
.SGMLParser
.entitydefs
[key
]
32 entitydefs
['bullet'] = '*'
34 # Provided -- handlers for tags introducing literal text
36 def start_listing(self
, attrs
):
37 self
.setliteral('listing')
38 self
.literal_bgn('listing', attrs
)
40 def end_listing(self
):
41 self
.literal_end('listing')
43 def start_xmp(self
, attrs
):
44 self
.setliteral('xmp')
45 self
.literal_bgn('xmp', attrs
)
48 self
.literal_end('xmp')
50 def do_plaintext(self
, attrs
):
52 self
.literal_bgn('plaintext', attrs
)
54 # To be overridden -- begin/end literal mode
55 def literal_bgn(self
, tag
, attrs
): pass
56 def literal_end(self
, tag
): pass
59 # Next level of sophistication -- collect anchors, title, nextid and isindex
60 class CollectingParser(HTMLParser
):
63 HTMLParser
.__init
__(self
)
73 def start_a(self
, attrs
):
78 for attrname
, value
in attrs
:
79 if attrname
== 'href':
81 if attrname
== 'name=':
83 if attrname
== 'type=':
84 type = string
.lower(value
)
85 if not (href
or name
):
87 self
.anchors
.append(href
)
88 self
.anchornames
.append(name
)
89 self
.anchortypes
.append(type)
90 self
.inanchor
= len(self
.anchors
)
92 self
.inanchor
= -self
.inanchor
96 # Don't show anchors pointing into the current document
97 if self
.anchors
[self
.inanchor
-1][:1] <> '#':
98 self
.handle_data('[' + `self
.inanchor`
+ ']')
101 def start_header(self
, attrs
): pass
102 def end_header(self
): pass
104 # (head is the same as header)
105 def start_head(self
, attrs
): pass
106 def end_head(self
): pass
108 def start_body(self
, attrs
): pass
109 def end_body(self
): pass
111 def do_nextid(self
, attrs
):
114 def do_isindex(self
, attrs
):
117 def start_title(self
, attrs
):
121 if self
.savetext
<> None:
122 self
.title
= self
.savetext
125 def handle_data(self
, text
):
126 if self
.savetext
is not None:
127 self
.savetext
= self
.savetext
+ text
130 # Formatting parser -- takes a formatter and a style sheet as arguments
132 # XXX The use of style sheets should change: for each tag and end tag
133 # there should be a style definition, and a style definition should
134 # encompass many more parameters: font, justification, indentation,
135 # vspace before, vspace after, hanging tag...
137 wordprog
= regex
.compile('[^ \t\n]*')
138 spaceprog
= regex
.compile('[ \t\n]*')
140 class FormattingParser(CollectingParser
):
142 def __init__(self
, formatter
, stylesheet
):
143 CollectingParser
.__init
__(self
)
145 self
.stl
= stylesheet
150 self
.setindent(self
.stl
.stdindent
)
155 self
.fontset
= self
.stl
.stdfontset
160 font
= self
.fontset
[self
.style
]
161 self
.fmt
.setfont(font
)
163 def pushstyle(self
, style
):
164 self
.stylestack
.append(self
.style
)
165 self
.style
= min(style
, len(self
.fontset
)-1)
169 self
.style
= self
.stylestack
[-1]
170 del self
.stylestack
[-1]
173 def pushfontset(self
, fontset
, style
):
174 self
.fontstack
.append(self
.fontset
)
175 self
.fontset
= fontset
176 self
.pushstyle(style
)
178 def popfontset(self
):
179 self
.fontset
= self
.fontstack
[-1]
180 del self
.fontstack
[-1]
186 def setindent(self
, n
):
187 self
.fmt
.setleftindent(n
)
189 def needvspace(self
, n
):
190 self
.fmt
.needvspace(n
)
193 HTMLParser
.close(self
)
196 def handle_literal(self
, text
):
197 lines
= string
.splitfields(text
, '\n')
198 for i
in range(1, len(lines
)):
199 lines
[i
] = string
.expandtabs(lines
[i
], 8)
200 for line
in lines
[:-1]:
201 self
.fmt
.addword(line
, 0)
204 for line
in lines
[-1:]:
205 self
.fmt
.addword(line
, 0)
207 def handle_data(self
, text
):
208 if self
.savetext
is not None:
209 self
.savetext
= self
.savetext
+ text
212 self
.handle_literal(text
)
217 j
= i
+ wordprog
.match(text
, i
)
219 i
= j
+ spaceprog
.match(text
, j
)
220 self
.fmt
.addword(word
, i
-j
)
221 if self
.nofill
and '\n' in text
[j
:i
]:
225 while text
[i
-1] <> '\n': i
= i
+1
227 def literal_bgn(self
, tag
, attrs
):
228 if tag
== 'plaintext':
232 self
.pushfontset(self
.stl
.stdfontset
, FIXED
)
233 self
.setindent(self
.stl
.literalindent
)
235 def literal_end(self
, tag
):
238 self
.setindent(self
.stl
.stdindent
)
240 def start_title(self
, attrs
):
243 # NB end_title is unchanged
245 def do_p(self
, attrs
):
251 def start_h1(self
, attrs
):
253 self
.setindent(self
.stl
.h1indent
)
254 self
.pushfontset(self
.stl
.h1fontset
, BOLD
)
255 self
.fmt
.setjust('c')
260 self
.setindent(self
.stl
.stdindent
)
261 self
.fmt
.setjust('l')
263 def start_h2(self
, attrs
):
265 self
.setindent(self
.stl
.h2indent
)
266 self
.pushfontset(self
.stl
.h2fontset
, BOLD
)
271 self
.setindent(self
.stl
.stdindent
)
273 def start_h3(self
, attrs
):
275 self
.setindent(self
.stl
.stdindent
)
276 self
.pushfontset(self
.stl
.h3fontset
, BOLD
)
281 self
.setindent(self
.stl
.stdindent
)
283 def start_h4(self
, attrs
):
285 self
.setindent(self
.stl
.stdindent
)
286 self
.pushfontset(self
.stl
.stdfontset
, BOLD
)
291 self
.setindent(self
.stl
.stdindent
)
302 def start_ul(self
, attrs
):
304 for attrname
, value
in attrs
:
305 if attrname
== 'compact':
310 self
.setindent(self
.stl
.ulindent
)
312 start_dir
= start_menu
= start_ol
= start_ul
319 self
.setindent(self
.stl
.stdindent
)
321 end_dir
= end_menu
= end_ol
= end_ul
323 def start_dl(self
, attrs
):
324 for attrname
, value
in attrs
:
325 if attrname
== 'compact':
332 self
.setindent(self
.stl
.stdindent
)
334 def do_dt(self
, attrs
):
339 self
.setindent(self
.stl
.stdindent
)
341 def do_dd(self
, attrs
):
342 self
.fmt
.addword('', 1)
343 self
.setindent(self
.stl
.ddindent
)
345 def start_address(self
, attrs
):
348 self
.fmt
.setjust('r')
350 def end_address(self
):
353 self
.setindent(self
.stl
.stdindent
)
354 self
.fmt
.setjust('l')
356 def start_pre(self
, attrs
):
358 self
.nofill
= self
.nofill
+ 1
359 self
.pushstyle(FIXED
)
363 self
.nofill
= self
.nofill
- 1
366 start_typewriter
= start_pre
367 end_typewriter
= end_pre
369 def do_img(self
, attrs
):
370 self
.fmt
.addword('(image)', 0)
374 def start_tt(self
, attrs
): self
.pushstyle(FIXED
)
375 def end_tt(self
): self
.popstyle()
377 def start_b(self
, attrs
): self
.pushstyle(BOLD
)
378 def end_b(self
): self
.popstyle()
380 def start_i(self
, attrs
): self
.pushstyle(ITALIC
)
381 def end_i(self
): self
.popstyle()
383 def start_u(self
, attrs
): self
.pushstyle(ITALIC
) # Underline???
384 def end_u(self
): self
.popstyle()
386 def start_r(self
, attrs
): self
.pushstyle(ROMAN
) # Not official
387 def end_r(self
): self
.popstyle()
394 start_strong
= start_b
397 start_code
= start_tt
400 start_samp
= start_tt
406 start_file
= start_tt
# unofficial
424 def unknown_starttag(self
, tag
, attrs
):
425 print '*** unknown <' + tag
+ '>'
427 def unknown_endtag(self
, tag
):
428 print '*** unknown </' + tag
+ '>'
431 # An extension of the formatting parser which formats anchors differently.
432 class AnchoringParser(FormattingParser
):
434 def start_a(self
, attrs
):
435 FormattingParser
.start_a(self
, attrs
)
437 self
.fmt
.bgn_anchor(self
.inanchor
)
441 self
.fmt
.end_anchor(self
.inanchor
)
445 # Style sheet -- this is never instantiated, but the attributes
446 # of the class object itself are used to specify fonts to be used
447 # for various paragraph styles.
448 # A font set is a non-empty list of fonts, in the order:
449 # [roman, italic, bold, fixed].
450 # When a style is not available the nearest lower style is used
457 class NullStylesheet
:
472 class X11Stylesheet(NullStylesheet
):
474 '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*',
475 '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*',
476 '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*',
477 '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*',
480 '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*',
481 '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*',
482 '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*',
485 '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*',
486 '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*',
487 '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*',
490 '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*',
491 '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*',
492 '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*',
497 class MacStylesheet(NullStylesheet
):
525 StdwinStylesheet
= MacStylesheet
527 StdwinStylesheet
= X11Stylesheet
530 class GLStylesheet(NullStylesheet
):
533 'Helvetica-Italic 10',
539 'Helvetica-Italic 18',
545 'Helvetica-Italic 14',
551 'Helvetica-Italic 12',
557 # Test program -- produces no output but times how long it takes
558 # to send a document to a null formatter, exclusive of I/O
563 if sys
.argv
[1:]: file = sys
.argv
[1]
564 else: file = 'test.html'
565 data
= open(file, 'r').read()
567 fmtr
= fmt
.WritingFormatter(sys
.stdout
, 79)
568 p
= FormattingParser(fmtr
, NullStylesheet
)
573 print '*** Formatting time:', round(t1
-t0
, 3), 'seconds.'
576 # Test program using stdwin
580 from stdwinevents
import *
581 if sys
.argv
[1:]: file = sys
.argv
[1]
582 else: file = 'test.html'
583 data
= open(file, 'r').read()
584 window
= stdwin
.open('testStdwin')
587 etype
, ewin
, edetail
= stdwin
.getevent()
588 if etype
== WE_CLOSE
:
591 window
.setdocsize(0, 0)
592 window
.setorigin(0, 0)
593 window
.change((0, 0), (10000, 30000)) # XXX
596 b
= fmt
.StdwinBackEnd(window
, 1)
597 f
= fmt
.BaseFormatter(b
.d
, b
)
598 p
= FormattingParser(f
, MacStylesheet
)
607 # Test program using GL
611 if sys
.argv
[1:]: file = sys
.argv
[1]
612 else: file = 'test.html'
613 data
= open(file, 'r').read()
617 wid
= gl
.winopen('testGL')
618 gl
.ortho2(0, W
, H
, 0)
622 b
= fmt
.GLBackEnd(wid
)
623 f
= fmt
.BaseFormatter(b
.d
, b
)
624 p
= FormattingParser(f
, GLStylesheet
)
633 if __name__
== '__main__':