1 """Shared support for scanning document type declarations in HTML and XHTML."""
6 _declname_match
= re
.compile(r
'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
7 _declstringlit_match
= re
.compile(r
'(\'[^
\']*\'|
"[^"]*")\s*').match
13 """Parser base class which provides some common support methods used
14 by the SGML/HTML and XHTML parsers."""
17 if self.__class__ is ParserBase:
19 "markupbase
.ParserBase must be subclassed
")
21 def error(self, message):
22 raise NotImplementedError(
23 "subclasses of ParserBase must override
error()")
30 """Return current line number and offset."""
31 return self.lineno, self.offset
33 # Internal -- update line number and offset. This should be
34 # called for each piece of data exactly once, in order -- in other
35 # words the concatenation of all the input strings to this
36 # function should be exactly the entire input.
37 def updatepos(self, i, j):
40 rawdata = self.rawdata
41 nlines = string.count(rawdata, "\n", i, j)
43 self.lineno = self.lineno + nlines
44 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
45 self.offset = j-(pos+1)
47 self.offset = self.offset + j-i
52 # Internal -- parse declaration (for use by subclasses).
53 def parse_declaration(self, i):
54 # This is some sort of declaration; in "HTML
as
55 # deployed," this should only be the document type
56 # declaration ("<!DOCTYPE html...>").
57 rawdata
= self
.rawdata
59 assert rawdata
[i
:j
] == "<!", "unexpected call to parse_declaration"
60 if rawdata
[j
:j
+1] in ("-", ""):
61 # Start of comment followed by buffer boundary,
62 # or just a buffer boundary.
64 # in practice, this should look like: ((name|stringlit) S*)+ '>'
66 decltype
, j
= self
._scan
_name
(j
, i
)
69 if decltype
== "doctype":
70 self
._decl
_otherchars
= ''
74 # end of declaration syntax
76 if decltype
== "doctype":
77 self
.handle_decl(data
)
79 self
.unknown_decl(data
)
82 m
= _declstringlit_match(rawdata
, j
)
84 return -1 # incomplete
86 elif c
in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
87 name
, j
= self
._scan
_name
(j
, i
)
88 elif c
in self
._decl
_otherchars
:
91 if decltype
== "doctype":
92 j
= self
._parse
_doctype
_subset
(j
+ 1, i
)
94 self
.error("unexpected '[' char in declaration")
97 "unexpected %s char in declaration" % `rawdata
[j
]`
)
100 return -1 # incomplete
102 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
103 # returning the index just past any whitespace following the trailing ']'.
104 def _parse_doctype_subset(self
, i
, declstartpos
):
105 rawdata
= self
.rawdata
113 # end of buffer; incomplete
116 self
.updatepos(declstartpos
, j
+ 1)
117 self
.error("unexpected char in internal subset (in %s)"
120 # end of buffer; incomplete
123 # end of buffer; incomplete
125 if rawdata
[j
:j
+4] == "<!--":
126 j
= self
.parse_comment(j
, report
=0)
130 name
, j
= self
._scan
_name
(j
+ 2, declstartpos
)
133 if name
not in ("attlist", "element", "entity", "notation"):
134 self
.updatepos(declstartpos
, j
+ 2)
136 "unknown declaration %s in internal subset" % `name`
)
137 # handle the individual names
138 meth
= getattr(self
, "_parse_doctype_" + name
)
139 j
= meth(j
, declstartpos
)
143 # parameter entity reference
145 # end of buffer; incomplete
147 s
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
150 if rawdata
[j
] == ";":
154 while j
< n
and rawdata
[j
] in string
.whitespace
:
157 if rawdata
[j
] == ">":
159 self
.updatepos(declstartpos
, j
)
160 self
.error("unexpected char after internal subset")
163 elif c
in string
.whitespace
:
166 self
.updatepos(declstartpos
, j
)
167 self
.error("unexpected char %s in internal subset" % `c`
)
168 # end of buffer reached
171 # Internal -- scan past <!ELEMENT declarations
172 def _parse_doctype_element(self
, i
, declstartpos
):
173 name
, j
= self
._scan
_name
(i
, declstartpos
)
176 # style content model; just skip until '>'
177 rawdata
= self
.rawdata
178 if '>' in rawdata
[j
:]:
179 return string
.find(rawdata
, ">", j
) + 1
182 # Internal -- scan past <!ATTLIST declarations
183 def _parse_doctype_attlist(self
, i
, declstartpos
):
184 rawdata
= self
.rawdata
185 name
, j
= self
._scan
_name
(i
, declstartpos
)
192 # scan a series of attribute descriptions; simplified:
193 # name type [value] [#constraint]
194 name
, j
= self
._scan
_name
(j
, declstartpos
)
201 # an enumerated type; look for ')'
202 if ")" in rawdata
[j
:]:
203 j
= string
.find(rawdata
, ")", j
) + 1
206 while rawdata
[j
:j
+1] in string
.whitespace
:
209 # end of buffer, incomplete
212 name
, j
= self
._scan
_name
(j
, declstartpos
)
217 m
= _declstringlit_match(rawdata
, j
)
226 if rawdata
[j
:] == "#":
229 name
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
239 # Internal -- scan past <!NOTATION declarations
240 def _parse_doctype_notation(self
, i
, declstartpos
):
241 name
, j
= self
._scan
_name
(i
, declstartpos
)
244 rawdata
= self
.rawdata
248 # end of buffer; incomplete
253 m
= _declstringlit_match(rawdata
, j
)
258 name
, j
= self
._scan
_name
(j
, declstartpos
)
262 # Internal -- scan past <!ENTITY declarations
263 def _parse_doctype_entity(self
, i
, declstartpos
):
264 rawdata
= self
.rawdata
265 if rawdata
[i
:i
+1] == "%":
271 if c
in string
.whitespace
:
277 name
, j
= self
._scan
_name
(j
, declstartpos
)
281 c
= self
.rawdata
[j
:j
+1]
285 m
= _declstringlit_match(rawdata
, j
)
289 return -1 # incomplete
293 name
, j
= self
._scan
_name
(j
, declstartpos
)
297 # Internal -- scan a name token and the new position and the token, or
298 # return -1 if we've reached the end of the buffer.
299 def _scan_name(self
, i
, declstartpos
):
300 rawdata
= self
.rawdata
304 m
= _declname_match(rawdata
, i
)
308 if (i
+ len(s
)) == n
:
309 return None, -1 # end of buffer
310 return string
.lower(name
), m
.end()
312 self
.updatepos(declstartpos
, i
)
313 self
.error("expected name token")
315 # To be overridden -- handlers for unknown objects
316 def unknown_decl(self
, data
):