1 """Shared support for scanning document type declarations in HTML and XHTML."""
5 _declname_match
= re
.compile(r
'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
6 _declstringlit_match
= re
.compile(r
'(\'[^
\']*\'|
"[^"]*")\s*').match
7 _commentclose = re.compile(r'--\s*>')
8 _markedsectionclose = re.compile(r']\s*]\s*>')
10 # An analysis of the MS-Word extensions is available at
11 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
13 _msmarkedsectionclose = re.compile(r']\s*>')
19 """Parser base class which provides some common support methods used
20 by the SGML/HTML and XHTML parsers."""
23 if self.__class__ is ParserBase:
25 "markupbase
.ParserBase must be subclassed
")
27 def error(self, message):
28 raise NotImplementedError(
29 "subclasses of ParserBase must override
error()")
36 """Return current line number and offset."""
37 return self.lineno, self.offset
39 # Internal -- update line number and offset. This should be
40 # called for each piece of data exactly once, in order -- in other
41 # words the concatenation of all the input strings to this
42 # function should be exactly the entire input.
43 def updatepos(self, i, j):
46 rawdata = self.rawdata
47 nlines = rawdata.count("\n", i, j)
49 self.lineno = self.lineno + nlines
50 pos = rawdata.rindex("\n", i, j) # Should not fail
51 self.offset = j-(pos+1)
53 self.offset = self.offset + j-i
58 # Internal -- parse declaration (for use by subclasses).
59 def parse_declaration(self, i):
60 # This is some sort of declaration; in "HTML
as
61 # deployed," this should only be the document type
62 # declaration ("<!DOCTYPE html...>").
63 # ISO 8879:1986, however, has more complex
64 # declaration syntax for elements in <!...>, including:
67 # name in the following list: ENTITY, DOCTYPE, ELEMENT,
68 # ATTLIST, NOTATION, SHORTREF, USEMAP,
69 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
70 rawdata
= self
.rawdata
72 assert rawdata
[i
:j
] == "<!", "unexpected call to parse_declaration"
73 if rawdata
[j
:j
+1] in ("-", ""):
74 # Start of comment followed by buffer boundary,
75 # or just a buffer boundary.
77 # A simple, practical version could look like: ((name|stringlit) S*) + '>'
79 if rawdata
[j
:j
+1] == '--': #comment
80 # Locate --.*-- as the body of the comment
81 return self
.parse_comment(i
)
82 elif rawdata
[j
] == '[': #marked section
83 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
84 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
85 # Note that this is extended by Microsoft Office "Save as Web" function
86 # to include [if...] and [endif].
87 return self
.parse_marked_section(i
)
88 else: #all other declaration elements
89 decltype
, j
= self
._scan
_name
(j
, i
)
92 if decltype
== "doctype":
93 self
._decl
_otherchars
= ''
97 # end of declaration syntax
99 if decltype
== "doctype":
100 self
.handle_decl(data
)
102 self
.unknown_decl(data
)
105 m
= _declstringlit_match(rawdata
, j
)
107 return -1 # incomplete
109 elif c
in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
110 name
, j
= self
._scan
_name
(j
, i
)
111 elif c
in self
._decl
_otherchars
:
114 # this could be handled in a separate doctype parser
115 if decltype
== "doctype":
116 j
= self
._parse
_doctype
_subset
(j
+ 1, i
)
117 elif decltype
in ("attlist", "linktype", "link", "element"):
118 # must tolerate []'d groups in a content model in an element declaration
119 # also in data attribute specifications of attlist declaration
120 # also link type declaration subsets in linktype declarations
121 # also link attribute specification lists in link declarations
122 self
.error("unsupported '[' char in %s declaration" % decltype
)
124 self
.error("unexpected '[' char in declaration")
127 "unexpected %s char in declaration" % `rawdata
[j
]`
)
130 return -1 # incomplete
132 # Internal -- parse a marked section
133 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
134 def parse_marked_section( self
, i
, report
=1 ):
135 rawdata
= self
.rawdata
136 assert rawdata
[i
:i
+3] == '<![', "unexpected call to parse_marked_section()"
137 sectName
, j
= self
._scan
_name
( i
+3, i
)
140 if sectName
in ("temp", "cdata", "ignore", "include", "rcdata"):
141 # look for standard ]]> ending
142 match
= _markedsectionclose
.search(rawdata
, i
+3)
143 elif sectName
in ("if", "else", "endif"):
144 # look for MS Office ]> ending
145 match
= _msmarkedsectionclose
.search(rawdata
, i
+3)
147 self
.error('unknown status keyword %s in marked section' % `rawdata
[i
+3:j
]`
)
152 self
.unknown_decl(rawdata
[i
+3: j
])
155 # Internal -- parse comment, return length or -1 if not terminated
156 def parse_comment(self
, i
, report
=1):
157 rawdata
= self
.rawdata
158 if rawdata
[i
:i
+4] != '<!--':
159 self
.error('unexpected call to parse_comment()')
160 match
= _commentclose
.search(rawdata
, i
+4)
165 self
.handle_comment(rawdata
[i
+4: j
])
168 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
169 # returning the index just past any whitespace following the trailing ']'.
170 def _parse_doctype_subset(self
, i
, declstartpos
):
171 rawdata
= self
.rawdata
179 # end of buffer; incomplete
182 self
.updatepos(declstartpos
, j
+ 1)
183 self
.error("unexpected char in internal subset (in %s)"
186 # end of buffer; incomplete
189 # end of buffer; incomplete
191 if rawdata
[j
:j
+4] == "<!--":
192 j
= self
.parse_comment(j
, report
=0)
196 name
, j
= self
._scan
_name
(j
+ 2, declstartpos
)
199 if name
not in ("attlist", "element", "entity", "notation"):
200 self
.updatepos(declstartpos
, j
+ 2)
202 "unknown declaration %s in internal subset" % `name`
)
203 # handle the individual names
204 meth
= getattr(self
, "_parse_doctype_" + name
)
205 j
= meth(j
, declstartpos
)
209 # parameter entity reference
211 # end of buffer; incomplete
213 s
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
216 if rawdata
[j
] == ";":
220 while j
< n
and rawdata
[j
].isspace():
223 if rawdata
[j
] == ">":
225 self
.updatepos(declstartpos
, j
)
226 self
.error("unexpected char after internal subset")
232 self
.updatepos(declstartpos
, j
)
233 self
.error("unexpected char %s in internal subset" % `c`
)
234 # end of buffer reached
237 # Internal -- scan past <!ELEMENT declarations
238 def _parse_doctype_element(self
, i
, declstartpos
):
239 name
, j
= self
._scan
_name
(i
, declstartpos
)
242 # style content model; just skip until '>'
243 rawdata
= self
.rawdata
244 if '>' in rawdata
[j
:]:
245 return rawdata
.find(">", j
) + 1
248 # Internal -- scan past <!ATTLIST declarations
249 def _parse_doctype_attlist(self
, i
, declstartpos
):
250 rawdata
= self
.rawdata
251 name
, j
= self
._scan
_name
(i
, declstartpos
)
258 # scan a series of attribute descriptions; simplified:
259 # name type [value] [#constraint]
260 name
, j
= self
._scan
_name
(j
, declstartpos
)
267 # an enumerated type; look for ')'
268 if ")" in rawdata
[j
:]:
269 j
= rawdata
.find(")", j
) + 1
272 while rawdata
[j
:j
+1].isspace():
275 # end of buffer, incomplete
278 name
, j
= self
._scan
_name
(j
, declstartpos
)
283 m
= _declstringlit_match(rawdata
, j
)
292 if rawdata
[j
:] == "#":
295 name
, j
= self
._scan
_name
(j
+ 1, declstartpos
)
305 # Internal -- scan past <!NOTATION declarations
306 def _parse_doctype_notation(self
, i
, declstartpos
):
307 name
, j
= self
._scan
_name
(i
, declstartpos
)
310 rawdata
= self
.rawdata
314 # end of buffer; incomplete
319 m
= _declstringlit_match(rawdata
, j
)
324 name
, j
= self
._scan
_name
(j
, declstartpos
)
328 # Internal -- scan past <!ENTITY declarations
329 def _parse_doctype_entity(self
, i
, declstartpos
):
330 rawdata
= self
.rawdata
331 if rawdata
[i
:i
+1] == "%":
343 name
, j
= self
._scan
_name
(j
, declstartpos
)
347 c
= self
.rawdata
[j
:j
+1]
351 m
= _declstringlit_match(rawdata
, j
)
355 return -1 # incomplete
359 name
, j
= self
._scan
_name
(j
, declstartpos
)
363 # Internal -- scan a name token and the new position and the token, or
364 # return -1 if we've reached the end of the buffer.
365 def _scan_name(self
, i
, declstartpos
):
366 rawdata
= self
.rawdata
370 m
= _declname_match(rawdata
, i
)
374 if (i
+ len(s
)) == n
:
375 return None, -1 # end of buffer
376 return name
.lower(), m
.end()
378 self
.updatepos(declstartpos
, i
)
379 self
.error("expected name token")
381 # To be overridden -- handlers for unknown objects
382 def unknown_decl(self
, data
):