Lib/markupbase.py

   1 """Shared support for scanning document type declarations in HTML and XHTML."""
   2
   3 import re
   4
   5 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
   6 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
   7 _commentclose = re.compile(r'--\s*>')
   8 _markedsectionclose = re.compile(r']\s*]\s*>')
   9
  10 # An analysis of the MS-Word extensions is available at
  11 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  12
  13 _msmarkedsectionclose = re.compile(r']\s*>')
  14
  15 del re
  16
  17
  18 class ParserBase:
  19     """Parser base class which provides some common support methods used
  20     by the SGML/HTML and XHTML parsers."""
  21
  22     def __init__(self):
  23         if self.__class__ is ParserBase:
  24             raise RuntimeError(
  25                 "markupbase.ParserBase must be subclassed")
  26
  27     def error(self, message):
  28         raise NotImplementedError(
  29             "subclasses of ParserBase must override error()")
  30
  31     def reset(self):
  32         self.lineno = 1
  33         self.offset = 0
  34
  35     def getpos(self):
  36         """Return current line number and offset."""
  37         return self.lineno, self.offset
  38
  39     # Internal -- update line number and offset.  This should be
  40     # called for each piece of data exactly once, in order -- in other
  41     # words the concatenation of all the input strings to this
  42     # function should be exactly the entire input.
  43     def updatepos(self, i, j):
  44         if i >= j:
  45             return j
  46         rawdata = self.rawdata
  47         nlines = rawdata.count("\n", i, j)
  48         if nlines:
  49             self.lineno = self.lineno + nlines
  50             pos = rawdata.rindex("\n", i, j) # Should not fail
  51             self.offset = j-(pos+1)
  52         else:
  53             self.offset = self.offset + j-i
  54         return j
  55
  56     _decl_otherchars = ''
  57
  58     # Internal -- parse declaration (for use by subclasses).
  59     def parse_declaration(self, i):
  60         # This is some sort of declaration; in "HTML as
  61         # deployed," this should only be the document type
  62         # declaration ("<!DOCTYPE html...>").
  63         # ISO 8879:1986, however, has more complex
  64         # declaration syntax for elements in <!...>, including:
  65         # --comment--
  66         # [marked section]
  67         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  68         # ATTLIST, NOTATION, SHORTREF, USEMAP,
  69         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  70         rawdata = self.rawdata
  71         j = i + 2
  72         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  73         if rawdata[j:j+1] in ("-", ""):
  74             # Start of comment followed by buffer boundary,
  75             # or just a buffer boundary.
  76             return -1
  77         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  78         n = len(rawdata)
  79         if rawdata[j:j+1] == '--': #comment
  80             # Locate --.*-- as the body of the comment
  81             return self.parse_comment(i)
  82         elif rawdata[j] == '[': #marked section
  83             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  84             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  85             # Note that this is extended by Microsoft Office "Save as Web" function
  86             # to include [if...] and [endif].
  87             return self.parse_marked_section(i)
  88         else: #all other declaration elements
  89             decltype, j = self._scan_name(j, i)
  90         if j < 0:
  91             return j
  92         if decltype == "doctype":
  93             self._decl_otherchars = ''
  94         while j < n:
  95             c = rawdata[j]
  96             if c == ">":
  97                 # end of declaration syntax
  98                 data = rawdata[i+2:j]
  99                 if decltype == "doctype":
 100                     self.handle_decl(data)
 101                 else:
 102                     self.unknown_decl(data)
 103                 return j + 1
 104             if c in "\"'":
 105                 m = _declstringlit_match(rawdata, j)
 106                 if not m:
 107                     return -1 # incomplete
 108                 j = m.end()
 109             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
 110                 name, j = self._scan_name(j, i)
 111             elif c in self._decl_otherchars:
 112                 j = j + 1
 113             elif c == "[":
 114                 # this could be handled in a separate doctype parser
 115                 if decltype == "doctype":
 116                     j = self._parse_doctype_subset(j + 1, i)
 117                 elif decltype in ("attlist", "linktype", "link", "element"):
 118                     # must tolerate []'d groups in a content model in an element declaration
 119                     # also in data attribute specifications of attlist declaration
 120                     # also link type declaration subsets in linktype declarations
 121                     # also link attribute specification lists in link declarations
 122                     self.error("unsupported '[' char in %s declaration" % decltype)
 123                 else:
 124                     self.error("unexpected '[' char in declaration")
 125             else:
 126                 self.error(
 127                     "unexpected %s char in declaration" % `rawdata[j]`)
 128             if j < 0:
 129                 return j
 130         return -1 # incomplete
 131
 132     # Internal -- parse a marked section
 133     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
 134     def parse_marked_section( self, i, report=1 ):
 135         rawdata= self.rawdata
 136         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
 137         sectName, j = self._scan_name( i+3, i )
 138         if j < 0:
 139             return j
 140         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
 141             # look for standard ]]> ending
 142             match= _markedsectionclose.search(rawdata, i+3)
 143         elif sectName in ("if", "else", "endif"):
 144             # look for MS Office ]> ending
 145             match= _msmarkedsectionclose.search(rawdata, i+3)
 146         else:
 147             self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
 148         if not match:
 149             return -1
 150         if report:
 151             j = match.start(0)
 152             self.unknown_decl(rawdata[i+3: j])
 153         return match.end(0)
 154
 155     # Internal -- parse comment, return length or -1 if not terminated
 156     def parse_comment(self, i, report=1):
 157         rawdata = self.rawdata
 158         if rawdata[i:i+4] != '<!--':
 159             self.error('unexpected call to parse_comment()')
 160         match = _commentclose.search(rawdata, i+4)
 161         if not match:
 162             return -1
 163         if report:
 164             j = match.start(0)
 165             self.handle_comment(rawdata[i+4: j])
 166         return match.end(0)
 167
 168     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
 169     # returning the index just past any whitespace following the trailing ']'.
 170     def _parse_doctype_subset(self, i, declstartpos):
 171         rawdata = self.rawdata
 172         n = len(rawdata)
 173         j = i
 174         while j < n:
 175             c = rawdata[j]
 176             if c == "<":
 177                 s = rawdata[j:j+2]
 178                 if s == "<":
 179                     # end of buffer; incomplete
 180                     return -1
 181                 if s != "<!":
 182                     self.updatepos(declstartpos, j + 1)
 183                     self.error("unexpected char in internal subset (in %s)"
 184                                % `s`)
 185                 if (j + 2) == n:
 186                     # end of buffer; incomplete
 187                     return -1
 188                 if (j + 4) > n:
 189                     # end of buffer; incomplete
 190                     return -1
 191                 if rawdata[j:j+4] == "<!--":
 192                     j = self.parse_comment(j, report=0)
 193                     if j < 0:
 194                         return j
 195                     continue
 196                 name, j = self._scan_name(j + 2, declstartpos)
 197                 if j == -1:
 198                     return -1
 199                 if name not in ("attlist", "element", "entity", "notation"):
 200                     self.updatepos(declstartpos, j + 2)
 201                     self.error(
 202                         "unknown declaration %s in internal subset" % `name`)
 203                 # handle the individual names
 204                 meth = getattr(self, "_parse_doctype_" + name)
 205                 j = meth(j, declstartpos)
 206                 if j < 0:
 207                     return j
 208             elif c == "%":
 209                 # parameter entity reference
 210                 if (j + 1) == n:
 211                     # end of buffer; incomplete
 212                     return -1
 213                 s, j = self._scan_name(j + 1, declstartpos)
 214                 if j < 0:
 215                     return j
 216                 if rawdata[j] == ";":
 217                     j = j + 1
 218             elif c == "]":
 219                 j = j + 1
 220                 while j < n and rawdata[j].isspace():
 221                     j = j + 1
 222                 if j < n:
 223                     if rawdata[j] == ">":
 224                         return j
 225                     self.updatepos(declstartpos, j)
 226                     self.error("unexpected char after internal subset")
 227                 else:
 228                     return -1
 229             elif c.isspace():
 230                 j = j + 1
 231             else:
 232                 self.updatepos(declstartpos, j)
 233                 self.error("unexpected char %s in internal subset" % `c`)
 234         # end of buffer reached
 235         return -1
 236
 237     # Internal -- scan past <!ELEMENT declarations
 238     def _parse_doctype_element(self, i, declstartpos):
 239         name, j = self._scan_name(i, declstartpos)
 240         if j == -1:
 241             return -1
 242         # style content model; just skip until '>'
 243         rawdata = self.rawdata
 244         if '>' in rawdata[j:]:
 245             return rawdata.find(">", j) + 1
 246         return -1
 247
 248     # Internal -- scan past <!ATTLIST declarations
 249     def _parse_doctype_attlist(self, i, declstartpos):
 250         rawdata = self.rawdata
 251         name, j = self._scan_name(i, declstartpos)
 252         c = rawdata[j:j+1]
 253         if c == "":
 254             return -1
 255         if c == ">":
 256             return j + 1
 257         while 1:
 258             # scan a series of attribute descriptions; simplified:
 259             #   name type [value] [#constraint]
 260             name, j = self._scan_name(j, declstartpos)
 261             if j < 0:
 262                 return j
 263             c = rawdata[j:j+1]
 264             if c == "":
 265                 return -1
 266             if c == "(":
 267                 # an enumerated type; look for ')'
 268                 if ")" in rawdata[j:]:
 269                     j = rawdata.find(")", j) + 1
 270                 else:
 271                     return -1
 272                 while rawdata[j:j+1].isspace():
 273                     j = j + 1
 274                 if not rawdata[j:]:
 275                     # end of buffer, incomplete
 276                     return -1
 277             else:
 278                 name, j = self._scan_name(j, declstartpos)
 279             c = rawdata[j:j+1]
 280             if not c:
 281                 return -1
 282             if c in "'\"":
 283                 m = _declstringlit_match(rawdata, j)
 284                 if m:
 285                     j = m.end()
 286                 else:
 287                     return -1
 288                 c = rawdata[j:j+1]
 289                 if not c:
 290                     return -1
 291             if c == "#":
 292                 if rawdata[j:] == "#":
 293                     # end of buffer
 294                     return -1
 295                 name, j = self._scan_name(j + 1, declstartpos)
 296                 if j < 0:
 297                     return j
 298                 c = rawdata[j:j+1]
 299                 if not c:
 300                     return -1
 301             if c == '>':
 302                 # all done
 303                 return j + 1
 304
 305     # Internal -- scan past <!NOTATION declarations
 306     def _parse_doctype_notation(self, i, declstartpos):
 307         name, j = self._scan_name(i, declstartpos)
 308         if j < 0:
 309             return j
 310         rawdata = self.rawdata
 311         while 1:
 312             c = rawdata[j:j+1]
 313             if not c:
 314                 # end of buffer; incomplete
 315                 return -1
 316             if c == '>':
 317                 return j + 1
 318             if c in "'\"":
 319                 m = _declstringlit_match(rawdata, j)
 320                 if not m:
 321                     return -1
 322                 j = m.end()
 323             else:
 324                 name, j = self._scan_name(j, declstartpos)
 325                 if j < 0:
 326                     return j
 327
 328     # Internal -- scan past <!ENTITY declarations
 329     def _parse_doctype_entity(self, i, declstartpos):
 330         rawdata = self.rawdata
 331         if rawdata[i:i+1] == "%":
 332             j = i + 1
 333             while 1:
 334                 c = rawdata[j:j+1]
 335                 if not c:
 336                     return -1
 337                 if c.isspace():
 338                     j = j + 1
 339                 else:
 340                     break
 341         else:
 342             j = i
 343         name, j = self._scan_name(j, declstartpos)
 344         if j < 0:
 345             return j
 346         while 1:
 347             c = self.rawdata[j:j+1]
 348             if not c:
 349                 return -1
 350             if c in "'\"":
 351                 m = _declstringlit_match(rawdata, j)
 352                 if m:
 353                     j = m.end()
 354                 else:
 355                     return -1    # incomplete
 356             elif c == ">":
 357                 return j + 1
 358             else:
 359                 name, j = self._scan_name(j, declstartpos)
 360                 if j < 0:
 361                     return j
 362
 363     # Internal -- scan a name token and the new position and the token, or
 364     # return -1 if we've reached the end of the buffer.
 365     def _scan_name(self, i, declstartpos):
 366         rawdata = self.rawdata
 367         n = len(rawdata)
 368         if i == n:
 369             return None, -1
 370         m = _declname_match(rawdata, i)
 371         if m:
 372             s = m.group()
 373             name = s.strip()
 374             if (i + len(s)) == n:
 375                 return None, -1  # end of buffer
 376             return name.lower(), m.end()
 377         else:
 378             self.updatepos(declstartpos, i)
 379             self.error("expected name token")
 380
 381     # To be overridden -- handlers for unknown objects
 382     def unknown_decl(self, data):
 383         pass