Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 interesting_cdata = re.compile(r'<(/|\Z)')
  18 incomplete = re.compile('&[a-zA-Z#]')
  19
  20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  22
  23 starttagopen = re.compile('<[a-zA-Z]')
  24 piclose = re.compile('>')
  25 commentclose = re.compile(r'--\s*>')
  26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  27 attrfind = re.compile(
  28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  29     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  30
  31 locatestarttagend = re.compile(r"""
  32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  33   (?:\s+                             # whitespace before attribute name
  34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  35       (?:\s*=\s*                     # value indicator
  36         (?:'[^']*'                   # LITA-enclosed value
  37           |\"[^\"]*\"                # LIT-enclosed value
  38           |[^'\">\s]+                # bare value
  39          )
  40        )?
  41      )
  42    )*
  43   \s*                                # trailing whitespace
  44 """, re.VERBOSE)
  45 endendtag = re.compile('>')
  46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  47
  48
  49 class HTMLParseError(Exception):
  50     """Exception raised for all parse errors."""
  51
  52     def __init__(self, msg, position=(None, None)):
  53         assert msg
  54         self.msg = msg
  55         self.lineno = position[0]
  56         self.offset = position[1]
  57
  58     def __str__(self):
  59         result = self.msg
  60         if self.lineno is not None:
  61             result = result + ", at line %d" % self.lineno
  62         if self.offset is not None:
  63             result = result + ", column %d" % (self.offset + 1)
  64         return result
  65
  66
  67 class HTMLParser(markupbase.ParserBase):
  68     """Find tags and other markup and call handler functions.
  69
  70     Usage:
  71         p = HTMLParser()
  72         p.feed(data)
  73         ...
  74         p.close()
  75
  76     Start tags are handled by calling self.handle_starttag() or
  77     self.handle_startendtag(); end tags by self.handle_endtag().  The
  78     data between tags is passed from the parser to the derived class
  79     by calling self.handle_data() with the data as argument (the data
  80     may be split up in arbitrary chunks).  Entity references are
  81     passed by calling self.handle_entityref() with the entity
  82     reference as the argument.  Numeric character references are
  83     passed to self.handle_charref() with the string containing the
  84     reference as the argument.
  85     """
  86
  87     CDATA_CONTENT_ELEMENTS = ("script", "style")
  88
  89
  90     def __init__(self):
  91         """Initialize and reset this instance."""
  92         self.reset()
  93
  94     def reset(self):
  95         """Reset this instance.  Loses all unprocessed data."""
  96         self.rawdata = ''
  97         self.lasttag = '???'
  98         self.interesting = interesting_normal
  99         markupbase.ParserBase.reset(self)
 100
 101     def feed(self, data):
 102         """Feed data to the parser.
 103
 104         Call this as often as you want, with as little or as much text
 105         as you want (may include '\n').
 106         """
 107         self.rawdata = self.rawdata + data
 108         self.goahead(0)
 109
 110     def close(self):
 111         """Handle any buffered data."""
 112         self.goahead(1)
 113
 114     def error(self, message):
 115         raise HTMLParseError(message, self.getpos())
 116
 117     __starttag_text = None
 118
 119     def get_starttag_text(self):
 120         """Return full source of start tag: '<...>'."""
 121         return self.__starttag_text
 122
 123     def set_cdata_mode(self):
 124         self.interesting = interesting_cdata
 125
 126     def clear_cdata_mode(self):
 127         self.interesting = interesting_normal
 128
 129     # Internal -- handle data as far as reasonable.  May leave state
 130     # and data to be processed by a subsequent call.  If 'end' is
 131     # true, force handling all data as if followed by EOF marker.
 132     def goahead(self, end):
 133         rawdata = self.rawdata
 134         i = 0
 135         n = len(rawdata)
 136         while i < n:
 137             match = self.interesting.search(rawdata, i) # < or &
 138             if match:
 139                 j = match.start()
 140             else:
 141                 j = n
 142             if i < j: self.handle_data(rawdata[i:j])
 143             i = self.updatepos(i, j)
 144             if i == n: break
 145             startswith = rawdata.startswith
 146             if startswith('<', i):
 147                 if starttagopen.match(rawdata, i): # < + letter
 148                     k = self.parse_starttag(i)
 149                 elif startswith("</", i):
 150                     k = self.parse_endtag(i)
 151                 elif startswith("<!--", i):
 152                     k = self.parse_comment(i)
 153                 elif startswith("<?", i):
 154                     k = self.parse_pi(i)
 155                 elif startswith("<!", i):
 156                     k = self.parse_declaration(i)
 157                 elif (i + 1) < n:
 158                     self.handle_data("<")
 159                     k = i + 1
 160                 else:
 161                     break
 162                 if k < 0:
 163                     if end:
 164                         self.error("EOF in middle of construct")
 165                     break
 166                 i = self.updatepos(i, k)
 167             elif startswith("&#", i):
 168                 match = charref.match(rawdata, i)
 169                 if match:
 170                     name = match.group()[2:-1]
 171                     self.handle_charref(name)
 172                     k = match.end()
 173                     if not startswith(';', k-1):
 174                         k = k - 1
 175                     i = self.updatepos(i, k)
 176                     continue
 177                 else:
 178                     break
 179             elif startswith('&', i):
 180                 match = entityref.match(rawdata, i)
 181                 if match:
 182                     name = match.group(1)
 183                     self.handle_entityref(name)
 184                     k = match.end()
 185                     if not startswith(';', k-1):
 186                         k = k - 1
 187                     i = self.updatepos(i, k)
 188                     continue
 189                 match = incomplete.match(rawdata, i)
 190                 if match:
 191                     # match.group() will contain at least 2 chars
 192                     if end and match.group() == rawdata[i:]:
 193                         self.error("EOF in middle of entity or char ref")
 194                     # incomplete
 195                     break
 196                 elif (i + 1) < n:
 197                     # not the end of the buffer, and can't be confused
 198                     # with some other construct
 199                     self.handle_data("&")
 200                     i = self.updatepos(i, i + 1)
 201                 else:
 202                     break
 203             else:
 204                 assert 0, "interesting.search() lied"
 205         # end while
 206         if end and i < n:
 207             self.handle_data(rawdata[i:n])
 208             i = self.updatepos(i, n)
 209         self.rawdata = rawdata[i:]
 210
 211     # Internal -- parse comment, return end or -1 if not terminated
 212     def parse_comment(self, i, report=1):
 213         rawdata = self.rawdata
 214         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
 215         match = commentclose.search(rawdata, i+4)
 216         if not match:
 217             return -1
 218         if report:
 219             j = match.start()
 220             self.handle_comment(rawdata[i+4: j])
 221         j = match.end()
 222         return j
 223
 224     # Internal -- parse processing instr, return end or -1 if not terminated
 225     def parse_pi(self, i):
 226         rawdata = self.rawdata
 227         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 228         match = piclose.search(rawdata, i+2) # >
 229         if not match:
 230             return -1
 231         j = match.start()
 232         self.handle_pi(rawdata[i+2: j])
 233         j = match.end()
 234         return j
 235
 236     # Internal -- handle starttag, return end or -1 if not terminated
 237     def parse_starttag(self, i):
 238         self.__starttag_text = None
 239         endpos = self.check_for_whole_start_tag(i)
 240         if endpos < 0:
 241             return endpos
 242         rawdata = self.rawdata
 243         self.__starttag_text = rawdata[i:endpos]
 244
 245         # Now parse the data between i+1 and j into a tag and attrs
 246         attrs = []
 247         match = tagfind.match(rawdata, i+1)
 248         assert match, 'unexpected call to parse_starttag()'
 249         k = match.end()
 250         self.lasttag = tag = rawdata[i+1:k].lower()
 251
 252         while k < endpos:
 253             m = attrfind.match(rawdata, k)
 254             if not m:
 255                 break
 256             attrname, rest, attrvalue = m.group(1, 2, 3)
 257             if not rest:
 258                 attrvalue = None
 259             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 260                  attrvalue[:1] == '"' == attrvalue[-1:]:
 261                 attrvalue = attrvalue[1:-1]
 262                 attrvalue = self.unescape(attrvalue)
 263             attrs.append((attrname.lower(), attrvalue))
 264             k = m.end()
 265
 266         end = rawdata[k:endpos].strip()
 267         if end not in (">", "/>"):
 268             lineno, offset = self.getpos()
 269             if "\n" in self.__starttag_text:
 270                 lineno = lineno + self.__starttag_text.count("\n")
 271                 offset = len(self.__starttag_text) \
 272                          - self.__starttag_text.rfind("\n")
 273             else:
 274                 offset = offset + len(self.__starttag_text)
 275             self.error("junk characters in start tag: %s"
 276                        % `rawdata[k:endpos][:20]`)
 277         if end.endswith('/>'):
 278             # XHTML-style empty tag: <span attr="value" />
 279             self.handle_startendtag(tag, attrs)
 280         else:
 281             self.handle_starttag(tag, attrs)
 282             if tag in self.CDATA_CONTENT_ELEMENTS:
 283                 self.set_cdata_mode()
 284         return endpos
 285
 286     # Internal -- check to see if we have a complete starttag; return end
 287     # or -1 if incomplete.
 288     def check_for_whole_start_tag(self, i):
 289         rawdata = self.rawdata
 290         m = locatestarttagend.match(rawdata, i)
 291         if m:
 292             j = m.end()
 293             next = rawdata[j:j+1]
 294             if next == ">":
 295                 return j + 1
 296             if next == "/":
 297                 if rawdata.startswith("/>", j):
 298                     return j + 2
 299                 if rawdata.startswith("/", j):
 300                     # buffer boundary
 301                     return -1
 302                 # else bogus input
 303                 self.updatepos(i, j + 1)
 304                 self.error("malformed empty start tag")
 305             if next == "":
 306                 # end of input
 307                 return -1
 308             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 309                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 310                 # end of input in or before attribute value, or we have the
 311                 # '/' from a '/>' ending
 312                 return -1
 313             self.updatepos(i, j)
 314             self.error("malformed start tag")
 315         raise AssertionError("we should not get here!")
 316
 317     # Internal -- parse endtag, return end or -1 if incomplete
 318     def parse_endtag(self, i):
 319         rawdata = self.rawdata
 320         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 321         match = endendtag.search(rawdata, i+1) # >
 322         if not match:
 323             return -1
 324         j = match.end()
 325         match = endtagfind.match(rawdata, i) # </ + tag + >
 326         if not match:
 327             self.error("bad end tag: %s" % `rawdata[i:j]`)
 328         tag = match.group(1)
 329         self.handle_endtag(tag.lower())
 330         self.clear_cdata_mode()
 331         return j
 332
 333     # Overridable -- finish processing of start+end tag: <tag.../>
 334     def handle_startendtag(self, tag, attrs):
 335         self.handle_starttag(tag, attrs)
 336         self.handle_endtag(tag)
 337
 338     # Overridable -- handle start tag
 339     def handle_starttag(self, tag, attrs):
 340         pass
 341
 342     # Overridable -- handle end tag
 343     def handle_endtag(self, tag):
 344         pass
 345
 346     # Overridable -- handle character reference
 347     def handle_charref(self, name):
 348         pass
 349
 350     # Overridable -- handle entity reference
 351     def handle_entityref(self, name):
 352         pass
 353
 354     # Overridable -- handle data
 355     def handle_data(self, data):
 356         pass
 357
 358     # Overridable -- handle comment
 359     def handle_comment(self, data):
 360         pass
 361
 362     # Overridable -- handle declaration
 363     def handle_decl(self, decl):
 364         pass
 365
 366     # Overridable -- handle processing instruction
 367     def handle_pi(self, data):
 368         pass
 369
 370     def unknown_decl(self, data):
 371         self.error("unknown declaration: " + `data`)
 372
 373     # Internal -- helper to remove special character quoting
 374     def unescape(self, s):
 375         if '&' not in s:
 376             return s
 377         s = s.replace("&lt;", "<")
 378         s = s.replace("&gt;", ">")
 379         s = s.replace("&apos;", "'")
 380         s = s.replace("&quot;", '"')
 381         s = s.replace("&amp;", "&") # Must be last
 382         return s