Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 interesting_cdata = re.compile(r'<(/|\Z)')
  18 incomplete = re.compile('&[a-zA-Z#]')
  19
  20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  22
  23 starttagopen = re.compile('<[a-zA-Z]')
  24 piclose = re.compile('>')
  25 commentclose = re.compile(r'--\s*>')
  26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  27 attrfind = re.compile(
  28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  29     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  30
  31 locatestarttagend = re.compile(r"""
  32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  33   (?:\s+                             # whitespace before attribute name
  34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  35       (?:\s*=\s*                     # value indicator
  36         (?:'[^']*'                   # LITA-enclosed value
  37           |\"[^\"]*\"                # LIT-enclosed value
  38           |[^'\">\s]+                # bare value
  39          )
  40        )?
  41      )
  42    )*
  43   \s*                                # trailing whitespace
  44 """, re.VERBOSE)
  45 endendtag = re.compile('>')
  46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  47
  48
  49 class HTMLParseError(Exception):
  50     """Exception raised for all parse errors."""
  51
  52     def __init__(self, msg, position=(None, None)):
  53         assert msg
  54         self.msg = msg
  55         self.lineno = position[0]
  56         self.offset = position[1]
  57
  58     def __str__(self):
  59         result = self.msg
  60         if self.lineno is not None:
  61             result = result + ", at line %d" % self.lineno
  62         if self.offset is not None:
  63             result = result + ", column %d" % (self.offset + 1)
  64         return result
  65
  66
  67 class HTMLParser(markupbase.ParserBase):
  68     """Find tags and other markup and call handler functions.
  69
  70     Usage:
  71         p = HTMLParser()
  72         p.feed(data)
  73         ...
  74         p.close()
  75
  76     Start tags are handled by calling self.handle_starttag() or
  77     self.handle_startendtag(); end tags by self.handle_endtag().  The
  78     data between tags is passed from the parser to the derived class
  79     by calling self.handle_data() with the data as argument (the data
  80     may be split up in arbitrary chunks).  Entity references are
  81     passed by calling self.handle_entityref() with the entity
  82     reference as the argument.  Numeric character references are
  83     passed to self.handle_charref() with the string containing the
  84     reference as the argument.
  85     """
  86
  87     CDATA_CONTENT_ELEMENTS = ("script", "style")
  88
  89
  90     def __init__(self):
  91         """Initialize and reset this instance."""
  92         self.reset()
  93
  94     def reset(self):
  95         """Reset this instance.  Loses all unprocessed data."""
  96         self.rawdata = ''
  97         self.lasttag = '???'
  98         self.interesting = interesting_normal
  99         markupbase.ParserBase.reset(self)
 100
 101     def feed(self, data):
 102         """Feed data to the parser.
 103
 104         Call this as often as you want, with as little or as much text
 105         as you want (may include '\n').
 106         """
 107         self.rawdata = self.rawdata + data
 108         self.goahead(0)
 109
 110     def close(self):
 111         """Handle any buffered data."""
 112         self.goahead(1)
 113
 114     def error(self, message):
 115         raise HTMLParseError(message, self.getpos())
 116
 117     __starttag_text = None
 118
 119     def get_starttag_text(self):
 120         """Return full source of start tag: '<...>'."""
 121         return self.__starttag_text
 122
 123     def set_cdata_mode(self):
 124         self.interesting = interesting_cdata
 125
 126     def clear_cdata_mode(self):
 127         self.interesting = interesting_normal
 128
 129     # Internal -- handle data as far as reasonable.  May leave state
 130     # and data to be processed by a subsequent call.  If 'end' is
 131     # true, force handling all data as if followed by EOF marker.
 132     def goahead(self, end):
 133         rawdata = self.rawdata
 134         i = 0
 135         n = len(rawdata)
 136         while i < n:
 137             match = self.interesting.search(rawdata, i) # < or &
 138             if match:
 139                 j = match.start()
 140             else:
 141                 j = n
 142             if i < j: self.handle_data(rawdata[i:j])
 143             i = self.updatepos(i, j)
 144             if i == n: break
 145             startswith = rawdata.startswith
 146             if startswith('<', i):
 147                 if starttagopen.match(rawdata, i): # < + letter
 148                     k = self.parse_starttag(i)
 149                 elif startswith("</", i):
 150                     k = self.parse_endtag(i)
 151                     if k >= 0:
 152                         self.clear_cdata_mode()
 153                 elif startswith("<!--", i):
 154                     k = self.parse_comment(i)
 155                 elif startswith("<?", i):
 156                     k = self.parse_pi(i)
 157                 elif startswith("<!", i):
 158                     k = self.parse_declaration(i)
 159                 elif (i + 1) < n:
 160                     self.handle_data("<")
 161                     k = i + 1
 162                 else:
 163                     break
 164                 if k < 0:
 165                     if end:
 166                         self.error("EOF in middle of construct")
 167                     break
 168                 i = self.updatepos(i, k)
 169             elif startswith("&#", i):
 170                 match = charref.match(rawdata, i)
 171                 if match:
 172                     name = match.group()[2:-1]
 173                     self.handle_charref(name)
 174                     k = match.end()
 175                     if not startswith(';', k-1):
 176                         k = k - 1
 177                     i = self.updatepos(i, k)
 178                     continue
 179                 else:
 180                     break
 181             elif startswith('&', i):
 182                 match = entityref.match(rawdata, i)
 183                 if match:
 184                     name = match.group(1)
 185                     self.handle_entityref(name)
 186                     k = match.end()
 187                     if not startswith(';', k-1):
 188                         k = k - 1
 189                     i = self.updatepos(i, k)
 190                     continue
 191                 match = incomplete.match(rawdata, i)
 192                 if match:
 193                     # match.group() will contain at least 2 chars
 194                     if end and match.group() == rawdata[i:]:
 195                         self.error("EOF in middle of entity or char ref")
 196                     # incomplete
 197                     break
 198                 elif (i + 1) < n:
 199                     # not the end of the buffer, and can't be confused
 200                     # with some other construct
 201                     self.handle_data("&")
 202                     i = self.updatepos(i, i + 1)
 203                 else:
 204                     break
 205             else:
 206                 assert 0, "interesting.search() lied"
 207         # end while
 208         if end and i < n:
 209             self.handle_data(rawdata[i:n])
 210             i = self.updatepos(i, n)
 211         self.rawdata = rawdata[i:]
 212
 213     # Internal -- parse comment, return end or -1 if not terminated
 214     def parse_comment(self, i, report=1):
 215         rawdata = self.rawdata
 216         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
 217         match = commentclose.search(rawdata, i+4)
 218         if not match:
 219             return -1
 220         if report:
 221             j = match.start()
 222             self.handle_comment(rawdata[i+4: j])
 223         j = match.end()
 224         return j
 225
 226     # Internal -- parse processing instr, return end or -1 if not terminated
 227     def parse_pi(self, i):
 228         rawdata = self.rawdata
 229         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 230         match = piclose.search(rawdata, i+2) # >
 231         if not match:
 232             return -1
 233         j = match.start()
 234         self.handle_pi(rawdata[i+2: j])
 235         j = match.end()
 236         return j
 237
 238     # Internal -- handle starttag, return end or -1 if not terminated
 239     def parse_starttag(self, i):
 240         self.__starttag_text = None
 241         endpos = self.check_for_whole_start_tag(i)
 242         if endpos < 0:
 243             return endpos
 244         rawdata = self.rawdata
 245         self.__starttag_text = rawdata[i:endpos]
 246
 247         # Now parse the data between i+1 and j into a tag and attrs
 248         attrs = []
 249         match = tagfind.match(rawdata, i+1)
 250         assert match, 'unexpected call to parse_starttag()'
 251         k = match.end()
 252         self.lasttag = tag = rawdata[i+1:k].lower()
 253
 254         while k < endpos:
 255             m = attrfind.match(rawdata, k)
 256             if not m:
 257                 break
 258             attrname, rest, attrvalue = m.group(1, 2, 3)
 259             if not rest:
 260                 attrvalue = None
 261             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 262                  attrvalue[:1] == '"' == attrvalue[-1:]:
 263                 attrvalue = attrvalue[1:-1]
 264                 attrvalue = self.unescape(attrvalue)
 265             attrs.append((attrname.lower(), attrvalue))
 266             k = m.end()
 267
 268         end = rawdata[k:endpos].strip()
 269         if end not in (">", "/>"):
 270             lineno, offset = self.getpos()
 271             if "\n" in self.__starttag_text:
 272                 lineno = lineno + self.__starttag_text.count("\n")
 273                 offset = len(self.__starttag_text) \
 274                          - self.__starttag_text.rfind("\n")
 275             else:
 276                 offset = offset + len(self.__starttag_text)
 277             self.error("junk characters in start tag: %s"
 278                        % `rawdata[k:endpos][:20]`)
 279         if end.endswith('/>'):
 280             # XHTML-style empty tag: <span attr="value" />
 281             self.handle_startendtag(tag, attrs)
 282         else:
 283             self.handle_starttag(tag, attrs)
 284             if tag in self.CDATA_CONTENT_ELEMENTS:
 285                 self.set_cdata_mode()
 286         return endpos
 287
 288     # Internal -- check to see if we have a complete starttag; return end
 289     # or -1 if incomplete.
 290     def check_for_whole_start_tag(self, i):
 291         rawdata = self.rawdata
 292         m = locatestarttagend.match(rawdata, i)
 293         if m:
 294             j = m.end()
 295             next = rawdata[j:j+1]
 296             if next == ">":
 297                 return j + 1
 298             if next == "/":
 299                 if rawdata.startswith("/>", j):
 300                     return j + 2
 301                 if rawdata.startswith("/", j):
 302                     # buffer boundary
 303                     return -1
 304                 # else bogus input
 305                 self.updatepos(i, j + 1)
 306                 self.error("malformed empty start tag")
 307             if next == "":
 308                 # end of input
 309                 return -1
 310             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 311                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 312                 # end of input in or before attribute value, or we have the
 313                 # '/' from a '/>' ending
 314                 return -1
 315             self.updatepos(i, j)
 316             self.error("malformed start tag")
 317         raise AssertionError("we should not get here!")
 318
 319     # Internal -- parse endtag, return end or -1 if incomplete
 320     def parse_endtag(self, i):
 321         rawdata = self.rawdata
 322         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 323         match = endendtag.search(rawdata, i+1) # >
 324         if not match:
 325             return -1
 326         j = match.end()
 327         match = endtagfind.match(rawdata, i) # </ + tag + >
 328         if not match:
 329             self.error("bad end tag: %s" % `rawdata[i:j]`)
 330         tag = match.group(1)
 331         self.handle_endtag(tag.lower())
 332         return j
 333
 334     # Overridable -- finish processing of start+end tag: <tag.../>
 335     def handle_startendtag(self, tag, attrs):
 336         self.handle_starttag(tag, attrs)
 337         self.handle_endtag(tag)
 338
 339     # Overridable -- handle start tag
 340     def handle_starttag(self, tag, attrs):
 341         pass
 342
 343     # Overridable -- handle end tag
 344     def handle_endtag(self, tag):
 345         pass
 346
 347     # Overridable -- handle character reference
 348     def handle_charref(self, name):
 349         pass
 350
 351     # Overridable -- handle entity reference
 352     def handle_entityref(self, name):
 353         pass
 354
 355     # Overridable -- handle data
 356     def handle_data(self, data):
 357         pass
 358
 359     # Overridable -- handle comment
 360     def handle_comment(self, data):
 361         pass
 362
 363     # Overridable -- handle declaration
 364     def handle_decl(self, decl):
 365         pass
 366
 367     # Overridable -- handle processing instruction
 368     def handle_pi(self, data):
 369         pass
 370
 371     def unknown_decl(self, data):
 372         self.error("unknown declaration: " + `data`)
 373
 374     # Internal -- helper to remove special character quoting
 375     def unescape(self, s):
 376         if '&' not in s:
 377             return s
 378         s = s.replace("&lt;", "<")
 379         s = s.replace("&gt;", ">")
 380         s = s.replace("&apos;", "'")
 381         s = s.replace("&quot;", '"')
 382         s = s.replace("&amp;", "&") # Must be last
 383         return s