Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13 import string
  14
  15 # Regular expressions used for parsing
  16
  17 interesting_normal = re.compile('[&<]')
  18 interesting_cdata = re.compile(r'<(/|\Z)')
  19 incomplete = re.compile('&[a-zA-Z#]')
  20
  21 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  22 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  23
  24 starttagopen = re.compile('<[a-zA-Z]')
  25 piclose = re.compile('>')
  26 endtagopen = re.compile('</')
  27 commentclose = re.compile(r'--\s*>')
  28 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  29 attrfind = re.compile(
  30     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  31     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  32
  33 locatestarttagend = re.compile(r"""
  34   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  35   (?:\s+                             # whitespace before attribute name
  36     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  37       (?:\s*=\s*                     # value indicator
  38         (?:'[^']*'                   # LITA-enclosed value
  39           |\"[^\"]*\"                # LIT-enclosed value
  40           |[^'\">\s]+                # bare value
  41          )
  42        )?
  43      )
  44    )*
  45   \s*                                # trailing whitespace
  46 """, re.VERBOSE)
  47 endendtag = re.compile('>')
  48 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  49
  50
  51 class HTMLParseError(Exception):
  52     """Exception raised for all parse errors."""
  53
  54     def __init__(self, msg, position=(None, None)):
  55         assert msg
  56         self.msg = msg
  57         self.lineno = position[0]
  58         self.offset = position[1]
  59
  60     def __str__(self):
  61         result = self.msg
  62         if self.lineno is not None:
  63             result = result + ", at line %d" % self.lineno
  64         if self.offset is not None:
  65             result = result + ", column %d" % (self.offset + 1)
  66         return result
  67
  68
  69 class HTMLParser(markupbase.ParserBase):
  70     """Find tags and other markup and call handler functions.
  71
  72     Usage:
  73         p = HTMLParser()
  74         p.feed(data)
  75         ...
  76         p.close()
  77
  78     Start tags are handled by calling self.handle_starttag() or
  79     self.handle_startendtag(); end tags by self.handle_endtag().  The
  80     data between tags is passed from the parser to the derived class
  81     by calling self.handle_data() with the data as argument (the data
  82     may be split up in arbitrary chunks).  Entity references are
  83     passed by calling self.handle_entityref() with the entity
  84     reference as the argument.  Numeric character references are
  85     passed to self.handle_charref() with the string containing the
  86     reference as the argument.
  87     """
  88
  89     CDATA_CONTENT_ELEMENTS = ("script", "style")
  90
  91
  92     def __init__(self):
  93         """Initialize and reset this instance."""
  94         self.reset()
  95
  96     def reset(self):
  97         """Reset this instance.  Loses all unprocessed data."""
  98         self.rawdata = ''
  99         self.stack = []
 100         self.lasttag = '???'
 101         self.interesting = interesting_normal
 102         markupbase.ParserBase.reset(self)
 103
 104     def feed(self, data):
 105         """Feed data to the parser.
 106
 107         Call this as often as you want, with as little or as much text
 108         as you want (may include '\n').
 109         """
 110         self.rawdata = self.rawdata + data
 111         self.goahead(0)
 112
 113     def close(self):
 114         """Handle any buffered data."""
 115         self.goahead(1)
 116
 117     def error(self, message):
 118         raise HTMLParseError(message, self.getpos())
 119
 120     __starttag_text = None
 121
 122     def get_starttag_text(self):
 123         """Return full source of start tag: '<...>'."""
 124         return self.__starttag_text
 125
 126     def set_cdata_mode(self):
 127         self.interesting = interesting_cdata
 128
 129     def clear_cdata_mode(self):
 130         self.interesting = interesting_normal
 131
 132     # Internal -- handle data as far as reasonable.  May leave state
 133     # and data to be processed by a subsequent call.  If 'end' is
 134     # true, force handling all data as if followed by EOF marker.
 135     def goahead(self, end):
 136         rawdata = self.rawdata
 137         i = 0
 138         n = len(rawdata)
 139         while i < n:
 140             match = self.interesting.search(rawdata, i) # < or &
 141             if match:
 142                 j = match.start()
 143             else:
 144                 j = n
 145             if i < j: self.handle_data(rawdata[i:j])
 146             i = self.updatepos(i, j)
 147             if i == n: break
 148             if rawdata[i] == '<':
 149                 if starttagopen.match(rawdata, i): # < + letter
 150                     k = self.parse_starttag(i)
 151                 elif endtagopen.match(rawdata, i): # </
 152                     k = self.parse_endtag(i)
 153                     if k >= 0:
 154                         self.clear_cdata_mode()
 155                 elif rawdata.startswith("<!--", i): # <!--
 156                     k = self.parse_comment(i)
 157                 elif rawdata.startswith("<?", i): # <?
 158                     k = self.parse_pi(i)
 159                 elif rawdata.startswith("<!", i): # <!
 160                     k = self.parse_declaration(i)
 161                 elif (i + 1) < n:
 162                     self.handle_data("<")
 163                     k = i + 1
 164                 else:
 165                     break
 166                 if k < 0:
 167                     if end:
 168                         self.error("EOF in middle of construct")
 169                     break
 170                 i = self.updatepos(i, k)
 171             elif rawdata[i:i+2] == "&#":
 172                 match = charref.match(rawdata, i)
 173                 if match:
 174                     name = match.group()[2:-1]
 175                     self.handle_charref(name)
 176                     k = match.end()
 177                     if rawdata[k-1] != ';':
 178                         k = k - 1
 179                     i = self.updatepos(i, k)
 180                     continue
 181                 else:
 182                     break
 183             elif rawdata[i] == '&':
 184                 match = entityref.match(rawdata, i)
 185                 if match:
 186                     name = match.group(1)
 187                     self.handle_entityref(name)
 188                     k = match.end()
 189                     if rawdata[k-1] != ';':
 190                         k = k - 1
 191                     i = self.updatepos(i, k)
 192                     continue
 193                 match = incomplete.match(rawdata, i)
 194                 if match:
 195                     # match.group() will contain at least 2 chars
 196                     rest = rawdata[i:]
 197                     if end and match.group() == rest:
 198                         self.error("EOF in middle of entity or char ref")
 199                     # incomplete
 200                     break
 201                 elif (i + 1) < n:
 202                     # not the end of the buffer, and can't be confused
 203                     # with some other construct
 204                     self.handle_data("&")
 205                     i = self.updatepos(i, i + 1)
 206                 else:
 207                     break
 208             else:
 209                 assert 0, "interesting.search() lied"
 210         # end while
 211         if end and i < n:
 212             self.handle_data(rawdata[i:n])
 213             i = self.updatepos(i, n)
 214         self.rawdata = rawdata[i:]
 215
 216     # Internal -- parse comment, return end or -1 if not terminated
 217     def parse_comment(self, i, report=1):
 218         rawdata = self.rawdata
 219         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
 220         match = commentclose.search(rawdata, i+4)
 221         if not match:
 222             return -1
 223         if report:
 224             j = match.start()
 225             self.handle_comment(rawdata[i+4: j])
 226         j = match.end()
 227         return j
 228
 229     # Internal -- parse processing instr, return end or -1 if not terminated
 230     def parse_pi(self, i):
 231         rawdata = self.rawdata
 232         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 233         match = piclose.search(rawdata, i+2) # >
 234         if not match:
 235             return -1
 236         j = match.start()
 237         self.handle_pi(rawdata[i+2: j])
 238         j = match.end()
 239         return j
 240
 241     # Internal -- handle starttag, return end or -1 if not terminated
 242     def parse_starttag(self, i):
 243         self.__starttag_text = None
 244         endpos = self.check_for_whole_start_tag(i)
 245         if endpos < 0:
 246             return endpos
 247         rawdata = self.rawdata
 248         self.__starttag_text = rawdata[i:endpos]
 249
 250         # Now parse the data between i+1 and j into a tag and attrs
 251         attrs = []
 252         match = tagfind.match(rawdata, i+1)
 253         assert match, 'unexpected call to parse_starttag()'
 254         k = match.end()
 255         self.lasttag = tag = string.lower(rawdata[i+1:k])
 256
 257         while k < endpos:
 258             m = attrfind.match(rawdata, k)
 259             if not m:
 260                 break
 261             attrname, rest, attrvalue = m.group(1, 2, 3)
 262             if not rest:
 263                 attrvalue = None
 264             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 265                  attrvalue[:1] == '"' == attrvalue[-1:]:
 266                 attrvalue = attrvalue[1:-1]
 267                 attrvalue = self.unescape(attrvalue)
 268             attrs.append((string.lower(attrname), attrvalue))
 269             k = m.end()
 270
 271         end = string.strip(rawdata[k:endpos])
 272         if end not in (">", "/>"):
 273             lineno, offset = self.getpos()
 274             if "\n" in self.__starttag_text:
 275                 lineno = lineno + string.count(self.__starttag_text, "\n")
 276                 offset = len(self.__starttag_text) \
 277                          - string.rfind(self.__starttag_text, "\n")
 278             else:
 279                 offset = offset + len(self.__starttag_text)
 280             self.error("junk characters in start tag: %s"
 281                        % `rawdata[k:endpos][:20]`)
 282         if end[-2:] == '/>':
 283             # XHTML-style empty tag: <span attr="value" />
 284             self.handle_startendtag(tag, attrs)
 285         else:
 286             self.handle_starttag(tag, attrs)
 287             if tag in self.CDATA_CONTENT_ELEMENTS:
 288                 self.set_cdata_mode()
 289         return endpos
 290
 291     # Internal -- check to see if we have a complete starttag; return end
 292     # or -1 if incomplete.
 293     def check_for_whole_start_tag(self, i):
 294         rawdata = self.rawdata
 295         m = locatestarttagend.match(rawdata, i)
 296         if m:
 297             j = m.end()
 298             next = rawdata[j:j+1]
 299             if next == ">":
 300                 return j + 1
 301             if next == "/":
 302                 s = rawdata[j:j+2]
 303                 if s == "/>":
 304                     return j + 2
 305                 if s == "/":
 306                     # buffer boundary
 307                     return -1
 308                 # else bogus input
 309                 self.updatepos(i, j + 1)
 310                 self.error("malformed empty start tag")
 311             if next == "":
 312                 # end of input
 313                 return -1
 314             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 315                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 316                 # end of input in or before attribute value, or we have the
 317                 # '/' from a '/>' ending
 318                 return -1
 319             self.updatepos(i, j)
 320             self.error("malformed start tag")
 321         raise AssertionError("we should not get here!")
 322
 323     # Internal -- parse endtag, return end or -1 if incomplete
 324     def parse_endtag(self, i):
 325         rawdata = self.rawdata
 326         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 327         match = endendtag.search(rawdata, i+1) # >
 328         if not match:
 329             return -1
 330         j = match.end()
 331         match = endtagfind.match(rawdata, i) # </ + tag + >
 332         if not match:
 333             self.error("bad end tag: %s" % `rawdata[i:j]`)
 334         tag = match.group(1)
 335         self.handle_endtag(string.lower(tag))
 336         return j
 337
 338     # Overridable -- finish processing of start+end tag: <tag.../>
 339     def handle_startendtag(self, tag, attrs):
 340         self.handle_starttag(tag, attrs)
 341         self.handle_endtag(tag)
 342
 343     # Overridable -- handle start tag
 344     def handle_starttag(self, tag, attrs):
 345         pass
 346
 347     # Overridable -- handle end tag
 348     def handle_endtag(self, tag):
 349         pass
 350
 351     # Overridable -- handle character reference
 352     def handle_charref(self, name):
 353         pass
 354
 355     # Overridable -- handle entity reference
 356     def handle_entityref(self, name):
 357         pass
 358
 359     # Overridable -- handle data
 360     def handle_data(self, data):
 361         pass
 362
 363     # Overridable -- handle comment
 364     def handle_comment(self, data):
 365         pass
 366
 367     # Overridable -- handle declaration
 368     def handle_decl(self, decl):
 369         pass
 370
 371     # Overridable -- handle processing instruction
 372     def handle_pi(self, data):
 373         pass
 374
 375     def unknown_decl(self, data):
 376         self.error("unknown declaration: " + `data`)
 377
 378     # Internal -- helper to remove special character quoting
 379     def unescape(self, s):
 380         if '&' not in s:
 381             return s
 382         s = string.replace(s, "&lt;", "<")
 383         s = string.replace(s, "&gt;", ">")
 384         s = string.replace(s, "&apos;", "'")
 385         s = string.replace(s, "&quot;", '"')
 386         s = string.replace(s, "&amp;", "&") # Must be last
 387         return s