Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 interesting_cdata = re.compile(r'<(/|\Z)')
  18 incomplete = re.compile('&[a-zA-Z#]')
  19
  20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  22
  23 starttagopen = re.compile('<[a-zA-Z]')
  24 piclose = re.compile('>')
  25 commentclose = re.compile(r'--\s*>')
  26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  27 attrfind = re.compile(
  28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  29     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
  30
  31 locatestarttagend = re.compile(r"""
  32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  33   (?:\s+                             # whitespace before attribute name
  34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  35       (?:\s*=\s*                     # value indicator
  36         (?:'[^']*'                   # LITA-enclosed value
  37           |\"[^\"]*\"                # LIT-enclosed value
  38           |[^'\">\s]+                # bare value
  39          )
  40        )?
  41      )
  42    )*
  43   \s*                                # trailing whitespace
  44 """, re.VERBOSE)
  45 endendtag = re.compile('>')
  46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  47
  48
  49 class HTMLParseError(Exception):
  50     """Exception raised for all parse errors."""
  51
  52     def __init__(self, msg, position=(None, None)):
  53         assert msg
  54         self.msg = msg
  55         self.lineno = position[0]
  56         self.offset = position[1]
  57
  58     def __str__(self):
  59         result = self.msg
  60         if self.lineno is not None:
  61             result = result + ", at line %d" % self.lineno
  62         if self.offset is not None:
  63             result = result + ", column %d" % (self.offset + 1)
  64         return result
  65
  66
  67 class HTMLParser(markupbase.ParserBase):
  68     """Find tags and other markup and call handler functions.
  69
  70     Usage:
  71         p = HTMLParser()
  72         p.feed(data)
  73         ...
  74         p.close()
  75
  76     Start tags are handled by calling self.handle_starttag() or
  77     self.handle_startendtag(); end tags by self.handle_endtag().  The
  78     data between tags is passed from the parser to the derived class
  79     by calling self.handle_data() with the data as argument (the data
  80     may be split up in arbitrary chunks).  Entity references are
  81     passed by calling self.handle_entityref() with the entity
  82     reference as the argument.  Numeric character references are
  83     passed to self.handle_charref() with the string containing the
  84     reference as the argument.
  85     """
  86
  87     CDATA_CONTENT_ELEMENTS = ("script", "style")
  88
  89
  90     def __init__(self):
  91         """Initialize and reset this instance."""
  92         self.reset()
  93
  94     def reset(self):
  95         """Reset this instance.  Loses all unprocessed data."""
  96         self.rawdata = ''
  97         self.lasttag = '???'
  98         self.interesting = interesting_normal
  99         markupbase.ParserBase.reset(self)
 100
 101     def feed(self, data):
 102         """Feed data to the parser.
 103
 104         Call this as often as you want, with as little or as much text
 105         as you want (may include '\n').
 106         """
 107         self.rawdata = self.rawdata + data
 108         self.goahead(0)
 109
 110     def close(self):
 111         """Handle any buffered data."""
 112         self.goahead(1)
 113
 114     def error(self, message):
 115         raise HTMLParseError(message, self.getpos())
 116
 117     __starttag_text = None
 118
 119     def get_starttag_text(self):
 120         """Return full source of start tag: '<...>'."""
 121         return self.__starttag_text
 122
 123     def set_cdata_mode(self):
 124         self.interesting = interesting_cdata
 125
 126     def clear_cdata_mode(self):
 127         self.interesting = interesting_normal
 128
 129     # Internal -- handle data as far as reasonable.  May leave state
 130     # and data to be processed by a subsequent call.  If 'end' is
 131     # true, force handling all data as if followed by EOF marker.
 132     def goahead(self, end):
 133         rawdata = self.rawdata
 134         i = 0
 135         n = len(rawdata)
 136         while i < n:
 137             match = self.interesting.search(rawdata, i) # < or &
 138             if match:
 139                 j = match.start()
 140             else:
 141                 j = n
 142             if i < j: self.handle_data(rawdata[i:j])
 143             i = self.updatepos(i, j)
 144             if i == n: break
 145             startswith = rawdata.startswith
 146             if startswith('<', i):
 147                 if starttagopen.match(rawdata, i): # < + letter
 148                     k = self.parse_starttag(i)
 149                 elif startswith("</", i):
 150                     k = self.parse_endtag(i)
 151                 elif startswith("<!--", i):
 152                     k = self.parse_comment(i)
 153                 elif startswith("<?", i):
 154                     k = self.parse_pi(i)
 155                 elif startswith("<!", i):
 156                     k = self.parse_declaration(i)
 157                 elif (i + 1) < n:
 158                     self.handle_data("<")
 159                     k = i + 1
 160                 else:
 161                     break
 162                 if k < 0:
 163                     if end:
 164                         self.error("EOF in middle of construct")
 165                     break
 166                 i = self.updatepos(i, k)
 167             elif startswith("&#", i):
 168                 match = charref.match(rawdata, i)
 169                 if match:
 170                     name = match.group()[2:-1]
 171                     self.handle_charref(name)
 172                     k = match.end()
 173                     if not startswith(';', k-1):
 174                         k = k - 1
 175                     i = self.updatepos(i, k)
 176                     continue
 177                 else:
 178                     if ";" in rawdata[i:]: #bail by consuming &#
 179                         self.handle_data(rawdata[0:2])
 180                         i = self.updatepos(i, 2)
 181                     break
 182             elif startswith('&', i):
 183                 match = entityref.match(rawdata, i)
 184                 if match:
 185                     name = match.group(1)
 186                     self.handle_entityref(name)
 187                     k = match.end()
 188                     if not startswith(';', k-1):
 189                         k = k - 1
 190                     i = self.updatepos(i, k)
 191                     continue
 192                 match = incomplete.match(rawdata, i)
 193                 if match:
 194                     # match.group() will contain at least 2 chars
 195                     if end and match.group() == rawdata[i:]:
 196                         self.error("EOF in middle of entity or char ref")
 197                     # incomplete
 198                     break
 199                 elif (i + 1) < n:
 200                     # not the end of the buffer, and can't be confused
 201                     # with some other construct
 202                     self.handle_data("&")
 203                     i = self.updatepos(i, i + 1)
 204                 else:
 205                     break
 206             else:
 207                 assert 0, "interesting.search() lied"
 208         # end while
 209         if end and i < n:
 210             self.handle_data(rawdata[i:n])
 211             i = self.updatepos(i, n)
 212         self.rawdata = rawdata[i:]
 213
 214     # Internal -- parse processing instr, return end or -1 if not terminated
 215     def parse_pi(self, i):
 216         rawdata = self.rawdata
 217         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 218         match = piclose.search(rawdata, i+2) # >
 219         if not match:
 220             return -1
 221         j = match.start()
 222         self.handle_pi(rawdata[i+2: j])
 223         j = match.end()
 224         return j
 225
 226     # Internal -- handle starttag, return end or -1 if not terminated
 227     def parse_starttag(self, i):
 228         self.__starttag_text = None
 229         endpos = self.check_for_whole_start_tag(i)
 230         if endpos < 0:
 231             return endpos
 232         rawdata = self.rawdata
 233         self.__starttag_text = rawdata[i:endpos]
 234
 235         # Now parse the data between i+1 and j into a tag and attrs
 236         attrs = []
 237         match = tagfind.match(rawdata, i+1)
 238         assert match, 'unexpected call to parse_starttag()'
 239         k = match.end()
 240         self.lasttag = tag = rawdata[i+1:k].lower()
 241
 242         while k < endpos:
 243             m = attrfind.match(rawdata, k)
 244             if not m:
 245                 break
 246             attrname, rest, attrvalue = m.group(1, 2, 3)
 247             if not rest:
 248                 attrvalue = None
 249             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 250                  attrvalue[:1] == '"' == attrvalue[-1:]:
 251                 attrvalue = attrvalue[1:-1]
 252                 attrvalue = self.unescape(attrvalue)
 253             attrs.append((attrname.lower(), attrvalue))
 254             k = m.end()
 255
 256         end = rawdata[k:endpos].strip()
 257         if end not in (">", "/>"):
 258             lineno, offset = self.getpos()
 259             if "\n" in self.__starttag_text:
 260                 lineno = lineno + self.__starttag_text.count("\n")
 261                 offset = len(self.__starttag_text) \
 262                          - self.__starttag_text.rfind("\n")
 263             else:
 264                 offset = offset + len(self.__starttag_text)
 265             self.error("junk characters in start tag: %r"
 266                        % (rawdata[k:endpos][:20],))
 267         if end.endswith('/>'):
 268             # XHTML-style empty tag: <span attr="value" />
 269             self.handle_startendtag(tag, attrs)
 270         else:
 271             self.handle_starttag(tag, attrs)
 272             if tag in self.CDATA_CONTENT_ELEMENTS:
 273                 self.set_cdata_mode()
 274         return endpos
 275
 276     # Internal -- check to see if we have a complete starttag; return end
 277     # or -1 if incomplete.
 278     def check_for_whole_start_tag(self, i):
 279         rawdata = self.rawdata
 280         m = locatestarttagend.match(rawdata, i)
 281         if m:
 282             j = m.end()
 283             next = rawdata[j:j+1]
 284             if next == ">":
 285                 return j + 1
 286             if next == "/":
 287                 if rawdata.startswith("/>", j):
 288                     return j + 2
 289                 if rawdata.startswith("/", j):
 290                     # buffer boundary
 291                     return -1
 292                 # else bogus input
 293                 self.updatepos(i, j + 1)
 294                 self.error("malformed empty start tag")
 295             if next == "":
 296                 # end of input
 297                 return -1
 298             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 299                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 300                 # end of input in or before attribute value, or we have the
 301                 # '/' from a '/>' ending
 302                 return -1
 303             self.updatepos(i, j)
 304             self.error("malformed start tag")
 305         raise AssertionError("we should not get here!")
 306
 307     # Internal -- parse endtag, return end or -1 if incomplete
 308     def parse_endtag(self, i):
 309         rawdata = self.rawdata
 310         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 311         match = endendtag.search(rawdata, i+1) # >
 312         if not match:
 313             return -1
 314         j = match.end()
 315         match = endtagfind.match(rawdata, i) # </ + tag + >
 316         if not match:
 317             self.error("bad end tag: %r" % (rawdata[i:j],))
 318         tag = match.group(1)
 319         self.handle_endtag(tag.lower())
 320         self.clear_cdata_mode()
 321         return j
 322
 323     # Overridable -- finish processing of start+end tag: <tag.../>
 324     def handle_startendtag(self, tag, attrs):
 325         self.handle_starttag(tag, attrs)
 326         self.handle_endtag(tag)
 327
 328     # Overridable -- handle start tag
 329     def handle_starttag(self, tag, attrs):
 330         pass
 331
 332     # Overridable -- handle end tag
 333     def handle_endtag(self, tag):
 334         pass
 335
 336     # Overridable -- handle character reference
 337     def handle_charref(self, name):
 338         pass
 339
 340     # Overridable -- handle entity reference
 341     def handle_entityref(self, name):
 342         pass
 343
 344     # Overridable -- handle data
 345     def handle_data(self, data):
 346         pass
 347
 348     # Overridable -- handle comment
 349     def handle_comment(self, data):
 350         pass
 351
 352     # Overridable -- handle declaration
 353     def handle_decl(self, decl):
 354         pass
 355
 356     # Overridable -- handle processing instruction
 357     def handle_pi(self, data):
 358         pass
 359
 360     def unknown_decl(self, data):
 361         self.error("unknown declaration: %r" % (data,))
 362
 363     # Internal -- helper to remove special character quoting
 364     entitydefs = None
 365     def unescape(self, s):
 366         if '&' not in s:
 367             return s
 368         def replaceEntities(s):
 369             s = s.groups()[0]
 370             if s[0] == "#":
 371                 s = s[1:]
 372                 if s[0] in ['x','X']:
 373                     c = int(s[1:], 16)
 374                 else:
 375                     c = int(s)
 376                 return unichr(c)
 377             else:
 378                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
 379                 # which is not part of HTML 4
 380                 import htmlentitydefs
 381                 if HTMLParser.entitydefs is None:
 382                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
 383                     for k, v in htmlentitydefs.name2codepoint.iteritems():
 384                         entitydefs[k] = unichr(v)
 385                 try:
 386                     return self.entitydefs[s]
 387                 except KeyError:
 388                     return '&'+s+';'
 389
 390         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)