mechanize/_beautifulsoup.py

   1 """Beautiful Soup
   2 Elixir and Tonic
   3 "The Screen-Scraper's Friend"
   4 v2.1.1
   5 http://www.crummy.com/software/BeautifulSoup/
   6
   7 Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
   8 into a tree representation. It provides methods and Pythonic idioms
   9 that make it easy to search and modify the tree.
  10
  11 A well-formed XML/HTML document will yield a well-formed data
  12 structure. An ill-formed XML/HTML document will yield a
  13 correspondingly ill-formed data structure. If your document is only
  14 locally well-formed, you can use this library to find and process the
  15 well-formed part of it. The BeautifulSoup class has heuristics for
  16 obtaining a sensible parse tree in the face of common HTML errors.
  17
  18 Beautiful Soup has no external dependencies. It works with Python 2.2
  19 and up.
  20
  21 Beautiful Soup defines classes for four different parsing strategies:
  22
  23  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  24    language that kind of looks like XML.
  25
  26  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  27    or invalid.
  28
  29  * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
  30    that trips up BeautifulSoup.
  31
  32  * BeautifulSOAP, for making it easier to parse XML documents that use
  33    lots of subelements containing a single string, where you'd prefer
  34    they put that string into an attribute (such as SOAP messages).
  35
  36 You can subclass BeautifulStoneSoup or BeautifulSoup to create a
  37 parsing strategy specific to an XML schema or a particular bizarre
  38 HTML document. Typically your subclass would just override
  39 SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
  40 """
  41 from __future__ import generators
  42
  43 __author__ = "Leonard Richardson (leonardr@segfault.org)"
  44 __version__ = "2.1.1"
  45 __date__ = "$Date: 2004/10/18 00:14:20 $"
  46 __copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
  47 __license__ = "PSF"
  48
  49 from sgmllib import SGMLParser, SGMLParseError
  50 import types
  51 import re
  52 import sgmllib
  53
  54 #This code makes Beautiful Soup able to parse XML with namespaces
  55 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
  56
  57 class NullType(object):
  58
  59     """Similar to NoneType with a corresponding singleton instance
  60     'Null' that, unlike None, accepts any message and returns itself.
  61
  62     Examples:
  63     >>> Null("send", "a", "message")("and one more",
  64     ...      "and what you get still") is Null
  65     True
  66     """
  67
  68     def __new__(cls):                    return Null
  69     def __call__(self, *args, **kwargs): return Null
  70 ##    def __getstate__(self, *args):       return Null
  71     def __getattr__(self, attr):         return Null
  72     def __getitem__(self, item):         return Null
  73     def __setattr__(self, attr, value):  pass
  74     def __setitem__(self, item, value):  pass
  75     def __len__(self):                   return 0
  76     # FIXME: is this a python bug? otherwise ``for x in Null: pass``
  77     #        never terminates...
  78     def __iter__(self):                  return iter([])
  79     def __contains__(self, item):        return False
  80     def __repr__(self):                  return "Null"
  81 Null = object.__new__(NullType)
  82
  83 class PageElement:
  84     """Contains the navigational information for some part of the page
  85     (either a tag or a piece of text)"""
  86
  87     def setup(self, parent=Null, previous=Null):
  88         """Sets up the initial relations between this element and
  89         other elements."""
  90         self.parent = parent
  91         self.previous = previous
  92         self.next = Null
  93         self.previousSibling = Null
  94         self.nextSibling = Null
  95         if self.parent and self.parent.contents:
  96             self.previousSibling = self.parent.contents[-1]
  97             self.previousSibling.nextSibling = self
  98
  99     def findNext(self, name=None, attrs={}, text=None):
 100         """Returns the first item that matches the given criteria and
 101         appears after this Tag in the document."""
 102         return self._first(self.fetchNext, name, attrs, text)
 103     firstNext = findNext
 104
 105     def fetchNext(self, name=None, attrs={}, text=None, limit=None):
 106         """Returns all items that match the given criteria and appear
 107         before after Tag in the document."""
 108         return self._fetch(name, attrs, text, limit, self.nextGenerator)
 109
 110     def findNextSibling(self, name=None, attrs={}, text=None):
 111         """Returns the closest sibling to this Tag that matches the
 112         given criteria and appears after this Tag in the document."""
 113         return self._first(self.fetchNextSiblings, name, attrs, text)
 114     firstNextSibling = findNextSibling
 115
 116     def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
 117         """Returns the siblings of this Tag that match the given
 118         criteria and appear after this Tag in the document."""
 119         return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
 120
 121     def findPrevious(self, name=None, attrs={}, text=None):
 122         """Returns the first item that matches the given criteria and
 123         appears before this Tag in the document."""
 124         return self._first(self.fetchPrevious, name, attrs, text)
 125
 126     def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
 127         """Returns all items that match the given criteria and appear
 128         before this Tag in the document."""
 129         return self._fetch(name, attrs, text, limit, self.previousGenerator)
 130     firstPrevious = findPrevious
 131
 132     def findPreviousSibling(self, name=None, attrs={}, text=None):
 133         """Returns the closest sibling to this Tag that matches the
 134         given criteria and appears before this Tag in the document."""
 135         return self._first(self.fetchPreviousSiblings, name, attrs, text)
 136     firstPreviousSibling = findPreviousSibling
 137
 138     def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
 139                               limit=None):
 140         """Returns the siblings of this Tag that match the given
 141         criteria and appear before this Tag in the document."""
 142         return self._fetch(name, attrs, text, limit,
 143                            self.previousSiblingGenerator)
 144
 145     def findParent(self, name=None, attrs={}):
 146         """Returns the closest parent of this Tag that matches the given
 147         criteria."""
 148         r = Null
 149         l = self.fetchParents(name, attrs, 1)
 150         if l:
 151             r = l[0]
 152         return r
 153     firstParent = findParent
 154
 155     def fetchParents(self, name=None, attrs={}, limit=None):
 156         """Returns the parents of this Tag that match the given
 157         criteria."""
 158         return self._fetch(name, attrs, None, limit, self.parentGenerator)
 159
 160     #These methods do the real heavy lifting.
 161
 162     def _first(self, method, name, attrs, text):
 163         r = Null
 164         l = method(name, attrs, text, 1)
 165         if l:
 166             r = l[0]
 167         return r
 168
 169     def _fetch(self, name, attrs, text, limit, generator):
 170         "Iterates over a generator looking for things that match."
 171         if not hasattr(attrs, 'items'):
 172             attrs = {'class' : attrs}
 173
 174         results = []
 175         g = generator()
 176         while True:
 177             try:
 178                 i = g.next()
 179             except StopIteration:
 180                 break
 181             found = None
 182             if isinstance(i, Tag):
 183                 if not text:
 184                     if not name or self._matches(i, name):
 185                         match = True
 186                         for attr, matchAgainst in attrs.items():
 187                             check = i.get(attr)
 188                             if not self._matches(check, matchAgainst):
 189                                 match = False
 190                                 break
 191                         if match:
 192                             found = i
 193             elif text:
 194                 if self._matches(i, text):
 195                     found = i
 196             if found:
 197                 results.append(found)
 198                 if limit and len(results) >= limit:
 199                     break
 200         return results
 201
 202     #Generators that can be used to navigate starting from both
 203     #NavigableTexts and Tags.
 204     def nextGenerator(self):
 205         i = self
 206         while i:
 207             i = i.next
 208             yield i
 209
 210     def nextSiblingGenerator(self):
 211         i = self
 212         while i:
 213             i = i.nextSibling
 214             yield i
 215
 216     def previousGenerator(self):
 217         i = self
 218         while i:
 219             i = i.previous
 220             yield i
 221
 222     def previousSiblingGenerator(self):
 223         i = self
 224         while i:
 225             i = i.previousSibling
 226             yield i
 227
 228     def parentGenerator(self):
 229         i = self
 230         while i:
 231             i = i.parent
 232             yield i
 233
 234     def _matches(self, chunk, howToMatch):
 235         #print 'looking for %s in %s' % (howToMatch, chunk)
 236         #
 237         # If given a list of items, return true if the list contains a
 238         # text element that matches.
 239         if isList(chunk) and not isinstance(chunk, Tag):
 240             for tag in chunk:
 241                 if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
 242                     return True
 243             return False
 244         if callable(howToMatch):
 245             return howToMatch(chunk)
 246         if isinstance(chunk, Tag):
 247             #Custom match methods take the tag as an argument, but all other
 248             #ways of matching match the tag name as a string
 249             chunk = chunk.name
 250         #Now we know that chunk is a string
 251         if not isinstance(chunk, basestring):
 252             chunk = str(chunk)
 253         if hasattr(howToMatch, 'match'):
 254             # It's a regexp object.
 255             return howToMatch.search(chunk)
 256         if isList(howToMatch):
 257             return chunk in howToMatch
 258         if hasattr(howToMatch, 'items'):
 259             return howToMatch.has_key(chunk)
 260         #It's just a string
 261         return str(howToMatch) == chunk
 262
 263 class NavigableText(PageElement):
 264
 265     def __getattr__(self, attr):
 266         "For backwards compatibility, text.string gives you text"
 267         if attr == 'string':
 268             return self
 269         else:
 270             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 271
 272 class NavigableString(str, NavigableText):
 273     pass
 274
 275 class NavigableUnicodeString(unicode, NavigableText):
 276     pass
 277
 278 class Tag(PageElement):
 279
 280     """Represents a found HTML tag with its attributes and contents."""
 281
 282     def __init__(self, name, attrs=None, parent=Null, previous=Null):
 283         "Basic constructor."
 284         self.name = name
 285         if attrs == None:
 286             attrs = []
 287         self.attrs = attrs
 288         self.contents = []
 289         self.setup(parent, previous)
 290         self.hidden = False
 291
 292     def get(self, key, default=None):
 293         """Returns the value of the 'key' attribute for the tag, or
 294         the value given for 'default' if it doesn't have that
 295         attribute."""
 296         return self._getAttrMap().get(key, default)
 297
 298     def __getitem__(self, key):
 299         """tag[key] returns the value of the 'key' attribute for the tag,
 300         and throws an exception if it's not there."""
 301         return self._getAttrMap()[key]
 302
 303     def __iter__(self):
 304         "Iterating over a tag iterates over its contents."
 305         return iter(self.contents)
 306
 307     def __len__(self):
 308         "The length of a tag is the length of its list of contents."
 309         return len(self.contents)
 310
 311     def __contains__(self, x):
 312         return x in self.contents
 313
 314     def __nonzero__(self):
 315         "A tag is non-None even if it has no contents."
 316         return True
 317
 318     def __setitem__(self, key, value):
 319         """Setting tag[key] sets the value of the 'key' attribute for the
 320         tag."""
 321         self._getAttrMap()
 322         self.attrMap[key] = value
 323         found = False
 324         for i in range(0, len(self.attrs)):
 325             if self.attrs[i][0] == key:
 326                 self.attrs[i] = (key, value)
 327                 found = True
 328         if not found:
 329             self.attrs.append((key, value))
 330         self._getAttrMap()[key] = value
 331
 332     def __delitem__(self, key):
 333         "Deleting tag[key] deletes all 'key' attributes for the tag."
 334         for item in self.attrs:
 335             if item[0] == key:
 336                 self.attrs.remove(item)
 337                 #We don't break because bad HTML can define the same
 338                 #attribute multiple times.
 339             self._getAttrMap()
 340             if self.attrMap.has_key(key):
 341                 del self.attrMap[key]
 342
 343     def __call__(self, *args, **kwargs):
 344         """Calling a tag like a function is the same as calling its
 345         fetch() method. Eg. tag('a') returns a list of all the A tags
 346         found within this tag."""
 347         return apply(self.fetch, args, kwargs)
 348
 349     def __getattr__(self, tag):
 350         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 351             return self.first(tag[:-3])
 352         elif tag.find('__') != 0:
 353             return self.first(tag)
 354
 355     def __eq__(self, other):
 356         """Returns true iff this tag has the same name, the same attributes,
 357         and the same contents (recursively) as the given tag.
 358
 359         NOTE: right now this will return false if two tags have the
 360         same attributes in a different order. Should this be fixed?"""
 361         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 362             return False
 363         for i in range(0, len(self.contents)):
 364             if self.contents[i] != other.contents[i]:
 365                 return False
 366         return True
 367
 368     def __ne__(self, other):
 369         """Returns true iff this tag is not identical to the other tag,
 370         as defined in __eq__."""
 371         return not self == other
 372
 373     def __repr__(self):
 374         """Renders this tag as a string."""
 375         return str(self)
 376
 377     def __unicode__(self):
 378         return self.__str__(1)
 379
 380     def __str__(self, needUnicode=None, showStructureIndent=None):
 381         """Returns a string or Unicode representation of this tag and
 382         its contents.
 383
 384         NOTE: since Python's HTML parser consumes whitespace, this
 385         method is not certain to reproduce the whitespace present in
 386         the original string."""
 387
 388         attrs = []
 389         if self.attrs:
 390             for key, val in self.attrs:
 391                 attrs.append('%s="%s"' % (key, val))
 392         close = ''
 393         closeTag = ''
 394         if self.isSelfClosing():
 395             close = ' /'
 396         else:
 397             closeTag = '</%s>' % self.name
 398         indentIncrement = None
 399         if showStructureIndent != None:
 400             indentIncrement = showStructureIndent
 401             if not self.hidden:
 402                 indentIncrement += 1
 403         contents = self.renderContents(indentIncrement, needUnicode=needUnicode)
 404         if showStructureIndent:
 405             space = '\n%s' % (' ' * showStructureIndent)
 406         if self.hidden:
 407             s = contents
 408         else:
 409             s = []
 410             attributeString = ''
 411             if attrs:
 412                 attributeString = ' ' + ' '.join(attrs)
 413             if showStructureIndent:
 414                 s.append(space)
 415             s.append('<%s%s%s>' % (self.name, attributeString, close))
 416             s.append(contents)
 417             if closeTag and showStructureIndent != None:
 418                 s.append(space)
 419             s.append(closeTag)
 420             s = ''.join(s)
 421         isUnicode = type(s) == types.UnicodeType
 422         if needUnicode and not isUnicode:
 423             s = unicode(s)
 424         elif isUnicode and needUnicode==False:
 425             s = str(s)
 426         return s
 427
 428     def prettify(self, needUnicode=None):
 429         return self.__str__(needUnicode, showStructureIndent=True)
 430
 431     def renderContents(self, showStructureIndent=None, needUnicode=None):
 432         """Renders the contents of this tag as a (possibly Unicode)
 433         string."""
 434         s=[]
 435         for c in self:
 436             text = None
 437             if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
 438                 text = unicode(c)
 439             elif isinstance(c, Tag):
 440                 s.append(c.__str__(needUnicode, showStructureIndent))
 441             elif needUnicode:
 442                 text = unicode(c)
 443             else:
 444                 text = str(c)
 445             if text:
 446                 if showStructureIndent != None:
 447                     if text[-1] == '\n':
 448                         text = text[:-1]
 449                 s.append(text)
 450         return ''.join(s)
 451
 452     #Soup methods
 453
 454     def firstText(self, text, recursive=True):
 455         """Convenience method to retrieve the first piece of text matching the
 456         given criteria. 'text' can be a string, a regular expression object,
 457         a callable that takes a string and returns whether or not the
 458         string 'matches', etc."""
 459         return self.first(recursive=recursive, text=text)
 460
 461     def fetchText(self, text, recursive=True, limit=None):
 462         """Convenience method to retrieve all pieces of text matching the
 463         given criteria. 'text' can be a string, a regular expression object,
 464         a callable that takes a string and returns whether or not the
 465         string 'matches', etc."""
 466         return self.fetch(recursive=recursive, text=text, limit=limit)
 467
 468     def first(self, name=None, attrs={}, recursive=True, text=None):
 469         """Return only the first child of this
 470         Tag matching the given criteria."""
 471         r = Null
 472         l = self.fetch(name, attrs, recursive, text, 1)
 473         if l:
 474             r = l[0]
 475         return r
 476     findChild = first
 477
 478     def fetch(self, name=None, attrs={}, recursive=True, text=None,
 479               limit=None):
 480         """Extracts a list of Tag objects that match the given
 481         criteria.  You can specify the name of the Tag and any
 482         attributes you want the Tag to have.
 483
 484         The value of a key-value pair in the 'attrs' map can be a
 485         string, a list of strings, a regular expression object, or a
 486         callable that takes a string and returns whether or not the
 487         string matches for some custom definition of 'matches'. The
 488         same is true of the tag name."""
 489         generator = self.recursiveChildGenerator
 490         if not recursive:
 491             generator = self.childGenerator
 492         return self._fetch(name, attrs, text, limit, generator)
 493     fetchChildren = fetch
 494
 495     #Utility methods
 496
 497     def isSelfClosing(self):
 498         """Returns true iff this is a self-closing tag as defined in the HTML
 499         standard.
 500
 501         TODO: This is specific to BeautifulSoup and its subclasses, but it's
 502         used by __str__"""
 503         return self.name in BeautifulSoup.SELF_CLOSING_TAGS
 504
 505     def append(self, tag):
 506         """Appends the given tag to the contents of this tag."""
 507         self.contents.append(tag)
 508
 509     #Private methods
 510
 511     def _getAttrMap(self):
 512         """Initializes a map representation of this tag's attributes,
 513         if not already initialized."""
 514         if not getattr(self, 'attrMap'):
 515             self.attrMap = {}
 516             for (key, value) in self.attrs:
 517                 self.attrMap[key] = value
 518         return self.attrMap
 519
 520     #Generator methods
 521     def childGenerator(self):
 522         for i in range(0, len(self.contents)):
 523             yield self.contents[i]
 524         raise StopIteration
 525
 526     def recursiveChildGenerator(self):
 527         stack = [(self, 0)]
 528         while stack:
 529             tag, start = stack.pop()
 530             if isinstance(tag, Tag):
 531                 for i in range(start, len(tag.contents)):
 532                     a = tag.contents[i]
 533                     yield a
 534                     if isinstance(a, Tag) and tag.contents:
 535                         if i < len(tag.contents) - 1:
 536                             stack.append((tag, i+1))
 537                         stack.append((a, 0))
 538                         break
 539         raise StopIteration
 540
 541
 542 def isList(l):
 543     """Convenience method that works with all 2.x versions of Python
 544     to determine whether or not something is listlike."""
 545     return hasattr(l, '__iter__') \
 546            or (type(l) in (types.ListType, types.TupleType))
 547
 548 def buildTagMap(default, *args):
 549     """Turns a list of maps, lists, or scalars into a single map.
 550     Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
 551     of lists and partial maps."""
 552     built = {}
 553     for portion in args:
 554         if hasattr(portion, 'items'):
 555             #It's a map. Merge it.
 556             for k,v in portion.items():
 557                 built[k] = v
 558         elif isList(portion):
 559             #It's a list. Map each item to the default.
 560             for k in portion:
 561                 built[k] = default
 562         else:
 563             #It's a scalar. Map it to the default.
 564             built[portion] = default
 565     return built
 566
 567 class BeautifulStoneSoup(Tag, SGMLParser):
 568
 569     """This class contains the basic parser and fetch code. It defines
 570     a parser that knows nothing about tag behavior except for the
 571     following:
 572
 573       You can't close a tag without closing all the tags it encloses.
 574       That is, "<foo><bar></foo>" actually means
 575       "<foo><bar></bar></foo>".
 576
 577     [Another possible explanation is "<foo><bar /></foo>", but since
 578     this class defines no SELF_CLOSING_TAGS, it will never use that
 579     explanation.]
 580
 581     This class is useful for parsing XML or made-up markup languages,
 582     or when BeautifulSoup makes an assumption counter to what you were
 583     expecting."""
 584
 585     SELF_CLOSING_TAGS = {}
 586     NESTABLE_TAGS = {}
 587     RESET_NESTING_TAGS = {}
 588     QUOTE_TAGS = {}
 589
 590     #As a public service we will by default silently replace MS smart quotes
 591     #and similar characters with their HTML or ASCII equivalents.
 592     MS_CHARS = { '\x80' : '&euro;',
 593                  '\x81' : ' ',
 594                  '\x82' : '&sbquo;',
 595                  '\x83' : '&fnof;',
 596                  '\x84' : '&bdquo;',
 597                  '\x85' : '&hellip;',
 598                  '\x86' : '&dagger;',
 599                  '\x87' : '&Dagger;',
 600                  '\x88' : '&caret;',
 601                  '\x89' : '%',
 602                  '\x8A' : '&Scaron;',
 603                  '\x8B' : '&lt;',
 604                  '\x8C' : '&OElig;',
 605                  '\x8D' : '?',
 606                  '\x8E' : 'Z',
 607                  '\x8F' : '?',
 608                  '\x90' : '?',
 609                  '\x91' : '&lsquo;',
 610                  '\x92' : '&rsquo;',
 611                  '\x93' : '&ldquo;',
 612                  '\x94' : '&rdquo;',
 613                  '\x95' : '&bull;',
 614                  '\x96' : '&ndash;',
 615                  '\x97' : '&mdash;',
 616                  '\x98' : '&tilde;',
 617                  '\x99' : '&trade;',
 618                  '\x9a' : '&scaron;',
 619                  '\x9b' : '&gt;',
 620                  '\x9c' : '&oelig;',
 621                  '\x9d' : '?',
 622                  '\x9e' : 'z',
 623                  '\x9f' : '&Yuml;',}
 624
 625     PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
 626                        lambda(x):x.group(1) + ' />'),
 627                       (re.compile('<!\s+([^<>]*)>'),
 628                        lambda(x):'<!' + x.group(1) + '>'),
 629                       (re.compile("([\x80-\x9f])"),
 630                        lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
 631                       ]
 632
 633     ROOT_TAG_NAME = '[document]'
 634
 635     def __init__(self, text=None, avoidParserProblems=True,
 636                  initialTextIsEverything=True):
 637         """Initialize this as the 'root tag' and feed in any text to
 638         the parser.
 639
 640         NOTE about avoidParserProblems: sgmllib will process most bad
 641         HTML, and BeautifulSoup has tricks for dealing with some HTML
 642         that kills sgmllib, but Beautiful Soup can nonetheless choke
 643         or lose data if your data uses self-closing tags or
 644         declarations incorrectly. By default, Beautiful Soup sanitizes
 645         its input to avoid the vast majority of these problems. The
 646         problems are relatively rare, even in bad HTML, so feel free
 647         to pass in False to avoidParserProblems if they don't apply to
 648         you, and you'll get better performance. The only reason I have
 649         this turned on by default is so I don't get so many tech
 650         support questions.
 651
 652         The two most common instances of invalid HTML that will choke
 653         sgmllib are fixed by the default parser massage techniques:
 654
 655          <br/> (No space between name of closing tag and tag close)
 656          <! --Comment--> (Extraneous whitespace in declaration)
 657
 658         You can pass in a custom list of (RE object, replace method)
 659         tuples to get Beautiful Soup to scrub your input the way you
 660         want."""
 661         Tag.__init__(self, self.ROOT_TAG_NAME)
 662         if avoidParserProblems \
 663            and not isList(avoidParserProblems):
 664             avoidParserProblems = self.PARSER_MASSAGE
 665         self.avoidParserProblems = avoidParserProblems
 666         SGMLParser.__init__(self)
 667         self.quoteStack = []
 668         self.hidden = 1
 669         self.reset()
 670         if hasattr(text, 'read'):
 671             #It's a file-type object.
 672             text = text.read()
 673         if text:
 674             self.feed(text)
 675         if initialTextIsEverything:
 676             self.done()
 677
 678     def __getattr__(self, methodName):
 679         """This method routes method call requests to either the SGMLParser
 680         superclass or the Tag superclass, depending on the method name."""
 681         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
 682                or methodName.find('do_') == 0:
 683             return SGMLParser.__getattr__(self, methodName)
 684         elif methodName.find('__') != 0:
 685             return Tag.__getattr__(self, methodName)
 686         else:
 687             raise AttributeError
 688
 689     def feed(self, text):
 690         if self.avoidParserProblems:
 691             for fix, m in self.avoidParserProblems:
 692                 text = fix.sub(m, text)
 693         SGMLParser.feed(self, text)
 694
 695     def done(self):
 696         """Called when you're done parsing, so that the unclosed tags can be
 697         correctly processed."""
 698         self.endData() #NEW
 699         while self.currentTag.name != self.ROOT_TAG_NAME:
 700             self.popTag()
 701
 702     def reset(self):
 703         SGMLParser.reset(self)
 704         self.currentData = []
 705         self.currentTag = None
 706         self.tagStack = []
 707         self.pushTag(self)
 708
 709     def popTag(self):
 710         tag = self.tagStack.pop()
 711         # Tags with just one string-owning child get the child as a
 712         # 'string' property, so that soup.tag.string is shorthand for
 713         # soup.tag.contents[0]
 714         if len(self.currentTag.contents) == 1 and \
 715            isinstance(self.currentTag.contents[0], NavigableText):
 716             self.currentTag.string = self.currentTag.contents[0]
 717
 718         #print "Pop", tag.name
 719         if self.tagStack:
 720             self.currentTag = self.tagStack[-1]
 721         return self.currentTag
 722
 723     def pushTag(self, tag):
 724         #print "Push", tag.name
 725         if self.currentTag:
 726             self.currentTag.append(tag)
 727         self.tagStack.append(tag)
 728         self.currentTag = self.tagStack[-1]
 729
 730     def endData(self):
 731         currentData = ''.join(self.currentData)
 732         if currentData:
 733             if not currentData.strip():
 734                 if '\n' in currentData:
 735                     currentData = '\n'
 736                 else:
 737                     currentData = ' '
 738             c = NavigableString
 739             if type(currentData) == types.UnicodeType:
 740                 c = NavigableUnicodeString
 741             o = c(currentData)
 742             o.setup(self.currentTag, self.previous)
 743             if self.previous:
 744                 self.previous.next = o
 745             self.previous = o
 746             self.currentTag.contents.append(o)
 747         self.currentData = []
 748
 749     def _popToTag(self, name, inclusivePop=True):
 750         """Pops the tag stack up to and including the most recent
 751         instance of the given tag. If inclusivePop is false, pops the tag
 752         stack up to but *not* including the most recent instqance of
 753         the given tag."""
 754         if name == self.ROOT_TAG_NAME:
 755             return
 756
 757         numPops = 0
 758         mostRecentTag = None
 759         for i in range(len(self.tagStack)-1, 0, -1):
 760             if name == self.tagStack[i].name:
 761                 numPops = len(self.tagStack)-i
 762                 break
 763         if not inclusivePop:
 764             numPops = numPops - 1
 765
 766         for i in range(0, numPops):
 767             mostRecentTag = self.popTag()
 768         return mostRecentTag
 769
 770     def _smartPop(self, name):
 771
 772         """We need to pop up to the previous tag of this type, unless
 773         one of this tag's nesting reset triggers comes between this
 774         tag and the previous tag of this type, OR unless this tag is a
 775         generic nesting trigger and another generic nesting trigger
 776         comes between this tag and the previous tag of this type.
 777
 778         Examples:
 779          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
 780          <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
 781          <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
 782          <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
 783
 784          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
 785          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
 786          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
 787         """
 788
 789         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
 790         isNestable = nestingResetTriggers != None
 791         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
 792         popTo = None
 793         inclusive = True
 794         for i in range(len(self.tagStack)-1, 0, -1):
 795             p = self.tagStack[i]
 796             if (not p or p.name == name) and not isNestable:
 797                 #Non-nestable tags get popped to the top or to their
 798                 #last occurance.
 799                 popTo = name
 800                 break
 801             if (nestingResetTriggers != None
 802                 and p.name in nestingResetTriggers) \
 803                 or (nestingResetTriggers == None and isResetNesting
 804                     and self.RESET_NESTING_TAGS.has_key(p.name)):
 805
 806                 #If we encounter one of the nesting reset triggers
 807                 #peculiar to this tag, or we encounter another tag
 808                 #that causes nesting to reset, pop up to but not
 809                 #including that tag.
 810
 811                 popTo = p.name
 812                 inclusive = False
 813                 break
 814             p = p.parent
 815         if popTo:
 816             self._popToTag(popTo, inclusive)
 817
 818     def unknown_starttag(self, name, attrs, selfClosing=0):
 819         #print "Start tag %s" % name
 820         if self.quoteStack:
 821             #This is not a real tag.
 822             #print "<%s> is not real!" % name
 823             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
 824             self.handle_data('<%s%s>' % (name, attrs))
 825             return
 826         self.endData()
 827         if not name in self.SELF_CLOSING_TAGS and not selfClosing:
 828             self._smartPop(name)
 829         tag = Tag(name, attrs, self.currentTag, self.previous)
 830         if self.previous:
 831             self.previous.next = tag
 832         self.previous = tag
 833         self.pushTag(tag)
 834         if selfClosing or name in self.SELF_CLOSING_TAGS:
 835             self.popTag()
 836         if name in self.QUOTE_TAGS:
 837             #print "Beginning quote (%s)" % name
 838             self.quoteStack.append(name)
 839             self.literal = 1
 840
 841     def unknown_endtag(self, name):
 842         if self.quoteStack and self.quoteStack[-1] != name:
 843             #This is not a real end tag.
 844             #print "</%s> is not real!" % name
 845             self.handle_data('</%s>' % name)
 846             return
 847         self.endData()
 848         self._popToTag(name)
 849         if self.quoteStack and self.quoteStack[-1] == name:
 850             self.quoteStack.pop()
 851             self.literal = (len(self.quoteStack) > 0)
 852
 853     def handle_data(self, data):
 854         self.currentData.append(data)
 855
 856     def handle_pi(self, text):
 857         "Propagate processing instructions right through."
 858         self.handle_data("<?%s>" % text)
 859
 860     def handle_comment(self, text):
 861         "Propagate comments right through."
 862         self.handle_data("<!--%s-->" % text)
 863
 864     def handle_charref(self, ref):
 865         "Propagate char refs right through."
 866         self.handle_data('&#%s;' % ref)
 867
 868     def handle_entityref(self, ref):
 869         "Propagate entity refs right through."
 870         self.handle_data('&%s;' % ref)
 871
 872     def handle_decl(self, data):
 873         "Propagate DOCTYPEs and the like right through."
 874         self.handle_data('<!%s>' % data)
 875
 876     def parse_declaration(self, i):
 877         """Treat a bogus SGML declaration as raw data. Treat a CDATA
 878         declaration as regular data."""
 879         j = None
 880         if self.rawdata[i:i+9] == '<![CDATA[':
 881              k = self.rawdata.find(']]>', i)
 882              if k == -1:
 883                  k = len(self.rawdata)
 884              self.handle_data(self.rawdata[i+9:k])
 885              j = k+3
 886         else:
 887             try:
 888                 j = SGMLParser.parse_declaration(self, i)
 889             except SGMLParseError:
 890                 toHandle = self.rawdata[i:]
 891                 self.handle_data(toHandle)
 892                 j = i + len(toHandle)
 893         return j
 894
 895 class BeautifulSoup(BeautifulStoneSoup):
 896
 897     """This parser knows the following facts about HTML:
 898
 899     * Some tags have no closing tag and should be interpreted as being
 900       closed as soon as they are encountered.
 901
 902     * The text inside some tags (ie. 'script') may contain tags which
 903       are not really part of the document and which should be parsed
 904       as text, not tags. If you want to parse the text as tags, you can
 905       always fetch it and parse it explicitly.
 906
 907     * Tag nesting rules:
 908
 909       Most tags can't be nested at all. For instance, the occurance of
 910       a <p> tag should implicitly close the previous <p> tag.
 911
 912        <p>Para1<p>Para2
 913         should be transformed into:
 914        <p>Para1</p><p>Para2
 915
 916       Some tags can be nested arbitrarily. For instance, the occurance
 917       of a <blockquote> tag should _not_ implicitly close the previous
 918       <blockquote> tag.
 919
 920        Alice said: <blockquote>Bob said: <blockquote>Blah
 921         should NOT be transformed into:
 922        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
 923
 924       Some tags can be nested, but the nesting is reset by the
 925       interposition of other tags. For instance, a <tr> tag should
 926       implicitly close the previous <tr> tag within the same <table>,
 927       but not close a <tr> tag in another table.
 928
 929        <table><tr>Blah<tr>Blah
 930         should be transformed into:
 931        <table><tr>Blah</tr><tr>Blah
 932         but,
 933        <tr>Blah<table><tr>Blah
 934         should NOT be transformed into
 935        <tr>Blah<table></tr><tr>Blah
 936
 937     Differing assumptions about tag nesting rules are a major source
 938     of problems with the BeautifulSoup class. If BeautifulSoup is not
 939     treating as nestable a tag your page author treats as nestable,
 940     try ICantBelieveItsBeautifulSoup before writing your own
 941     subclass."""
 942
 943     SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
 944                                            'spacer', 'link', 'frame', 'base'])
 945
 946     QUOTE_TAGS = {'script': None}
 947
 948     #According to the HTML standard, each of these inline tags can
 949     #contain another tag of the same type. Furthermore, it's common
 950     #to actually use these tags this way.
 951     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
 952                             'center']
 953
 954     #According to the HTML standard, these block tags can contain
 955     #another tag of the same type. Furthermore, it's common
 956     #to actually use these tags this way.
 957     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
 958
 959     #Lists can contain other lists, but there are restrictions.
 960     NESTABLE_LIST_TAGS = { 'ol' : [],
 961                            'ul' : [],
 962                            'li' : ['ul', 'ol'],
 963                            'dl' : [],
 964                            'dd' : ['dl'],
 965                            'dt' : ['dl'] }
 966
 967     #Tables can contain other tables, but there are restrictions.
 968     NESTABLE_TABLE_TAGS = {'table' : [],
 969                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
 970                            'td' : ['tr'],
 971                            'th' : ['tr'],
 972                            }
 973
 974     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
 975
 976     #If one of these tags is encountered, all tags up to the next tag of
 977     #this type are popped.
 978     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
 979                                      NON_NESTABLE_BLOCK_TAGS,
 980                                      NESTABLE_LIST_TAGS,
 981                                      NESTABLE_TABLE_TAGS)
 982
 983     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
 984                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
 985
 986 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
 987
 988     """The BeautifulSoup class is oriented towards skipping over
 989     common HTML errors like unclosed tags. However, sometimes it makes
 990     errors of its own. For instance, consider this fragment:
 991
 992      <b>Foo<b>Bar</b></b>
 993
 994     This is perfectly valid (if bizarre) HTML. However, the
 995     BeautifulSoup class will implicitly close the first b tag when it
 996     encounters the second 'b'. It will think the author wrote
 997     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
 998     there's no real-world reason to bold something that's already
 999     bold. When it encounters '</b></b>' it will close two more 'b'
1000     tags, for a grand total of three tags closed instead of two. This
1001     can throw off the rest of your document structure. The same is
1002     true of a number of other tags, listed below.
1003
1004     It's much more common for someone to forget to close (eg.) a 'b'
1005     tag than to actually use nested 'b' tags, and the BeautifulSoup
1006     class handles the common case. This class handles the
1007     not-co-common case: where you can't believe someone wrote what
1008     they did, but it's valid HTML and BeautifulSoup screwed up by
1009     assuming it wouldn't be.
1010
1011     If this doesn't do what you need, try subclassing this class or
1012     BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
1013
1014     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1015      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1016       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1017       'big']
1018
1019     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1020
1021     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1022                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1023                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1024
1025 class BeautifulSOAP(BeautifulStoneSoup):
1026     """This class will push a tag with only a single string child into
1027     the tag's parent as an attribute. The attribute's name is the tag
1028     name, and the value is the string child. An example should give
1029     the flavor of the change:
1030
1031     <foo><bar>baz</bar></foo>
1032      =>
1033     <foo bar="baz"><bar>baz</bar></foo>
1034
1035     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1036
1037     This is, of course, useful for scraping structures that tend to
1038     use subelements instead of attributes, such as SOAP messages. Note
1039     that it modifies its input, so don't print the modified version
1040     out.
1041
1042     I'm not sure how many people really want to use this class; let me
1043     know if you do. Mainly I like the name."""
1044
1045     def popTag(self):
1046         if len(self.tagStack) > 1:
1047             tag = self.tagStack[-1]
1048             parent = self.tagStack[-2]
1049             parent._getAttrMap()
1050             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1051                 isinstance(tag.contents[0], NavigableText) and
1052                 not parent.attrMap.has_key(tag.name)):
1053                 parent[tag.name] = tag.contents[0]
1054         BeautifulStoneSoup.popTag(self)
1055
1056 #Enterprise class names! It has come to our attention that some people
1057 #think the names of the Beautiful Soup parser classes are too silly
1058 #and "unprofessional" for use in enterprise screen-scraping. We feel
1059 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1060 #All-Night Kosher Bakery recommends renaming this file to
1061 #"RobustParser.py" (or, in cases of extreme enterprisitude,
1062 #"RobustParserBeanInterface.class") and using the following
1063 #enterprise-friendly class aliases:
1064 class RobustXMLParser(BeautifulStoneSoup):
1065     pass
1066 class RobustHTMLParser(BeautifulSoup):
1067     pass
1068 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1069     pass
1070 class SimplifyingSOAPParser(BeautifulSOAP):
1071     pass
1072
1073 ###
1074
1075
1076 #By default, act as an HTML pretty-printer.
1077 if __name__ == '__main__':
1078     import sys
1079     soup = BeautifulStoneSoup(sys.stdin.read())
1080     print soup.prettify()