libs/BeautifulSoup.py

   1 """Beautiful Soup
   2 Elixir and Tonic
   3 "The Screen-Scraper's Friend"
   4 http://www.crummy.com/software/BeautifulSoup/
   5
   6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 tree representation. It provides methods and Pythonic idioms that make
   8 it easy to navigate, search, and modify the tree.
   9
  10 A well-formed XML/HTML document yields a well-formed data
  11 structure. An ill-formed XML/HTML document yields a correspondingly
  12 ill-formed data structure. If your document is only locally
  13 well-formed, you can use this library to find and process the
  14 well-formed part of it.
  15
  16 Beautiful Soup works with Python 2.2 and up. It has no external
  17 dependencies, but you'll have more success at converting data to UTF-8
  18 if you also install these three packages:
  19
  20 * chardet, for auto-detecting character encodings
  21   http://chardet.feedparser.org/
  22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23   by stock Python.
  24   http://cjkpython.i18n.org/
  25
  26 Beautiful Soup defines classes for two main parsing strategies:
  27
  28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29    language that kind of looks like XML.
  30
  31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32    or invalid. This class has web browser-like heuristics for
  33    obtaining a sensible parse tree in the face of common HTML errors.
  34
  35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 the encoding of an HTML or XML document, and converting it to
  37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38
  39 For more than you ever wanted to know about Beautiful Soup, see the
  40 documentation:
  41 http://www.crummy.com/software/BeautifulSoup/documentation.html
  42
  43 Here, have some legalese:
  44
  45 Copyright (c) 2004-2008, Leonard Richardson
  46
  47 All rights reserved.
  48
  49 Redistribution and use in source and binary forms, with or without
  50 modification, are permitted provided that the following conditions are
  51 met:
  52
  53   * Redistributions of source code must retain the above copyright
  54     notice, this list of conditions and the following disclaimer.
  55
  56   * Redistributions in binary form must reproduce the above
  57     copyright notice, this list of conditions and the following
  58     disclaimer in the documentation and/or other materials provided
  59     with the distribution.
  60
  61   * Neither the name of the the Beautiful Soup Consortium and All
  62     Night Kosher Bakery nor the names of its contributors may be
  63     used to endorse or promote products derived from this software
  64     without specific prior written permission.
  65
  66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77
  78 """
  79 from __future__ import generators
  80
  81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 __version__ = "3.0.7a"
  83 __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
  84 __license__ = "New-style BSD"
  85
  86 from sgmllib import SGMLParser, SGMLParseError
  87 import codecs
  88 import markupbase
  89 import types
  90 import re
  91 import sgmllib
  92 try:
  93   from htmlentitydefs import name2codepoint
  94 except ImportError:
  95   name2codepoint = {}
  96 try:
  97     set
  98 except NameError:
  99     from sets import Set as set
 100
 101 #These hacks make Beautiful Soup able to parse XML with namespaces
 102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 104
 105 DEFAULT_OUTPUT_ENCODING = "utf-8"
 106
 107 # First, the classes that represent markup elements.
 108
 109 class PageElement:
 110     """Contains the navigational information for some part of the page
 111     (either a tag or a piece of text)"""
 112
 113     def setup(self, parent=None, previous=None):
 114         """Sets up the initial relations between this element and
 115         other elements."""
 116         self.parent = parent
 117         self.previous = previous
 118         self.next = None
 119         self.previousSibling = None
 120         self.nextSibling = None
 121         if self.parent and self.parent.contents:
 122             self.previousSibling = self.parent.contents[-1]
 123             self.previousSibling.nextSibling = self
 124
 125     def replaceWith(self, replaceWith):
 126         oldParent = self.parent
 127         myIndex = self.parent.contents.index(self)
 128         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
 129             # We're replacing this element with one of its siblings.
 130             index = self.parent.contents.index(replaceWith)
 131             if index and index < myIndex:
 132                 # Furthermore, it comes before this element. That
 133                 # means that when we extract it, the index of this
 134                 # element will change.
 135                 myIndex = myIndex - 1
 136         self.extract()
 137         oldParent.insert(myIndex, replaceWith)
 138
 139     def extract(self):
 140         """Destructively rips this element out of the tree."""
 141         if self.parent:
 142             try:
 143                 self.parent.contents.remove(self)
 144             except ValueError:
 145                 pass
 146
 147         #Find the two elements that would be next to each other if
 148         #this element (and any children) hadn't been parsed. Connect
 149         #the two.
 150         lastChild = self._lastRecursiveChild()
 151         nextElement = lastChild.next
 152
 153         if self.previous:
 154             self.previous.next = nextElement
 155         if nextElement:
 156             nextElement.previous = self.previous
 157         self.previous = None
 158         lastChild.next = None
 159
 160         self.parent = None
 161         if self.previousSibling:
 162             self.previousSibling.nextSibling = self.nextSibling
 163         if self.nextSibling:
 164             self.nextSibling.previousSibling = self.previousSibling
 165         self.previousSibling = self.nextSibling = None
 166         return self
 167
 168     def _lastRecursiveChild(self):
 169         "Finds the last element beneath this object to be parsed."
 170         lastChild = self
 171         while hasattr(lastChild, 'contents') and lastChild.contents:
 172             lastChild = lastChild.contents[-1]
 173         return lastChild
 174
 175     def insert(self, position, newChild):
 176         if (isinstance(newChild, basestring)
 177             or isinstance(newChild, unicode)) \
 178             and not isinstance(newChild, NavigableString):
 179             newChild = NavigableString(newChild)
 180
 181         position =  min(position, len(self.contents))
 182         if hasattr(newChild, 'parent') and newChild.parent != None:
 183             # We're 'inserting' an element that's already one
 184             # of this object's children.
 185             if newChild.parent == self:
 186                 index = self.find(newChild)
 187                 if index and index < position:
 188                     # Furthermore we're moving it further down the
 189                     # list of this object's children. That means that
 190                     # when we extract this element, our target index
 191                     # will jump down one.
 192                     position = position - 1
 193             newChild.extract()
 194
 195         newChild.parent = self
 196         previousChild = None
 197         if position == 0:
 198             newChild.previousSibling = None
 199             newChild.previous = self
 200         else:
 201             previousChild = self.contents[position-1]
 202             newChild.previousSibling = previousChild
 203             newChild.previousSibling.nextSibling = newChild
 204             newChild.previous = previousChild._lastRecursiveChild()
 205         if newChild.previous:
 206             newChild.previous.next = newChild
 207
 208         newChildsLastElement = newChild._lastRecursiveChild()
 209
 210         if position >= len(self.contents):
 211             newChild.nextSibling = None
 212
 213             parent = self
 214             parentsNextSibling = None
 215             while not parentsNextSibling:
 216                 parentsNextSibling = parent.nextSibling
 217                 parent = parent.parent
 218                 if not parent: # This is the last element in the document.
 219                     break
 220             if parentsNextSibling:
 221                 newChildsLastElement.next = parentsNextSibling
 222             else:
 223                 newChildsLastElement.next = None
 224         else:
 225             nextChild = self.contents[position]
 226             newChild.nextSibling = nextChild
 227             if newChild.nextSibling:
 228                 newChild.nextSibling.previousSibling = newChild
 229             newChildsLastElement.next = nextChild
 230
 231         if newChildsLastElement.next:
 232             newChildsLastElement.next.previous = newChildsLastElement
 233         self.contents.insert(position, newChild)
 234
 235     def append(self, tag):
 236         """Appends the given tag to the contents of this tag."""
 237         self.insert(len(self.contents), tag)
 238
 239     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 240         """Returns the first item that matches the given criteria and
 241         appears after this Tag in the document."""
 242         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 243
 244     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 245                     **kwargs):
 246         """Returns all items that match the given criteria and appear
 247         after this Tag in the document."""
 248         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 249                              **kwargs)
 250
 251     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 252         """Returns the closest sibling to this Tag that matches the
 253         given criteria and appears after this Tag in the document."""
 254         return self._findOne(self.findNextSiblings, name, attrs, text,
 255                              **kwargs)
 256
 257     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 258                          **kwargs):
 259         """Returns the siblings of this Tag that match the given
 260         criteria and appear after this Tag in the document."""
 261         return self._findAll(name, attrs, text, limit,
 262                              self.nextSiblingGenerator, **kwargs)
 263     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 264
 265     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 266         """Returns the first item that matches the given criteria and
 267         appears before this Tag in the document."""
 268         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 269
 270     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 271                         **kwargs):
 272         """Returns all items that match the given criteria and appear
 273         before this Tag in the document."""
 274         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 275                            **kwargs)
 276     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 277
 278     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 279         """Returns the closest sibling to this Tag that matches the
 280         given criteria and appears before this Tag in the document."""
 281         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 282                              **kwargs)
 283
 284     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 285                              limit=None, **kwargs):
 286         """Returns the siblings of this Tag that match the given
 287         criteria and appear before this Tag in the document."""
 288         return self._findAll(name, attrs, text, limit,
 289                              self.previousSiblingGenerator, **kwargs)
 290     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 291
 292     def findParent(self, name=None, attrs={}, **kwargs):
 293         """Returns the closest parent of this Tag that matches the given
 294         criteria."""
 295         # NOTE: We can't use _findOne because findParents takes a different
 296         # set of arguments.
 297         r = None
 298         l = self.findParents(name, attrs, 1)
 299         if l:
 300             r = l[0]
 301         return r
 302
 303     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 304         """Returns the parents of this Tag that match the given
 305         criteria."""
 306
 307         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 308                              **kwargs)
 309     fetchParents = findParents # Compatibility with pre-3.x
 310
 311     #These methods do the real heavy lifting.
 312
 313     def _findOne(self, method, name, attrs, text, **kwargs):
 314         r = None
 315         l = method(name, attrs, text, 1, **kwargs)
 316         if l:
 317             r = l[0]
 318         return r
 319
 320     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 321         "Iterates over a generator looking for things that match."
 322
 323         if isinstance(name, SoupStrainer):
 324             strainer = name
 325         else:
 326             # Build a SoupStrainer
 327             strainer = SoupStrainer(name, attrs, text, **kwargs)
 328         results = ResultSet(strainer)
 329         g = generator()
 330         while True:
 331             try:
 332                 i = g.next()
 333             except StopIteration:
 334                 break
 335             if i:
 336                 found = strainer.search(i)
 337                 if found:
 338                     results.append(found)
 339                     if limit and len(results) >= limit:
 340                         break
 341         return results
 342
 343     #These Generators can be used to navigate starting from both
 344     #NavigableStrings and Tags.
 345     def nextGenerator(self):
 346         i = self
 347         while i:
 348             i = i.next
 349             yield i
 350
 351     def nextSiblingGenerator(self):
 352         i = self
 353         while i:
 354             i = i.nextSibling
 355             yield i
 356
 357     def previousGenerator(self):
 358         i = self
 359         while i:
 360             i = i.previous
 361             yield i
 362
 363     def previousSiblingGenerator(self):
 364         i = self
 365         while i:
 366             i = i.previousSibling
 367             yield i
 368
 369     def parentGenerator(self):
 370         i = self
 371         while i:
 372             i = i.parent
 373             yield i
 374
 375     # Utility methods
 376     def substituteEncoding(self, str, encoding=None):
 377         encoding = encoding or "utf-8"
 378         return str.replace("%SOUP-ENCODING%", encoding)
 379
 380     def toEncoding(self, s, encoding=None):
 381         """Encodes an object to a string in some encoding, or to Unicode.
 382         ."""
 383         if isinstance(s, unicode):
 384             if encoding:
 385                 s = s.encode(encoding)
 386         elif isinstance(s, str):
 387             if encoding:
 388                 s = s.encode(encoding)
 389             else:
 390                 s = unicode(s)
 391         else:
 392             if encoding:
 393                 s  = self.toEncoding(str(s), encoding)
 394             else:
 395                 s = unicode(s)
 396         return s
 397
 398 class NavigableString(unicode, PageElement):
 399
 400     def __new__(cls, value):
 401         """Create a new NavigableString.
 402
 403         When unpickling a NavigableString, this method is called with
 404         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 405         passed in to the superclass's __new__ or the superclass won't know
 406         how to handle non-ASCII characters.
 407         """
 408         if isinstance(value, unicode):
 409             return unicode.__new__(cls, value)
 410         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 411
 412     def __getnewargs__(self):
 413         return (NavigableString.__str__(self),)
 414
 415     def __getattr__(self, attr):
 416         """text.string gives you text. This is for backwards
 417         compatibility for Navigable*String, but for CData* it lets you
 418         get the string without the CData wrapper."""
 419         if attr == 'string':
 420             return self
 421         else:
 422             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 423
 424     def __unicode__(self):
 425         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 426
 427     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 428         if encoding:
 429             return self.encode(encoding)
 430         else:
 431             return self
 432
 433 class CData(NavigableString):
 434
 435     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 436         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 437
 438 class ProcessingInstruction(NavigableString):
 439     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 440         output = self
 441         if "%SOUP-ENCODING%" in output:
 442             output = self.substituteEncoding(output, encoding)
 443         return "<?%s?>" % self.toEncoding(output, encoding)
 444
 445 class Comment(NavigableString):
 446     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 447         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 448
 449 class Declaration(NavigableString):
 450     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 451         return "<!%s>" % NavigableString.__str__(self, encoding)
 452
 453 class Tag(PageElement):
 454
 455     """Represents a found HTML tag with its attributes and contents."""
 456
 457     def _invert(h):
 458         "Cheap function to invert a hash."
 459         i = {}
 460         for k,v in h.items():
 461             i[v] = k
 462         return i
 463
 464     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 465                                       "quot" : '"',
 466                                       "amp" : "&",
 467                                       "lt" : "<",
 468                                       "gt" : ">" }
 469
 470     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 471
 472     def _convertEntities(self, match):
 473         """Used in a call to re.sub to replace HTML, XML, and numeric
 474         entities with the appropriate Unicode characters. If HTML
 475         entities are being converted, any unrecognized entities are
 476         escaped."""
 477         x = match.group(1)
 478         if self.convertHTMLEntities and x in name2codepoint:
 479             return unichr(name2codepoint[x])
 480         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 481             if self.convertXMLEntities:
 482                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 483             else:
 484                 return u'&%s;' % x
 485         elif len(x) > 0 and x[0] == '#':
 486             # Handle numeric entities
 487             if len(x) > 1 and x[1] == 'x':
 488                 return unichr(int(x[2:], 16))
 489             else:
 490                 return unichr(int(x[1:]))
 491
 492         elif self.escapeUnrecognizedEntities:
 493             return u'&amp;%s;' % x
 494         else:
 495             return u'&%s;' % x
 496
 497     def __init__(self, parser, name, attrs=None, parent=None,
 498                  previous=None):
 499         "Basic constructor."
 500
 501         # We don't actually store the parser object: that lets extracted
 502         # chunks be garbage-collected
 503         self.parserClass = parser.__class__
 504         self.isSelfClosing = parser.isSelfClosingTag(name)
 505         self.name = name
 506         if attrs == None:
 507             attrs = []
 508         self.attrs = attrs
 509         self.contents = []
 510         self.setup(parent, previous)
 511         self.hidden = False
 512         self.containsSubstitutions = False
 513         self.convertHTMLEntities = parser.convertHTMLEntities
 514         self.convertXMLEntities = parser.convertXMLEntities
 515         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 516
 517         # Convert any HTML, XML, or numeric entities in the attribute values.
 518         convert = lambda(k, val): (k,
 519                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 520                                           self._convertEntities,
 521                                           val))
 522         self.attrs = map(convert, self.attrs)
 523
 524     def get(self, key, default=None):
 525         """Returns the value of the 'key' attribute for the tag, or
 526         the value given for 'default' if it doesn't have that
 527         attribute."""
 528         return self._getAttrMap().get(key, default)
 529
 530     def has_key(self, key):
 531         return self._getAttrMap().has_key(key)
 532
 533     def __getitem__(self, key):
 534         """tag[key] returns the value of the 'key' attribute for the tag,
 535         and throws an exception if it's not there."""
 536         return self._getAttrMap()[key]
 537
 538     def __iter__(self):
 539         "Iterating over a tag iterates over its contents."
 540         return iter(self.contents)
 541
 542     def __len__(self):
 543         "The length of a tag is the length of its list of contents."
 544         return len(self.contents)
 545
 546     def __contains__(self, x):
 547         return x in self.contents
 548
 549     def __nonzero__(self):
 550         "A tag is non-None even if it has no contents."
 551         return True
 552
 553     def __setitem__(self, key, value):
 554         """Setting tag[key] sets the value of the 'key' attribute for the
 555         tag."""
 556         self._getAttrMap()
 557         self.attrMap[key] = value
 558         found = False
 559         for i in range(0, len(self.attrs)):
 560             if self.attrs[i][0] == key:
 561                 self.attrs[i] = (key, value)
 562                 found = True
 563         if not found:
 564             self.attrs.append((key, value))
 565         self._getAttrMap()[key] = value
 566
 567     def __delitem__(self, key):
 568         "Deleting tag[key] deletes all 'key' attributes for the tag."
 569         for item in self.attrs:
 570             if item[0] == key:
 571                 self.attrs.remove(item)
 572                 #We don't break because bad HTML can define the same
 573                 #attribute multiple times.
 574             self._getAttrMap()
 575             if self.attrMap.has_key(key):
 576                 del self.attrMap[key]
 577
 578     def __call__(self, *args, **kwargs):
 579         """Calling a tag like a function is the same as calling its
 580         findAll() method. Eg. tag('a') returns a list of all the A tags
 581         found within this tag."""
 582         return apply(self.findAll, args, kwargs)
 583
 584     def __getattr__(self, tag):
 585         #print "Getattr %s.%s" % (self.__class__, tag)
 586         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 587             return self.find(tag[:-3])
 588         elif tag.find('__') != 0:
 589             return self.find(tag)
 590         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 591
 592     def __eq__(self, other):
 593         """Returns true iff this tag has the same name, the same attributes,
 594         and the same contents (recursively) as the given tag.
 595
 596         NOTE: right now this will return false if two tags have the
 597         same attributes in a different order. Should this be fixed?"""
 598         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 599             return False
 600         for i in range(0, len(self.contents)):
 601             if self.contents[i] != other.contents[i]:
 602                 return False
 603         return True
 604
 605     def __ne__(self, other):
 606         """Returns true iff this tag is not identical to the other tag,
 607         as defined in __eq__."""
 608         return not self == other
 609
 610     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 611         """Renders this tag as a string."""
 612         return self.__str__(encoding)
 613
 614     def __unicode__(self):
 615         return self.__str__(None)
 616
 617     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 618                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 619                                            + ")")
 620
 621     def _sub_entity(self, x):
 622         """Used with a regular expression to substitute the
 623         appropriate XML entity for an XML special character."""
 624         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 625
 626     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 627                 prettyPrint=False, indentLevel=0):
 628         """Returns a string or Unicode representation of this tag and
 629         its contents. To get Unicode, pass None for encoding.
 630
 631         NOTE: since Python's HTML parser consumes whitespace, this
 632         method is not certain to reproduce the whitespace present in
 633         the original string."""
 634
 635         encodedName = self.toEncoding(self.name, encoding)
 636
 637         attrs = []
 638         if self.attrs:
 639             for key, val in self.attrs:
 640                 fmt = '%s="%s"'
 641                 if isString(val):
 642                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 643                         val = self.substituteEncoding(val, encoding)
 644
 645                     # The attribute value either:
 646                     #
 647                     # * Contains no embedded double quotes or single quotes.
 648                     #   No problem: we enclose it in double quotes.
 649                     # * Contains embedded single quotes. No problem:
 650                     #   double quotes work here too.
 651                     # * Contains embedded double quotes. No problem:
 652                     #   we enclose it in single quotes.
 653                     # * Embeds both single _and_ double quotes. This
 654                     #   can't happen naturally, but it can happen if
 655                     #   you modify an attribute value after parsing
 656                     #   the document. Now we have a bit of a
 657                     #   problem. We solve it by enclosing the
 658                     #   attribute in single quotes, and escaping any
 659                     #   embedded single quotes to XML entities.
 660                     if '"' in val:
 661                         fmt = "%s='%s'"
 662                         if "'" in val:
 663                             # TODO: replace with apos when
 664                             # appropriate.
 665                             val = val.replace("'", "&squot;")
 666
 667                     # Now we're okay w/r/t quotes. But the attribute
 668                     # value might also contain angle brackets, or
 669                     # ampersands that aren't part of entities. We need
 670                     # to escape those to XML entities too.
 671                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 672
 673                 attrs.append(fmt % (self.toEncoding(key, encoding),
 674                                     self.toEncoding(val, encoding)))
 675         close = ''
 676         closeTag = ''
 677         if self.isSelfClosing:
 678             close = ' /'
 679         else:
 680             closeTag = '</%s>' % encodedName
 681
 682         indentTag, indentContents = 0, 0
 683         if prettyPrint:
 684             indentTag = indentLevel
 685             space = (' ' * (indentTag-1))
 686             indentContents = indentTag + 1
 687         contents = self.renderContents(encoding, prettyPrint, indentContents)
 688         if self.hidden:
 689             s = contents
 690         else:
 691             s = []
 692             attributeString = ''
 693             if attrs:
 694                 attributeString = ' ' + ' '.join(attrs)
 695             if prettyPrint:
 696                 s.append(space)
 697             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 698             if prettyPrint:
 699                 s.append("\n")
 700             s.append(contents)
 701             if prettyPrint and contents and contents[-1] != "\n":
 702                 s.append("\n")
 703             if prettyPrint and closeTag:
 704                 s.append(space)
 705             s.append(closeTag)
 706             if prettyPrint and closeTag and self.nextSibling:
 707                 s.append("\n")
 708             s = ''.join(s)
 709         return s
 710
 711     def decompose(self):
 712         """Recursively destroys the contents of this tree."""
 713         contents = [i for i in self.contents]
 714         for i in contents:
 715             if isinstance(i, Tag):
 716                 i.decompose()
 717             else:
 718                 i.extract()
 719         self.extract()
 720
 721     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 722         return self.__str__(encoding, True)
 723
 724     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 725                        prettyPrint=False, indentLevel=0):
 726         """Renders the contents of this tag as a string in the given
 727         encoding. If encoding is None, returns a Unicode string.."""
 728         s=[]
 729         for c in self:
 730             text = None
 731             if isinstance(c, NavigableString):
 732                 text = c.__str__(encoding)
 733             elif isinstance(c, Tag):
 734                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 735             if text and prettyPrint:
 736                 text = text.strip()
 737             if text:
 738                 if prettyPrint:
 739                     s.append(" " * (indentLevel-1))
 740                 s.append(text)
 741                 if prettyPrint:
 742                     s.append("\n")
 743         return ''.join(s)
 744
 745     #Soup methods
 746
 747     def find(self, name=None, attrs={}, recursive=True, text=None,
 748              **kwargs):
 749         """Return only the first child of this Tag matching the given
 750         criteria."""
 751         r = None
 752         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 753         if l:
 754             r = l[0]
 755         return r
 756     findChild = find
 757
 758     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 759                 limit=None, **kwargs):
 760         """Extracts a list of Tag objects that match the given
 761         criteria.  You can specify the name of the Tag and any
 762         attributes you want the Tag to have.
 763
 764         The value of a key-value pair in the 'attrs' map can be a
 765         string, a list of strings, a regular expression object, or a
 766         callable that takes a string and returns whether or not the
 767         string matches for some custom definition of 'matches'. The
 768         same is true of the tag name."""
 769         generator = self.recursiveChildGenerator
 770         if not recursive:
 771             generator = self.childGenerator
 772         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 773     findChildren = findAll
 774
 775     # Pre-3.x compatibility methods
 776     first = find
 777     fetch = findAll
 778
 779     def fetchText(self, text=None, recursive=True, limit=None):
 780         return self.findAll(text=text, recursive=recursive, limit=limit)
 781
 782     def firstText(self, text=None, recursive=True):
 783         return self.find(text=text, recursive=recursive)
 784
 785     #Private methods
 786
 787     def _getAttrMap(self):
 788         """Initializes a map representation of this tag's attributes,
 789         if not already initialized."""
 790         if not getattr(self, 'attrMap'):
 791             self.attrMap = {}
 792             for (key, value) in self.attrs:
 793                 self.attrMap[key] = value
 794         return self.attrMap
 795
 796     #Generator methods
 797     def childGenerator(self):
 798         for i in range(0, len(self.contents)):
 799             yield self.contents[i]
 800         raise StopIteration
 801
 802     def recursiveChildGenerator(self):
 803         stack = [(self, 0)]
 804         while stack:
 805             tag, start = stack.pop()
 806             if isinstance(tag, Tag):
 807                 for i in range(start, len(tag.contents)):
 808                     a = tag.contents[i]
 809                     yield a
 810                     if isinstance(a, Tag) and tag.contents:
 811                         if i < len(tag.contents) - 1:
 812                             stack.append((tag, i+1))
 813                         stack.append((a, 0))
 814                         break
 815         raise StopIteration
 816
 817 # Next, a couple classes to represent queries and their results.
 818 class SoupStrainer:
 819     """Encapsulates a number of ways of matching a markup element (tag or
 820     text)."""
 821
 822     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 823         self.name = name
 824         if isString(attrs):
 825             kwargs['class'] = attrs
 826             attrs = None
 827         if kwargs:
 828             if attrs:
 829                 attrs = attrs.copy()
 830                 attrs.update(kwargs)
 831             else:
 832                 attrs = kwargs
 833         self.attrs = attrs
 834         self.text = text
 835
 836     def __str__(self):
 837         if self.text:
 838             return self.text
 839         else:
 840             return "%s|%s" % (self.name, self.attrs)
 841
 842     def searchTag(self, markupName=None, markupAttrs={}):
 843         found = None
 844         markup = None
 845         if isinstance(markupName, Tag):
 846             markup = markupName
 847             markupAttrs = markup
 848         callFunctionWithTagData = callable(self.name) \
 849                                 and not isinstance(markupName, Tag)
 850
 851         if (not self.name) \
 852                or callFunctionWithTagData \
 853                or (markup and self._matches(markup, self.name)) \
 854                or (not markup and self._matches(markupName, self.name)):
 855             if callFunctionWithTagData:
 856                 match = self.name(markupName, markupAttrs)
 857             else:
 858                 match = True
 859                 markupAttrMap = None
 860                 for attr, matchAgainst in self.attrs.items():
 861                     if not markupAttrMap:
 862                          if hasattr(markupAttrs, 'get'):
 863                             markupAttrMap = markupAttrs
 864                          else:
 865                             markupAttrMap = {}
 866                             for k,v in markupAttrs:
 867                                 markupAttrMap[k] = v
 868                     attrValue = markupAttrMap.get(attr)
 869                     if not self._matches(attrValue, matchAgainst):
 870                         match = False
 871                         break
 872             if match:
 873                 if markup:
 874                     found = markup
 875                 else:
 876                     found = markupName
 877         return found
 878
 879     def search(self, markup):
 880         #print 'looking for %s in %s' % (self, markup)
 881         found = None
 882         # If given a list of items, scan it for a text element that
 883         # matches.
 884         if isList(markup) and not isinstance(markup, Tag):
 885             for element in markup:
 886                 if isinstance(element, NavigableString) \
 887                        and self.search(element):
 888                     found = element
 889                     break
 890         # If it's a Tag, make sure its name or attributes match.
 891         # Don't bother with Tags if we're searching for text.
 892         elif isinstance(markup, Tag):
 893             if not self.text:
 894                 found = self.searchTag(markup)
 895         # If it's text, make sure the text matches.
 896         elif isinstance(markup, NavigableString) or \
 897                  isString(markup):
 898             if self._matches(markup, self.text):
 899                 found = markup
 900         else:
 901             raise Exception, "I don't know how to match against a %s" \
 902                   % markup.__class__
 903         return found
 904
 905     def _matches(self, markup, matchAgainst):
 906         #print "Matching %s against %s" % (markup, matchAgainst)
 907         result = False
 908         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
 909             result = markup != None
 910         elif callable(matchAgainst):
 911             result = matchAgainst(markup)
 912         else:
 913             #Custom match methods take the tag as an argument, but all
 914             #other ways of matching match the tag name as a string.
 915             if isinstance(markup, Tag):
 916                 markup = markup.name
 917             if markup and not isString(markup):
 918                 markup = unicode(markup)
 919             #Now we know that chunk is either a string, or None.
 920             if hasattr(matchAgainst, 'match'):
 921                 # It's a regexp object.
 922                 result = markup and matchAgainst.search(markup)
 923             elif isList(matchAgainst):
 924                 result = markup in matchAgainst
 925             elif hasattr(matchAgainst, 'items'):
 926                 result = markup.has_key(matchAgainst)
 927             elif matchAgainst and isString(markup):
 928                 if isinstance(markup, unicode):
 929                     matchAgainst = unicode(matchAgainst)
 930                 else:
 931                     matchAgainst = str(matchAgainst)
 932
 933             if not result:
 934                 result = matchAgainst == markup
 935         return result
 936
 937 class ResultSet(list):
 938     """A ResultSet is just a list that keeps track of the SoupStrainer
 939     that created it."""
 940     def __init__(self, source):
 941         list.__init__([])
 942         self.source = source
 943
 944 # Now, some helper functions.
 945
 946 def isList(l):
 947     """Convenience method that works with all 2.x versions of Python
 948     to determine whether or not something is listlike."""
 949     return hasattr(l, '__iter__') \
 950            or (type(l) in (types.ListType, types.TupleType))
 951
 952 def isString(s):
 953     """Convenience method that works with all 2.x versions of Python
 954     to determine whether or not something is stringlike."""
 955     try:
 956         return isinstance(s, unicode) or isinstance(s, basestring)
 957     except NameError:
 958         return isinstance(s, str)
 959
 960 def buildTagMap(default, *args):
 961     """Turns a list of maps, lists, or scalars into a single map.
 962     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
 963     NESTING_RESET_TAGS maps out of lists and partial maps."""
 964     built = {}
 965     for portion in args:
 966         if hasattr(portion, 'items'):
 967             #It's a map. Merge it.
 968             for k,v in portion.items():
 969                 built[k] = v
 970         elif isList(portion):
 971             #It's a list. Map each item to the default.
 972             for k in portion:
 973                 built[k] = default
 974         else:
 975             #It's a scalar. Map it to the default.
 976             built[portion] = default
 977     return built
 978
 979 # Now, the parser classes.
 980
 981 class BeautifulStoneSoup(Tag, SGMLParser):
 982
 983     """This class contains the basic parser and search code. It defines
 984     a parser that knows nothing about tag behavior except for the
 985     following:
 986
 987       You can't close a tag without closing all the tags it encloses.
 988       That is, "<foo><bar></foo>" actually means
 989       "<foo><bar></bar></foo>".
 990
 991     [Another possible explanation is "<foo><bar /></foo>", but since
 992     this class defines no SELF_CLOSING_TAGS, it will never use that
 993     explanation.]
 994
 995     This class is useful for parsing XML or made-up markup languages,
 996     or when BeautifulSoup makes an assumption counter to what you were
 997     expecting."""
 998
 999     SELF_CLOSING_TAGS = {}
1000     NESTABLE_TAGS = {}
1001     RESET_NESTING_TAGS = {}
1002     QUOTE_TAGS = {}
1003     PRESERVE_WHITESPACE_TAGS = []
1004
1005     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1006                        lambda x: x.group(1) + ' />'),
1007                       (re.compile('<!\s+([^<>]*)>'),
1008                        lambda x: '<!' + x.group(1) + '>')
1009                       ]
1010
1011     ROOT_TAG_NAME = u'[document]'
1012
1013     HTML_ENTITIES = "html"
1014     XML_ENTITIES = "xml"
1015     XHTML_ENTITIES = "xhtml"
1016     # TODO: This only exists for backwards-compatibility
1017     ALL_ENTITIES = XHTML_ENTITIES
1018
1019     # Used when determining whether a text node is all whitespace and
1020     # can be replaced with a single space. A text node that contains
1021     # fancy Unicode spaces (usually non-breaking) should be left
1022     # alone.
1023     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1024
1025     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1026                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1027                  convertEntities=None, selfClosingTags=None, isHTML=False):
1028         """The Soup object is initialized as the 'root tag', and the
1029         provided markup (which can be a string or a file-like object)
1030         is fed into the underlying parser.
1031
1032         sgmllib will process most bad HTML, and the BeautifulSoup
1033         class has some tricks for dealing with some HTML that kills
1034         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1035         if your data uses self-closing tags or declarations
1036         incorrectly.
1037
1038         By default, Beautiful Soup uses regexes to sanitize input,
1039         avoiding the vast majority of these problems. If the problems
1040         don't apply to you, pass in False for markupMassage, and
1041         you'll get better performance.
1042
1043         The default parser massage techniques fix the two most common
1044         instances of invalid HTML that choke sgmllib:
1045
1046          <br/> (No space between name of closing tag and tag close)
1047          <! --Comment--> (Extraneous whitespace in declaration)
1048
1049         You can pass in a custom list of (RE object, replace method)
1050         tuples to get Beautiful Soup to scrub your input the way you
1051         want."""
1052
1053         self.parseOnlyThese = parseOnlyThese
1054         self.fromEncoding = fromEncoding
1055         self.smartQuotesTo = smartQuotesTo
1056         self.convertEntities = convertEntities
1057         # Set the rules for how we'll deal with the entities we
1058         # encounter
1059         if self.convertEntities:
1060             # It doesn't make sense to convert encoded characters to
1061             # entities even while you're converting entities to Unicode.
1062             # Just convert it all to Unicode.
1063             self.smartQuotesTo = None
1064             if convertEntities == self.HTML_ENTITIES:
1065                 self.convertXMLEntities = False
1066                 self.convertHTMLEntities = True
1067                 self.escapeUnrecognizedEntities = True
1068             elif convertEntities == self.XHTML_ENTITIES:
1069                 self.convertXMLEntities = True
1070                 self.convertHTMLEntities = True
1071                 self.escapeUnrecognizedEntities = False
1072             elif convertEntities == self.XML_ENTITIES:
1073                 self.convertXMLEntities = True
1074                 self.convertHTMLEntities = False
1075                 self.escapeUnrecognizedEntities = False
1076         else:
1077             self.convertXMLEntities = False
1078             self.convertHTMLEntities = False
1079             self.escapeUnrecognizedEntities = False
1080
1081         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1082         SGMLParser.__init__(self)
1083
1084         if hasattr(markup, 'read'):        # It's a file-type object.
1085             markup = markup.read()
1086         self.markup = markup
1087         self.markupMassage = markupMassage
1088         try:
1089             self._feed(isHTML=isHTML)
1090         except StopParsing:
1091             pass
1092         self.markup = None                 # The markup can now be GCed
1093
1094     def convert_charref(self, name):
1095         """This method fixes a bug in Python's SGMLParser."""
1096         try:
1097             n = int(name)
1098         except ValueError:
1099             return
1100         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1101             return
1102         return self.convert_codepoint(n)
1103
1104     def _feed(self, inDocumentEncoding=None, isHTML=False):
1105         # Convert the document to Unicode.
1106         markup = self.markup
1107         if isinstance(markup, unicode):
1108             if not hasattr(self, 'originalEncoding'):
1109                 self.originalEncoding = None
1110         else:
1111             dammit = UnicodeDammit\
1112                      (markup, [self.fromEncoding, inDocumentEncoding],
1113                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1114             markup = dammit.unicode
1115             self.originalEncoding = dammit.originalEncoding
1116             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1117         if markup:
1118             if self.markupMassage:
1119                 if not isList(self.markupMassage):
1120                     self.markupMassage = self.MARKUP_MASSAGE
1121                 for fix, m in self.markupMassage:
1122                     markup = fix.sub(m, markup)
1123                 # TODO: We get rid of markupMassage so that the
1124                 # soup object can be deepcopied later on. Some
1125                 # Python installations can't copy regexes. If anyone
1126                 # was relying on the existence of markupMassage, this
1127                 # might cause problems.
1128                 del(self.markupMassage)
1129         self.reset()
1130
1131         SGMLParser.feed(self, markup)
1132         # Close out any unfinished strings and close all the open tags.
1133         self.endData()
1134         while self.currentTag.name != self.ROOT_TAG_NAME:
1135             self.popTag()
1136
1137     def __getattr__(self, methodName):
1138         """This method routes method call requests to either the SGMLParser
1139         superclass or the Tag superclass, depending on the method name."""
1140         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1141
1142         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1143                or methodName.find('do_') == 0:
1144             return SGMLParser.__getattr__(self, methodName)
1145         elif methodName.find('__') != 0:
1146             return Tag.__getattr__(self, methodName)
1147         else:
1148             raise AttributeError
1149
1150     def isSelfClosingTag(self, name):
1151         """Returns true iff the given string is the name of a
1152         self-closing tag according to this parser."""
1153         return self.SELF_CLOSING_TAGS.has_key(name) \
1154                or self.instanceSelfClosingTags.has_key(name)
1155
1156     def reset(self):
1157         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1158         self.hidden = 1
1159         SGMLParser.reset(self)
1160         self.currentData = []
1161         self.currentTag = None
1162         self.tagStack = []
1163         self.quoteStack = []
1164         self.pushTag(self)
1165
1166     def popTag(self):
1167         tag = self.tagStack.pop()
1168         # Tags with just one string-owning child get the child as a
1169         # 'string' property, so that soup.tag.string is shorthand for
1170         # soup.tag.contents[0]
1171         if len(self.currentTag.contents) == 1 and \
1172            isinstance(self.currentTag.contents[0], NavigableString):
1173             self.currentTag.string = self.currentTag.contents[0]
1174
1175         #print "Pop", tag.name
1176         if self.tagStack:
1177             self.currentTag = self.tagStack[-1]
1178         return self.currentTag
1179
1180     def pushTag(self, tag):
1181         #print "Push", tag.name
1182         if self.currentTag:
1183             self.currentTag.contents.append(tag)
1184         self.tagStack.append(tag)
1185         self.currentTag = self.tagStack[-1]
1186
1187     def endData(self, containerClass=NavigableString):
1188         if self.currentData:
1189             currentData = u''.join(self.currentData)
1190             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1191                 not set([tag.name for tag in self.tagStack]).intersection(
1192                     self.PRESERVE_WHITESPACE_TAGS)):
1193                 if '\n' in currentData:
1194                     currentData = '\n'
1195                 else:
1196                     currentData = ' '
1197             self.currentData = []
1198             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1199                    (not self.parseOnlyThese.text or \
1200                     not self.parseOnlyThese.search(currentData)):
1201                 return
1202             o = containerClass(currentData)
1203             o.setup(self.currentTag, self.previous)
1204             if self.previous:
1205                 self.previous.next = o
1206             self.previous = o
1207             self.currentTag.contents.append(o)
1208
1209
1210     def _popToTag(self, name, inclusivePop=True):
1211         """Pops the tag stack up to and including the most recent
1212         instance of the given tag. If inclusivePop is false, pops the tag
1213         stack up to but *not* including the most recent instqance of
1214         the given tag."""
1215         #print "Popping to %s" % name
1216         if name == self.ROOT_TAG_NAME:
1217             return
1218
1219         numPops = 0
1220         mostRecentTag = None
1221         for i in range(len(self.tagStack)-1, 0, -1):
1222             if name == self.tagStack[i].name:
1223                 numPops = len(self.tagStack)-i
1224                 break
1225         if not inclusivePop:
1226             numPops = numPops - 1
1227
1228         for i in range(0, numPops):
1229             mostRecentTag = self.popTag()
1230         return mostRecentTag
1231
1232     def _smartPop(self, name):
1233
1234         """We need to pop up to the previous tag of this type, unless
1235         one of this tag's nesting reset triggers comes between this
1236         tag and the previous tag of this type, OR unless this tag is a
1237         generic nesting trigger and another generic nesting trigger
1238         comes between this tag and the previous tag of this type.
1239
1240         Examples:
1241          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1242          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1243          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1244
1245          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1246          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1247          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1248         """
1249
1250         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1251         isNestable = nestingResetTriggers != None
1252         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1253         popTo = None
1254         inclusive = True
1255         for i in range(len(self.tagStack)-1, 0, -1):
1256             p = self.tagStack[i]
1257             if (not p or p.name == name) and not isNestable:
1258                 #Non-nestable tags get popped to the top or to their
1259                 #last occurance.
1260                 popTo = name
1261                 break
1262             if (nestingResetTriggers != None
1263                 and p.name in nestingResetTriggers) \
1264                 or (nestingResetTriggers == None and isResetNesting
1265                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1266
1267                 #If we encounter one of the nesting reset triggers
1268                 #peculiar to this tag, or we encounter another tag
1269                 #that causes nesting to reset, pop up to but not
1270                 #including that tag.
1271                 popTo = p.name
1272                 inclusive = False
1273                 break
1274             p = p.parent
1275         if popTo:
1276             self._popToTag(popTo, inclusive)
1277
1278     def unknown_starttag(self, name, attrs, selfClosing=0):
1279         #print "Start tag %s: %s" % (name, attrs)
1280         if self.quoteStack:
1281             #This is not a real tag.
1282             #print "<%s> is not real!" % name
1283             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1284             self.handle_data('<%s%s>' % (name, attrs))
1285             return
1286         self.endData()
1287
1288         if not self.isSelfClosingTag(name) and not selfClosing:
1289             self._smartPop(name)
1290
1291         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1292                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1293             return
1294
1295         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1296         if self.previous:
1297             self.previous.next = tag
1298         self.previous = tag
1299         self.pushTag(tag)
1300         if selfClosing or self.isSelfClosingTag(name):
1301             self.popTag()
1302         if name in self.QUOTE_TAGS:
1303             #print "Beginning quote (%s)" % name
1304             self.quoteStack.append(name)
1305             self.literal = 1
1306         return tag
1307
1308     def unknown_endtag(self, name):
1309         #print "End tag %s" % name
1310         if self.quoteStack and self.quoteStack[-1] != name:
1311             #This is not a real end tag.
1312             #print "</%s> is not real!" % name
1313             self.handle_data('</%s>' % name)
1314             return
1315         self.endData()
1316         self._popToTag(name)
1317         if self.quoteStack and self.quoteStack[-1] == name:
1318             self.quoteStack.pop()
1319             self.literal = (len(self.quoteStack) > 0)
1320
1321     def handle_data(self, data):
1322         self.currentData.append(data)
1323
1324     def _toStringSubclass(self, text, subclass):
1325         """Adds a certain piece of text to the tree as a NavigableString
1326         subclass."""
1327         self.endData()
1328         self.handle_data(text)
1329         self.endData(subclass)
1330
1331     def handle_pi(self, text):
1332         """Handle a processing instruction as a ProcessingInstruction
1333         object, possibly one with a %SOUP-ENCODING% slot into which an
1334         encoding will be plugged later."""
1335         if text[:3] == "xml":
1336             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1337         self._toStringSubclass(text, ProcessingInstruction)
1338
1339     def handle_comment(self, text):
1340         "Handle comments as Comment objects."
1341         self._toStringSubclass(text, Comment)
1342
1343     def handle_charref(self, ref):
1344         "Handle character references as data."
1345         if self.convertEntities:
1346             data = unichr(int(ref))
1347         else:
1348             data = '&#%s;' % ref
1349         self.handle_data(data)
1350
1351     def handle_entityref(self, ref):
1352         """Handle entity references as data, possibly converting known
1353         HTML and/or XML entity references to the corresponding Unicode
1354         characters."""
1355         data = None
1356         if self.convertHTMLEntities:
1357             try:
1358                 data = unichr(name2codepoint[ref])
1359             except KeyError:
1360                 pass
1361
1362         if not data and self.convertXMLEntities:
1363                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1364
1365         if not data and self.convertHTMLEntities and \
1366             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1367                 # TODO: We've got a problem here. We're told this is
1368                 # an entity reference, but it's not an XML entity
1369                 # reference or an HTML entity reference. Nonetheless,
1370                 # the logical thing to do is to pass it through as an
1371                 # unrecognized entity reference.
1372                 #
1373                 # Except: when the input is "&carol;" this function
1374                 # will be called with input "carol". When the input is
1375                 # "AT&T", this function will be called with input
1376                 # "T". We have no way of knowing whether a semicolon
1377                 # was present originally, so we don't know whether
1378                 # this is an unknown entity or just a misplaced
1379                 # ampersand.
1380                 #
1381                 # The more common case is a misplaced ampersand, so I
1382                 # escape the ampersand and omit the trailing semicolon.
1383                 data = "&amp;%s" % ref
1384         if not data:
1385             # This case is different from the one above, because we
1386             # haven't already gone through a supposedly comprehensive
1387             # mapping of entities to Unicode characters. We might not
1388             # have gone through any mapping at all. So the chances are
1389             # very high that this is a real entity, and not a
1390             # misplaced ampersand.
1391             data = "&%s;" % ref
1392         self.handle_data(data)
1393
1394     def handle_decl(self, data):
1395         "Handle DOCTYPEs and the like as Declaration objects."
1396         self._toStringSubclass(data, Declaration)
1397
1398     def parse_declaration(self, i):
1399         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1400         declaration as a CData object."""
1401         j = None
1402         if self.rawdata[i:i+9] == '<![CDATA[':
1403              k = self.rawdata.find(']]>', i)
1404              if k == -1:
1405                  k = len(self.rawdata)
1406              data = self.rawdata[i+9:k]
1407              j = k+3
1408              self._toStringSubclass(data, CData)
1409         else:
1410             try:
1411                 j = SGMLParser.parse_declaration(self, i)
1412             except SGMLParseError:
1413                 toHandle = self.rawdata[i:]
1414                 self.handle_data(toHandle)
1415                 j = i + len(toHandle)
1416         return j
1417
1418 class BeautifulSoup(BeautifulStoneSoup):
1419
1420     """This parser knows the following facts about HTML:
1421
1422     * Some tags have no closing tag and should be interpreted as being
1423       closed as soon as they are encountered.
1424
1425     * The text inside some tags (ie. 'script') may contain tags which
1426       are not really part of the document and which should be parsed
1427       as text, not tags. If you want to parse the text as tags, you can
1428       always fetch it and parse it explicitly.
1429
1430     * Tag nesting rules:
1431
1432       Most tags can't be nested at all. For instance, the occurance of
1433       a <p> tag should implicitly close the previous <p> tag.
1434
1435        <p>Para1<p>Para2
1436         should be transformed into:
1437        <p>Para1</p><p>Para2
1438
1439       Some tags can be nested arbitrarily. For instance, the occurance
1440       of a <blockquote> tag should _not_ implicitly close the previous
1441       <blockquote> tag.
1442
1443        Alice said: <blockquote>Bob said: <blockquote>Blah
1444         should NOT be transformed into:
1445        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1446
1447       Some tags can be nested, but the nesting is reset by the
1448       interposition of other tags. For instance, a <tr> tag should
1449       implicitly close the previous <tr> tag within the same <table>,
1450       but not close a <tr> tag in another table.
1451
1452        <table><tr>Blah<tr>Blah
1453         should be transformed into:
1454        <table><tr>Blah</tr><tr>Blah
1455         but,
1456        <tr>Blah<table><tr>Blah
1457         should NOT be transformed into
1458        <tr>Blah<table></tr><tr>Blah
1459
1460     Differing assumptions about tag nesting rules are a major source
1461     of problems with the BeautifulSoup class. If BeautifulSoup is not
1462     treating as nestable a tag your page author treats as nestable,
1463     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1464     BeautifulStoneSoup before writing your own subclass."""
1465
1466     def __init__(self, *args, **kwargs):
1467         if not kwargs.has_key('smartQuotesTo'):
1468             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1469         kwargs['isHTML'] = True
1470         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1471
1472     SELF_CLOSING_TAGS = buildTagMap(None,
1473                                     ['br' , 'hr', 'input', 'img', 'meta',
1474                                     'spacer', 'link', 'frame', 'base'])
1475
1476     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1477
1478     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1479
1480     #According to the HTML standard, each of these inline tags can
1481     #contain another tag of the same type. Furthermore, it's common
1482     #to actually use these tags this way.
1483     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1484                             'center']
1485
1486     #According to the HTML standard, these block tags can contain
1487     #another tag of the same type. Furthermore, it's common
1488     #to actually use these tags this way.
1489     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1490
1491     #Lists can contain other lists, but there are restrictions.
1492     NESTABLE_LIST_TAGS = { 'ol' : [],
1493                            'ul' : [],
1494                            'li' : ['ul', 'ol'],
1495                            'dl' : [],
1496                            'dd' : ['dl'],
1497                            'dt' : ['dl'] }
1498
1499     #Tables can contain other tables, but there are restrictions.
1500     NESTABLE_TABLE_TAGS = {'table' : [],
1501                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1502                            'td' : ['tr'],
1503                            'th' : ['tr'],
1504                            'thead' : ['table'],
1505                            'tbody' : ['table'],
1506                            'tfoot' : ['table'],
1507                            }
1508
1509     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1510
1511     #If one of these tags is encountered, all tags up to the next tag of
1512     #this type are popped.
1513     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1514                                      NON_NESTABLE_BLOCK_TAGS,
1515                                      NESTABLE_LIST_TAGS,
1516                                      NESTABLE_TABLE_TAGS)
1517
1518     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1519                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1520
1521     # Used to detect the charset in a META tag; see start_meta
1522     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1523
1524     def start_meta(self, attrs):
1525         """Beautiful Soup can detect a charset included in a META tag,
1526         try to convert the document to that charset, and re-parse the
1527         document from the beginning."""
1528         httpEquiv = None
1529         contentType = None
1530         contentTypeIndex = None
1531         tagNeedsEncodingSubstitution = False
1532
1533         for i in range(0, len(attrs)):
1534             key, value = attrs[i]
1535             key = key.lower()
1536             if key == 'http-equiv':
1537                 httpEquiv = value
1538             elif key == 'content':
1539                 contentType = value
1540                 contentTypeIndex = i
1541
1542         if httpEquiv and contentType: # It's an interesting meta tag.
1543             match = self.CHARSET_RE.search(contentType)
1544             if match:
1545                 if (self.declaredHTMLEncoding is not None or
1546                     self.originalEncoding == self.fromEncoding):
1547                     # An HTML encoding was sniffed while converting
1548                     # the document to Unicode, or an HTML encoding was
1549                     # sniffed during a previous pass through the
1550                     # document, or an encoding was specified
1551                     # explicitly and it worked. Rewrite the meta tag.
1552                     def rewrite(match):
1553                         return match.group(1) + "%SOUP-ENCODING%"
1554                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1555                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1556                                                newAttr)
1557                     tagNeedsEncodingSubstitution = True
1558                 else:
1559                     # This is our first pass through the document.
1560                     # Go through it again with the encoding information.
1561                     newCharset = match.group(3)
1562                     if newCharset and newCharset != self.originalEncoding:
1563                         self.declaredHTMLEncoding = newCharset
1564                         self._feed(self.declaredHTMLEncoding)
1565                         raise StopParsing
1566                     pass
1567         tag = self.unknown_starttag("meta", attrs)
1568         if tag and tagNeedsEncodingSubstitution:
1569             tag.containsSubstitutions = True
1570
1571 class StopParsing(Exception):
1572     pass
1573
1574 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1575
1576     """The BeautifulSoup class is oriented towards skipping over
1577     common HTML errors like unclosed tags. However, sometimes it makes
1578     errors of its own. For instance, consider this fragment:
1579
1580      <b>Foo<b>Bar</b></b>
1581
1582     This is perfectly valid (if bizarre) HTML. However, the
1583     BeautifulSoup class will implicitly close the first b tag when it
1584     encounters the second 'b'. It will think the author wrote
1585     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1586     there's no real-world reason to bold something that's already
1587     bold. When it encounters '</b></b>' it will close two more 'b'
1588     tags, for a grand total of three tags closed instead of two. This
1589     can throw off the rest of your document structure. The same is
1590     true of a number of other tags, listed below.
1591
1592     It's much more common for someone to forget to close a 'b' tag
1593     than to actually use nested 'b' tags, and the BeautifulSoup class
1594     handles the common case. This class handles the not-co-common
1595     case: where you can't believe someone wrote what they did, but
1596     it's valid HTML and BeautifulSoup screwed up by assuming it
1597     wouldn't be."""
1598
1599     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1600      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1601       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1602       'big']
1603
1604     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1605
1606     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1607                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1608                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1609
1610 class MinimalSoup(BeautifulSoup):
1611     """The MinimalSoup class is for parsing HTML that contains
1612     pathologically bad markup. It makes no assumptions about tag
1613     nesting, but it does know which tags are self-closing, that
1614     <script> tags contain Javascript and should not be parsed, that
1615     META tags may contain encoding information, and so on.
1616
1617     This also makes it better for subclassing than BeautifulStoneSoup
1618     or BeautifulSoup."""
1619
1620     RESET_NESTING_TAGS = buildTagMap('noscript')
1621     NESTABLE_TAGS = {}
1622
1623 class BeautifulSOAP(BeautifulStoneSoup):
1624     """This class will push a tag with only a single string child into
1625     the tag's parent as an attribute. The attribute's name is the tag
1626     name, and the value is the string child. An example should give
1627     the flavor of the change:
1628
1629     <foo><bar>baz</bar></foo>
1630      =>
1631     <foo bar="baz"><bar>baz</bar></foo>
1632
1633     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1634
1635     This is, of course, useful for scraping structures that tend to
1636     use subelements instead of attributes, such as SOAP messages. Note
1637     that it modifies its input, so don't print the modified version
1638     out.
1639
1640     I'm not sure how many people really want to use this class; let me
1641     know if you do. Mainly I like the name."""
1642
1643     def popTag(self):
1644         if len(self.tagStack) > 1:
1645             tag = self.tagStack[-1]
1646             parent = self.tagStack[-2]
1647             parent._getAttrMap()
1648             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1649                 isinstance(tag.contents[0], NavigableString) and
1650                 not parent.attrMap.has_key(tag.name)):
1651                 parent[tag.name] = tag.contents[0]
1652         BeautifulStoneSoup.popTag(self)
1653
1654 #Enterprise class names! It has come to our attention that some people
1655 #think the names of the Beautiful Soup parser classes are too silly
1656 #and "unprofessional" for use in enterprise screen-scraping. We feel
1657 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1658 #All-Night Kosher Bakery recommends renaming this file to
1659 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1660 #"RobustParserBeanInterface.class") and using the following
1661 #enterprise-friendly class aliases:
1662 class RobustXMLParser(BeautifulStoneSoup):
1663     pass
1664 class RobustHTMLParser(BeautifulSoup):
1665     pass
1666 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1667     pass
1668 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1669     pass
1670 class SimplifyingSOAPParser(BeautifulSOAP):
1671     pass
1672
1673 ######################################################
1674 #
1675 # Bonus library: Unicode, Dammit
1676 #
1677 # This class forces XML data into a standard format (usually to UTF-8
1678 # or Unicode).  It is heavily based on code from Mark Pilgrim's
1679 # Universal Feed Parser. It does not rewrite the XML or HTML to
1680 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1681 # (XML) and BeautifulSoup.start_meta (HTML).
1682
1683 # Autodetects character encodings.
1684 # Download from http://chardet.feedparser.org/
1685 try:
1686     import chardet
1687 #    import chardet.constants
1688 #    chardet.constants._debug = 1
1689 except ImportError:
1690     chardet = None
1691
1692 # cjkcodecs and iconv_codec make Python know about more character encodings.
1693 # Both are available from http://cjkpython.i18n.org/
1694 # They're built in if you use Python 2.4.
1695 try:
1696     import cjkcodecs.aliases
1697 except ImportError:
1698     pass
1699 try:
1700     import iconv_codec
1701 except ImportError:
1702     pass
1703
1704 class UnicodeDammit:
1705     """A class for detecting the encoding of a *ML document and
1706     converting it to a Unicode string. If the source encoding is
1707     windows-1252, can replace MS smart quotes with their HTML or XML
1708     equivalents."""
1709
1710     # This dictionary maps commonly seen values for "charset" in HTML
1711     # meta tags to the corresponding Python codec names. It only covers
1712     # values that aren't in Python's aliases and can't be determined
1713     # by the heuristics in find_codec.
1714     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1715                         "x-sjis" : "shift-jis" }
1716
1717     def __init__(self, markup, overrideEncodings=[],
1718                  smartQuotesTo='xml', isHTML=False):
1719         self.declaredHTMLEncoding = None
1720         self.markup, documentEncoding, sniffedEncoding = \
1721                      self._detectEncoding(markup, isHTML)
1722         self.smartQuotesTo = smartQuotesTo
1723         self.triedEncodings = []
1724         if markup == '' or isinstance(markup, unicode):
1725             self.originalEncoding = None
1726             self.unicode = unicode(markup)
1727             return
1728
1729         u = None
1730         for proposedEncoding in overrideEncodings:
1731             u = self._convertFrom(proposedEncoding)
1732             if u: break
1733         if not u:
1734             for proposedEncoding in (documentEncoding, sniffedEncoding):
1735                 u = self._convertFrom(proposedEncoding)
1736                 if u: break
1737
1738         # If no luck and we have auto-detection library, try that:
1739         if not u and chardet and not isinstance(self.markup, unicode):
1740             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1741
1742         # As a last resort, try utf-8 and windows-1252:
1743         if not u:
1744             for proposed_encoding in ("utf-8", "windows-1252"):
1745                 u = self._convertFrom(proposed_encoding)
1746                 if u: break
1747
1748         self.unicode = u
1749         if not u: self.originalEncoding = None
1750
1751     def _subMSChar(self, orig):
1752         """Changes a MS smart quote character to an XML or HTML
1753         entity."""
1754         sub = self.MS_CHARS.get(orig)
1755         if type(sub) == types.TupleType:
1756             if self.smartQuotesTo == 'xml':
1757                 sub = '&#x%s;' % sub[1]
1758             else:
1759                 sub = '&%s;' % sub[0]
1760         return sub
1761
1762     def _convertFrom(self, proposed):
1763         proposed = self.find_codec(proposed)
1764         if not proposed or proposed in self.triedEncodings:
1765             return None
1766         self.triedEncodings.append(proposed)
1767         markup = self.markup
1768
1769         # Convert smart quotes to HTML if coming from an encoding
1770         # that might have them.
1771         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1772                                                       "iso-8859-1",
1773                                                       "iso-8859-2"):
1774             markup = re.compile("([\x80-\x9f])").sub \
1775                      (lambda(x): self._subMSChar(x.group(1)),
1776                       markup)
1777
1778         try:
1779             # print "Trying to convert document to %s" % proposed
1780             u = self._toUnicode(markup, proposed)
1781             self.markup = u
1782             self.originalEncoding = proposed
1783         except Exception, e:
1784             # print "That didn't work!"
1785             # print e
1786             return None
1787         #print "Correct encoding: %s" % proposed
1788         return self.markup
1789
1790     def _toUnicode(self, data, encoding):
1791         '''Given a string and its encoding, decodes the string into Unicode.
1792         %encoding is a string recognized by encodings.aliases'''
1793
1794         # strip Byte Order Mark (if present)
1795         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1796                and (data[2:4] != '\x00\x00'):
1797             encoding = 'utf-16be'
1798             data = data[2:]
1799         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1800                  and (data[2:4] != '\x00\x00'):
1801             encoding = 'utf-16le'
1802             data = data[2:]
1803         elif data[:3] == '\xef\xbb\xbf':
1804             encoding = 'utf-8'
1805             data = data[3:]
1806         elif data[:4] == '\x00\x00\xfe\xff':
1807             encoding = 'utf-32be'
1808             data = data[4:]
1809         elif data[:4] == '\xff\xfe\x00\x00':
1810             encoding = 'utf-32le'
1811             data = data[4:]
1812         newdata = unicode(data, encoding)
1813         return newdata
1814
1815     def _detectEncoding(self, xml_data, isHTML=False):
1816         """Given a document, tries to detect its XML encoding."""
1817         xml_encoding = sniffed_xml_encoding = None
1818         try:
1819             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1820                 # EBCDIC
1821                 xml_data = self._ebcdic_to_ascii(xml_data)
1822             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1823                 # UTF-16BE
1824                 sniffed_xml_encoding = 'utf-16be'
1825                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1826             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1827                      and (xml_data[2:4] != '\x00\x00'):
1828                 # UTF-16BE with BOM
1829                 sniffed_xml_encoding = 'utf-16be'
1830                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1831             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1832                 # UTF-16LE
1833                 sniffed_xml_encoding = 'utf-16le'
1834                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1835             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1836                      (xml_data[2:4] != '\x00\x00'):
1837                 # UTF-16LE with BOM
1838                 sniffed_xml_encoding = 'utf-16le'
1839                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1840             elif xml_data[:4] == '\x00\x00\x00\x3c':
1841                 # UTF-32BE
1842                 sniffed_xml_encoding = 'utf-32be'
1843                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1844             elif xml_data[:4] == '\x3c\x00\x00\x00':
1845                 # UTF-32LE
1846                 sniffed_xml_encoding = 'utf-32le'
1847                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1848             elif xml_data[:4] == '\x00\x00\xfe\xff':
1849                 # UTF-32BE with BOM
1850                 sniffed_xml_encoding = 'utf-32be'
1851                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1852             elif xml_data[:4] == '\xff\xfe\x00\x00':
1853                 # UTF-32LE with BOM
1854                 sniffed_xml_encoding = 'utf-32le'
1855                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1856             elif xml_data[:3] == '\xef\xbb\xbf':
1857                 # UTF-8 with BOM
1858                 sniffed_xml_encoding = 'utf-8'
1859                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1860             else:
1861                 sniffed_xml_encoding = 'ascii'
1862                 pass
1863         except:
1864             xml_encoding_match = None
1865         xml_encoding_match = re.compile(
1866             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1867         if not xml_encoding_match and isHTML:
1868             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1869             xml_encoding_match = regexp.search(xml_data)
1870         if xml_encoding_match is not None:
1871             xml_encoding = xml_encoding_match.groups()[0].lower()
1872             if isHTML:
1873                 self.declaredHTMLEncoding = xml_encoding
1874             if sniffed_xml_encoding and \
1875                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1876                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1877                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1878                                  'utf16', 'u16')):
1879                 xml_encoding = sniffed_xml_encoding
1880         return xml_data, xml_encoding, sniffed_xml_encoding
1881
1882
1883     def find_codec(self, charset):
1884         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1885                or (charset and self._codec(charset.replace("-", ""))) \
1886                or (charset and self._codec(charset.replace("-", "_"))) \
1887                or charset
1888
1889     def _codec(self, charset):
1890         if not charset: return charset
1891         codec = None
1892         try:
1893             codecs.lookup(charset)
1894             codec = charset
1895         except (LookupError, ValueError):
1896             pass
1897         return codec
1898
1899     EBCDIC_TO_ASCII_MAP = None
1900     def _ebcdic_to_ascii(self, s):
1901         c = self.__class__
1902         if not c.EBCDIC_TO_ASCII_MAP:
1903             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1904                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1905                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1906                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1907                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1908                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1909                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1910                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1911                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1912                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1913                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1914                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1915                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1916                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1917                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1918                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1919                     250,251,252,253,254,255)
1920             import string
1921             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1922             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1923         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1924
1925     MS_CHARS = { '\x80' : ('euro', '20AC'),
1926                  '\x81' : ' ',
1927                  '\x82' : ('sbquo', '201A'),
1928                  '\x83' : ('fnof', '192'),
1929                  '\x84' : ('bdquo', '201E'),
1930                  '\x85' : ('hellip', '2026'),
1931                  '\x86' : ('dagger', '2020'),
1932                  '\x87' : ('Dagger', '2021'),
1933                  '\x88' : ('circ', '2C6'),
1934                  '\x89' : ('permil', '2030'),
1935                  '\x8A' : ('Scaron', '160'),
1936                  '\x8B' : ('lsaquo', '2039'),
1937                  '\x8C' : ('OElig', '152'),
1938                  '\x8D' : '?',
1939                  '\x8E' : ('#x17D', '17D'),
1940                  '\x8F' : '?',
1941                  '\x90' : '?',
1942                  '\x91' : ('lsquo', '2018'),
1943                  '\x92' : ('rsquo', '2019'),
1944                  '\x93' : ('ldquo', '201C'),
1945                  '\x94' : ('rdquo', '201D'),
1946                  '\x95' : ('bull', '2022'),
1947                  '\x96' : ('ndash', '2013'),
1948                  '\x97' : ('mdash', '2014'),
1949                  '\x98' : ('tilde', '2DC'),
1950                  '\x99' : ('trade', '2122'),
1951                  '\x9a' : ('scaron', '161'),
1952                  '\x9b' : ('rsaquo', '203A'),
1953                  '\x9c' : ('oelig', '153'),
1954                  '\x9d' : '?',
1955                  '\x9e' : ('#x17E', '17E'),
1956                  '\x9f' : ('Yuml', ''),}
1957
1958 #######################################################################
1959
1960
1961 #By default, act as an HTML pretty-printer.
1962 if __name__ == '__main__':
1963     import sys
1964     soup = BeautifulSoup(sys.stdin)
1965     print soup.prettify()