3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2008, Leonard Richardson
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79 from __future__
import generators
81 __author__
= "Leonard Richardson (leonardr@segfault.org)"
82 __version__
= "3.0.7a"
83 __copyright__
= "Copyright (c) 2004-2008 Leonard Richardson"
84 __license__
= "New-style BSD"
86 from sgmllib
import SGMLParser
, SGMLParseError
93 from htmlentitydefs
import name2codepoint
99 from sets
import Set
as set
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib
.tagfind
= re
.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase
._declname
_match
= re
.compile(r
'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING
= "utf-8"
107 # First, the classes that represent markup elements.
110 """Contains the navigational information for some part of the page
111 (either a tag or a piece of text)"""
113 def setup(self
, parent
=None, previous
=None):
114 """Sets up the initial relations between this element and
117 self
.previous
= previous
119 self
.previousSibling
= None
120 self
.nextSibling
= None
121 if self
.parent
and self
.parent
.contents
:
122 self
.previousSibling
= self
.parent
.contents
[-1]
123 self
.previousSibling
.nextSibling
= self
125 def replaceWith(self
, replaceWith
):
126 oldParent
= self
.parent
127 myIndex
= self
.parent
.contents
.index(self
)
128 if hasattr(replaceWith
, 'parent') and replaceWith
.parent
== self
.parent
:
129 # We're replacing this element with one of its siblings.
130 index
= self
.parent
.contents
.index(replaceWith
)
131 if index
and index
< myIndex
:
132 # Furthermore, it comes before this element. That
133 # means that when we extract it, the index of this
134 # element will change.
135 myIndex
= myIndex
- 1
137 oldParent
.insert(myIndex
, replaceWith
)
140 """Destructively rips this element out of the tree."""
143 self
.parent
.contents
.remove(self
)
147 #Find the two elements that would be next to each other if
148 #this element (and any children) hadn't been parsed. Connect
150 lastChild
= self
._lastRecursiveChild
()
151 nextElement
= lastChild
.next
154 self
.previous
.next
= nextElement
156 nextElement
.previous
= self
.previous
158 lastChild
.next
= None
161 if self
.previousSibling
:
162 self
.previousSibling
.nextSibling
= self
.nextSibling
164 self
.nextSibling
.previousSibling
= self
.previousSibling
165 self
.previousSibling
= self
.nextSibling
= None
168 def _lastRecursiveChild(self
):
169 "Finds the last element beneath this object to be parsed."
171 while hasattr(lastChild
, 'contents') and lastChild
.contents
:
172 lastChild
= lastChild
.contents
[-1]
175 def insert(self
, position
, newChild
):
176 if (isinstance(newChild
, basestring
)
177 or isinstance(newChild
, unicode)) \
178 and not isinstance(newChild
, NavigableString
):
179 newChild
= NavigableString(newChild
)
181 position
= min(position
, len(self
.contents
))
182 if hasattr(newChild
, 'parent') and newChild
.parent
!= None:
183 # We're 'inserting' an element that's already one
184 # of this object's children.
185 if newChild
.parent
== self
:
186 index
= self
.find(newChild
)
187 if index
and index
< position
:
188 # Furthermore we're moving it further down the
189 # list of this object's children. That means that
190 # when we extract this element, our target index
191 # will jump down one.
192 position
= position
- 1
195 newChild
.parent
= self
198 newChild
.previousSibling
= None
199 newChild
.previous
= self
201 previousChild
= self
.contents
[position
-1]
202 newChild
.previousSibling
= previousChild
203 newChild
.previousSibling
.nextSibling
= newChild
204 newChild
.previous
= previousChild
._lastRecursiveChild
()
205 if newChild
.previous
:
206 newChild
.previous
.next
= newChild
208 newChildsLastElement
= newChild
._lastRecursiveChild
()
210 if position
>= len(self
.contents
):
211 newChild
.nextSibling
= None
214 parentsNextSibling
= None
215 while not parentsNextSibling
:
216 parentsNextSibling
= parent
.nextSibling
217 parent
= parent
.parent
218 if not parent
: # This is the last element in the document.
220 if parentsNextSibling
:
221 newChildsLastElement
.next
= parentsNextSibling
223 newChildsLastElement
.next
= None
225 nextChild
= self
.contents
[position
]
226 newChild
.nextSibling
= nextChild
227 if newChild
.nextSibling
:
228 newChild
.nextSibling
.previousSibling
= newChild
229 newChildsLastElement
.next
= nextChild
231 if newChildsLastElement
.next
:
232 newChildsLastElement
.next
.previous
= newChildsLastElement
233 self
.contents
.insert(position
, newChild
)
235 def append(self
, tag
):
236 """Appends the given tag to the contents of this tag."""
237 self
.insert(len(self
.contents
), tag
)
239 def findNext(self
, name
=None, attrs
={}, text
=None, **kwargs
):
240 """Returns the first item that matches the given criteria and
241 appears after this Tag in the document."""
242 return self
._findOne
(self
.findAllNext
, name
, attrs
, text
, **kwargs
)
244 def findAllNext(self
, name
=None, attrs
={}, text
=None, limit
=None,
246 """Returns all items that match the given criteria and appear
247 after this Tag in the document."""
248 return self
._findAll
(name
, attrs
, text
, limit
, self
.nextGenerator
,
251 def findNextSibling(self
, name
=None, attrs
={}, text
=None, **kwargs
):
252 """Returns the closest sibling to this Tag that matches the
253 given criteria and appears after this Tag in the document."""
254 return self
._findOne
(self
.findNextSiblings
, name
, attrs
, text
,
257 def findNextSiblings(self
, name
=None, attrs
={}, text
=None, limit
=None,
259 """Returns the siblings of this Tag that match the given
260 criteria and appear after this Tag in the document."""
261 return self
._findAll
(name
, attrs
, text
, limit
,
262 self
.nextSiblingGenerator
, **kwargs
)
263 fetchNextSiblings
= findNextSiblings
# Compatibility with pre-3.x
265 def findPrevious(self
, name
=None, attrs
={}, text
=None, **kwargs
):
266 """Returns the first item that matches the given criteria and
267 appears before this Tag in the document."""
268 return self
._findOne
(self
.findAllPrevious
, name
, attrs
, text
, **kwargs
)
270 def findAllPrevious(self
, name
=None, attrs
={}, text
=None, limit
=None,
272 """Returns all items that match the given criteria and appear
273 before this Tag in the document."""
274 return self
._findAll
(name
, attrs
, text
, limit
, self
.previousGenerator
,
276 fetchPrevious
= findAllPrevious
# Compatibility with pre-3.x
278 def findPreviousSibling(self
, name
=None, attrs
={}, text
=None, **kwargs
):
279 """Returns the closest sibling to this Tag that matches the
280 given criteria and appears before this Tag in the document."""
281 return self
._findOne
(self
.findPreviousSiblings
, name
, attrs
, text
,
284 def findPreviousSiblings(self
, name
=None, attrs
={}, text
=None,
285 limit
=None, **kwargs
):
286 """Returns the siblings of this Tag that match the given
287 criteria and appear before this Tag in the document."""
288 return self
._findAll
(name
, attrs
, text
, limit
,
289 self
.previousSiblingGenerator
, **kwargs
)
290 fetchPreviousSiblings
= findPreviousSiblings
# Compatibility with pre-3.x
292 def findParent(self
, name
=None, attrs
={}, **kwargs
):
293 """Returns the closest parent of this Tag that matches the given
295 # NOTE: We can't use _findOne because findParents takes a different
298 l
= self
.findParents(name
, attrs
, 1)
303 def findParents(self
, name
=None, attrs
={}, limit
=None, **kwargs
):
304 """Returns the parents of this Tag that match the given
307 return self
._findAll
(name
, attrs
, None, limit
, self
.parentGenerator
,
309 fetchParents
= findParents
# Compatibility with pre-3.x
311 #These methods do the real heavy lifting.
313 def _findOne(self
, method
, name
, attrs
, text
, **kwargs
):
315 l
= method(name
, attrs
, text
, 1, **kwargs
)
320 def _findAll(self
, name
, attrs
, text
, limit
, generator
, **kwargs
):
321 "Iterates over a generator looking for things that match."
323 if isinstance(name
, SoupStrainer
):
326 # Build a SoupStrainer
327 strainer
= SoupStrainer(name
, attrs
, text
, **kwargs
)
328 results
= ResultSet(strainer
)
333 except StopIteration:
336 found
= strainer
.search(i
)
338 results
.append(found
)
339 if limit
and len(results
) >= limit
:
343 #These Generators can be used to navigate starting from both
344 #NavigableStrings and Tags.
345 def nextGenerator(self
):
351 def nextSiblingGenerator(self
):
357 def previousGenerator(self
):
363 def previousSiblingGenerator(self
):
366 i
= i
.previousSibling
369 def parentGenerator(self
):
376 def substituteEncoding(self
, str, encoding
=None):
377 encoding
= encoding
or "utf-8"
378 return str.replace("%SOUP-ENCODING%", encoding
)
380 def toEncoding(self
, s
, encoding
=None):
381 """Encodes an object to a string in some encoding, or to Unicode.
383 if isinstance(s
, unicode):
385 s
= s
.encode(encoding
)
386 elif isinstance(s
, str):
388 s
= s
.encode(encoding
)
393 s
= self
.toEncoding(str(s
), encoding
)
398 class NavigableString(unicode, PageElement
):
400 def __new__(cls
, value
):
401 """Create a new NavigableString.
403 When unpickling a NavigableString, this method is called with
404 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
405 passed in to the superclass's __new__ or the superclass won't know
406 how to handle non-ASCII characters.
408 if isinstance(value
, unicode):
409 return unicode.__new
__(cls
, value
)
410 return unicode.__new
__(cls
, value
, DEFAULT_OUTPUT_ENCODING
)
412 def __getnewargs__(self
):
413 return (NavigableString
.__str
__(self
),)
415 def __getattr__(self
, attr
):
416 """text.string gives you text. This is for backwards
417 compatibility for Navigable*String, but for CData* it lets you
418 get the string without the CData wrapper."""
422 raise AttributeError, "'%s' object has no attribute '%s'" % (self
.__class
__.__name
__, attr
)
424 def __unicode__(self
):
425 return str(self
).decode(DEFAULT_OUTPUT_ENCODING
)
427 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
429 return self
.encode(encoding
)
433 class CData(NavigableString
):
435 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
436 return "<![CDATA[%s]]>" % NavigableString
.__str
__(self
, encoding
)
438 class ProcessingInstruction(NavigableString
):
439 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
441 if "%SOUP-ENCODING%" in output
:
442 output
= self
.substituteEncoding(output
, encoding
)
443 return "<?%s?>" % self
.toEncoding(output
, encoding
)
445 class Comment(NavigableString
):
446 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
447 return "<!--%s-->" % NavigableString
.__str
__(self
, encoding
)
449 class Declaration(NavigableString
):
450 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
451 return "<!%s>" % NavigableString
.__str
__(self
, encoding
)
453 class Tag(PageElement
):
455 """Represents a found HTML tag with its attributes and contents."""
458 "Cheap function to invert a hash."
460 for k
,v
in h
.items():
464 XML_ENTITIES_TO_SPECIAL_CHARS
= { "apos" : "'",
470 XML_SPECIAL_CHARS_TO_ENTITIES
= _invert(XML_ENTITIES_TO_SPECIAL_CHARS
)
472 def _convertEntities(self
, match
):
473 """Used in a call to re.sub to replace HTML, XML, and numeric
474 entities with the appropriate Unicode characters. If HTML
475 entities are being converted, any unrecognized entities are
478 if self
.convertHTMLEntities
and x
in name2codepoint
:
479 return unichr(name2codepoint
[x
])
480 elif x
in self
.XML_ENTITIES_TO_SPECIAL_CHARS
:
481 if self
.convertXMLEntities
:
482 return self
.XML_ENTITIES_TO_SPECIAL_CHARS
[x
]
485 elif len(x
) > 0 and x
[0] == '#':
486 # Handle numeric entities
487 if len(x
) > 1 and x
[1] == 'x':
488 return unichr(int(x
[2:], 16))
490 return unichr(int(x
[1:]))
492 elif self
.escapeUnrecognizedEntities
:
493 return u
'&%s;' % x
497 def __init__(self
, parser
, name
, attrs
=None, parent
=None,
501 # We don't actually store the parser object: that lets extracted
502 # chunks be garbage-collected
503 self
.parserClass
= parser
.__class
__
504 self
.isSelfClosing
= parser
.isSelfClosingTag(name
)
510 self
.setup(parent
, previous
)
512 self
.containsSubstitutions
= False
513 self
.convertHTMLEntities
= parser
.convertHTMLEntities
514 self
.convertXMLEntities
= parser
.convertXMLEntities
515 self
.escapeUnrecognizedEntities
= parser
.escapeUnrecognizedEntities
517 # Convert any HTML, XML, or numeric entities in the attribute values.
518 convert
= lambda(k
, val
): (k
,
519 re
.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
520 self
._convertEntities
,
522 self
.attrs
= map(convert
, self
.attrs
)
524 def get(self
, key
, default
=None):
525 """Returns the value of the 'key' attribute for the tag, or
526 the value given for 'default' if it doesn't have that
528 return self
._getAttrMap
().get(key
, default
)
530 def has_key(self
, key
):
531 return self
._getAttrMap
().has_key(key
)
533 def __getitem__(self
, key
):
534 """tag[key] returns the value of the 'key' attribute for the tag,
535 and throws an exception if it's not there."""
536 return self
._getAttrMap
()[key
]
539 "Iterating over a tag iterates over its contents."
540 return iter(self
.contents
)
543 "The length of a tag is the length of its list of contents."
544 return len(self
.contents
)
546 def __contains__(self
, x
):
547 return x
in self
.contents
549 def __nonzero__(self
):
550 "A tag is non-None even if it has no contents."
553 def __setitem__(self
, key
, value
):
554 """Setting tag[key] sets the value of the 'key' attribute for the
557 self
.attrMap
[key
] = value
559 for i
in range(0, len(self
.attrs
)):
560 if self
.attrs
[i
][0] == key
:
561 self
.attrs
[i
] = (key
, value
)
564 self
.attrs
.append((key
, value
))
565 self
._getAttrMap
()[key
] = value
567 def __delitem__(self
, key
):
568 "Deleting tag[key] deletes all 'key' attributes for the tag."
569 for item
in self
.attrs
:
571 self
.attrs
.remove(item
)
572 #We don't break because bad HTML can define the same
573 #attribute multiple times.
575 if self
.attrMap
.has_key(key
):
576 del self
.attrMap
[key
]
578 def __call__(self
, *args
, **kwargs
):
579 """Calling a tag like a function is the same as calling its
580 findAll() method. Eg. tag('a') returns a list of all the A tags
581 found within this tag."""
582 return apply(self
.findAll
, args
, kwargs
)
584 def __getattr__(self
, tag
):
585 #print "Getattr %s.%s" % (self.__class__, tag)
586 if len(tag
) > 3 and tag
.rfind('Tag') == len(tag
)-3:
587 return self
.find(tag
[:-3])
588 elif tag
.find('__') != 0:
589 return self
.find(tag
)
590 raise AttributeError, "'%s' object has no attribute '%s'" % (self
.__class
__, tag
)
592 def __eq__(self
, other
):
593 """Returns true iff this tag has the same name, the same attributes,
594 and the same contents (recursively) as the given tag.
596 NOTE: right now this will return false if two tags have the
597 same attributes in a different order. Should this be fixed?"""
598 if not hasattr(other
, 'name') or not hasattr(other
, 'attrs') or not hasattr(other
, 'contents') or self
.name
!= other
.name
or self
.attrs
!= other
.attrs
or len(self
) != len(other
):
600 for i
in range(0, len(self
.contents
)):
601 if self
.contents
[i
] != other
.contents
[i
]:
605 def __ne__(self
, other
):
606 """Returns true iff this tag is not identical to the other tag,
607 as defined in __eq__."""
608 return not self
== other
610 def __repr__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
611 """Renders this tag as a string."""
612 return self
.__str
__(encoding
)
614 def __unicode__(self
):
615 return self
.__str
__(None)
617 BARE_AMPERSAND_OR_BRACKET
= re
.compile("([<>]|"
618 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
621 def _sub_entity(self
, x
):
622 """Used with a regular expression to substitute the
623 appropriate XML entity for an XML special character."""
624 return "&" + self
.XML_SPECIAL_CHARS_TO_ENTITIES
[x
.group(0)[0]] + ";"
626 def __str__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
,
627 prettyPrint
=False, indentLevel
=0):
628 """Returns a string or Unicode representation of this tag and
629 its contents. To get Unicode, pass None for encoding.
631 NOTE: since Python's HTML parser consumes whitespace, this
632 method is not certain to reproduce the whitespace present in
633 the original string."""
635 encodedName
= self
.toEncoding(self
.name
, encoding
)
639 for key
, val
in self
.attrs
:
642 if self
.containsSubstitutions
and '%SOUP-ENCODING%' in val
:
643 val
= self
.substituteEncoding(val
, encoding
)
645 # The attribute value either:
647 # * Contains no embedded double quotes or single quotes.
648 # No problem: we enclose it in double quotes.
649 # * Contains embedded single quotes. No problem:
650 # double quotes work here too.
651 # * Contains embedded double quotes. No problem:
652 # we enclose it in single quotes.
653 # * Embeds both single _and_ double quotes. This
654 # can't happen naturally, but it can happen if
655 # you modify an attribute value after parsing
656 # the document. Now we have a bit of a
657 # problem. We solve it by enclosing the
658 # attribute in single quotes, and escaping any
659 # embedded single quotes to XML entities.
663 # TODO: replace with apos when
665 val
= val
.replace("'", "&squot;")
667 # Now we're okay w/r/t quotes. But the attribute
668 # value might also contain angle brackets, or
669 # ampersands that aren't part of entities. We need
670 # to escape those to XML entities too.
671 val
= self
.BARE_AMPERSAND_OR_BRACKET
.sub(self
._sub
_entity
, val
)
673 attrs
.append(fmt
% (self
.toEncoding(key
, encoding
),
674 self
.toEncoding(val
, encoding
)))
677 if self
.isSelfClosing
:
680 closeTag
= '</%s>' % encodedName
682 indentTag
, indentContents
= 0, 0
684 indentTag
= indentLevel
685 space
= (' ' * (indentTag
-1))
686 indentContents
= indentTag
+ 1
687 contents
= self
.renderContents(encoding
, prettyPrint
, indentContents
)
694 attributeString
= ' ' + ' '.join(attrs
)
697 s
.append('<%s%s%s>' % (encodedName
, attributeString
, close
))
701 if prettyPrint
and contents
and contents
[-1] != "\n":
703 if prettyPrint
and closeTag
:
706 if prettyPrint
and closeTag
and self
.nextSibling
:
712 """Recursively destroys the contents of this tree."""
713 contents
= [i
for i
in self
.contents
]
715 if isinstance(i
, Tag
):
721 def prettify(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
722 return self
.__str
__(encoding
, True)
724 def renderContents(self
, encoding
=DEFAULT_OUTPUT_ENCODING
,
725 prettyPrint
=False, indentLevel
=0):
726 """Renders the contents of this tag as a string in the given
727 encoding. If encoding is None, returns a Unicode string.."""
731 if isinstance(c
, NavigableString
):
732 text
= c
.__str
__(encoding
)
733 elif isinstance(c
, Tag
):
734 s
.append(c
.__str
__(encoding
, prettyPrint
, indentLevel
))
735 if text
and prettyPrint
:
739 s
.append(" " * (indentLevel
-1))
747 def find(self
, name
=None, attrs
={}, recursive
=True, text
=None,
749 """Return only the first child of this Tag matching the given
752 l
= self
.findAll(name
, attrs
, recursive
, text
, 1, **kwargs
)
758 def findAll(self
, name
=None, attrs
={}, recursive
=True, text
=None,
759 limit
=None, **kwargs
):
760 """Extracts a list of Tag objects that match the given
761 criteria. You can specify the name of the Tag and any
762 attributes you want the Tag to have.
764 The value of a key-value pair in the 'attrs' map can be a
765 string, a list of strings, a regular expression object, or a
766 callable that takes a string and returns whether or not the
767 string matches for some custom definition of 'matches'. The
768 same is true of the tag name."""
769 generator
= self
.recursiveChildGenerator
771 generator
= self
.childGenerator
772 return self
._findAll
(name
, attrs
, text
, limit
, generator
, **kwargs
)
773 findChildren
= findAll
775 # Pre-3.x compatibility methods
779 def fetchText(self
, text
=None, recursive
=True, limit
=None):
780 return self
.findAll(text
=text
, recursive
=recursive
, limit
=limit
)
782 def firstText(self
, text
=None, recursive
=True):
783 return self
.find(text
=text
, recursive
=recursive
)
787 def _getAttrMap(self
):
788 """Initializes a map representation of this tag's attributes,
789 if not already initialized."""
790 if not getattr(self
, 'attrMap'):
792 for (key
, value
) in self
.attrs
:
793 self
.attrMap
[key
] = value
797 def childGenerator(self
):
798 for i
in range(0, len(self
.contents
)):
799 yield self
.contents
[i
]
802 def recursiveChildGenerator(self
):
805 tag
, start
= stack
.pop()
806 if isinstance(tag
, Tag
):
807 for i
in range(start
, len(tag
.contents
)):
810 if isinstance(a
, Tag
) and tag
.contents
:
811 if i
< len(tag
.contents
) - 1:
812 stack
.append((tag
, i
+1))
817 # Next, a couple classes to represent queries and their results.
819 """Encapsulates a number of ways of matching a markup element (tag or
822 def __init__(self
, name
=None, attrs
={}, text
=None, **kwargs
):
825 kwargs
['class'] = attrs
840 return "%s|%s" % (self
.name
, self
.attrs
)
842 def searchTag(self
, markupName
=None, markupAttrs
={}):
845 if isinstance(markupName
, Tag
):
848 callFunctionWithTagData
= callable(self
.name
) \
849 and not isinstance(markupName
, Tag
)
852 or callFunctionWithTagData \
853 or (markup
and self
._matches
(markup
, self
.name
)) \
854 or (not markup
and self
._matches
(markupName
, self
.name
)):
855 if callFunctionWithTagData
:
856 match
= self
.name(markupName
, markupAttrs
)
860 for attr
, matchAgainst
in self
.attrs
.items():
861 if not markupAttrMap
:
862 if hasattr(markupAttrs
, 'get'):
863 markupAttrMap
= markupAttrs
866 for k
,v
in markupAttrs
:
868 attrValue
= markupAttrMap
.get(attr
)
869 if not self
._matches
(attrValue
, matchAgainst
):
879 def search(self
, markup
):
880 #print 'looking for %s in %s' % (self, markup)
882 # If given a list of items, scan it for a text element that
884 if isList(markup
) and not isinstance(markup
, Tag
):
885 for element
in markup
:
886 if isinstance(element
, NavigableString
) \
887 and self
.search(element
):
890 # If it's a Tag, make sure its name or attributes match.
891 # Don't bother with Tags if we're searching for text.
892 elif isinstance(markup
, Tag
):
894 found
= self
.searchTag(markup
)
895 # If it's text, make sure the text matches.
896 elif isinstance(markup
, NavigableString
) or \
898 if self
._matches
(markup
, self
.text
):
901 raise Exception, "I don't know how to match against a %s" \
905 def _matches(self
, markup
, matchAgainst
):
906 #print "Matching %s against %s" % (markup, matchAgainst)
908 if matchAgainst
== True and type(matchAgainst
) == types
.BooleanType
:
909 result
= markup
!= None
910 elif callable(matchAgainst
):
911 result
= matchAgainst(markup
)
913 #Custom match methods take the tag as an argument, but all
914 #other ways of matching match the tag name as a string.
915 if isinstance(markup
, Tag
):
917 if markup
and not isString(markup
):
918 markup
= unicode(markup
)
919 #Now we know that chunk is either a string, or None.
920 if hasattr(matchAgainst
, 'match'):
921 # It's a regexp object.
922 result
= markup
and matchAgainst
.search(markup
)
923 elif isList(matchAgainst
):
924 result
= markup
in matchAgainst
925 elif hasattr(matchAgainst
, 'items'):
926 result
= markup
.has_key(matchAgainst
)
927 elif matchAgainst
and isString(markup
):
928 if isinstance(markup
, unicode):
929 matchAgainst
= unicode(matchAgainst
)
931 matchAgainst
= str(matchAgainst
)
934 result
= matchAgainst
== markup
937 class ResultSet(list):
938 """A ResultSet is just a list that keeps track of the SoupStrainer
940 def __init__(self
, source
):
944 # Now, some helper functions.
947 """Convenience method that works with all 2.x versions of Python
948 to determine whether or not something is listlike."""
949 return hasattr(l
, '__iter__') \
950 or (type(l
) in (types
.ListType
, types
.TupleType
))
953 """Convenience method that works with all 2.x versions of Python
954 to determine whether or not something is stringlike."""
956 return isinstance(s
, unicode) or isinstance(s
, basestring
)
958 return isinstance(s
, str)
960 def buildTagMap(default
, *args
):
961 """Turns a list of maps, lists, or scalars into a single map.
962 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
963 NESTING_RESET_TAGS maps out of lists and partial maps."""
966 if hasattr(portion
, 'items'):
967 #It's a map. Merge it.
968 for k
,v
in portion
.items():
970 elif isList(portion
):
971 #It's a list. Map each item to the default.
975 #It's a scalar. Map it to the default.
976 built
[portion
] = default
979 # Now, the parser classes.
981 class BeautifulStoneSoup(Tag
, SGMLParser
):
983 """This class contains the basic parser and search code. It defines
984 a parser that knows nothing about tag behavior except for the
987 You can't close a tag without closing all the tags it encloses.
988 That is, "<foo><bar></foo>" actually means
989 "<foo><bar></bar></foo>".
991 [Another possible explanation is "<foo><bar /></foo>", but since
992 this class defines no SELF_CLOSING_TAGS, it will never use that
995 This class is useful for parsing XML or made-up markup languages,
996 or when BeautifulSoup makes an assumption counter to what you were
999 SELF_CLOSING_TAGS
= {}
1001 RESET_NESTING_TAGS
= {}
1003 PRESERVE_WHITESPACE_TAGS
= []
1005 MARKUP_MASSAGE
= [(re
.compile('(<[^<>]*)/>'),
1006 lambda x
: x
.group(1) + ' />'),
1007 (re
.compile('<!\s+([^<>]*)>'),
1008 lambda x
: '<!' + x
.group(1) + '>')
1011 ROOT_TAG_NAME
= u
'[document]'
1013 HTML_ENTITIES
= "html"
1014 XML_ENTITIES
= "xml"
1015 XHTML_ENTITIES
= "xhtml"
1016 # TODO: This only exists for backwards-compatibility
1017 ALL_ENTITIES
= XHTML_ENTITIES
1019 # Used when determining whether a text node is all whitespace and
1020 # can be replaced with a single space. A text node that contains
1021 # fancy Unicode spaces (usually non-breaking) should be left
1023 STRIP_ASCII_SPACES
= { 9: None, 10: None, 12: None, 13: None, 32: None, }
1025 def __init__(self
, markup
="", parseOnlyThese
=None, fromEncoding
=None,
1026 markupMassage
=True, smartQuotesTo
=XML_ENTITIES
,
1027 convertEntities
=None, selfClosingTags
=None, isHTML
=False):
1028 """The Soup object is initialized as the 'root tag', and the
1029 provided markup (which can be a string or a file-like object)
1030 is fed into the underlying parser.
1032 sgmllib will process most bad HTML, and the BeautifulSoup
1033 class has some tricks for dealing with some HTML that kills
1034 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1035 if your data uses self-closing tags or declarations
1038 By default, Beautiful Soup uses regexes to sanitize input,
1039 avoiding the vast majority of these problems. If the problems
1040 don't apply to you, pass in False for markupMassage, and
1041 you'll get better performance.
1043 The default parser massage techniques fix the two most common
1044 instances of invalid HTML that choke sgmllib:
1046 <br/> (No space between name of closing tag and tag close)
1047 <! --Comment--> (Extraneous whitespace in declaration)
1049 You can pass in a custom list of (RE object, replace method)
1050 tuples to get Beautiful Soup to scrub your input the way you
1053 self
.parseOnlyThese
= parseOnlyThese
1054 self
.fromEncoding
= fromEncoding
1055 self
.smartQuotesTo
= smartQuotesTo
1056 self
.convertEntities
= convertEntities
1057 # Set the rules for how we'll deal with the entities we
1059 if self
.convertEntities
:
1060 # It doesn't make sense to convert encoded characters to
1061 # entities even while you're converting entities to Unicode.
1062 # Just convert it all to Unicode.
1063 self
.smartQuotesTo
= None
1064 if convertEntities
== self
.HTML_ENTITIES
:
1065 self
.convertXMLEntities
= False
1066 self
.convertHTMLEntities
= True
1067 self
.escapeUnrecognizedEntities
= True
1068 elif convertEntities
== self
.XHTML_ENTITIES
:
1069 self
.convertXMLEntities
= True
1070 self
.convertHTMLEntities
= True
1071 self
.escapeUnrecognizedEntities
= False
1072 elif convertEntities
== self
.XML_ENTITIES
:
1073 self
.convertXMLEntities
= True
1074 self
.convertHTMLEntities
= False
1075 self
.escapeUnrecognizedEntities
= False
1077 self
.convertXMLEntities
= False
1078 self
.convertHTMLEntities
= False
1079 self
.escapeUnrecognizedEntities
= False
1081 self
.instanceSelfClosingTags
= buildTagMap(None, selfClosingTags
)
1082 SGMLParser
.__init
__(self
)
1084 if hasattr(markup
, 'read'): # It's a file-type object.
1085 markup
= markup
.read()
1086 self
.markup
= markup
1087 self
.markupMassage
= markupMassage
1089 self
._feed
(isHTML
=isHTML
)
1092 self
.markup
= None # The markup can now be GCed
1094 def convert_charref(self
, name
):
1095 """This method fixes a bug in Python's SGMLParser."""
1100 if not 0 <= n
<= 127 : # ASCII ends at 127, not 255
1102 return self
.convert_codepoint(n
)
1104 def _feed(self
, inDocumentEncoding
=None, isHTML
=False):
1105 # Convert the document to Unicode.
1106 markup
= self
.markup
1107 if isinstance(markup
, unicode):
1108 if not hasattr(self
, 'originalEncoding'):
1109 self
.originalEncoding
= None
1111 dammit
= UnicodeDammit\
1112 (markup
, [self
.fromEncoding
, inDocumentEncoding
],
1113 smartQuotesTo
=self
.smartQuotesTo
, isHTML
=isHTML
)
1114 markup
= dammit
.unicode
1115 self
.originalEncoding
= dammit
.originalEncoding
1116 self
.declaredHTMLEncoding
= dammit
.declaredHTMLEncoding
1118 if self
.markupMassage
:
1119 if not isList(self
.markupMassage
):
1120 self
.markupMassage
= self
.MARKUP_MASSAGE
1121 for fix
, m
in self
.markupMassage
:
1122 markup
= fix
.sub(m
, markup
)
1123 # TODO: We get rid of markupMassage so that the
1124 # soup object can be deepcopied later on. Some
1125 # Python installations can't copy regexes. If anyone
1126 # was relying on the existence of markupMassage, this
1127 # might cause problems.
1128 del(self
.markupMassage
)
1131 SGMLParser
.feed(self
, markup
)
1132 # Close out any unfinished strings and close all the open tags.
1134 while self
.currentTag
.name
!= self
.ROOT_TAG_NAME
:
1137 def __getattr__(self
, methodName
):
1138 """This method routes method call requests to either the SGMLParser
1139 superclass or the Tag superclass, depending on the method name."""
1140 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1142 if methodName
.find('start_') == 0 or methodName
.find('end_') == 0 \
1143 or methodName
.find('do_') == 0:
1144 return SGMLParser
.__getattr
__(self
, methodName
)
1145 elif methodName
.find('__') != 0:
1146 return Tag
.__getattr
__(self
, methodName
)
1148 raise AttributeError
1150 def isSelfClosingTag(self
, name
):
1151 """Returns true iff the given string is the name of a
1152 self-closing tag according to this parser."""
1153 return self
.SELF_CLOSING_TAGS
.has_key(name
) \
1154 or self
.instanceSelfClosingTags
.has_key(name
)
1157 Tag
.__init
__(self
, self
, self
.ROOT_TAG_NAME
)
1159 SGMLParser
.reset(self
)
1160 self
.currentData
= []
1161 self
.currentTag
= None
1163 self
.quoteStack
= []
1167 tag
= self
.tagStack
.pop()
1168 # Tags with just one string-owning child get the child as a
1169 # 'string' property, so that soup.tag.string is shorthand for
1170 # soup.tag.contents[0]
1171 if len(self
.currentTag
.contents
) == 1 and \
1172 isinstance(self
.currentTag
.contents
[0], NavigableString
):
1173 self
.currentTag
.string
= self
.currentTag
.contents
[0]
1175 #print "Pop", tag.name
1177 self
.currentTag
= self
.tagStack
[-1]
1178 return self
.currentTag
1180 def pushTag(self
, tag
):
1181 #print "Push", tag.name
1183 self
.currentTag
.contents
.append(tag
)
1184 self
.tagStack
.append(tag
)
1185 self
.currentTag
= self
.tagStack
[-1]
1187 def endData(self
, containerClass
=NavigableString
):
1188 if self
.currentData
:
1189 currentData
= u
''.join(self
.currentData
)
1190 if (currentData
.translate(self
.STRIP_ASCII_SPACES
) == '' and
1191 not set([tag
.name
for tag
in self
.tagStack
]).intersection(
1192 self
.PRESERVE_WHITESPACE_TAGS
)):
1193 if '\n' in currentData
:
1197 self
.currentData
= []
1198 if self
.parseOnlyThese
and len(self
.tagStack
) <= 1 and \
1199 (not self
.parseOnlyThese
.text
or \
1200 not self
.parseOnlyThese
.search(currentData
)):
1202 o
= containerClass(currentData
)
1203 o
.setup(self
.currentTag
, self
.previous
)
1205 self
.previous
.next
= o
1207 self
.currentTag
.contents
.append(o
)
1210 def _popToTag(self
, name
, inclusivePop
=True):
1211 """Pops the tag stack up to and including the most recent
1212 instance of the given tag. If inclusivePop is false, pops the tag
1213 stack up to but *not* including the most recent instqance of
1215 #print "Popping to %s" % name
1216 if name
== self
.ROOT_TAG_NAME
:
1220 mostRecentTag
= None
1221 for i
in range(len(self
.tagStack
)-1, 0, -1):
1222 if name
== self
.tagStack
[i
].name
:
1223 numPops
= len(self
.tagStack
)-i
1225 if not inclusivePop
:
1226 numPops
= numPops
- 1
1228 for i
in range(0, numPops
):
1229 mostRecentTag
= self
.popTag()
1230 return mostRecentTag
1232 def _smartPop(self
, name
):
1234 """We need to pop up to the previous tag of this type, unless
1235 one of this tag's nesting reset triggers comes between this
1236 tag and the previous tag of this type, OR unless this tag is a
1237 generic nesting trigger and another generic nesting trigger
1238 comes between this tag and the previous tag of this type.
1241 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1242 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1243 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1245 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1246 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1247 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1250 nestingResetTriggers
= self
.NESTABLE_TAGS
.get(name
)
1251 isNestable
= nestingResetTriggers
!= None
1252 isResetNesting
= self
.RESET_NESTING_TAGS
.has_key(name
)
1255 for i
in range(len(self
.tagStack
)-1, 0, -1):
1256 p
= self
.tagStack
[i
]
1257 if (not p
or p
.name
== name
) and not isNestable
:
1258 #Non-nestable tags get popped to the top or to their
1262 if (nestingResetTriggers
!= None
1263 and p
.name
in nestingResetTriggers
) \
1264 or (nestingResetTriggers
== None and isResetNesting
1265 and self
.RESET_NESTING_TAGS
.has_key(p
.name
)):
1267 #If we encounter one of the nesting reset triggers
1268 #peculiar to this tag, or we encounter another tag
1269 #that causes nesting to reset, pop up to but not
1270 #including that tag.
1276 self
._popToTag
(popTo
, inclusive
)
1278 def unknown_starttag(self
, name
, attrs
, selfClosing
=0):
1279 #print "Start tag %s: %s" % (name, attrs)
1281 #This is not a real tag.
1282 #print "<%s> is not real!" % name
1283 attrs
= ''.join(map(lambda(x
, y
): ' %s="%s"' % (x
, y
), attrs
))
1284 self
.handle_data('<%s%s>' % (name
, attrs
))
1288 if not self
.isSelfClosingTag(name
) and not selfClosing
:
1289 self
._smartPop
(name
)
1291 if self
.parseOnlyThese
and len(self
.tagStack
) <= 1 \
1292 and (self
.parseOnlyThese
.text
or not self
.parseOnlyThese
.searchTag(name
, attrs
)):
1295 tag
= Tag(self
, name
, attrs
, self
.currentTag
, self
.previous
)
1297 self
.previous
.next
= tag
1300 if selfClosing
or self
.isSelfClosingTag(name
):
1302 if name
in self
.QUOTE_TAGS
:
1303 #print "Beginning quote (%s)" % name
1304 self
.quoteStack
.append(name
)
1308 def unknown_endtag(self
, name
):
1309 #print "End tag %s" % name
1310 if self
.quoteStack
and self
.quoteStack
[-1] != name
:
1311 #This is not a real end tag.
1312 #print "</%s> is not real!" % name
1313 self
.handle_data('</%s>' % name
)
1316 self
._popToTag
(name
)
1317 if self
.quoteStack
and self
.quoteStack
[-1] == name
:
1318 self
.quoteStack
.pop()
1319 self
.literal
= (len(self
.quoteStack
) > 0)
1321 def handle_data(self
, data
):
1322 self
.currentData
.append(data
)
1324 def _toStringSubclass(self
, text
, subclass
):
1325 """Adds a certain piece of text to the tree as a NavigableString
1328 self
.handle_data(text
)
1329 self
.endData(subclass
)
1331 def handle_pi(self
, text
):
1332 """Handle a processing instruction as a ProcessingInstruction
1333 object, possibly one with a %SOUP-ENCODING% slot into which an
1334 encoding will be plugged later."""
1335 if text
[:3] == "xml":
1336 text
= u
"xml version='1.0' encoding='%SOUP-ENCODING%'"
1337 self
._toStringSubclass
(text
, ProcessingInstruction
)
1339 def handle_comment(self
, text
):
1340 "Handle comments as Comment objects."
1341 self
._toStringSubclass
(text
, Comment
)
1343 def handle_charref(self
, ref
):
1344 "Handle character references as data."
1345 if self
.convertEntities
:
1346 data
= unichr(int(ref
))
1348 data
= '&#%s;' % ref
1349 self
.handle_data(data
)
1351 def handle_entityref(self
, ref
):
1352 """Handle entity references as data, possibly converting known
1353 HTML and/or XML entity references to the corresponding Unicode
1356 if self
.convertHTMLEntities
:
1358 data
= unichr(name2codepoint
[ref
])
1362 if not data
and self
.convertXMLEntities
:
1363 data
= self
.XML_ENTITIES_TO_SPECIAL_CHARS
.get(ref
)
1365 if not data
and self
.convertHTMLEntities
and \
1366 not self
.XML_ENTITIES_TO_SPECIAL_CHARS
.get(ref
):
1367 # TODO: We've got a problem here. We're told this is
1368 # an entity reference, but it's not an XML entity
1369 # reference or an HTML entity reference. Nonetheless,
1370 # the logical thing to do is to pass it through as an
1371 # unrecognized entity reference.
1373 # Except: when the input is "&carol;" this function
1374 # will be called with input "carol". When the input is
1375 # "AT&T", this function will be called with input
1376 # "T". We have no way of knowing whether a semicolon
1377 # was present originally, so we don't know whether
1378 # this is an unknown entity or just a misplaced
1381 # The more common case is a misplaced ampersand, so I
1382 # escape the ampersand and omit the trailing semicolon.
1383 data
= "&%s" % ref
1385 # This case is different from the one above, because we
1386 # haven't already gone through a supposedly comprehensive
1387 # mapping of entities to Unicode characters. We might not
1388 # have gone through any mapping at all. So the chances are
1389 # very high that this is a real entity, and not a
1390 # misplaced ampersand.
1392 self
.handle_data(data
)
1394 def handle_decl(self
, data
):
1395 "Handle DOCTYPEs and the like as Declaration objects."
1396 self
._toStringSubclass
(data
, Declaration
)
1398 def parse_declaration(self
, i
):
1399 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1400 declaration as a CData object."""
1402 if self
.rawdata
[i
:i
+9] == '<![CDATA[':
1403 k
= self
.rawdata
.find(']]>', i
)
1405 k
= len(self
.rawdata
)
1406 data
= self
.rawdata
[i
+9:k
]
1408 self
._toStringSubclass
(data
, CData
)
1411 j
= SGMLParser
.parse_declaration(self
, i
)
1412 except SGMLParseError
:
1413 toHandle
= self
.rawdata
[i
:]
1414 self
.handle_data(toHandle
)
1415 j
= i
+ len(toHandle
)
1418 class BeautifulSoup(BeautifulStoneSoup
):
1420 """This parser knows the following facts about HTML:
1422 * Some tags have no closing tag and should be interpreted as being
1423 closed as soon as they are encountered.
1425 * The text inside some tags (ie. 'script') may contain tags which
1426 are not really part of the document and which should be parsed
1427 as text, not tags. If you want to parse the text as tags, you can
1428 always fetch it and parse it explicitly.
1430 * Tag nesting rules:
1432 Most tags can't be nested at all. For instance, the occurance of
1433 a <p> tag should implicitly close the previous <p> tag.
1436 should be transformed into:
1437 <p>Para1</p><p>Para2
1439 Some tags can be nested arbitrarily. For instance, the occurance
1440 of a <blockquote> tag should _not_ implicitly close the previous
1443 Alice said: <blockquote>Bob said: <blockquote>Blah
1444 should NOT be transformed into:
1445 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1447 Some tags can be nested, but the nesting is reset by the
1448 interposition of other tags. For instance, a <tr> tag should
1449 implicitly close the previous <tr> tag within the same <table>,
1450 but not close a <tr> tag in another table.
1452 <table><tr>Blah<tr>Blah
1453 should be transformed into:
1454 <table><tr>Blah</tr><tr>Blah
1456 <tr>Blah<table><tr>Blah
1457 should NOT be transformed into
1458 <tr>Blah<table></tr><tr>Blah
1460 Differing assumptions about tag nesting rules are a major source
1461 of problems with the BeautifulSoup class. If BeautifulSoup is not
1462 treating as nestable a tag your page author treats as nestable,
1463 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1464 BeautifulStoneSoup before writing your own subclass."""
1466 def __init__(self
, *args
, **kwargs
):
1467 if not kwargs
.has_key('smartQuotesTo'):
1468 kwargs
['smartQuotesTo'] = self
.HTML_ENTITIES
1469 kwargs
['isHTML'] = True
1470 BeautifulStoneSoup
.__init
__(self
, *args
, **kwargs
)
1472 SELF_CLOSING_TAGS
= buildTagMap(None,
1473 ['br' , 'hr', 'input', 'img', 'meta',
1474 'spacer', 'link', 'frame', 'base'])
1476 PRESERVE_WHITESPACE_TAGS
= set(['pre', 'textarea'])
1478 QUOTE_TAGS
= {'script' : None, 'textarea' : None}
1480 #According to the HTML standard, each of these inline tags can
1481 #contain another tag of the same type. Furthermore, it's common
1482 #to actually use these tags this way.
1483 NESTABLE_INLINE_TAGS
= ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1486 #According to the HTML standard, these block tags can contain
1487 #another tag of the same type. Furthermore, it's common
1488 #to actually use these tags this way.
1489 NESTABLE_BLOCK_TAGS
= ['blockquote', 'div', 'fieldset', 'ins', 'del']
1491 #Lists can contain other lists, but there are restrictions.
1492 NESTABLE_LIST_TAGS
= { 'ol' : [],
1494 'li' : ['ul', 'ol'],
1499 #Tables can contain other tables, but there are restrictions.
1500 NESTABLE_TABLE_TAGS
= {'table' : [],
1501 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1504 'thead' : ['table'],
1505 'tbody' : ['table'],
1506 'tfoot' : ['table'],
1509 NON_NESTABLE_BLOCK_TAGS
= ['address', 'form', 'p', 'pre']
1511 #If one of these tags is encountered, all tags up to the next tag of
1512 #this type are popped.
1513 RESET_NESTING_TAGS
= buildTagMap(None, NESTABLE_BLOCK_TAGS
, 'noscript',
1514 NON_NESTABLE_BLOCK_TAGS
,
1516 NESTABLE_TABLE_TAGS
)
1518 NESTABLE_TAGS
= buildTagMap([], NESTABLE_INLINE_TAGS
, NESTABLE_BLOCK_TAGS
,
1519 NESTABLE_LIST_TAGS
, NESTABLE_TABLE_TAGS
)
1521 # Used to detect the charset in a META tag; see start_meta
1522 CHARSET_RE
= re
.compile("((^|;)\s*charset=)([^;]*)", re
.M
)
1524 def start_meta(self
, attrs
):
1525 """Beautiful Soup can detect a charset included in a META tag,
1526 try to convert the document to that charset, and re-parse the
1527 document from the beginning."""
1530 contentTypeIndex
= None
1531 tagNeedsEncodingSubstitution
= False
1533 for i
in range(0, len(attrs
)):
1534 key
, value
= attrs
[i
]
1536 if key
== 'http-equiv':
1538 elif key
== 'content':
1540 contentTypeIndex
= i
1542 if httpEquiv
and contentType
: # It's an interesting meta tag.
1543 match
= self
.CHARSET_RE
.search(contentType
)
1545 if (self
.declaredHTMLEncoding
is not None or
1546 self
.originalEncoding
== self
.fromEncoding
):
1547 # An HTML encoding was sniffed while converting
1548 # the document to Unicode, or an HTML encoding was
1549 # sniffed during a previous pass through the
1550 # document, or an encoding was specified
1551 # explicitly and it worked. Rewrite the meta tag.
1553 return match
.group(1) + "%SOUP-ENCODING%"
1554 newAttr
= self
.CHARSET_RE
.sub(rewrite
, contentType
)
1555 attrs
[contentTypeIndex
] = (attrs
[contentTypeIndex
][0],
1557 tagNeedsEncodingSubstitution
= True
1559 # This is our first pass through the document.
1560 # Go through it again with the encoding information.
1561 newCharset
= match
.group(3)
1562 if newCharset
and newCharset
!= self
.originalEncoding
:
1563 self
.declaredHTMLEncoding
= newCharset
1564 self
._feed
(self
.declaredHTMLEncoding
)
1567 tag
= self
.unknown_starttag("meta", attrs
)
1568 if tag
and tagNeedsEncodingSubstitution
:
1569 tag
.containsSubstitutions
= True
1571 class StopParsing(Exception):
1574 class ICantBelieveItsBeautifulSoup(BeautifulSoup
):
1576 """The BeautifulSoup class is oriented towards skipping over
1577 common HTML errors like unclosed tags. However, sometimes it makes
1578 errors of its own. For instance, consider this fragment:
1580 <b>Foo<b>Bar</b></b>
1582 This is perfectly valid (if bizarre) HTML. However, the
1583 BeautifulSoup class will implicitly close the first b tag when it
1584 encounters the second 'b'. It will think the author wrote
1585 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1586 there's no real-world reason to bold something that's already
1587 bold. When it encounters '</b></b>' it will close two more 'b'
1588 tags, for a grand total of three tags closed instead of two. This
1589 can throw off the rest of your document structure. The same is
1590 true of a number of other tags, listed below.
1592 It's much more common for someone to forget to close a 'b' tag
1593 than to actually use nested 'b' tags, and the BeautifulSoup class
1594 handles the common case. This class handles the not-co-common
1595 case: where you can't believe someone wrote what they did, but
1596 it's valid HTML and BeautifulSoup screwed up by assuming it
1599 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS
= \
1600 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1601 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1604 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS
= ['noscript']
1606 NESTABLE_TAGS
= buildTagMap([], BeautifulSoup
.NESTABLE_TAGS
,
1607 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS
,
1608 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS
)
1610 class MinimalSoup(BeautifulSoup
):
1611 """The MinimalSoup class is for parsing HTML that contains
1612 pathologically bad markup. It makes no assumptions about tag
1613 nesting, but it does know which tags are self-closing, that
1614 <script> tags contain Javascript and should not be parsed, that
1615 META tags may contain encoding information, and so on.
1617 This also makes it better for subclassing than BeautifulStoneSoup
1618 or BeautifulSoup."""
1620 RESET_NESTING_TAGS
= buildTagMap('noscript')
1623 class BeautifulSOAP(BeautifulStoneSoup
):
1624 """This class will push a tag with only a single string child into
1625 the tag's parent as an attribute. The attribute's name is the tag
1626 name, and the value is the string child. An example should give
1627 the flavor of the change:
1629 <foo><bar>baz</bar></foo>
1631 <foo bar="baz"><bar>baz</bar></foo>
1633 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1635 This is, of course, useful for scraping structures that tend to
1636 use subelements instead of attributes, such as SOAP messages. Note
1637 that it modifies its input, so don't print the modified version
1640 I'm not sure how many people really want to use this class; let me
1641 know if you do. Mainly I like the name."""
1644 if len(self
.tagStack
) > 1:
1645 tag
= self
.tagStack
[-1]
1646 parent
= self
.tagStack
[-2]
1647 parent
._getAttrMap
()
1648 if (isinstance(tag
, Tag
) and len(tag
.contents
) == 1 and
1649 isinstance(tag
.contents
[0], NavigableString
) and
1650 not parent
.attrMap
.has_key(tag
.name
)):
1651 parent
[tag
.name
] = tag
.contents
[0]
1652 BeautifulStoneSoup
.popTag(self
)
1654 #Enterprise class names! It has come to our attention that some people
1655 #think the names of the Beautiful Soup parser classes are too silly
1656 #and "unprofessional" for use in enterprise screen-scraping. We feel
1657 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1658 #All-Night Kosher Bakery recommends renaming this file to
1659 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1660 #"RobustParserBeanInterface.class") and using the following
1661 #enterprise-friendly class aliases:
1662 class RobustXMLParser(BeautifulStoneSoup
):
1664 class RobustHTMLParser(BeautifulSoup
):
1666 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup
):
1668 class RobustInsanelyWackAssHTMLParser(MinimalSoup
):
1670 class SimplifyingSOAPParser(BeautifulSOAP
):
1673 ######################################################
1675 # Bonus library: Unicode, Dammit
1677 # This class forces XML data into a standard format (usually to UTF-8
1678 # or Unicode). It is heavily based on code from Mark Pilgrim's
1679 # Universal Feed Parser. It does not rewrite the XML or HTML to
1680 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1681 # (XML) and BeautifulSoup.start_meta (HTML).
1683 # Autodetects character encodings.
1684 # Download from http://chardet.feedparser.org/
1687 # import chardet.constants
1688 # chardet.constants._debug = 1
1692 # cjkcodecs and iconv_codec make Python know about more character encodings.
1693 # Both are available from http://cjkpython.i18n.org/
1694 # They're built in if you use Python 2.4.
1696 import cjkcodecs
.aliases
1704 class UnicodeDammit
:
1705 """A class for detecting the encoding of a *ML document and
1706 converting it to a Unicode string. If the source encoding is
1707 windows-1252, can replace MS smart quotes with their HTML or XML
1710 # This dictionary maps commonly seen values for "charset" in HTML
1711 # meta tags to the corresponding Python codec names. It only covers
1712 # values that aren't in Python's aliases and can't be determined
1713 # by the heuristics in find_codec.
1714 CHARSET_ALIASES
= { "macintosh" : "mac-roman",
1715 "x-sjis" : "shift-jis" }
1717 def __init__(self
, markup
, overrideEncodings
=[],
1718 smartQuotesTo
='xml', isHTML
=False):
1719 self
.declaredHTMLEncoding
= None
1720 self
.markup
, documentEncoding
, sniffedEncoding
= \
1721 self
._detectEncoding
(markup
, isHTML
)
1722 self
.smartQuotesTo
= smartQuotesTo
1723 self
.triedEncodings
= []
1724 if markup
== '' or isinstance(markup
, unicode):
1725 self
.originalEncoding
= None
1726 self
.unicode = unicode(markup
)
1730 for proposedEncoding
in overrideEncodings
:
1731 u
= self
._convertFrom
(proposedEncoding
)
1734 for proposedEncoding
in (documentEncoding
, sniffedEncoding
):
1735 u
= self
._convertFrom
(proposedEncoding
)
1738 # If no luck and we have auto-detection library, try that:
1739 if not u
and chardet
and not isinstance(self
.markup
, unicode):
1740 u
= self
._convertFrom
(chardet
.detect(self
.markup
)['encoding'])
1742 # As a last resort, try utf-8 and windows-1252:
1744 for proposed_encoding
in ("utf-8", "windows-1252"):
1745 u
= self
._convertFrom
(proposed_encoding
)
1749 if not u
: self
.originalEncoding
= None
1751 def _subMSChar(self
, orig
):
1752 """Changes a MS smart quote character to an XML or HTML
1754 sub
= self
.MS_CHARS
.get(orig
)
1755 if type(sub
) == types
.TupleType
:
1756 if self
.smartQuotesTo
== 'xml':
1757 sub
= '&#x%s;' % sub
[1]
1759 sub
= '&%s;' % sub
[0]
1762 def _convertFrom(self
, proposed
):
1763 proposed
= self
.find_codec(proposed
)
1764 if not proposed
or proposed
in self
.triedEncodings
:
1766 self
.triedEncodings
.append(proposed
)
1767 markup
= self
.markup
1769 # Convert smart quotes to HTML if coming from an encoding
1770 # that might have them.
1771 if self
.smartQuotesTo
and proposed
.lower() in("windows-1252",
1774 markup
= re
.compile("([\x80-\x9f])").sub \
1775 (lambda(x
): self
._subMSChar
(x
.group(1)),
1779 # print "Trying to convert document to %s" % proposed
1780 u
= self
._toUnicode
(markup
, proposed
)
1782 self
.originalEncoding
= proposed
1783 except Exception, e
:
1784 # print "That didn't work!"
1787 #print "Correct encoding: %s" % proposed
1790 def _toUnicode(self
, data
, encoding
):
1791 '''Given a string and its encoding, decodes the string into Unicode.
1792 %encoding is a string recognized by encodings.aliases'''
1794 # strip Byte Order Mark (if present)
1795 if (len(data
) >= 4) and (data
[:2] == '\xfe\xff') \
1796 and (data
[2:4] != '\x00\x00'):
1797 encoding
= 'utf-16be'
1799 elif (len(data
) >= 4) and (data
[:2] == '\xff\xfe') \
1800 and (data
[2:4] != '\x00\x00'):
1801 encoding
= 'utf-16le'
1803 elif data
[:3] == '\xef\xbb\xbf':
1806 elif data
[:4] == '\x00\x00\xfe\xff':
1807 encoding
= 'utf-32be'
1809 elif data
[:4] == '\xff\xfe\x00\x00':
1810 encoding
= 'utf-32le'
1812 newdata
= unicode(data
, encoding
)
1815 def _detectEncoding(self
, xml_data
, isHTML
=False):
1816 """Given a document, tries to detect its XML encoding."""
1817 xml_encoding
= sniffed_xml_encoding
= None
1819 if xml_data
[:4] == '\x4c\x6f\xa7\x94':
1821 xml_data
= self
._ebcdic
_to
_ascii
(xml_data
)
1822 elif xml_data
[:4] == '\x00\x3c\x00\x3f':
1824 sniffed_xml_encoding
= 'utf-16be'
1825 xml_data
= unicode(xml_data
, 'utf-16be').encode('utf-8')
1826 elif (len(xml_data
) >= 4) and (xml_data
[:2] == '\xfe\xff') \
1827 and (xml_data
[2:4] != '\x00\x00'):
1829 sniffed_xml_encoding
= 'utf-16be'
1830 xml_data
= unicode(xml_data
[2:], 'utf-16be').encode('utf-8')
1831 elif xml_data
[:4] == '\x3c\x00\x3f\x00':
1833 sniffed_xml_encoding
= 'utf-16le'
1834 xml_data
= unicode(xml_data
, 'utf-16le').encode('utf-8')
1835 elif (len(xml_data
) >= 4) and (xml_data
[:2] == '\xff\xfe') and \
1836 (xml_data
[2:4] != '\x00\x00'):
1838 sniffed_xml_encoding
= 'utf-16le'
1839 xml_data
= unicode(xml_data
[2:], 'utf-16le').encode('utf-8')
1840 elif xml_data
[:4] == '\x00\x00\x00\x3c':
1842 sniffed_xml_encoding
= 'utf-32be'
1843 xml_data
= unicode(xml_data
, 'utf-32be').encode('utf-8')
1844 elif xml_data
[:4] == '\x3c\x00\x00\x00':
1846 sniffed_xml_encoding
= 'utf-32le'
1847 xml_data
= unicode(xml_data
, 'utf-32le').encode('utf-8')
1848 elif xml_data
[:4] == '\x00\x00\xfe\xff':
1850 sniffed_xml_encoding
= 'utf-32be'
1851 xml_data
= unicode(xml_data
[4:], 'utf-32be').encode('utf-8')
1852 elif xml_data
[:4] == '\xff\xfe\x00\x00':
1854 sniffed_xml_encoding
= 'utf-32le'
1855 xml_data
= unicode(xml_data
[4:], 'utf-32le').encode('utf-8')
1856 elif xml_data
[:3] == '\xef\xbb\xbf':
1858 sniffed_xml_encoding
= 'utf-8'
1859 xml_data
= unicode(xml_data
[3:], 'utf-8').encode('utf-8')
1861 sniffed_xml_encoding
= 'ascii'
1864 xml_encoding_match
= None
1865 xml_encoding_match
= re
.compile(
1866 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data
)
1867 if not xml_encoding_match
and isHTML
:
1868 regexp
= re
.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re
.I
)
1869 xml_encoding_match
= regexp
.search(xml_data
)
1870 if xml_encoding_match
is not None:
1871 xml_encoding
= xml_encoding_match
.groups()[0].lower()
1873 self
.declaredHTMLEncoding
= xml_encoding
1874 if sniffed_xml_encoding
and \
1875 (xml_encoding
in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1876 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1877 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1879 xml_encoding
= sniffed_xml_encoding
1880 return xml_data
, xml_encoding
, sniffed_xml_encoding
1883 def find_codec(self
, charset
):
1884 return self
._codec
(self
.CHARSET_ALIASES
.get(charset
, charset
)) \
1885 or (charset
and self
._codec
(charset
.replace("-", ""))) \
1886 or (charset
and self
._codec
(charset
.replace("-", "_"))) \
1889 def _codec(self
, charset
):
1890 if not charset
: return charset
1893 codecs
.lookup(charset
)
1895 except (LookupError, ValueError):
1899 EBCDIC_TO_ASCII_MAP
= None
1900 def _ebcdic_to_ascii(self
, s
):
1902 if not c
.EBCDIC_TO_ASCII_MAP
:
1903 emap
= (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1904 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1905 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1906 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1907 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1908 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1909 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1910 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1911 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1912 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1913 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1914 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1915 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1916 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1917 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1918 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1919 250,251,252,253,254,255)
1921 c
.EBCDIC_TO_ASCII_MAP
= string
.maketrans( \
1922 ''.join(map(chr, range(256))), ''.join(map(chr, emap
)))
1923 return s
.translate(c
.EBCDIC_TO_ASCII_MAP
)
1925 MS_CHARS
= { '\x80' : ('euro', '20AC'),
1927 '\x82' : ('sbquo', '201A'),
1928 '\x83' : ('fnof', '192'),
1929 '\x84' : ('bdquo', '201E'),
1930 '\x85' : ('hellip', '2026'),
1931 '\x86' : ('dagger', '2020'),
1932 '\x87' : ('Dagger', '2021'),
1933 '\x88' : ('circ', '2C6'),
1934 '\x89' : ('permil', '2030'),
1935 '\x8A' : ('Scaron', '160'),
1936 '\x8B' : ('lsaquo', '2039'),
1937 '\x8C' : ('OElig', '152'),
1939 '\x8E' : ('#x17D', '17D'),
1942 '\x91' : ('lsquo', '2018'),
1943 '\x92' : ('rsquo', '2019'),
1944 '\x93' : ('ldquo', '201C'),
1945 '\x94' : ('rdquo', '201D'),
1946 '\x95' : ('bull', '2022'),
1947 '\x96' : ('ndash', '2013'),
1948 '\x97' : ('mdash', '2014'),
1949 '\x98' : ('tilde', '2DC'),
1950 '\x99' : ('trade', '2122'),
1951 '\x9a' : ('scaron', '161'),
1952 '\x9b' : ('rsaquo', '203A'),
1953 '\x9c' : ('oelig', '153'),
1955 '\x9e' : ('#x17E', '17E'),
1956 '\x9f' : ('Yuml', ''),}
1958 #######################################################################
1961 #By default, act as an HTML pretty-printer.
1962 if __name__
== '__main__':
1964 soup
= BeautifulSoup(sys
.stdin
)
1965 print soup
.prettify()