3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2009, Leonard Richardson
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79 from __future__
import generators
81 __author__
= "Leonard Richardson (leonardr@segfault.org)"
82 __version__
= "3.1.0.1"
83 __copyright__
= "Copyright (c) 2004-2009 Leonard Richardson"
84 __license__
= "New-style BSD"
90 from HTMLParser
import HTMLParser
, HTMLParseError
92 from htmlentitydefs
import name2codepoint
98 from sets
import Set
as set
100 #These hacks make Beautiful Soup able to parse XML with namespaces
101 markupbase
._declname
_match
= re
.compile(r
'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
103 DEFAULT_OUTPUT_ENCODING
= "utf-8"
105 # First, the classes that represent markup elements.
107 def sob(unicode, encoding
):
108 """Returns either the given Unicode string or its encoding."""
112 return unicode.encode(encoding
)
115 """Contains the navigational information for some part of the page
116 (either a tag or a piece of text)"""
118 def setup(self
, parent
=None, previous
=None):
119 """Sets up the initial relations between this element and
122 self
.previous
= previous
124 self
.previousSibling
= None
125 self
.nextSibling
= None
126 if self
.parent
and self
.parent
.contents
:
127 self
.previousSibling
= self
.parent
.contents
[-1]
128 self
.previousSibling
.nextSibling
= self
130 def replaceWith(self
, replaceWith
):
131 oldParent
= self
.parent
132 myIndex
= self
.parent
.contents
.index(self
)
133 if hasattr(replaceWith
, 'parent') and replaceWith
.parent
== self
.parent
:
134 # We're replacing this element with one of its siblings.
135 index
= self
.parent
.contents
.index(replaceWith
)
136 if index
and index
< myIndex
:
137 # Furthermore, it comes before this element. That
138 # means that when we extract it, the index of this
139 # element will change.
140 myIndex
= myIndex
- 1
142 oldParent
.insert(myIndex
, replaceWith
)
145 """Destructively rips this element out of the tree."""
148 self
.parent
.contents
.remove(self
)
152 #Find the two elements that would be next to each other if
153 #this element (and any children) hadn't been parsed. Connect
155 lastChild
= self
._lastRecursiveChild
()
156 nextElement
= lastChild
.next
159 self
.previous
.next
= nextElement
161 nextElement
.previous
= self
.previous
163 lastChild
.next
= None
166 if self
.previousSibling
:
167 self
.previousSibling
.nextSibling
= self
.nextSibling
169 self
.nextSibling
.previousSibling
= self
.previousSibling
170 self
.previousSibling
= self
.nextSibling
= None
173 def _lastRecursiveChild(self
):
174 "Finds the last element beneath this object to be parsed."
176 while hasattr(lastChild
, 'contents') and lastChild
.contents
:
177 lastChild
= lastChild
.contents
[-1]
180 def insert(self
, position
, newChild
):
181 if (isinstance(newChild
, basestring
)
182 or isinstance(newChild
, unicode)) \
183 and not isinstance(newChild
, NavigableString
):
184 newChild
= NavigableString(newChild
)
186 position
= min(position
, len(self
.contents
))
187 if hasattr(newChild
, 'parent') and newChild
.parent
!= None:
188 # We're 'inserting' an element that's already one
189 # of this object's children.
190 if newChild
.parent
== self
:
191 index
= self
.find(newChild
)
192 if index
and index
< position
:
193 # Furthermore we're moving it further down the
194 # list of this object's children. That means that
195 # when we extract this element, our target index
196 # will jump down one.
197 position
= position
- 1
200 newChild
.parent
= self
203 newChild
.previousSibling
= None
204 newChild
.previous
= self
206 previousChild
= self
.contents
[position
-1]
207 newChild
.previousSibling
= previousChild
208 newChild
.previousSibling
.nextSibling
= newChild
209 newChild
.previous
= previousChild
._lastRecursiveChild
()
210 if newChild
.previous
:
211 newChild
.previous
.next
= newChild
213 newChildsLastElement
= newChild
._lastRecursiveChild
()
215 if position
>= len(self
.contents
):
216 newChild
.nextSibling
= None
219 parentsNextSibling
= None
220 while not parentsNextSibling
:
221 parentsNextSibling
= parent
.nextSibling
222 parent
= parent
.parent
223 if not parent
: # This is the last element in the document.
225 if parentsNextSibling
:
226 newChildsLastElement
.next
= parentsNextSibling
228 newChildsLastElement
.next
= None
230 nextChild
= self
.contents
[position
]
231 newChild
.nextSibling
= nextChild
232 if newChild
.nextSibling
:
233 newChild
.nextSibling
.previousSibling
= newChild
234 newChildsLastElement
.next
= nextChild
236 if newChildsLastElement
.next
:
237 newChildsLastElement
.next
.previous
= newChildsLastElement
238 self
.contents
.insert(position
, newChild
)
240 def append(self
, tag
):
241 """Appends the given tag to the contents of this tag."""
242 self
.insert(len(self
.contents
), tag
)
244 def findNext(self
, name
=None, attrs
={}, text
=None, **kwargs
):
245 """Returns the first item that matches the given criteria and
246 appears after this Tag in the document."""
247 return self
._findOne
(self
.findAllNext
, name
, attrs
, text
, **kwargs
)
249 def findAllNext(self
, name
=None, attrs
={}, text
=None, limit
=None,
251 """Returns all items that match the given criteria and appear
252 after this Tag in the document."""
253 return self
._findAll
(name
, attrs
, text
, limit
, self
.nextGenerator
,
256 def findNextSibling(self
, name
=None, attrs
={}, text
=None, **kwargs
):
257 """Returns the closest sibling to this Tag that matches the
258 given criteria and appears after this Tag in the document."""
259 return self
._findOne
(self
.findNextSiblings
, name
, attrs
, text
,
262 def findNextSiblings(self
, name
=None, attrs
={}, text
=None, limit
=None,
264 """Returns the siblings of this Tag that match the given
265 criteria and appear after this Tag in the document."""
266 return self
._findAll
(name
, attrs
, text
, limit
,
267 self
.nextSiblingGenerator
, **kwargs
)
268 fetchNextSiblings
= findNextSiblings
# Compatibility with pre-3.x
270 def findPrevious(self
, name
=None, attrs
={}, text
=None, **kwargs
):
271 """Returns the first item that matches the given criteria and
272 appears before this Tag in the document."""
273 return self
._findOne
(self
.findAllPrevious
, name
, attrs
, text
, **kwargs
)
275 def findAllPrevious(self
, name
=None, attrs
={}, text
=None, limit
=None,
277 """Returns all items that match the given criteria and appear
278 before this Tag in the document."""
279 return self
._findAll
(name
, attrs
, text
, limit
, self
.previousGenerator
,
281 fetchPrevious
= findAllPrevious
# Compatibility with pre-3.x
283 def findPreviousSibling(self
, name
=None, attrs
={}, text
=None, **kwargs
):
284 """Returns the closest sibling to this Tag that matches the
285 given criteria and appears before this Tag in the document."""
286 return self
._findOne
(self
.findPreviousSiblings
, name
, attrs
, text
,
289 def findPreviousSiblings(self
, name
=None, attrs
={}, text
=None,
290 limit
=None, **kwargs
):
291 """Returns the siblings of this Tag that match the given
292 criteria and appear before this Tag in the document."""
293 return self
._findAll
(name
, attrs
, text
, limit
,
294 self
.previousSiblingGenerator
, **kwargs
)
295 fetchPreviousSiblings
= findPreviousSiblings
# Compatibility with pre-3.x
297 def findParent(self
, name
=None, attrs
={}, **kwargs
):
298 """Returns the closest parent of this Tag that matches the given
300 # NOTE: We can't use _findOne because findParents takes a different
303 l
= self
.findParents(name
, attrs
, 1)
308 def findParents(self
, name
=None, attrs
={}, limit
=None, **kwargs
):
309 """Returns the parents of this Tag that match the given
312 return self
._findAll
(name
, attrs
, None, limit
, self
.parentGenerator
,
314 fetchParents
= findParents
# Compatibility with pre-3.x
316 #These methods do the real heavy lifting.
318 def _findOne(self
, method
, name
, attrs
, text
, **kwargs
):
320 l
= method(name
, attrs
, text
, 1, **kwargs
)
325 def _findAll(self
, name
, attrs
, text
, limit
, generator
, **kwargs
):
326 "Iterates over a generator looking for things that match."
328 if isinstance(name
, SoupStrainer
):
331 # Build a SoupStrainer
332 strainer
= SoupStrainer(name
, attrs
, text
, **kwargs
)
333 results
= ResultSet(strainer
)
338 except StopIteration:
341 found
= strainer
.search(i
)
343 results
.append(found
)
344 if limit
and len(results
) >= limit
:
348 #These Generators can be used to navigate starting from both
349 #NavigableStrings and Tags.
350 def nextGenerator(self
):
356 def nextSiblingGenerator(self
):
362 def previousGenerator(self
):
368 def previousSiblingGenerator(self
):
371 i
= i
.previousSibling
374 def parentGenerator(self
):
381 def substituteEncoding(self
, str, encoding
=None):
382 encoding
= encoding
or "utf-8"
383 return str.replace("%SOUP-ENCODING%", encoding
)
385 def toEncoding(self
, s
, encoding
=None):
386 """Encodes an object to a string in some encoding, or to Unicode.
388 if isinstance(s
, unicode):
390 s
= s
.encode(encoding
)
391 elif isinstance(s
, str):
393 s
= s
.encode(encoding
)
398 s
= self
.toEncoding(str(s
), encoding
)
403 class NavigableString(unicode, PageElement
):
405 def __new__(cls
, value
):
406 """Create a new NavigableString.
408 When unpickling a NavigableString, this method is called with
409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
410 passed in to the superclass's __new__ or the superclass won't know
411 how to handle non-ASCII characters.
413 if isinstance(value
, unicode):
414 return unicode.__new
__(cls
, value
)
415 return unicode.__new
__(cls
, value
, DEFAULT_OUTPUT_ENCODING
)
417 def __getnewargs__(self
):
418 return (unicode(self
),)
420 def __getattr__(self
, attr
):
421 """text.string gives you text. This is for backwards
422 compatibility for Navigable*String, but for CData* it lets you
423 get the string without the CData wrapper."""
427 raise AttributeError, "'%s' object has no attribute '%s'" % (self
.__class
__.__name
__, attr
)
429 def encode(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
430 return self
.decode().encode(encoding
)
432 def decodeGivenEventualEncoding(self
, eventualEncoding
):
435 class CData(NavigableString
):
437 def decodeGivenEventualEncoding(self
, eventualEncoding
):
438 return u
'<![CDATA[' + self
+ u
']]>'
440 class ProcessingInstruction(NavigableString
):
442 def decodeGivenEventualEncoding(self
, eventualEncoding
):
444 if u
'%SOUP-ENCODING%' in output
:
445 output
= self
.substituteEncoding(output
, eventualEncoding
)
446 return u
'<?' + output
+ u
'?>'
448 class Comment(NavigableString
):
449 def decodeGivenEventualEncoding(self
, eventualEncoding
):
450 return u
'<!--' + self
+ u
'-->'
452 class Declaration(NavigableString
):
453 def decodeGivenEventualEncoding(self
, eventualEncoding
):
454 return u
'<!' + self
+ u
'>'
456 class Tag(PageElement
):
458 """Represents a found HTML tag with its attributes and contents."""
461 "Cheap function to invert a hash."
463 for k
,v
in h
.items():
467 XML_ENTITIES_TO_SPECIAL_CHARS
= { "apos" : "'",
473 XML_SPECIAL_CHARS_TO_ENTITIES
= _invert(XML_ENTITIES_TO_SPECIAL_CHARS
)
475 def _convertEntities(self
, match
):
476 """Used in a call to re.sub to replace HTML, XML, and numeric
477 entities with the appropriate Unicode characters. If HTML
478 entities are being converted, any unrecognized entities are
481 if self
.convertHTMLEntities
and x
in name2codepoint
:
482 return unichr(name2codepoint
[x
])
483 elif x
in self
.XML_ENTITIES_TO_SPECIAL_CHARS
:
484 if self
.convertXMLEntities
:
485 return self
.XML_ENTITIES_TO_SPECIAL_CHARS
[x
]
488 elif len(x
) > 0 and x
[0] == '#':
489 # Handle numeric entities
490 if len(x
) > 1 and x
[1] == 'x':
491 return unichr(int(x
[2:], 16))
493 return unichr(int(x
[1:]))
495 elif self
.escapeUnrecognizedEntities
:
496 return u
'&%s;' % x
500 def __init__(self
, parser
, name
, attrs
=None, parent
=None,
504 # We don't actually store the parser object: that lets extracted
505 # chunks be garbage-collected
506 self
.parserClass
= parser
.__class
__
507 self
.isSelfClosing
= parser
.isSelfClosingTag(name
)
513 self
.setup(parent
, previous
)
515 self
.containsSubstitutions
= False
516 self
.convertHTMLEntities
= parser
.convertHTMLEntities
517 self
.convertXMLEntities
= parser
.convertXMLEntities
518 self
.escapeUnrecognizedEntities
= parser
.escapeUnrecognizedEntities
521 "Converts HTML, XML and numeric entities in the attribute value."
525 return (k
, re
.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
526 self
._convertEntities
, val
))
527 self
.attrs
= map(convert
, self
.attrs
)
529 def get(self
, key
, default
=None):
530 """Returns the value of the 'key' attribute for the tag, or
531 the value given for 'default' if it doesn't have that
533 return self
._getAttrMap
().get(key
, default
)
535 def has_key(self
, key
):
536 return self
._getAttrMap
().has_key(key
)
538 def __getitem__(self
, key
):
539 """tag[key] returns the value of the 'key' attribute for the tag,
540 and throws an exception if it's not there."""
541 return self
._getAttrMap
()[key
]
544 "Iterating over a tag iterates over its contents."
545 return iter(self
.contents
)
548 "The length of a tag is the length of its list of contents."
549 return len(self
.contents
)
551 def __contains__(self
, x
):
552 return x
in self
.contents
554 def __nonzero__(self
):
555 "A tag is non-None even if it has no contents."
558 def __setitem__(self
, key
, value
):
559 """Setting tag[key] sets the value of the 'key' attribute for the
562 self
.attrMap
[key
] = value
564 for i
in range(0, len(self
.attrs
)):
565 if self
.attrs
[i
][0] == key
:
566 self
.attrs
[i
] = (key
, value
)
569 self
.attrs
.append((key
, value
))
570 self
._getAttrMap
()[key
] = value
572 def __delitem__(self
, key
):
573 "Deleting tag[key] deletes all 'key' attributes for the tag."
574 for item
in self
.attrs
:
576 self
.attrs
.remove(item
)
577 #We don't break because bad HTML can define the same
578 #attribute multiple times.
580 if self
.attrMap
.has_key(key
):
581 del self
.attrMap
[key
]
583 def __call__(self
, *args
, **kwargs
):
584 """Calling a tag like a function is the same as calling its
585 findAll() method. Eg. tag('a') returns a list of all the A tags
586 found within this tag."""
587 return apply(self
.findAll
, args
, kwargs
)
589 def __getattr__(self
, tag
):
590 #print "Getattr %s.%s" % (self.__class__, tag)
591 if len(tag
) > 3 and tag
.rfind('Tag') == len(tag
)-3:
592 return self
.find(tag
[:-3])
593 elif tag
.find('__') != 0:
594 return self
.find(tag
)
595 raise AttributeError, "'%s' object has no attribute '%s'" % (self
.__class
__, tag
)
597 def __eq__(self
, other
):
598 """Returns true iff this tag has the same name, the same attributes,
599 and the same contents (recursively) as the given tag.
601 NOTE: right now this will return false if two tags have the
602 same attributes in a different order. Should this be fixed?"""
603 if not hasattr(other
, 'name') or not hasattr(other
, 'attrs') or not hasattr(other
, 'contents') or self
.name
!= other
.name
or self
.attrs
!= other
.attrs
or len(self
) != len(other
):
605 for i
in range(0, len(self
.contents
)):
606 if self
.contents
[i
] != other
.contents
[i
]:
610 def __ne__(self
, other
):
611 """Returns true iff this tag is not identical to the other tag,
612 as defined in __eq__."""
613 return not self
== other
615 def __repr__(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
616 """Renders this tag as a string."""
617 return self
.decode(eventualEncoding
=encoding
)
619 BARE_AMPERSAND_OR_BRACKET
= re
.compile("([<>]|"
620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
623 def _sub_entity(self
, x
):
624 """Used with a regular expression to substitute the
625 appropriate XML entity for an XML special character."""
626 return "&" + self
.XML_SPECIAL_CHARS_TO_ENTITIES
[x
.group(0)[0]] + ";"
628 def __unicode__(self
):
634 def encode(self
, encoding
=DEFAULT_OUTPUT_ENCODING
,
635 prettyPrint
=False, indentLevel
=0):
636 return self
.decode(prettyPrint
, indentLevel
, encoding
).encode(encoding
)
638 def decode(self
, prettyPrint
=False, indentLevel
=0,
639 eventualEncoding
=DEFAULT_OUTPUT_ENCODING
):
640 """Returns a string or Unicode representation of this tag and
641 its contents. To get Unicode, pass None for encoding."""
645 for key
, val
in self
.attrs
:
648 if (self
.containsSubstitutions
649 and eventualEncoding
is not None
650 and '%SOUP-ENCODING%' in val
):
651 val
= self
.substituteEncoding(val
, eventualEncoding
)
653 # The attribute value either:
655 # * Contains no embedded double quotes or single quotes.
656 # No problem: we enclose it in double quotes.
657 # * Contains embedded single quotes. No problem:
658 # double quotes work here too.
659 # * Contains embedded double quotes. No problem:
660 # we enclose it in single quotes.
661 # * Embeds both single _and_ double quotes. This
662 # can't happen naturally, but it can happen if
663 # you modify an attribute value after parsing
664 # the document. Now we have a bit of a
665 # problem. We solve it by enclosing the
666 # attribute in single quotes, and escaping any
667 # embedded single quotes to XML entities.
671 # TODO: replace with apos when
673 val
= val
.replace("'", "&squot;")
675 # Now we're okay w/r/t quotes. But the attribute
676 # value might also contain angle brackets, or
677 # ampersands that aren't part of entities. We need
678 # to escape those to XML entities too.
679 val
= self
.BARE_AMPERSAND_OR_BRACKET
.sub(self
._sub
_entity
, val
)
681 # Handle boolean attributes.
684 decoded
= fmt
% (key
, val
)
685 attrs
.append(decoded
)
688 if self
.isSelfClosing
:
691 closeTag
= '</%s>' % self
.name
693 indentTag
, indentContents
= 0, 0
695 indentTag
= indentLevel
696 space
= (' ' * (indentTag
-1))
697 indentContents
= indentTag
+ 1
698 contents
= self
.decodeContents(prettyPrint
, indentContents
,
706 attributeString
= ' ' + ' '.join(attrs
)
709 s
.append('<%s%s%s>' % (self
.name
, attributeString
, close
))
713 if prettyPrint
and contents
and contents
[-1] != "\n":
715 if prettyPrint
and closeTag
:
718 if prettyPrint
and closeTag
and self
.nextSibling
:
724 """Recursively destroys the contents of this tree."""
725 contents
= [i
for i
in self
.contents
]
727 if isinstance(i
, Tag
):
733 def prettify(self
, encoding
=DEFAULT_OUTPUT_ENCODING
):
734 return self
.encode(encoding
, True)
736 def encodeContents(self
, encoding
=DEFAULT_OUTPUT_ENCODING
,
737 prettyPrint
=False, indentLevel
=0):
738 return self
.decodeContents(prettyPrint
, indentLevel
).encode(encoding
)
740 def decodeContents(self
, prettyPrint
=False, indentLevel
=0,
741 eventualEncoding
=DEFAULT_OUTPUT_ENCODING
):
742 """Renders the contents of this tag as a string in the given
743 encoding. If encoding is None, returns a Unicode string.."""
747 if isinstance(c
, NavigableString
):
748 text
= c
.decodeGivenEventualEncoding(eventualEncoding
)
749 elif isinstance(c
, Tag
):
750 s
.append(c
.decode(prettyPrint
, indentLevel
, eventualEncoding
))
751 if text
and prettyPrint
:
755 s
.append(" " * (indentLevel
-1))
763 def find(self
, name
=None, attrs
={}, recursive
=True, text
=None,
765 """Return only the first child of this Tag matching the given
768 l
= self
.findAll(name
, attrs
, recursive
, text
, 1, **kwargs
)
774 def findAll(self
, name
=None, attrs
={}, recursive
=True, text
=None,
775 limit
=None, **kwargs
):
776 """Extracts a list of Tag objects that match the given
777 criteria. You can specify the name of the Tag and any
778 attributes you want the Tag to have.
780 The value of a key-value pair in the 'attrs' map can be a
781 string, a list of strings, a regular expression object, or a
782 callable that takes a string and returns whether or not the
783 string matches for some custom definition of 'matches'. The
784 same is true of the tag name."""
785 generator
= self
.recursiveChildGenerator
787 generator
= self
.childGenerator
788 return self
._findAll
(name
, attrs
, text
, limit
, generator
, **kwargs
)
789 findChildren
= findAll
791 # Pre-3.x compatibility methods. Will go away in 4.0.
795 def fetchText(self
, text
=None, recursive
=True, limit
=None):
796 return self
.findAll(text
=text
, recursive
=recursive
, limit
=limit
)
798 def firstText(self
, text
=None, recursive
=True):
799 return self
.find(text
=text
, recursive
=recursive
)
801 # 3.x compatibility methods. Will go away in 4.0.
802 def renderContents(self
, encoding
=DEFAULT_OUTPUT_ENCODING
,
803 prettyPrint
=False, indentLevel
=0):
805 return self
.decodeContents(prettyPrint
, indentLevel
, encoding
)
807 return self
.encodeContents(encoding
, prettyPrint
, indentLevel
)
812 def _getAttrMap(self
):
813 """Initializes a map representation of this tag's attributes,
814 if not already initialized."""
815 if not getattr(self
, 'attrMap'):
817 for (key
, value
) in self
.attrs
:
818 self
.attrMap
[key
] = value
822 def recursiveChildGenerator(self
):
823 if not len(self
.contents
):
825 stopNode
= self
._lastRecursiveChild
().next
826 current
= self
.contents
[0]
827 while current
is not stopNode
:
829 current
= current
.next
831 def childGenerator(self
):
832 if not len(self
.contents
):
834 current
= self
.contents
[0]
837 current
= current
.nextSibling
840 # Next, a couple classes to represent queries and their results.
842 """Encapsulates a number of ways of matching a markup element (tag or
845 def __init__(self
, name
=None, attrs
={}, text
=None, **kwargs
):
848 kwargs
['class'] = attrs
863 return "%s|%s" % (self
.name
, self
.attrs
)
865 def searchTag(self
, markupName
=None, markupAttrs
={}):
868 if isinstance(markupName
, Tag
):
871 callFunctionWithTagData
= callable(self
.name
) \
872 and not isinstance(markupName
, Tag
)
875 or callFunctionWithTagData \
876 or (markup
and self
._matches
(markup
, self
.name
)) \
877 or (not markup
and self
._matches
(markupName
, self
.name
)):
878 if callFunctionWithTagData
:
879 match
= self
.name(markupName
, markupAttrs
)
883 for attr
, matchAgainst
in self
.attrs
.items():
884 if not markupAttrMap
:
885 if hasattr(markupAttrs
, 'get'):
886 markupAttrMap
= markupAttrs
889 for k
,v
in markupAttrs
:
891 attrValue
= markupAttrMap
.get(attr
)
892 if not self
._matches
(attrValue
, matchAgainst
):
902 def search(self
, markup
):
903 #print 'looking for %s in %s' % (self, markup)
905 # If given a list of items, scan it for a text element that
907 if isList(markup
) and not isinstance(markup
, Tag
):
908 for element
in markup
:
909 if isinstance(element
, NavigableString
) \
910 and self
.search(element
):
913 # If it's a Tag, make sure its name or attributes match.
914 # Don't bother with Tags if we're searching for text.
915 elif isinstance(markup
, Tag
):
917 found
= self
.searchTag(markup
)
918 # If it's text, make sure the text matches.
919 elif isinstance(markup
, NavigableString
) or \
921 if self
._matches
(markup
, self
.text
):
924 raise Exception, "I don't know how to match against a %s" \
928 def _matches(self
, markup
, matchAgainst
):
929 #print "Matching %s against %s" % (markup, matchAgainst)
931 if matchAgainst
== True and type(matchAgainst
) == types
.BooleanType
:
932 result
= markup
!= None
933 elif callable(matchAgainst
):
934 result
= matchAgainst(markup
)
936 #Custom match methods take the tag as an argument, but all
937 #other ways of matching match the tag name as a string.
938 if isinstance(markup
, Tag
):
940 if markup
is not None and not isString(markup
):
941 markup
= unicode(markup
)
942 #Now we know that chunk is either a string, or None.
943 if hasattr(matchAgainst
, 'match'):
944 # It's a regexp object.
945 result
= markup
and matchAgainst
.search(markup
)
946 elif (isList(matchAgainst
)
947 and (markup
is not None or not isString(matchAgainst
))):
948 result
= markup
in matchAgainst
949 elif hasattr(matchAgainst
, 'items'):
950 result
= markup
.has_key(matchAgainst
)
951 elif matchAgainst
and isString(markup
):
952 if isinstance(markup
, unicode):
953 matchAgainst
= unicode(matchAgainst
)
955 matchAgainst
= str(matchAgainst
)
958 result
= matchAgainst
== markup
961 class ResultSet(list):
962 """A ResultSet is just a list that keeps track of the SoupStrainer
964 def __init__(self
, source
):
968 # Now, some helper functions.
971 """Convenience method that works with all 2.x versions of Python
972 to determine whether or not something is listlike."""
973 return ((hasattr(l
, '__iter__') and not isString(l
))
974 or (type(l
) in (types
.ListType
, types
.TupleType
)))
977 """Convenience method that works with all 2.x versions of Python
978 to determine whether or not something is stringlike."""
980 return isinstance(s
, unicode) or isinstance(s
, basestring
)
982 return isinstance(s
, str)
984 def buildTagMap(default
, *args
):
985 """Turns a list of maps, lists, or scalars into a single map.
986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
987 NESTING_RESET_TAGS maps out of lists and partial maps."""
990 if hasattr(portion
, 'items'):
991 #It's a map. Merge it.
992 for k
,v
in portion
.items():
994 elif isList(portion
) and not isString(portion
):
995 #It's a list. Map each item to the default.
999 #It's a scalar. Map it to the default.
1000 built
[portion
] = default
1003 # Now, the parser classes.
1005 class HTMLParserBuilder(HTMLParser
):
1007 def __init__(self
, soup
):
1008 HTMLParser
.__init
__(self
)
1011 # We inherit feed() and reset().
1013 def handle_starttag(self
, name
, attrs
):
1015 self
.soup
.extractCharsetFromMeta(attrs
)
1017 self
.soup
.unknown_starttag(name
, attrs
)
1019 def handle_endtag(self
, name
):
1020 self
.soup
.unknown_endtag(name
)
1022 def handle_data(self
, content
):
1023 self
.soup
.handle_data(content
)
1025 def _toStringSubclass(self
, text
, subclass
):
1026 """Adds a certain piece of text to the tree as a NavigableString
1029 self
.handle_data(text
)
1030 self
.soup
.endData(subclass
)
1032 def handle_pi(self
, text
):
1033 """Handle a processing instruction as a ProcessingInstruction
1034 object, possibly one with a %SOUP-ENCODING% slot into which an
1035 encoding will be plugged later."""
1036 if text
[:3] == "xml":
1037 text
= u
"xml version='1.0' encoding='%SOUP-ENCODING%'"
1038 self
._toStringSubclass
(text
, ProcessingInstruction
)
1040 def handle_comment(self
, text
):
1041 "Handle comments as Comment objects."
1042 self
._toStringSubclass
(text
, Comment
)
1044 def handle_charref(self
, ref
):
1045 "Handle character references as data."
1046 if self
.soup
.convertEntities
:
1047 data
= unichr(int(ref
))
1049 data
= '&#%s;' % ref
1050 self
.handle_data(data
)
1052 def handle_entityref(self
, ref
):
1053 """Handle entity references as data, possibly converting known
1054 HTML and/or XML entity references to the corresponding Unicode
1057 if self
.soup
.convertHTMLEntities
:
1059 data
= unichr(name2codepoint
[ref
])
1063 if not data
and self
.soup
.convertXMLEntities
:
1064 data
= self
.soup
.XML_ENTITIES_TO_SPECIAL_CHARS
.get(ref
)
1066 if not data
and self
.soup
.convertHTMLEntities
and \
1067 not self
.soup
.XML_ENTITIES_TO_SPECIAL_CHARS
.get(ref
):
1068 # TODO: We've got a problem here. We're told this is
1069 # an entity reference, but it's not an XML entity
1070 # reference or an HTML entity reference. Nonetheless,
1071 # the logical thing to do is to pass it through as an
1072 # unrecognized entity reference.
1074 # Except: when the input is "&carol;" this function
1075 # will be called with input "carol". When the input is
1076 # "AT&T", this function will be called with input
1077 # "T". We have no way of knowing whether a semicolon
1078 # was present originally, so we don't know whether
1079 # this is an unknown entity or just a misplaced
1082 # The more common case is a misplaced ampersand, so I
1083 # escape the ampersand and omit the trailing semicolon.
1084 data
= "&%s" % ref
1086 # This case is different from the one above, because we
1087 # haven't already gone through a supposedly comprehensive
1088 # mapping of entities to Unicode characters. We might not
1089 # have gone through any mapping at all. So the chances are
1090 # very high that this is a real entity, and not a
1091 # misplaced ampersand.
1093 self
.handle_data(data
)
1095 def handle_decl(self
, data
):
1096 "Handle DOCTYPEs and the like as Declaration objects."
1097 self
._toStringSubclass
(data
, Declaration
)
1099 def parse_declaration(self
, i
):
1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1101 declaration as a CData object."""
1103 if self
.rawdata
[i
:i
+9] == '<![CDATA[':
1104 k
= self
.rawdata
.find(']]>', i
)
1106 k
= len(self
.rawdata
)
1107 data
= self
.rawdata
[i
+9:k
]
1109 self
._toStringSubclass
(data
, CData
)
1112 j
= HTMLParser
.parse_declaration(self
, i
)
1113 except HTMLParseError
:
1114 toHandle
= self
.rawdata
[i
:]
1115 self
.handle_data(toHandle
)
1116 j
= i
+ len(toHandle
)
1120 class BeautifulStoneSoup(Tag
):
1122 """This class contains the basic parser and search code. It defines
1123 a parser that knows nothing about tag behavior except for the
1126 You can't close a tag without closing all the tags it encloses.
1127 That is, "<foo><bar></foo>" actually means
1128 "<foo><bar></bar></foo>".
1130 [Another possible explanation is "<foo><bar /></foo>", but since
1131 this class defines no SELF_CLOSING_TAGS, it will never use that
1134 This class is useful for parsing XML or made-up markup languages,
1135 or when BeautifulSoup makes an assumption counter to what you were
1138 SELF_CLOSING_TAGS
= {}
1140 RESET_NESTING_TAGS
= {}
1142 PRESERVE_WHITESPACE_TAGS
= []
1144 MARKUP_MASSAGE
= [(re
.compile('(<[^<>]*)/>'),
1145 lambda x
: x
.group(1) + ' />'),
1146 (re
.compile('<!\s+([^<>]*)>'),
1147 lambda x
: '<!' + x
.group(1) + '>')
1150 ROOT_TAG_NAME
= u
'[document]'
1152 HTML_ENTITIES
= "html"
1153 XML_ENTITIES
= "xml"
1154 XHTML_ENTITIES
= "xhtml"
1155 # TODO: This only exists for backwards-compatibility
1156 ALL_ENTITIES
= XHTML_ENTITIES
1158 # Used when determining whether a text node is all whitespace and
1159 # can be replaced with a single space. A text node that contains
1160 # fancy Unicode spaces (usually non-breaking) should be left
1162 STRIP_ASCII_SPACES
= { 9: None, 10: None, 12: None, 13: None, 32: None, }
1164 def __init__(self
, markup
="", parseOnlyThese
=None, fromEncoding
=None,
1165 markupMassage
=True, smartQuotesTo
=XML_ENTITIES
,
1166 convertEntities
=None, selfClosingTags
=None, isHTML
=False,
1167 builder
=HTMLParserBuilder
):
1168 """The Soup object is initialized as the 'root tag', and the
1169 provided markup (which can be a string or a file-like object)
1170 is fed into the underlying parser.
1172 HTMLParser will process most bad HTML, and the BeautifulSoup
1173 class has some tricks for dealing with some HTML that kills
1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data
1175 if your data uses self-closing tags or declarations
1178 By default, Beautiful Soup uses regexes to sanitize input,
1179 avoiding the vast majority of these problems. If the problems
1180 don't apply to you, pass in False for markupMassage, and
1181 you'll get better performance.
1183 The default parser massage techniques fix the two most common
1184 instances of invalid HTML that choke HTMLParser:
1186 <br/> (No space between name of closing tag and tag close)
1187 <! --Comment--> (Extraneous whitespace in declaration)
1189 You can pass in a custom list of (RE object, replace method)
1190 tuples to get Beautiful Soup to scrub your input the way you
1193 self
.parseOnlyThese
= parseOnlyThese
1194 self
.fromEncoding
= fromEncoding
1195 self
.smartQuotesTo
= smartQuotesTo
1196 self
.convertEntities
= convertEntities
1197 # Set the rules for how we'll deal with the entities we
1199 if self
.convertEntities
:
1200 # It doesn't make sense to convert encoded characters to
1201 # entities even while you're converting entities to Unicode.
1202 # Just convert it all to Unicode.
1203 self
.smartQuotesTo
= None
1204 if convertEntities
== self
.HTML_ENTITIES
:
1205 self
.convertXMLEntities
= False
1206 self
.convertHTMLEntities
= True
1207 self
.escapeUnrecognizedEntities
= True
1208 elif convertEntities
== self
.XHTML_ENTITIES
:
1209 self
.convertXMLEntities
= True
1210 self
.convertHTMLEntities
= True
1211 self
.escapeUnrecognizedEntities
= False
1212 elif convertEntities
== self
.XML_ENTITIES
:
1213 self
.convertXMLEntities
= True
1214 self
.convertHTMLEntities
= False
1215 self
.escapeUnrecognizedEntities
= False
1217 self
.convertXMLEntities
= False
1218 self
.convertHTMLEntities
= False
1219 self
.escapeUnrecognizedEntities
= False
1221 self
.instanceSelfClosingTags
= buildTagMap(None, selfClosingTags
)
1222 self
.builder
= builder(self
)
1225 if hasattr(markup
, 'read'): # It's a file-type object.
1226 markup
= markup
.read()
1227 self
.markup
= markup
1228 self
.markupMassage
= markupMassage
1230 self
._feed
(isHTML
=isHTML
)
1233 self
.markup
= None # The markup can now be GCed.
1234 self
.builder
= None # So can the builder.
1236 def _feed(self
, inDocumentEncoding
=None, isHTML
=False):
1237 # Convert the document to Unicode.
1238 markup
= self
.markup
1239 if isinstance(markup
, unicode):
1240 if not hasattr(self
, 'originalEncoding'):
1241 self
.originalEncoding
= None
1243 dammit
= UnicodeDammit\
1244 (markup
, [self
.fromEncoding
, inDocumentEncoding
],
1245 smartQuotesTo
=self
.smartQuotesTo
, isHTML
=isHTML
)
1246 markup
= dammit
.unicode
1247 self
.originalEncoding
= dammit
.originalEncoding
1248 self
.declaredHTMLEncoding
= dammit
.declaredHTMLEncoding
1250 if self
.markupMassage
:
1251 if not isList(self
.markupMassage
):
1252 self
.markupMassage
= self
.MARKUP_MASSAGE
1253 for fix
, m
in self
.markupMassage
:
1254 markup
= fix
.sub(m
, markup
)
1255 # TODO: We get rid of markupMassage so that the
1256 # soup object can be deepcopied later on. Some
1257 # Python installations can't copy regexes. If anyone
1258 # was relying on the existence of markupMassage, this
1259 # might cause problems.
1260 del(self
.markupMassage
)
1261 self
.builder
.reset()
1263 self
.builder
.feed(markup
)
1264 # Close out any unfinished strings and close all the open tags.
1266 while self
.currentTag
.name
!= self
.ROOT_TAG_NAME
:
1269 def isSelfClosingTag(self
, name
):
1270 """Returns true iff the given string is the name of a
1271 self-closing tag according to this parser."""
1272 return self
.SELF_CLOSING_TAGS
.has_key(name
) \
1273 or self
.instanceSelfClosingTags
.has_key(name
)
1276 Tag
.__init
__(self
, self
, self
.ROOT_TAG_NAME
)
1278 self
.builder
.reset()
1279 self
.currentData
= []
1280 self
.currentTag
= None
1282 self
.quoteStack
= []
1286 tag
= self
.tagStack
.pop()
1287 # Tags with just one string-owning child get the child as a
1288 # 'string' property, so that soup.tag.string is shorthand for
1289 # soup.tag.contents[0]
1290 if len(self
.currentTag
.contents
) == 1 and \
1291 isinstance(self
.currentTag
.contents
[0], NavigableString
):
1292 self
.currentTag
.string
= self
.currentTag
.contents
[0]
1294 #print "Pop", tag.name
1296 self
.currentTag
= self
.tagStack
[-1]
1297 return self
.currentTag
1299 def pushTag(self
, tag
):
1300 #print "Push", tag.name
1302 self
.currentTag
.contents
.append(tag
)
1303 self
.tagStack
.append(tag
)
1304 self
.currentTag
= self
.tagStack
[-1]
1306 def endData(self
, containerClass
=NavigableString
):
1307 if self
.currentData
:
1308 currentData
= u
''.join(self
.currentData
)
1309 if (currentData
.translate(self
.STRIP_ASCII_SPACES
) == '' and
1310 not set([tag
.name
for tag
in self
.tagStack
]).intersection(
1311 self
.PRESERVE_WHITESPACE_TAGS
)):
1312 if '\n' in currentData
:
1316 self
.currentData
= []
1317 if self
.parseOnlyThese
and len(self
.tagStack
) <= 1 and \
1318 (not self
.parseOnlyThese
.text
or \
1319 not self
.parseOnlyThese
.search(currentData
)):
1321 o
= containerClass(currentData
)
1322 o
.setup(self
.currentTag
, self
.previous
)
1324 self
.previous
.next
= o
1326 self
.currentTag
.contents
.append(o
)
1329 def _popToTag(self
, name
, inclusivePop
=True):
1330 """Pops the tag stack up to and including the most recent
1331 instance of the given tag. If inclusivePop is false, pops the tag
1332 stack up to but *not* including the most recent instqance of
1334 #print "Popping to %s" % name
1335 if name
== self
.ROOT_TAG_NAME
:
1339 mostRecentTag
= None
1340 for i
in range(len(self
.tagStack
)-1, 0, -1):
1341 if name
== self
.tagStack
[i
].name
:
1342 numPops
= len(self
.tagStack
)-i
1344 if not inclusivePop
:
1345 numPops
= numPops
- 1
1347 for i
in range(0, numPops
):
1348 mostRecentTag
= self
.popTag()
1349 return mostRecentTag
1351 def _smartPop(self
, name
):
1353 """We need to pop up to the previous tag of this type, unless
1354 one of this tag's nesting reset triggers comes between this
1355 tag and the previous tag of this type, OR unless this tag is a
1356 generic nesting trigger and another generic nesting trigger
1357 comes between this tag and the previous tag of this type.
1360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1369 nestingResetTriggers
= self
.NESTABLE_TAGS
.get(name
)
1370 isNestable
= nestingResetTriggers
!= None
1371 isResetNesting
= self
.RESET_NESTING_TAGS
.has_key(name
)
1374 for i
in range(len(self
.tagStack
)-1, 0, -1):
1375 p
= self
.tagStack
[i
]
1376 if (not p
or p
.name
== name
) and not isNestable
:
1377 #Non-nestable tags get popped to the top or to their
1381 if (nestingResetTriggers
!= None
1382 and p
.name
in nestingResetTriggers
) \
1383 or (nestingResetTriggers
== None and isResetNesting
1384 and self
.RESET_NESTING_TAGS
.has_key(p
.name
)):
1386 #If we encounter one of the nesting reset triggers
1387 #peculiar to this tag, or we encounter another tag
1388 #that causes nesting to reset, pop up to but not
1389 #including that tag.
1395 self
._popToTag
(popTo
, inclusive
)
1397 def unknown_starttag(self
, name
, attrs
, selfClosing
=0):
1398 #print "Start tag %s: %s" % (name, attrs)
1400 #This is not a real tag.
1401 #print "<%s> is not real!" % name
1402 attrs
= ''.join(map(lambda(x
, y
): ' %s="%s"' % (x
, y
), attrs
))
1403 self
.handle_data('<%s%s>' % (name
, attrs
))
1407 if not self
.isSelfClosingTag(name
) and not selfClosing
:
1408 self
._smartPop
(name
)
1410 if self
.parseOnlyThese
and len(self
.tagStack
) <= 1 \
1411 and (self
.parseOnlyThese
.text
or not self
.parseOnlyThese
.searchTag(name
, attrs
)):
1414 tag
= Tag(self
, name
, attrs
, self
.currentTag
, self
.previous
)
1416 self
.previous
.next
= tag
1419 if selfClosing
or self
.isSelfClosingTag(name
):
1421 if name
in self
.QUOTE_TAGS
:
1422 #print "Beginning quote (%s)" % name
1423 self
.quoteStack
.append(name
)
1427 def unknown_endtag(self
, name
):
1428 #print "End tag %s" % name
1429 if self
.quoteStack
and self
.quoteStack
[-1] != name
:
1430 #This is not a real end tag.
1431 #print "</%s> is not real!" % name
1432 self
.handle_data('</%s>' % name
)
1435 self
._popToTag
(name
)
1436 if self
.quoteStack
and self
.quoteStack
[-1] == name
:
1437 self
.quoteStack
.pop()
1438 self
.literal
= (len(self
.quoteStack
) > 0)
1440 def handle_data(self
, data
):
1441 self
.currentData
.append(data
)
1443 def extractCharsetFromMeta(self
, attrs
):
1444 self
.unknown_starttag('meta', attrs
)
1447 class BeautifulSoup(BeautifulStoneSoup
):
1449 """This parser knows the following facts about HTML:
1451 * Some tags have no closing tag and should be interpreted as being
1452 closed as soon as they are encountered.
1454 * The text inside some tags (ie. 'script') may contain tags which
1455 are not really part of the document and which should be parsed
1456 as text, not tags. If you want to parse the text as tags, you can
1457 always fetch it and parse it explicitly.
1459 * Tag nesting rules:
1461 Most tags can't be nested at all. For instance, the occurance of
1462 a <p> tag should implicitly close the previous <p> tag.
1465 should be transformed into:
1466 <p>Para1</p><p>Para2
1468 Some tags can be nested arbitrarily. For instance, the occurance
1469 of a <blockquote> tag should _not_ implicitly close the previous
1472 Alice said: <blockquote>Bob said: <blockquote>Blah
1473 should NOT be transformed into:
1474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1476 Some tags can be nested, but the nesting is reset by the
1477 interposition of other tags. For instance, a <tr> tag should
1478 implicitly close the previous <tr> tag within the same <table>,
1479 but not close a <tr> tag in another table.
1481 <table><tr>Blah<tr>Blah
1482 should be transformed into:
1483 <table><tr>Blah</tr><tr>Blah
1485 <tr>Blah<table><tr>Blah
1486 should NOT be transformed into
1487 <tr>Blah<table></tr><tr>Blah
1489 Differing assumptions about tag nesting rules are a major source
1490 of problems with the BeautifulSoup class. If BeautifulSoup is not
1491 treating as nestable a tag your page author treats as nestable,
1492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1493 BeautifulStoneSoup before writing your own subclass."""
1495 def __init__(self
, *args
, **kwargs
):
1496 if not kwargs
.has_key('smartQuotesTo'):
1497 kwargs
['smartQuotesTo'] = self
.HTML_ENTITIES
1498 kwargs
['isHTML'] = True
1499 BeautifulStoneSoup
.__init
__(self
, *args
, **kwargs
)
1501 SELF_CLOSING_TAGS
= buildTagMap(None,
1502 ['br' , 'hr', 'input', 'img', 'meta',
1503 'spacer', 'link', 'frame', 'base'])
1505 PRESERVE_WHITESPACE_TAGS
= set(['pre', 'textarea'])
1507 QUOTE_TAGS
= {'script' : None, 'textarea' : None}
1509 #According to the HTML standard, each of these inline tags can
1510 #contain another tag of the same type. Furthermore, it's common
1511 #to actually use these tags this way.
1512 NESTABLE_INLINE_TAGS
= ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1515 #According to the HTML standard, these block tags can contain
1516 #another tag of the same type. Furthermore, it's common
1517 #to actually use these tags this way.
1518 NESTABLE_BLOCK_TAGS
= ['blockquote', 'div', 'fieldset', 'ins', 'del']
1520 #Lists can contain other lists, but there are restrictions.
1521 NESTABLE_LIST_TAGS
= { 'ol' : [],
1523 'li' : ['ul', 'ol'],
1528 #Tables can contain other tables, but there are restrictions.
1529 NESTABLE_TABLE_TAGS
= {'table' : [],
1530 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1533 'thead' : ['table'],
1534 'tbody' : ['table'],
1535 'tfoot' : ['table'],
1538 NON_NESTABLE_BLOCK_TAGS
= ['address', 'form', 'p', 'pre']
1540 #If one of these tags is encountered, all tags up to the next tag of
1541 #this type are popped.
1542 RESET_NESTING_TAGS
= buildTagMap(None, NESTABLE_BLOCK_TAGS
, 'noscript',
1543 NON_NESTABLE_BLOCK_TAGS
,
1545 NESTABLE_TABLE_TAGS
)
1547 NESTABLE_TAGS
= buildTagMap([], NESTABLE_INLINE_TAGS
, NESTABLE_BLOCK_TAGS
,
1548 NESTABLE_LIST_TAGS
, NESTABLE_TABLE_TAGS
)
1550 # Used to detect the charset in a META tag; see start_meta
1551 CHARSET_RE
= re
.compile("((^|;)\s*charset=)([^;]*)", re
.M
)
1553 def extractCharsetFromMeta(self
, attrs
):
1554 """Beautiful Soup can detect a charset included in a META tag,
1555 try to convert the document to that charset, and re-parse the
1556 document from the beginning."""
1559 contentTypeIndex
= None
1560 tagNeedsEncodingSubstitution
= False
1562 for i
in range(0, len(attrs
)):
1563 key
, value
= attrs
[i
]
1565 if key
== 'http-equiv':
1567 elif key
== 'content':
1569 contentTypeIndex
= i
1571 if httpEquiv
and contentType
: # It's an interesting meta tag.
1572 match
= self
.CHARSET_RE
.search(contentType
)
1574 if (self
.declaredHTMLEncoding
is not None or
1575 self
.originalEncoding
== self
.fromEncoding
):
1576 # An HTML encoding was sniffed while converting
1577 # the document to Unicode, or an HTML encoding was
1578 # sniffed during a previous pass through the
1579 # document, or an encoding was specified
1580 # explicitly and it worked. Rewrite the meta tag.
1582 return match
.group(1) + "%SOUP-ENCODING%"
1583 newAttr
= self
.CHARSET_RE
.sub(rewrite
, contentType
)
1584 attrs
[contentTypeIndex
] = (attrs
[contentTypeIndex
][0],
1586 tagNeedsEncodingSubstitution
= True
1588 # This is our first pass through the document.
1589 # Go through it again with the encoding information.
1590 newCharset
= match
.group(3)
1591 if newCharset
and newCharset
!= self
.originalEncoding
:
1592 self
.declaredHTMLEncoding
= newCharset
1593 self
._feed
(self
.declaredHTMLEncoding
)
1596 tag
= self
.unknown_starttag("meta", attrs
)
1597 if tag
and tagNeedsEncodingSubstitution
:
1598 tag
.containsSubstitutions
= True
1601 class StopParsing(Exception):
1604 class ICantBelieveItsBeautifulSoup(BeautifulSoup
):
1606 """The BeautifulSoup class is oriented towards skipping over
1607 common HTML errors like unclosed tags. However, sometimes it makes
1608 errors of its own. For instance, consider this fragment:
1610 <b>Foo<b>Bar</b></b>
1612 This is perfectly valid (if bizarre) HTML. However, the
1613 BeautifulSoup class will implicitly close the first b tag when it
1614 encounters the second 'b'. It will think the author wrote
1615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1616 there's no real-world reason to bold something that's already
1617 bold. When it encounters '</b></b>' it will close two more 'b'
1618 tags, for a grand total of three tags closed instead of two. This
1619 can throw off the rest of your document structure. The same is
1620 true of a number of other tags, listed below.
1622 It's much more common for someone to forget to close a 'b' tag
1623 than to actually use nested 'b' tags, and the BeautifulSoup class
1624 handles the common case. This class handles the not-co-common
1625 case: where you can't believe someone wrote what they did, but
1626 it's valid HTML and BeautifulSoup screwed up by assuming it
1629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS
= \
1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS
= ['noscript']
1636 NESTABLE_TAGS
= buildTagMap([], BeautifulSoup
.NESTABLE_TAGS
,
1637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS
,
1638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS
)
1640 class MinimalSoup(BeautifulSoup
):
1641 """The MinimalSoup class is for parsing HTML that contains
1642 pathologically bad markup. It makes no assumptions about tag
1643 nesting, but it does know which tags are self-closing, that
1644 <script> tags contain Javascript and should not be parsed, that
1645 META tags may contain encoding information, and so on.
1647 This also makes it better for subclassing than BeautifulStoneSoup
1648 or BeautifulSoup."""
1650 RESET_NESTING_TAGS
= buildTagMap('noscript')
1653 class BeautifulSOAP(BeautifulStoneSoup
):
1654 """This class will push a tag with only a single string child into
1655 the tag's parent as an attribute. The attribute's name is the tag
1656 name, and the value is the string child. An example should give
1657 the flavor of the change:
1659 <foo><bar>baz</bar></foo>
1661 <foo bar="baz"><bar>baz</bar></foo>
1663 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1665 This is, of course, useful for scraping structures that tend to
1666 use subelements instead of attributes, such as SOAP messages. Note
1667 that it modifies its input, so don't print the modified version
1670 I'm not sure how many people really want to use this class; let me
1671 know if you do. Mainly I like the name."""
1674 if len(self
.tagStack
) > 1:
1675 tag
= self
.tagStack
[-1]
1676 parent
= self
.tagStack
[-2]
1677 parent
._getAttrMap
()
1678 if (isinstance(tag
, Tag
) and len(tag
.contents
) == 1 and
1679 isinstance(tag
.contents
[0], NavigableString
) and
1680 not parent
.attrMap
.has_key(tag
.name
)):
1681 parent
[tag
.name
] = tag
.contents
[0]
1682 BeautifulStoneSoup
.popTag(self
)
1684 #Enterprise class names! It has come to our attention that some people
1685 #think the names of the Beautiful Soup parser classes are too silly
1686 #and "unprofessional" for use in enterprise screen-scraping. We feel
1687 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1688 #All-Night Kosher Bakery recommends renaming this file to
1689 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1690 #"RobustParserBeanInterface.class") and using the following
1691 #enterprise-friendly class aliases:
1692 class RobustXMLParser(BeautifulStoneSoup
):
1694 class RobustHTMLParser(BeautifulSoup
):
1696 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup
):
1698 class RobustInsanelyWackAssHTMLParser(MinimalSoup
):
1700 class SimplifyingSOAPParser(BeautifulSOAP
):
1703 ######################################################
1705 # Bonus library: Unicode, Dammit
1707 # This class forces XML data into a standard format (usually to UTF-8
1708 # or Unicode). It is heavily based on code from Mark Pilgrim's
1709 # Universal Feed Parser. It does not rewrite the XML or HTML to
1710 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1711 # (XML) and BeautifulSoup.start_meta (HTML).
1713 # Autodetects character encodings.
1714 # Download from http://chardet.feedparser.org/
1717 # import chardet.constants
1718 # chardet.constants._debug = 1
1722 # cjkcodecs and iconv_codec make Python know about more character encodings.
1723 # Both are available from http://cjkpython.i18n.org/
1724 # They're built in if you use Python 2.4.
1726 import cjkcodecs
.aliases
1734 class UnicodeDammit
:
1735 """A class for detecting the encoding of a *ML document and
1736 converting it to a Unicode string. If the source encoding is
1737 windows-1252, can replace MS smart quotes with their HTML or XML
1740 # This dictionary maps commonly seen values for "charset" in HTML
1741 # meta tags to the corresponding Python codec names. It only covers
1742 # values that aren't in Python's aliases and can't be determined
1743 # by the heuristics in find_codec.
1744 CHARSET_ALIASES
= { "macintosh" : "mac-roman",
1745 "x-sjis" : "shift-jis" }
1747 def __init__(self
, markup
, overrideEncodings
=[],
1748 smartQuotesTo
='xml', isHTML
=False):
1749 self
.declaredHTMLEncoding
= None
1750 self
.markup
, documentEncoding
, sniffedEncoding
= \
1751 self
._detectEncoding
(markup
, isHTML
)
1752 self
.smartQuotesTo
= smartQuotesTo
1753 self
.triedEncodings
= []
1754 if markup
== '' or isinstance(markup
, unicode):
1755 self
.originalEncoding
= None
1756 self
.unicode = unicode(markup
)
1760 for proposedEncoding
in overrideEncodings
:
1761 u
= self
._convertFrom
(proposedEncoding
)
1764 for proposedEncoding
in (documentEncoding
, sniffedEncoding
):
1765 u
= self
._convertFrom
(proposedEncoding
)
1768 # If no luck and we have auto-detection library, try that:
1769 if not u
and chardet
and not isinstance(self
.markup
, unicode):
1770 u
= self
._convertFrom
(chardet
.detect(self
.markup
)['encoding'])
1772 # As a last resort, try utf-8 and windows-1252:
1774 for proposed_encoding
in ("utf-8", "windows-1252"):
1775 u
= self
._convertFrom
(proposed_encoding
)
1779 if not u
: self
.originalEncoding
= None
1781 def _subMSChar(self
, match
):
1782 """Changes a MS smart quote character to an XML or HTML
1784 orig
= match
.group(1)
1785 sub
= self
.MS_CHARS
.get(orig
)
1786 if type(sub
) == types
.TupleType
:
1787 if self
.smartQuotesTo
== 'xml':
1788 sub
= '&#x'.encode() + sub
[1].encode() + ';'.encode()
1790 sub
= '&'.encode() + sub
[0].encode() + ';'.encode()
1795 def _convertFrom(self
, proposed
):
1796 proposed
= self
.find_codec(proposed
)
1797 if not proposed
or proposed
in self
.triedEncodings
:
1799 self
.triedEncodings
.append(proposed
)
1800 markup
= self
.markup
1802 # Convert smart quotes to HTML if coming from an encoding
1803 # that might have them.
1804 if self
.smartQuotesTo
and proposed
.lower() in("windows-1252",
1807 smart_quotes_re
= "([\x80-\x9f])"
1808 smart_quotes_compiled
= re
.compile(smart_quotes_re
)
1809 markup
= smart_quotes_compiled
.sub(self
._subMSChar
, markup
)
1812 # print "Trying to convert document to %s" % proposed
1813 u
= self
._toUnicode
(markup
, proposed
)
1815 self
.originalEncoding
= proposed
1816 except Exception, e
:
1817 # print "That didn't work!"
1820 #print "Correct encoding: %s" % proposed
1823 def _toUnicode(self
, data
, encoding
):
1824 '''Given a string and its encoding, decodes the string into Unicode.
1825 %encoding is a string recognized by encodings.aliases'''
1827 # strip Byte Order Mark (if present)
1828 if (len(data
) >= 4) and (data
[:2] == '\xfe\xff') \
1829 and (data
[2:4] != '\x00\x00'):
1830 encoding
= 'utf-16be'
1832 elif (len(data
) >= 4) and (data
[:2] == '\xff\xfe') \
1833 and (data
[2:4] != '\x00\x00'):
1834 encoding
= 'utf-16le'
1836 elif data
[:3] == '\xef\xbb\xbf':
1839 elif data
[:4] == '\x00\x00\xfe\xff':
1840 encoding
= 'utf-32be'
1842 elif data
[:4] == '\xff\xfe\x00\x00':
1843 encoding
= 'utf-32le'
1845 newdata
= unicode(data
, encoding
)
1848 def _detectEncoding(self
, xml_data
, isHTML
=False):
1849 """Given a document, tries to detect its XML encoding."""
1850 xml_encoding
= sniffed_xml_encoding
= None
1852 if xml_data
[:4] == '\x4c\x6f\xa7\x94':
1854 xml_data
= self
._ebcdic
_to
_ascii
(xml_data
)
1855 elif xml_data
[:4] == '\x00\x3c\x00\x3f':
1857 sniffed_xml_encoding
= 'utf-16be'
1858 xml_data
= unicode(xml_data
, 'utf-16be').encode('utf-8')
1859 elif (len(xml_data
) >= 4) and (xml_data
[:2] == '\xfe\xff') \
1860 and (xml_data
[2:4] != '\x00\x00'):
1862 sniffed_xml_encoding
= 'utf-16be'
1863 xml_data
= unicode(xml_data
[2:], 'utf-16be').encode('utf-8')
1864 elif xml_data
[:4] == '\x3c\x00\x3f\x00':
1866 sniffed_xml_encoding
= 'utf-16le'
1867 xml_data
= unicode(xml_data
, 'utf-16le').encode('utf-8')
1868 elif (len(xml_data
) >= 4) and (xml_data
[:2] == '\xff\xfe') and \
1869 (xml_data
[2:4] != '\x00\x00'):
1871 sniffed_xml_encoding
= 'utf-16le'
1872 xml_data
= unicode(xml_data
[2:], 'utf-16le').encode('utf-8')
1873 elif xml_data
[:4] == '\x00\x00\x00\x3c':
1875 sniffed_xml_encoding
= 'utf-32be'
1876 xml_data
= unicode(xml_data
, 'utf-32be').encode('utf-8')
1877 elif xml_data
[:4] == '\x3c\x00\x00\x00':
1879 sniffed_xml_encoding
= 'utf-32le'
1880 xml_data
= unicode(xml_data
, 'utf-32le').encode('utf-8')
1881 elif xml_data
[:4] == '\x00\x00\xfe\xff':
1883 sniffed_xml_encoding
= 'utf-32be'
1884 xml_data
= unicode(xml_data
[4:], 'utf-32be').encode('utf-8')
1885 elif xml_data
[:4] == '\xff\xfe\x00\x00':
1887 sniffed_xml_encoding
= 'utf-32le'
1888 xml_data
= unicode(xml_data
[4:], 'utf-32le').encode('utf-8')
1889 elif xml_data
[:3] == '\xef\xbb\xbf':
1891 sniffed_xml_encoding
= 'utf-8'
1892 xml_data
= unicode(xml_data
[3:], 'utf-8').encode('utf-8')
1894 sniffed_xml_encoding
= 'ascii'
1897 xml_encoding_match
= None
1898 xml_encoding_re
= '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
1899 xml_encoding_match
= re
.compile(xml_encoding_re
).match(xml_data
)
1900 if not xml_encoding_match
and isHTML
:
1901 meta_re
= '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
1902 regexp
= re
.compile(meta_re
, re
.I
)
1903 xml_encoding_match
= regexp
.search(xml_data
)
1904 if xml_encoding_match
is not None:
1905 xml_encoding
= xml_encoding_match
.groups()[0].decode(
1908 self
.declaredHTMLEncoding
= xml_encoding
1909 if sniffed_xml_encoding
and \
1910 (xml_encoding
in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1911 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1912 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1914 xml_encoding
= sniffed_xml_encoding
1915 return xml_data
, xml_encoding
, sniffed_xml_encoding
1918 def find_codec(self
, charset
):
1919 return self
._codec
(self
.CHARSET_ALIASES
.get(charset
, charset
)) \
1920 or (charset
and self
._codec
(charset
.replace("-", ""))) \
1921 or (charset
and self
._codec
(charset
.replace("-", "_"))) \
1924 def _codec(self
, charset
):
1925 if not charset
: return charset
1928 codecs
.lookup(charset
)
1930 except (LookupError, ValueError):
1934 EBCDIC_TO_ASCII_MAP
= None
1935 def _ebcdic_to_ascii(self
, s
):
1937 if not c
.EBCDIC_TO_ASCII_MAP
:
1938 emap
= (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1947 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1948 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1949 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1954 250,251,252,253,254,255)
1956 c
.EBCDIC_TO_ASCII_MAP
= string
.maketrans( \
1957 ''.join(map(chr, range(256))), ''.join(map(chr, emap
)))
1958 return s
.translate(c
.EBCDIC_TO_ASCII_MAP
)
1960 MS_CHARS
= { '\x80' : ('euro', '20AC'),
1962 '\x82' : ('sbquo', '201A'),
1963 '\x83' : ('fnof', '192'),
1964 '\x84' : ('bdquo', '201E'),
1965 '\x85' : ('hellip', '2026'),
1966 '\x86' : ('dagger', '2020'),
1967 '\x87' : ('Dagger', '2021'),
1968 '\x88' : ('circ', '2C6'),
1969 '\x89' : ('permil', '2030'),
1970 '\x8A' : ('Scaron', '160'),
1971 '\x8B' : ('lsaquo', '2039'),
1972 '\x8C' : ('OElig', '152'),
1974 '\x8E' : ('#x17D', '17D'),
1977 '\x91' : ('lsquo', '2018'),
1978 '\x92' : ('rsquo', '2019'),
1979 '\x93' : ('ldquo', '201C'),
1980 '\x94' : ('rdquo', '201D'),
1981 '\x95' : ('bull', '2022'),
1982 '\x96' : ('ndash', '2013'),
1983 '\x97' : ('mdash', '2014'),
1984 '\x98' : ('tilde', '2DC'),
1985 '\x99' : ('trade', '2122'),
1986 '\x9a' : ('scaron', '161'),
1987 '\x9b' : ('rsaquo', '203A'),
1988 '\x9c' : ('oelig', '153'),
1990 '\x9e' : ('#x17E', '17E'),
1991 '\x9f' : ('Yuml', ''),}
1993 #######################################################################
1996 #By default, act as an HTML pretty-printer.
1997 if __name__
== '__main__':
1999 soup
= BeautifulSoup(sys
.stdin
)
2000 print soup
.prettify()