some project update
[worddb.git] / libs / BeautifulSoup.py
blob0e214630c80b0ac44bab534eb5777ebc8e5608c2
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2008, Leonard Richardson
47 All rights reserved.
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
78 """
79 from __future__ import generators
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.0.7a"
83 __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
84 __license__ = "New-style BSD"
86 from sgmllib import SGMLParser, SGMLParseError
87 import codecs
88 import markupbase
89 import types
90 import re
91 import sgmllib
92 try:
93 from htmlentitydefs import name2codepoint
94 except ImportError:
95 name2codepoint = {}
96 try:
97 set
98 except NameError:
99 from sets import Set as set
101 #These hacks make Beautiful Soup able to parse XML with namespaces
102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
105 DEFAULT_OUTPUT_ENCODING = "utf-8"
107 # First, the classes that represent markup elements.
109 class PageElement:
110 """Contains the navigational information for some part of the page
111 (either a tag or a piece of text)"""
113 def setup(self, parent=None, previous=None):
114 """Sets up the initial relations between this element and
115 other elements."""
116 self.parent = parent
117 self.previous = previous
118 self.next = None
119 self.previousSibling = None
120 self.nextSibling = None
121 if self.parent and self.parent.contents:
122 self.previousSibling = self.parent.contents[-1]
123 self.previousSibling.nextSibling = self
125 def replaceWith(self, replaceWith):
126 oldParent = self.parent
127 myIndex = self.parent.contents.index(self)
128 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
129 # We're replacing this element with one of its siblings.
130 index = self.parent.contents.index(replaceWith)
131 if index and index < myIndex:
132 # Furthermore, it comes before this element. That
133 # means that when we extract it, the index of this
134 # element will change.
135 myIndex = myIndex - 1
136 self.extract()
137 oldParent.insert(myIndex, replaceWith)
139 def extract(self):
140 """Destructively rips this element out of the tree."""
141 if self.parent:
142 try:
143 self.parent.contents.remove(self)
144 except ValueError:
145 pass
147 #Find the two elements that would be next to each other if
148 #this element (and any children) hadn't been parsed. Connect
149 #the two.
150 lastChild = self._lastRecursiveChild()
151 nextElement = lastChild.next
153 if self.previous:
154 self.previous.next = nextElement
155 if nextElement:
156 nextElement.previous = self.previous
157 self.previous = None
158 lastChild.next = None
160 self.parent = None
161 if self.previousSibling:
162 self.previousSibling.nextSibling = self.nextSibling
163 if self.nextSibling:
164 self.nextSibling.previousSibling = self.previousSibling
165 self.previousSibling = self.nextSibling = None
166 return self
168 def _lastRecursiveChild(self):
169 "Finds the last element beneath this object to be parsed."
170 lastChild = self
171 while hasattr(lastChild, 'contents') and lastChild.contents:
172 lastChild = lastChild.contents[-1]
173 return lastChild
175 def insert(self, position, newChild):
176 if (isinstance(newChild, basestring)
177 or isinstance(newChild, unicode)) \
178 and not isinstance(newChild, NavigableString):
179 newChild = NavigableString(newChild)
181 position = min(position, len(self.contents))
182 if hasattr(newChild, 'parent') and newChild.parent != None:
183 # We're 'inserting' an element that's already one
184 # of this object's children.
185 if newChild.parent == self:
186 index = self.find(newChild)
187 if index and index < position:
188 # Furthermore we're moving it further down the
189 # list of this object's children. That means that
190 # when we extract this element, our target index
191 # will jump down one.
192 position = position - 1
193 newChild.extract()
195 newChild.parent = self
196 previousChild = None
197 if position == 0:
198 newChild.previousSibling = None
199 newChild.previous = self
200 else:
201 previousChild = self.contents[position-1]
202 newChild.previousSibling = previousChild
203 newChild.previousSibling.nextSibling = newChild
204 newChild.previous = previousChild._lastRecursiveChild()
205 if newChild.previous:
206 newChild.previous.next = newChild
208 newChildsLastElement = newChild._lastRecursiveChild()
210 if position >= len(self.contents):
211 newChild.nextSibling = None
213 parent = self
214 parentsNextSibling = None
215 while not parentsNextSibling:
216 parentsNextSibling = parent.nextSibling
217 parent = parent.parent
218 if not parent: # This is the last element in the document.
219 break
220 if parentsNextSibling:
221 newChildsLastElement.next = parentsNextSibling
222 else:
223 newChildsLastElement.next = None
224 else:
225 nextChild = self.contents[position]
226 newChild.nextSibling = nextChild
227 if newChild.nextSibling:
228 newChild.nextSibling.previousSibling = newChild
229 newChildsLastElement.next = nextChild
231 if newChildsLastElement.next:
232 newChildsLastElement.next.previous = newChildsLastElement
233 self.contents.insert(position, newChild)
235 def append(self, tag):
236 """Appends the given tag to the contents of this tag."""
237 self.insert(len(self.contents), tag)
239 def findNext(self, name=None, attrs={}, text=None, **kwargs):
240 """Returns the first item that matches the given criteria and
241 appears after this Tag in the document."""
242 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
244 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
245 **kwargs):
246 """Returns all items that match the given criteria and appear
247 after this Tag in the document."""
248 return self._findAll(name, attrs, text, limit, self.nextGenerator,
249 **kwargs)
251 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
252 """Returns the closest sibling to this Tag that matches the
253 given criteria and appears after this Tag in the document."""
254 return self._findOne(self.findNextSiblings, name, attrs, text,
255 **kwargs)
257 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
258 **kwargs):
259 """Returns the siblings of this Tag that match the given
260 criteria and appear after this Tag in the document."""
261 return self._findAll(name, attrs, text, limit,
262 self.nextSiblingGenerator, **kwargs)
263 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
265 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
266 """Returns the first item that matches the given criteria and
267 appears before this Tag in the document."""
268 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
270 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
271 **kwargs):
272 """Returns all items that match the given criteria and appear
273 before this Tag in the document."""
274 return self._findAll(name, attrs, text, limit, self.previousGenerator,
275 **kwargs)
276 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
278 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
279 """Returns the closest sibling to this Tag that matches the
280 given criteria and appears before this Tag in the document."""
281 return self._findOne(self.findPreviousSiblings, name, attrs, text,
282 **kwargs)
284 def findPreviousSiblings(self, name=None, attrs={}, text=None,
285 limit=None, **kwargs):
286 """Returns the siblings of this Tag that match the given
287 criteria and appear before this Tag in the document."""
288 return self._findAll(name, attrs, text, limit,
289 self.previousSiblingGenerator, **kwargs)
290 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
292 def findParent(self, name=None, attrs={}, **kwargs):
293 """Returns the closest parent of this Tag that matches the given
294 criteria."""
295 # NOTE: We can't use _findOne because findParents takes a different
296 # set of arguments.
297 r = None
298 l = self.findParents(name, attrs, 1)
299 if l:
300 r = l[0]
301 return r
303 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
304 """Returns the parents of this Tag that match the given
305 criteria."""
307 return self._findAll(name, attrs, None, limit, self.parentGenerator,
308 **kwargs)
309 fetchParents = findParents # Compatibility with pre-3.x
311 #These methods do the real heavy lifting.
313 def _findOne(self, method, name, attrs, text, **kwargs):
314 r = None
315 l = method(name, attrs, text, 1, **kwargs)
316 if l:
317 r = l[0]
318 return r
320 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
321 "Iterates over a generator looking for things that match."
323 if isinstance(name, SoupStrainer):
324 strainer = name
325 else:
326 # Build a SoupStrainer
327 strainer = SoupStrainer(name, attrs, text, **kwargs)
328 results = ResultSet(strainer)
329 g = generator()
330 while True:
331 try:
332 i = g.next()
333 except StopIteration:
334 break
335 if i:
336 found = strainer.search(i)
337 if found:
338 results.append(found)
339 if limit and len(results) >= limit:
340 break
341 return results
343 #These Generators can be used to navigate starting from both
344 #NavigableStrings and Tags.
345 def nextGenerator(self):
346 i = self
347 while i:
348 i = i.next
349 yield i
351 def nextSiblingGenerator(self):
352 i = self
353 while i:
354 i = i.nextSibling
355 yield i
357 def previousGenerator(self):
358 i = self
359 while i:
360 i = i.previous
361 yield i
363 def previousSiblingGenerator(self):
364 i = self
365 while i:
366 i = i.previousSibling
367 yield i
369 def parentGenerator(self):
370 i = self
371 while i:
372 i = i.parent
373 yield i
375 # Utility methods
376 def substituteEncoding(self, str, encoding=None):
377 encoding = encoding or "utf-8"
378 return str.replace("%SOUP-ENCODING%", encoding)
380 def toEncoding(self, s, encoding=None):
381 """Encodes an object to a string in some encoding, or to Unicode.
382 ."""
383 if isinstance(s, unicode):
384 if encoding:
385 s = s.encode(encoding)
386 elif isinstance(s, str):
387 if encoding:
388 s = s.encode(encoding)
389 else:
390 s = unicode(s)
391 else:
392 if encoding:
393 s = self.toEncoding(str(s), encoding)
394 else:
395 s = unicode(s)
396 return s
398 class NavigableString(unicode, PageElement):
400 def __new__(cls, value):
401 """Create a new NavigableString.
403 When unpickling a NavigableString, this method is called with
404 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
405 passed in to the superclass's __new__ or the superclass won't know
406 how to handle non-ASCII characters.
408 if isinstance(value, unicode):
409 return unicode.__new__(cls, value)
410 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
412 def __getnewargs__(self):
413 return (NavigableString.__str__(self),)
415 def __getattr__(self, attr):
416 """text.string gives you text. This is for backwards
417 compatibility for Navigable*String, but for CData* it lets you
418 get the string without the CData wrapper."""
419 if attr == 'string':
420 return self
421 else:
422 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
424 def __unicode__(self):
425 return str(self).decode(DEFAULT_OUTPUT_ENCODING)
427 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
428 if encoding:
429 return self.encode(encoding)
430 else:
431 return self
433 class CData(NavigableString):
435 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
436 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
438 class ProcessingInstruction(NavigableString):
439 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
440 output = self
441 if "%SOUP-ENCODING%" in output:
442 output = self.substituteEncoding(output, encoding)
443 return "<?%s?>" % self.toEncoding(output, encoding)
445 class Comment(NavigableString):
446 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
447 return "<!--%s-->" % NavigableString.__str__(self, encoding)
449 class Declaration(NavigableString):
450 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
451 return "<!%s>" % NavigableString.__str__(self, encoding)
453 class Tag(PageElement):
455 """Represents a found HTML tag with its attributes and contents."""
457 def _invert(h):
458 "Cheap function to invert a hash."
459 i = {}
460 for k,v in h.items():
461 i[v] = k
462 return i
464 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
465 "quot" : '"',
466 "amp" : "&",
467 "lt" : "<",
468 "gt" : ">" }
470 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
472 def _convertEntities(self, match):
473 """Used in a call to re.sub to replace HTML, XML, and numeric
474 entities with the appropriate Unicode characters. If HTML
475 entities are being converted, any unrecognized entities are
476 escaped."""
477 x = match.group(1)
478 if self.convertHTMLEntities and x in name2codepoint:
479 return unichr(name2codepoint[x])
480 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
481 if self.convertXMLEntities:
482 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
483 else:
484 return u'&%s;' % x
485 elif len(x) > 0 and x[0] == '#':
486 # Handle numeric entities
487 if len(x) > 1 and x[1] == 'x':
488 return unichr(int(x[2:], 16))
489 else:
490 return unichr(int(x[1:]))
492 elif self.escapeUnrecognizedEntities:
493 return u'&amp;%s;' % x
494 else:
495 return u'&%s;' % x
497 def __init__(self, parser, name, attrs=None, parent=None,
498 previous=None):
499 "Basic constructor."
501 # We don't actually store the parser object: that lets extracted
502 # chunks be garbage-collected
503 self.parserClass = parser.__class__
504 self.isSelfClosing = parser.isSelfClosingTag(name)
505 self.name = name
506 if attrs == None:
507 attrs = []
508 self.attrs = attrs
509 self.contents = []
510 self.setup(parent, previous)
511 self.hidden = False
512 self.containsSubstitutions = False
513 self.convertHTMLEntities = parser.convertHTMLEntities
514 self.convertXMLEntities = parser.convertXMLEntities
515 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
517 # Convert any HTML, XML, or numeric entities in the attribute values.
518 convert = lambda(k, val): (k,
519 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
520 self._convertEntities,
521 val))
522 self.attrs = map(convert, self.attrs)
524 def get(self, key, default=None):
525 """Returns the value of the 'key' attribute for the tag, or
526 the value given for 'default' if it doesn't have that
527 attribute."""
528 return self._getAttrMap().get(key, default)
530 def has_key(self, key):
531 return self._getAttrMap().has_key(key)
533 def __getitem__(self, key):
534 """tag[key] returns the value of the 'key' attribute for the tag,
535 and throws an exception if it's not there."""
536 return self._getAttrMap()[key]
538 def __iter__(self):
539 "Iterating over a tag iterates over its contents."
540 return iter(self.contents)
542 def __len__(self):
543 "The length of a tag is the length of its list of contents."
544 return len(self.contents)
546 def __contains__(self, x):
547 return x in self.contents
549 def __nonzero__(self):
550 "A tag is non-None even if it has no contents."
551 return True
553 def __setitem__(self, key, value):
554 """Setting tag[key] sets the value of the 'key' attribute for the
555 tag."""
556 self._getAttrMap()
557 self.attrMap[key] = value
558 found = False
559 for i in range(0, len(self.attrs)):
560 if self.attrs[i][0] == key:
561 self.attrs[i] = (key, value)
562 found = True
563 if not found:
564 self.attrs.append((key, value))
565 self._getAttrMap()[key] = value
567 def __delitem__(self, key):
568 "Deleting tag[key] deletes all 'key' attributes for the tag."
569 for item in self.attrs:
570 if item[0] == key:
571 self.attrs.remove(item)
572 #We don't break because bad HTML can define the same
573 #attribute multiple times.
574 self._getAttrMap()
575 if self.attrMap.has_key(key):
576 del self.attrMap[key]
578 def __call__(self, *args, **kwargs):
579 """Calling a tag like a function is the same as calling its
580 findAll() method. Eg. tag('a') returns a list of all the A tags
581 found within this tag."""
582 return apply(self.findAll, args, kwargs)
584 def __getattr__(self, tag):
585 #print "Getattr %s.%s" % (self.__class__, tag)
586 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
587 return self.find(tag[:-3])
588 elif tag.find('__') != 0:
589 return self.find(tag)
590 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
592 def __eq__(self, other):
593 """Returns true iff this tag has the same name, the same attributes,
594 and the same contents (recursively) as the given tag.
596 NOTE: right now this will return false if two tags have the
597 same attributes in a different order. Should this be fixed?"""
598 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
599 return False
600 for i in range(0, len(self.contents)):
601 if self.contents[i] != other.contents[i]:
602 return False
603 return True
605 def __ne__(self, other):
606 """Returns true iff this tag is not identical to the other tag,
607 as defined in __eq__."""
608 return not self == other
610 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
611 """Renders this tag as a string."""
612 return self.__str__(encoding)
614 def __unicode__(self):
615 return self.__str__(None)
617 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
618 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
619 + ")")
621 def _sub_entity(self, x):
622 """Used with a regular expression to substitute the
623 appropriate XML entity for an XML special character."""
624 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
626 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
627 prettyPrint=False, indentLevel=0):
628 """Returns a string or Unicode representation of this tag and
629 its contents. To get Unicode, pass None for encoding.
631 NOTE: since Python's HTML parser consumes whitespace, this
632 method is not certain to reproduce the whitespace present in
633 the original string."""
635 encodedName = self.toEncoding(self.name, encoding)
637 attrs = []
638 if self.attrs:
639 for key, val in self.attrs:
640 fmt = '%s="%s"'
641 if isString(val):
642 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
643 val = self.substituteEncoding(val, encoding)
645 # The attribute value either:
647 # * Contains no embedded double quotes or single quotes.
648 # No problem: we enclose it in double quotes.
649 # * Contains embedded single quotes. No problem:
650 # double quotes work here too.
651 # * Contains embedded double quotes. No problem:
652 # we enclose it in single quotes.
653 # * Embeds both single _and_ double quotes. This
654 # can't happen naturally, but it can happen if
655 # you modify an attribute value after parsing
656 # the document. Now we have a bit of a
657 # problem. We solve it by enclosing the
658 # attribute in single quotes, and escaping any
659 # embedded single quotes to XML entities.
660 if '"' in val:
661 fmt = "%s='%s'"
662 if "'" in val:
663 # TODO: replace with apos when
664 # appropriate.
665 val = val.replace("'", "&squot;")
667 # Now we're okay w/r/t quotes. But the attribute
668 # value might also contain angle brackets, or
669 # ampersands that aren't part of entities. We need
670 # to escape those to XML entities too.
671 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
673 attrs.append(fmt % (self.toEncoding(key, encoding),
674 self.toEncoding(val, encoding)))
675 close = ''
676 closeTag = ''
677 if self.isSelfClosing:
678 close = ' /'
679 else:
680 closeTag = '</%s>' % encodedName
682 indentTag, indentContents = 0, 0
683 if prettyPrint:
684 indentTag = indentLevel
685 space = (' ' * (indentTag-1))
686 indentContents = indentTag + 1
687 contents = self.renderContents(encoding, prettyPrint, indentContents)
688 if self.hidden:
689 s = contents
690 else:
691 s = []
692 attributeString = ''
693 if attrs:
694 attributeString = ' ' + ' '.join(attrs)
695 if prettyPrint:
696 s.append(space)
697 s.append('<%s%s%s>' % (encodedName, attributeString, close))
698 if prettyPrint:
699 s.append("\n")
700 s.append(contents)
701 if prettyPrint and contents and contents[-1] != "\n":
702 s.append("\n")
703 if prettyPrint and closeTag:
704 s.append(space)
705 s.append(closeTag)
706 if prettyPrint and closeTag and self.nextSibling:
707 s.append("\n")
708 s = ''.join(s)
709 return s
711 def decompose(self):
712 """Recursively destroys the contents of this tree."""
713 contents = [i for i in self.contents]
714 for i in contents:
715 if isinstance(i, Tag):
716 i.decompose()
717 else:
718 i.extract()
719 self.extract()
721 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
722 return self.__str__(encoding, True)
724 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
725 prettyPrint=False, indentLevel=0):
726 """Renders the contents of this tag as a string in the given
727 encoding. If encoding is None, returns a Unicode string.."""
728 s=[]
729 for c in self:
730 text = None
731 if isinstance(c, NavigableString):
732 text = c.__str__(encoding)
733 elif isinstance(c, Tag):
734 s.append(c.__str__(encoding, prettyPrint, indentLevel))
735 if text and prettyPrint:
736 text = text.strip()
737 if text:
738 if prettyPrint:
739 s.append(" " * (indentLevel-1))
740 s.append(text)
741 if prettyPrint:
742 s.append("\n")
743 return ''.join(s)
745 #Soup methods
747 def find(self, name=None, attrs={}, recursive=True, text=None,
748 **kwargs):
749 """Return only the first child of this Tag matching the given
750 criteria."""
751 r = None
752 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
753 if l:
754 r = l[0]
755 return r
756 findChild = find
758 def findAll(self, name=None, attrs={}, recursive=True, text=None,
759 limit=None, **kwargs):
760 """Extracts a list of Tag objects that match the given
761 criteria. You can specify the name of the Tag and any
762 attributes you want the Tag to have.
764 The value of a key-value pair in the 'attrs' map can be a
765 string, a list of strings, a regular expression object, or a
766 callable that takes a string and returns whether or not the
767 string matches for some custom definition of 'matches'. The
768 same is true of the tag name."""
769 generator = self.recursiveChildGenerator
770 if not recursive:
771 generator = self.childGenerator
772 return self._findAll(name, attrs, text, limit, generator, **kwargs)
773 findChildren = findAll
775 # Pre-3.x compatibility methods
776 first = find
777 fetch = findAll
779 def fetchText(self, text=None, recursive=True, limit=None):
780 return self.findAll(text=text, recursive=recursive, limit=limit)
782 def firstText(self, text=None, recursive=True):
783 return self.find(text=text, recursive=recursive)
785 #Private methods
787 def _getAttrMap(self):
788 """Initializes a map representation of this tag's attributes,
789 if not already initialized."""
790 if not getattr(self, 'attrMap'):
791 self.attrMap = {}
792 for (key, value) in self.attrs:
793 self.attrMap[key] = value
794 return self.attrMap
796 #Generator methods
797 def childGenerator(self):
798 for i in range(0, len(self.contents)):
799 yield self.contents[i]
800 raise StopIteration
802 def recursiveChildGenerator(self):
803 stack = [(self, 0)]
804 while stack:
805 tag, start = stack.pop()
806 if isinstance(tag, Tag):
807 for i in range(start, len(tag.contents)):
808 a = tag.contents[i]
809 yield a
810 if isinstance(a, Tag) and tag.contents:
811 if i < len(tag.contents) - 1:
812 stack.append((tag, i+1))
813 stack.append((a, 0))
814 break
815 raise StopIteration
817 # Next, a couple classes to represent queries and their results.
818 class SoupStrainer:
819 """Encapsulates a number of ways of matching a markup element (tag or
820 text)."""
822 def __init__(self, name=None, attrs={}, text=None, **kwargs):
823 self.name = name
824 if isString(attrs):
825 kwargs['class'] = attrs
826 attrs = None
827 if kwargs:
828 if attrs:
829 attrs = attrs.copy()
830 attrs.update(kwargs)
831 else:
832 attrs = kwargs
833 self.attrs = attrs
834 self.text = text
836 def __str__(self):
837 if self.text:
838 return self.text
839 else:
840 return "%s|%s" % (self.name, self.attrs)
842 def searchTag(self, markupName=None, markupAttrs={}):
843 found = None
844 markup = None
845 if isinstance(markupName, Tag):
846 markup = markupName
847 markupAttrs = markup
848 callFunctionWithTagData = callable(self.name) \
849 and not isinstance(markupName, Tag)
851 if (not self.name) \
852 or callFunctionWithTagData \
853 or (markup and self._matches(markup, self.name)) \
854 or (not markup and self._matches(markupName, self.name)):
855 if callFunctionWithTagData:
856 match = self.name(markupName, markupAttrs)
857 else:
858 match = True
859 markupAttrMap = None
860 for attr, matchAgainst in self.attrs.items():
861 if not markupAttrMap:
862 if hasattr(markupAttrs, 'get'):
863 markupAttrMap = markupAttrs
864 else:
865 markupAttrMap = {}
866 for k,v in markupAttrs:
867 markupAttrMap[k] = v
868 attrValue = markupAttrMap.get(attr)
869 if not self._matches(attrValue, matchAgainst):
870 match = False
871 break
872 if match:
873 if markup:
874 found = markup
875 else:
876 found = markupName
877 return found
879 def search(self, markup):
880 #print 'looking for %s in %s' % (self, markup)
881 found = None
882 # If given a list of items, scan it for a text element that
883 # matches.
884 if isList(markup) and not isinstance(markup, Tag):
885 for element in markup:
886 if isinstance(element, NavigableString) \
887 and self.search(element):
888 found = element
889 break
890 # If it's a Tag, make sure its name or attributes match.
891 # Don't bother with Tags if we're searching for text.
892 elif isinstance(markup, Tag):
893 if not self.text:
894 found = self.searchTag(markup)
895 # If it's text, make sure the text matches.
896 elif isinstance(markup, NavigableString) or \
897 isString(markup):
898 if self._matches(markup, self.text):
899 found = markup
900 else:
901 raise Exception, "I don't know how to match against a %s" \
902 % markup.__class__
903 return found
905 def _matches(self, markup, matchAgainst):
906 #print "Matching %s against %s" % (markup, matchAgainst)
907 result = False
908 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
909 result = markup != None
910 elif callable(matchAgainst):
911 result = matchAgainst(markup)
912 else:
913 #Custom match methods take the tag as an argument, but all
914 #other ways of matching match the tag name as a string.
915 if isinstance(markup, Tag):
916 markup = markup.name
917 if markup and not isString(markup):
918 markup = unicode(markup)
919 #Now we know that chunk is either a string, or None.
920 if hasattr(matchAgainst, 'match'):
921 # It's a regexp object.
922 result = markup and matchAgainst.search(markup)
923 elif isList(matchAgainst):
924 result = markup in matchAgainst
925 elif hasattr(matchAgainst, 'items'):
926 result = markup.has_key(matchAgainst)
927 elif matchAgainst and isString(markup):
928 if isinstance(markup, unicode):
929 matchAgainst = unicode(matchAgainst)
930 else:
931 matchAgainst = str(matchAgainst)
933 if not result:
934 result = matchAgainst == markup
935 return result
937 class ResultSet(list):
938 """A ResultSet is just a list that keeps track of the SoupStrainer
939 that created it."""
940 def __init__(self, source):
941 list.__init__([])
942 self.source = source
944 # Now, some helper functions.
946 def isList(l):
947 """Convenience method that works with all 2.x versions of Python
948 to determine whether or not something is listlike."""
949 return hasattr(l, '__iter__') \
950 or (type(l) in (types.ListType, types.TupleType))
952 def isString(s):
953 """Convenience method that works with all 2.x versions of Python
954 to determine whether or not something is stringlike."""
955 try:
956 return isinstance(s, unicode) or isinstance(s, basestring)
957 except NameError:
958 return isinstance(s, str)
960 def buildTagMap(default, *args):
961 """Turns a list of maps, lists, or scalars into a single map.
962 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
963 NESTING_RESET_TAGS maps out of lists and partial maps."""
964 built = {}
965 for portion in args:
966 if hasattr(portion, 'items'):
967 #It's a map. Merge it.
968 for k,v in portion.items():
969 built[k] = v
970 elif isList(portion):
971 #It's a list. Map each item to the default.
972 for k in portion:
973 built[k] = default
974 else:
975 #It's a scalar. Map it to the default.
976 built[portion] = default
977 return built
979 # Now, the parser classes.
981 class BeautifulStoneSoup(Tag, SGMLParser):
983 """This class contains the basic parser and search code. It defines
984 a parser that knows nothing about tag behavior except for the
985 following:
987 You can't close a tag without closing all the tags it encloses.
988 That is, "<foo><bar></foo>" actually means
989 "<foo><bar></bar></foo>".
991 [Another possible explanation is "<foo><bar /></foo>", but since
992 this class defines no SELF_CLOSING_TAGS, it will never use that
993 explanation.]
995 This class is useful for parsing XML or made-up markup languages,
996 or when BeautifulSoup makes an assumption counter to what you were
997 expecting."""
999 SELF_CLOSING_TAGS = {}
1000 NESTABLE_TAGS = {}
1001 RESET_NESTING_TAGS = {}
1002 QUOTE_TAGS = {}
1003 PRESERVE_WHITESPACE_TAGS = []
1005 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1006 lambda x: x.group(1) + ' />'),
1007 (re.compile('<!\s+([^<>]*)>'),
1008 lambda x: '<!' + x.group(1) + '>')
1011 ROOT_TAG_NAME = u'[document]'
1013 HTML_ENTITIES = "html"
1014 XML_ENTITIES = "xml"
1015 XHTML_ENTITIES = "xhtml"
1016 # TODO: This only exists for backwards-compatibility
1017 ALL_ENTITIES = XHTML_ENTITIES
1019 # Used when determining whether a text node is all whitespace and
1020 # can be replaced with a single space. A text node that contains
1021 # fancy Unicode spaces (usually non-breaking) should be left
1022 # alone.
1023 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1025 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1026 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1027 convertEntities=None, selfClosingTags=None, isHTML=False):
1028 """The Soup object is initialized as the 'root tag', and the
1029 provided markup (which can be a string or a file-like object)
1030 is fed into the underlying parser.
1032 sgmllib will process most bad HTML, and the BeautifulSoup
1033 class has some tricks for dealing with some HTML that kills
1034 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1035 if your data uses self-closing tags or declarations
1036 incorrectly.
1038 By default, Beautiful Soup uses regexes to sanitize input,
1039 avoiding the vast majority of these problems. If the problems
1040 don't apply to you, pass in False for markupMassage, and
1041 you'll get better performance.
1043 The default parser massage techniques fix the two most common
1044 instances of invalid HTML that choke sgmllib:
1046 <br/> (No space between name of closing tag and tag close)
1047 <! --Comment--> (Extraneous whitespace in declaration)
1049 You can pass in a custom list of (RE object, replace method)
1050 tuples to get Beautiful Soup to scrub your input the way you
1051 want."""
1053 self.parseOnlyThese = parseOnlyThese
1054 self.fromEncoding = fromEncoding
1055 self.smartQuotesTo = smartQuotesTo
1056 self.convertEntities = convertEntities
1057 # Set the rules for how we'll deal with the entities we
1058 # encounter
1059 if self.convertEntities:
1060 # It doesn't make sense to convert encoded characters to
1061 # entities even while you're converting entities to Unicode.
1062 # Just convert it all to Unicode.
1063 self.smartQuotesTo = None
1064 if convertEntities == self.HTML_ENTITIES:
1065 self.convertXMLEntities = False
1066 self.convertHTMLEntities = True
1067 self.escapeUnrecognizedEntities = True
1068 elif convertEntities == self.XHTML_ENTITIES:
1069 self.convertXMLEntities = True
1070 self.convertHTMLEntities = True
1071 self.escapeUnrecognizedEntities = False
1072 elif convertEntities == self.XML_ENTITIES:
1073 self.convertXMLEntities = True
1074 self.convertHTMLEntities = False
1075 self.escapeUnrecognizedEntities = False
1076 else:
1077 self.convertXMLEntities = False
1078 self.convertHTMLEntities = False
1079 self.escapeUnrecognizedEntities = False
1081 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1082 SGMLParser.__init__(self)
1084 if hasattr(markup, 'read'): # It's a file-type object.
1085 markup = markup.read()
1086 self.markup = markup
1087 self.markupMassage = markupMassage
1088 try:
1089 self._feed(isHTML=isHTML)
1090 except StopParsing:
1091 pass
1092 self.markup = None # The markup can now be GCed
1094 def convert_charref(self, name):
1095 """This method fixes a bug in Python's SGMLParser."""
1096 try:
1097 n = int(name)
1098 except ValueError:
1099 return
1100 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1101 return
1102 return self.convert_codepoint(n)
1104 def _feed(self, inDocumentEncoding=None, isHTML=False):
1105 # Convert the document to Unicode.
1106 markup = self.markup
1107 if isinstance(markup, unicode):
1108 if not hasattr(self, 'originalEncoding'):
1109 self.originalEncoding = None
1110 else:
1111 dammit = UnicodeDammit\
1112 (markup, [self.fromEncoding, inDocumentEncoding],
1113 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1114 markup = dammit.unicode
1115 self.originalEncoding = dammit.originalEncoding
1116 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1117 if markup:
1118 if self.markupMassage:
1119 if not isList(self.markupMassage):
1120 self.markupMassage = self.MARKUP_MASSAGE
1121 for fix, m in self.markupMassage:
1122 markup = fix.sub(m, markup)
1123 # TODO: We get rid of markupMassage so that the
1124 # soup object can be deepcopied later on. Some
1125 # Python installations can't copy regexes. If anyone
1126 # was relying on the existence of markupMassage, this
1127 # might cause problems.
1128 del(self.markupMassage)
1129 self.reset()
1131 SGMLParser.feed(self, markup)
1132 # Close out any unfinished strings and close all the open tags.
1133 self.endData()
1134 while self.currentTag.name != self.ROOT_TAG_NAME:
1135 self.popTag()
1137 def __getattr__(self, methodName):
1138 """This method routes method call requests to either the SGMLParser
1139 superclass or the Tag superclass, depending on the method name."""
1140 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1142 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1143 or methodName.find('do_') == 0:
1144 return SGMLParser.__getattr__(self, methodName)
1145 elif methodName.find('__') != 0:
1146 return Tag.__getattr__(self, methodName)
1147 else:
1148 raise AttributeError
1150 def isSelfClosingTag(self, name):
1151 """Returns true iff the given string is the name of a
1152 self-closing tag according to this parser."""
1153 return self.SELF_CLOSING_TAGS.has_key(name) \
1154 or self.instanceSelfClosingTags.has_key(name)
1156 def reset(self):
1157 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1158 self.hidden = 1
1159 SGMLParser.reset(self)
1160 self.currentData = []
1161 self.currentTag = None
1162 self.tagStack = []
1163 self.quoteStack = []
1164 self.pushTag(self)
1166 def popTag(self):
1167 tag = self.tagStack.pop()
1168 # Tags with just one string-owning child get the child as a
1169 # 'string' property, so that soup.tag.string is shorthand for
1170 # soup.tag.contents[0]
1171 if len(self.currentTag.contents) == 1 and \
1172 isinstance(self.currentTag.contents[0], NavigableString):
1173 self.currentTag.string = self.currentTag.contents[0]
1175 #print "Pop", tag.name
1176 if self.tagStack:
1177 self.currentTag = self.tagStack[-1]
1178 return self.currentTag
1180 def pushTag(self, tag):
1181 #print "Push", tag.name
1182 if self.currentTag:
1183 self.currentTag.contents.append(tag)
1184 self.tagStack.append(tag)
1185 self.currentTag = self.tagStack[-1]
1187 def endData(self, containerClass=NavigableString):
1188 if self.currentData:
1189 currentData = u''.join(self.currentData)
1190 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1191 not set([tag.name for tag in self.tagStack]).intersection(
1192 self.PRESERVE_WHITESPACE_TAGS)):
1193 if '\n' in currentData:
1194 currentData = '\n'
1195 else:
1196 currentData = ' '
1197 self.currentData = []
1198 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1199 (not self.parseOnlyThese.text or \
1200 not self.parseOnlyThese.search(currentData)):
1201 return
1202 o = containerClass(currentData)
1203 o.setup(self.currentTag, self.previous)
1204 if self.previous:
1205 self.previous.next = o
1206 self.previous = o
1207 self.currentTag.contents.append(o)
1210 def _popToTag(self, name, inclusivePop=True):
1211 """Pops the tag stack up to and including the most recent
1212 instance of the given tag. If inclusivePop is false, pops the tag
1213 stack up to but *not* including the most recent instqance of
1214 the given tag."""
1215 #print "Popping to %s" % name
1216 if name == self.ROOT_TAG_NAME:
1217 return
1219 numPops = 0
1220 mostRecentTag = None
1221 for i in range(len(self.tagStack)-1, 0, -1):
1222 if name == self.tagStack[i].name:
1223 numPops = len(self.tagStack)-i
1224 break
1225 if not inclusivePop:
1226 numPops = numPops - 1
1228 for i in range(0, numPops):
1229 mostRecentTag = self.popTag()
1230 return mostRecentTag
1232 def _smartPop(self, name):
1234 """We need to pop up to the previous tag of this type, unless
1235 one of this tag's nesting reset triggers comes between this
1236 tag and the previous tag of this type, OR unless this tag is a
1237 generic nesting trigger and another generic nesting trigger
1238 comes between this tag and the previous tag of this type.
1240 Examples:
1241 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1242 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1243 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1245 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1246 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1247 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1250 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1251 isNestable = nestingResetTriggers != None
1252 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1253 popTo = None
1254 inclusive = True
1255 for i in range(len(self.tagStack)-1, 0, -1):
1256 p = self.tagStack[i]
1257 if (not p or p.name == name) and not isNestable:
1258 #Non-nestable tags get popped to the top or to their
1259 #last occurance.
1260 popTo = name
1261 break
1262 if (nestingResetTriggers != None
1263 and p.name in nestingResetTriggers) \
1264 or (nestingResetTriggers == None and isResetNesting
1265 and self.RESET_NESTING_TAGS.has_key(p.name)):
1267 #If we encounter one of the nesting reset triggers
1268 #peculiar to this tag, or we encounter another tag
1269 #that causes nesting to reset, pop up to but not
1270 #including that tag.
1271 popTo = p.name
1272 inclusive = False
1273 break
1274 p = p.parent
1275 if popTo:
1276 self._popToTag(popTo, inclusive)
1278 def unknown_starttag(self, name, attrs, selfClosing=0):
1279 #print "Start tag %s: %s" % (name, attrs)
1280 if self.quoteStack:
1281 #This is not a real tag.
1282 #print "<%s> is not real!" % name
1283 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1284 self.handle_data('<%s%s>' % (name, attrs))
1285 return
1286 self.endData()
1288 if not self.isSelfClosingTag(name) and not selfClosing:
1289 self._smartPop(name)
1291 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1292 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1293 return
1295 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1296 if self.previous:
1297 self.previous.next = tag
1298 self.previous = tag
1299 self.pushTag(tag)
1300 if selfClosing or self.isSelfClosingTag(name):
1301 self.popTag()
1302 if name in self.QUOTE_TAGS:
1303 #print "Beginning quote (%s)" % name
1304 self.quoteStack.append(name)
1305 self.literal = 1
1306 return tag
1308 def unknown_endtag(self, name):
1309 #print "End tag %s" % name
1310 if self.quoteStack and self.quoteStack[-1] != name:
1311 #This is not a real end tag.
1312 #print "</%s> is not real!" % name
1313 self.handle_data('</%s>' % name)
1314 return
1315 self.endData()
1316 self._popToTag(name)
1317 if self.quoteStack and self.quoteStack[-1] == name:
1318 self.quoteStack.pop()
1319 self.literal = (len(self.quoteStack) > 0)
1321 def handle_data(self, data):
1322 self.currentData.append(data)
1324 def _toStringSubclass(self, text, subclass):
1325 """Adds a certain piece of text to the tree as a NavigableString
1326 subclass."""
1327 self.endData()
1328 self.handle_data(text)
1329 self.endData(subclass)
1331 def handle_pi(self, text):
1332 """Handle a processing instruction as a ProcessingInstruction
1333 object, possibly one with a %SOUP-ENCODING% slot into which an
1334 encoding will be plugged later."""
1335 if text[:3] == "xml":
1336 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1337 self._toStringSubclass(text, ProcessingInstruction)
1339 def handle_comment(self, text):
1340 "Handle comments as Comment objects."
1341 self._toStringSubclass(text, Comment)
1343 def handle_charref(self, ref):
1344 "Handle character references as data."
1345 if self.convertEntities:
1346 data = unichr(int(ref))
1347 else:
1348 data = '&#%s;' % ref
1349 self.handle_data(data)
1351 def handle_entityref(self, ref):
1352 """Handle entity references as data, possibly converting known
1353 HTML and/or XML entity references to the corresponding Unicode
1354 characters."""
1355 data = None
1356 if self.convertHTMLEntities:
1357 try:
1358 data = unichr(name2codepoint[ref])
1359 except KeyError:
1360 pass
1362 if not data and self.convertXMLEntities:
1363 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1365 if not data and self.convertHTMLEntities and \
1366 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1367 # TODO: We've got a problem here. We're told this is
1368 # an entity reference, but it's not an XML entity
1369 # reference or an HTML entity reference. Nonetheless,
1370 # the logical thing to do is to pass it through as an
1371 # unrecognized entity reference.
1373 # Except: when the input is "&carol;" this function
1374 # will be called with input "carol". When the input is
1375 # "AT&T", this function will be called with input
1376 # "T". We have no way of knowing whether a semicolon
1377 # was present originally, so we don't know whether
1378 # this is an unknown entity or just a misplaced
1379 # ampersand.
1381 # The more common case is a misplaced ampersand, so I
1382 # escape the ampersand and omit the trailing semicolon.
1383 data = "&amp;%s" % ref
1384 if not data:
1385 # This case is different from the one above, because we
1386 # haven't already gone through a supposedly comprehensive
1387 # mapping of entities to Unicode characters. We might not
1388 # have gone through any mapping at all. So the chances are
1389 # very high that this is a real entity, and not a
1390 # misplaced ampersand.
1391 data = "&%s;" % ref
1392 self.handle_data(data)
1394 def handle_decl(self, data):
1395 "Handle DOCTYPEs and the like as Declaration objects."
1396 self._toStringSubclass(data, Declaration)
1398 def parse_declaration(self, i):
1399 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1400 declaration as a CData object."""
1401 j = None
1402 if self.rawdata[i:i+9] == '<![CDATA[':
1403 k = self.rawdata.find(']]>', i)
1404 if k == -1:
1405 k = len(self.rawdata)
1406 data = self.rawdata[i+9:k]
1407 j = k+3
1408 self._toStringSubclass(data, CData)
1409 else:
1410 try:
1411 j = SGMLParser.parse_declaration(self, i)
1412 except SGMLParseError:
1413 toHandle = self.rawdata[i:]
1414 self.handle_data(toHandle)
1415 j = i + len(toHandle)
1416 return j
1418 class BeautifulSoup(BeautifulStoneSoup):
1420 """This parser knows the following facts about HTML:
1422 * Some tags have no closing tag and should be interpreted as being
1423 closed as soon as they are encountered.
1425 * The text inside some tags (ie. 'script') may contain tags which
1426 are not really part of the document and which should be parsed
1427 as text, not tags. If you want to parse the text as tags, you can
1428 always fetch it and parse it explicitly.
1430 * Tag nesting rules:
1432 Most tags can't be nested at all. For instance, the occurance of
1433 a <p> tag should implicitly close the previous <p> tag.
1435 <p>Para1<p>Para2
1436 should be transformed into:
1437 <p>Para1</p><p>Para2
1439 Some tags can be nested arbitrarily. For instance, the occurance
1440 of a <blockquote> tag should _not_ implicitly close the previous
1441 <blockquote> tag.
1443 Alice said: <blockquote>Bob said: <blockquote>Blah
1444 should NOT be transformed into:
1445 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1447 Some tags can be nested, but the nesting is reset by the
1448 interposition of other tags. For instance, a <tr> tag should
1449 implicitly close the previous <tr> tag within the same <table>,
1450 but not close a <tr> tag in another table.
1452 <table><tr>Blah<tr>Blah
1453 should be transformed into:
1454 <table><tr>Blah</tr><tr>Blah
1455 but,
1456 <tr>Blah<table><tr>Blah
1457 should NOT be transformed into
1458 <tr>Blah<table></tr><tr>Blah
1460 Differing assumptions about tag nesting rules are a major source
1461 of problems with the BeautifulSoup class. If BeautifulSoup is not
1462 treating as nestable a tag your page author treats as nestable,
1463 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1464 BeautifulStoneSoup before writing your own subclass."""
1466 def __init__(self, *args, **kwargs):
1467 if not kwargs.has_key('smartQuotesTo'):
1468 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1469 kwargs['isHTML'] = True
1470 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1472 SELF_CLOSING_TAGS = buildTagMap(None,
1473 ['br' , 'hr', 'input', 'img', 'meta',
1474 'spacer', 'link', 'frame', 'base'])
1476 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1478 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1480 #According to the HTML standard, each of these inline tags can
1481 #contain another tag of the same type. Furthermore, it's common
1482 #to actually use these tags this way.
1483 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1484 'center']
1486 #According to the HTML standard, these block tags can contain
1487 #another tag of the same type. Furthermore, it's common
1488 #to actually use these tags this way.
1489 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1491 #Lists can contain other lists, but there are restrictions.
1492 NESTABLE_LIST_TAGS = { 'ol' : [],
1493 'ul' : [],
1494 'li' : ['ul', 'ol'],
1495 'dl' : [],
1496 'dd' : ['dl'],
1497 'dt' : ['dl'] }
1499 #Tables can contain other tables, but there are restrictions.
1500 NESTABLE_TABLE_TAGS = {'table' : [],
1501 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1502 'td' : ['tr'],
1503 'th' : ['tr'],
1504 'thead' : ['table'],
1505 'tbody' : ['table'],
1506 'tfoot' : ['table'],
1509 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1511 #If one of these tags is encountered, all tags up to the next tag of
1512 #this type are popped.
1513 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1514 NON_NESTABLE_BLOCK_TAGS,
1515 NESTABLE_LIST_TAGS,
1516 NESTABLE_TABLE_TAGS)
1518 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1519 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1521 # Used to detect the charset in a META tag; see start_meta
1522 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1524 def start_meta(self, attrs):
1525 """Beautiful Soup can detect a charset included in a META tag,
1526 try to convert the document to that charset, and re-parse the
1527 document from the beginning."""
1528 httpEquiv = None
1529 contentType = None
1530 contentTypeIndex = None
1531 tagNeedsEncodingSubstitution = False
1533 for i in range(0, len(attrs)):
1534 key, value = attrs[i]
1535 key = key.lower()
1536 if key == 'http-equiv':
1537 httpEquiv = value
1538 elif key == 'content':
1539 contentType = value
1540 contentTypeIndex = i
1542 if httpEquiv and contentType: # It's an interesting meta tag.
1543 match = self.CHARSET_RE.search(contentType)
1544 if match:
1545 if (self.declaredHTMLEncoding is not None or
1546 self.originalEncoding == self.fromEncoding):
1547 # An HTML encoding was sniffed while converting
1548 # the document to Unicode, or an HTML encoding was
1549 # sniffed during a previous pass through the
1550 # document, or an encoding was specified
1551 # explicitly and it worked. Rewrite the meta tag.
1552 def rewrite(match):
1553 return match.group(1) + "%SOUP-ENCODING%"
1554 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1555 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1556 newAttr)
1557 tagNeedsEncodingSubstitution = True
1558 else:
1559 # This is our first pass through the document.
1560 # Go through it again with the encoding information.
1561 newCharset = match.group(3)
1562 if newCharset and newCharset != self.originalEncoding:
1563 self.declaredHTMLEncoding = newCharset
1564 self._feed(self.declaredHTMLEncoding)
1565 raise StopParsing
1566 pass
1567 tag = self.unknown_starttag("meta", attrs)
1568 if tag and tagNeedsEncodingSubstitution:
1569 tag.containsSubstitutions = True
1571 class StopParsing(Exception):
1572 pass
1574 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1576 """The BeautifulSoup class is oriented towards skipping over
1577 common HTML errors like unclosed tags. However, sometimes it makes
1578 errors of its own. For instance, consider this fragment:
1580 <b>Foo<b>Bar</b></b>
1582 This is perfectly valid (if bizarre) HTML. However, the
1583 BeautifulSoup class will implicitly close the first b tag when it
1584 encounters the second 'b'. It will think the author wrote
1585 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1586 there's no real-world reason to bold something that's already
1587 bold. When it encounters '</b></b>' it will close two more 'b'
1588 tags, for a grand total of three tags closed instead of two. This
1589 can throw off the rest of your document structure. The same is
1590 true of a number of other tags, listed below.
1592 It's much more common for someone to forget to close a 'b' tag
1593 than to actually use nested 'b' tags, and the BeautifulSoup class
1594 handles the common case. This class handles the not-co-common
1595 case: where you can't believe someone wrote what they did, but
1596 it's valid HTML and BeautifulSoup screwed up by assuming it
1597 wouldn't be."""
1599 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1600 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1601 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1602 'big']
1604 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1606 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1607 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1608 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1610 class MinimalSoup(BeautifulSoup):
1611 """The MinimalSoup class is for parsing HTML that contains
1612 pathologically bad markup. It makes no assumptions about tag
1613 nesting, but it does know which tags are self-closing, that
1614 <script> tags contain Javascript and should not be parsed, that
1615 META tags may contain encoding information, and so on.
1617 This also makes it better for subclassing than BeautifulStoneSoup
1618 or BeautifulSoup."""
1620 RESET_NESTING_TAGS = buildTagMap('noscript')
1621 NESTABLE_TAGS = {}
1623 class BeautifulSOAP(BeautifulStoneSoup):
1624 """This class will push a tag with only a single string child into
1625 the tag's parent as an attribute. The attribute's name is the tag
1626 name, and the value is the string child. An example should give
1627 the flavor of the change:
1629 <foo><bar>baz</bar></foo>
1631 <foo bar="baz"><bar>baz</bar></foo>
1633 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1635 This is, of course, useful for scraping structures that tend to
1636 use subelements instead of attributes, such as SOAP messages. Note
1637 that it modifies its input, so don't print the modified version
1638 out.
1640 I'm not sure how many people really want to use this class; let me
1641 know if you do. Mainly I like the name."""
1643 def popTag(self):
1644 if len(self.tagStack) > 1:
1645 tag = self.tagStack[-1]
1646 parent = self.tagStack[-2]
1647 parent._getAttrMap()
1648 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1649 isinstance(tag.contents[0], NavigableString) and
1650 not parent.attrMap.has_key(tag.name)):
1651 parent[tag.name] = tag.contents[0]
1652 BeautifulStoneSoup.popTag(self)
1654 #Enterprise class names! It has come to our attention that some people
1655 #think the names of the Beautiful Soup parser classes are too silly
1656 #and "unprofessional" for use in enterprise screen-scraping. We feel
1657 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1658 #All-Night Kosher Bakery recommends renaming this file to
1659 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1660 #"RobustParserBeanInterface.class") and using the following
1661 #enterprise-friendly class aliases:
1662 class RobustXMLParser(BeautifulStoneSoup):
1663 pass
1664 class RobustHTMLParser(BeautifulSoup):
1665 pass
1666 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1667 pass
1668 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1669 pass
1670 class SimplifyingSOAPParser(BeautifulSOAP):
1671 pass
1673 ######################################################
1675 # Bonus library: Unicode, Dammit
1677 # This class forces XML data into a standard format (usually to UTF-8
1678 # or Unicode). It is heavily based on code from Mark Pilgrim's
1679 # Universal Feed Parser. It does not rewrite the XML or HTML to
1680 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1681 # (XML) and BeautifulSoup.start_meta (HTML).
1683 # Autodetects character encodings.
1684 # Download from http://chardet.feedparser.org/
1685 try:
1686 import chardet
1687 # import chardet.constants
1688 # chardet.constants._debug = 1
1689 except ImportError:
1690 chardet = None
1692 # cjkcodecs and iconv_codec make Python know about more character encodings.
1693 # Both are available from http://cjkpython.i18n.org/
1694 # They're built in if you use Python 2.4.
1695 try:
1696 import cjkcodecs.aliases
1697 except ImportError:
1698 pass
1699 try:
1700 import iconv_codec
1701 except ImportError:
1702 pass
1704 class UnicodeDammit:
1705 """A class for detecting the encoding of a *ML document and
1706 converting it to a Unicode string. If the source encoding is
1707 windows-1252, can replace MS smart quotes with their HTML or XML
1708 equivalents."""
1710 # This dictionary maps commonly seen values for "charset" in HTML
1711 # meta tags to the corresponding Python codec names. It only covers
1712 # values that aren't in Python's aliases and can't be determined
1713 # by the heuristics in find_codec.
1714 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1715 "x-sjis" : "shift-jis" }
1717 def __init__(self, markup, overrideEncodings=[],
1718 smartQuotesTo='xml', isHTML=False):
1719 self.declaredHTMLEncoding = None
1720 self.markup, documentEncoding, sniffedEncoding = \
1721 self._detectEncoding(markup, isHTML)
1722 self.smartQuotesTo = smartQuotesTo
1723 self.triedEncodings = []
1724 if markup == '' or isinstance(markup, unicode):
1725 self.originalEncoding = None
1726 self.unicode = unicode(markup)
1727 return
1729 u = None
1730 for proposedEncoding in overrideEncodings:
1731 u = self._convertFrom(proposedEncoding)
1732 if u: break
1733 if not u:
1734 for proposedEncoding in (documentEncoding, sniffedEncoding):
1735 u = self._convertFrom(proposedEncoding)
1736 if u: break
1738 # If no luck and we have auto-detection library, try that:
1739 if not u and chardet and not isinstance(self.markup, unicode):
1740 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1742 # As a last resort, try utf-8 and windows-1252:
1743 if not u:
1744 for proposed_encoding in ("utf-8", "windows-1252"):
1745 u = self._convertFrom(proposed_encoding)
1746 if u: break
1748 self.unicode = u
1749 if not u: self.originalEncoding = None
1751 def _subMSChar(self, orig):
1752 """Changes a MS smart quote character to an XML or HTML
1753 entity."""
1754 sub = self.MS_CHARS.get(orig)
1755 if type(sub) == types.TupleType:
1756 if self.smartQuotesTo == 'xml':
1757 sub = '&#x%s;' % sub[1]
1758 else:
1759 sub = '&%s;' % sub[0]
1760 return sub
1762 def _convertFrom(self, proposed):
1763 proposed = self.find_codec(proposed)
1764 if not proposed or proposed in self.triedEncodings:
1765 return None
1766 self.triedEncodings.append(proposed)
1767 markup = self.markup
1769 # Convert smart quotes to HTML if coming from an encoding
1770 # that might have them.
1771 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1772 "iso-8859-1",
1773 "iso-8859-2"):
1774 markup = re.compile("([\x80-\x9f])").sub \
1775 (lambda(x): self._subMSChar(x.group(1)),
1776 markup)
1778 try:
1779 # print "Trying to convert document to %s" % proposed
1780 u = self._toUnicode(markup, proposed)
1781 self.markup = u
1782 self.originalEncoding = proposed
1783 except Exception, e:
1784 # print "That didn't work!"
1785 # print e
1786 return None
1787 #print "Correct encoding: %s" % proposed
1788 return self.markup
1790 def _toUnicode(self, data, encoding):
1791 '''Given a string and its encoding, decodes the string into Unicode.
1792 %encoding is a string recognized by encodings.aliases'''
1794 # strip Byte Order Mark (if present)
1795 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1796 and (data[2:4] != '\x00\x00'):
1797 encoding = 'utf-16be'
1798 data = data[2:]
1799 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1800 and (data[2:4] != '\x00\x00'):
1801 encoding = 'utf-16le'
1802 data = data[2:]
1803 elif data[:3] == '\xef\xbb\xbf':
1804 encoding = 'utf-8'
1805 data = data[3:]
1806 elif data[:4] == '\x00\x00\xfe\xff':
1807 encoding = 'utf-32be'
1808 data = data[4:]
1809 elif data[:4] == '\xff\xfe\x00\x00':
1810 encoding = 'utf-32le'
1811 data = data[4:]
1812 newdata = unicode(data, encoding)
1813 return newdata
1815 def _detectEncoding(self, xml_data, isHTML=False):
1816 """Given a document, tries to detect its XML encoding."""
1817 xml_encoding = sniffed_xml_encoding = None
1818 try:
1819 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1820 # EBCDIC
1821 xml_data = self._ebcdic_to_ascii(xml_data)
1822 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1823 # UTF-16BE
1824 sniffed_xml_encoding = 'utf-16be'
1825 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1826 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1827 and (xml_data[2:4] != '\x00\x00'):
1828 # UTF-16BE with BOM
1829 sniffed_xml_encoding = 'utf-16be'
1830 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1831 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1832 # UTF-16LE
1833 sniffed_xml_encoding = 'utf-16le'
1834 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1835 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1836 (xml_data[2:4] != '\x00\x00'):
1837 # UTF-16LE with BOM
1838 sniffed_xml_encoding = 'utf-16le'
1839 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1840 elif xml_data[:4] == '\x00\x00\x00\x3c':
1841 # UTF-32BE
1842 sniffed_xml_encoding = 'utf-32be'
1843 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1844 elif xml_data[:4] == '\x3c\x00\x00\x00':
1845 # UTF-32LE
1846 sniffed_xml_encoding = 'utf-32le'
1847 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1848 elif xml_data[:4] == '\x00\x00\xfe\xff':
1849 # UTF-32BE with BOM
1850 sniffed_xml_encoding = 'utf-32be'
1851 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1852 elif xml_data[:4] == '\xff\xfe\x00\x00':
1853 # UTF-32LE with BOM
1854 sniffed_xml_encoding = 'utf-32le'
1855 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1856 elif xml_data[:3] == '\xef\xbb\xbf':
1857 # UTF-8 with BOM
1858 sniffed_xml_encoding = 'utf-8'
1859 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1860 else:
1861 sniffed_xml_encoding = 'ascii'
1862 pass
1863 except:
1864 xml_encoding_match = None
1865 xml_encoding_match = re.compile(
1866 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1867 if not xml_encoding_match and isHTML:
1868 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1869 xml_encoding_match = regexp.search(xml_data)
1870 if xml_encoding_match is not None:
1871 xml_encoding = xml_encoding_match.groups()[0].lower()
1872 if isHTML:
1873 self.declaredHTMLEncoding = xml_encoding
1874 if sniffed_xml_encoding and \
1875 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1876 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1877 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1878 'utf16', 'u16')):
1879 xml_encoding = sniffed_xml_encoding
1880 return xml_data, xml_encoding, sniffed_xml_encoding
1883 def find_codec(self, charset):
1884 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1885 or (charset and self._codec(charset.replace("-", ""))) \
1886 or (charset and self._codec(charset.replace("-", "_"))) \
1887 or charset
1889 def _codec(self, charset):
1890 if not charset: return charset
1891 codec = None
1892 try:
1893 codecs.lookup(charset)
1894 codec = charset
1895 except (LookupError, ValueError):
1896 pass
1897 return codec
1899 EBCDIC_TO_ASCII_MAP = None
1900 def _ebcdic_to_ascii(self, s):
1901 c = self.__class__
1902 if not c.EBCDIC_TO_ASCII_MAP:
1903 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1904 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1905 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1906 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1907 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1908 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1909 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1910 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1911 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1912 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1913 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1914 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1915 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1916 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1917 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1918 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1919 250,251,252,253,254,255)
1920 import string
1921 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1922 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1923 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1925 MS_CHARS = { '\x80' : ('euro', '20AC'),
1926 '\x81' : ' ',
1927 '\x82' : ('sbquo', '201A'),
1928 '\x83' : ('fnof', '192'),
1929 '\x84' : ('bdquo', '201E'),
1930 '\x85' : ('hellip', '2026'),
1931 '\x86' : ('dagger', '2020'),
1932 '\x87' : ('Dagger', '2021'),
1933 '\x88' : ('circ', '2C6'),
1934 '\x89' : ('permil', '2030'),
1935 '\x8A' : ('Scaron', '160'),
1936 '\x8B' : ('lsaquo', '2039'),
1937 '\x8C' : ('OElig', '152'),
1938 '\x8D' : '?',
1939 '\x8E' : ('#x17D', '17D'),
1940 '\x8F' : '?',
1941 '\x90' : '?',
1942 '\x91' : ('lsquo', '2018'),
1943 '\x92' : ('rsquo', '2019'),
1944 '\x93' : ('ldquo', '201C'),
1945 '\x94' : ('rdquo', '201D'),
1946 '\x95' : ('bull', '2022'),
1947 '\x96' : ('ndash', '2013'),
1948 '\x97' : ('mdash', '2014'),
1949 '\x98' : ('tilde', '2DC'),
1950 '\x99' : ('trade', '2122'),
1951 '\x9a' : ('scaron', '161'),
1952 '\x9b' : ('rsaquo', '203A'),
1953 '\x9c' : ('oelig', '153'),
1954 '\x9d' : '?',
1955 '\x9e' : ('#x17E', '17E'),
1956 '\x9f' : ('Yuml', ''),}
1958 #######################################################################
1961 #By default, act as an HTML pretty-printer.
1962 if __name__ == '__main__':
1963 import sys
1964 soup = BeautifulSoup(sys.stdin)
1965 print soup.prettify()