feedservice/feedparser.py

   1 #!/usr/bin/env python
   2 """Universal feed parser
   3
   4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
   5
   6 Visit http://feedparser.org/ for the latest version
   7 Visit http://feedparser.org/docs/ for the latest documentation
   8
   9 Required: Python 2.1 or later
  10 Recommended: Python 2.3 or later
  11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
  12 """
  13
  14 __version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn"
  15 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
  16
  17 Redistribution and use in source and binary forms, with or without modification,
  18 are permitted provided that the following conditions are met:
  19
  20 * Redistributions of source code must retain the above copyright notice,
  21   this list of conditions and the following disclaimer.
  22 * Redistributions in binary form must reproduce the above copyright notice,
  23   this list of conditions and the following disclaimer in the documentation
  24   and/or other materials provided with the distribution.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE."""
  37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  39                     "John Beimler <http://john.beimler.org/>",
  40                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
  41                     "Aaron Swartz <http://aaronsw.com/>",
  42                     "Kevin Marks <http://epeus.blogspot.com/>",
  43                     "Sam Ruby <http://intertwingly.net/>",
  44                     "Ade Oshineye <http://blog.oshineye.com/>",
  45                     "Martin Pool <http://sourcefrog.net/>"]
  46 _debug = 0
  47
  48 # HTTP "User-Agent" header to send to servers when downloading feeds.
  49 # If you are embedding feedparser in a larger application, you should
  50 # change this to your application name and URL.
  51 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
  52
  53 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
  54 # want to send an Accept header, set this to None.
  55 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  56
  57 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
  58 # but if they're not installed, Python will keep searching through its own list
  59 # of pre-installed parsers until it finds one that supports everything we need.
  60 PREFERRED_XML_PARSERS = ["drv_libxml2"]
  61
  62 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
  63 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
  64 # or utidylib <http://utidylib.berlios.de/>.
  65 TIDY_MARKUP = 0
  66
  67 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
  68 # if TIDY_MARKUP = 1
  69 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
  70
  71 # If you want feedparser to automatically resolve all relative URIs, set this
  72 # to 1.
  73 RESOLVE_RELATIVE_URIS = 1
  74
  75 # If you want feedparser to automatically sanitize all potentially unsafe
  76 # HTML content, set this to 1.
  77 SANITIZE_HTML = 1
  78
  79 # ---------- Python 3 modules (make it work if possible) ----------
  80 try:
  81     import rfc822
  82 except ImportError:
  83     from email import _parseaddr as rfc822
  84
  85 try:
  86     # Python 3.1 introduces bytes.maketrans and simultaneously
  87     # deprecates string.maketrans; use bytes.maketrans if possible
  88     _maketrans = bytes.maketrans
  89 except (NameError, AttributeError):
  90     import string
  91     _maketrans = string.maketrans
  92
  93 # base64 support for Atom feeds that contain embedded binary data
  94 try:
  95     import base64, binascii
  96     # Python 3.1 deprecates decodestring in favor of decodebytes
  97     _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
  98 except:
  99     base64 = binascii = None
 100
 101 def _s2bytes(s):
 102   # Convert a UTF-8 str to bytes if the interpreter is Python 3
 103   try:
 104     return bytes(s, 'utf8')
 105   except (NameError, TypeError):
 106     # In Python 2.5 and below, bytes doesn't exist (NameError)
 107     # In Python 2.6 and above, bytes and str are the same (TypeError)
 108     return s
 109
 110 def _l2bytes(l):
 111   # Convert a list of ints to bytes if the interpreter is Python 3
 112   try:
 113     if bytes is not str:
 114       # In Python 2.6 and above, this call won't raise an exception
 115       # but it will return bytes([65]) as '[65]' instead of 'A'
 116       return bytes(l)
 117     raise NameError
 118   except NameError:
 119     return ''.join(map(chr, l))
 120
 121 # If you want feedparser to allow all URL schemes, set this to ()
 122 # List culled from Python's urlparse documentation at:
 123 #   http://docs.python.org/library/urlparse.html
 124 # as well as from "URI scheme" at Wikipedia:
 125 #   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
 126 # Many more will likely need to be added!
 127 ACCEPTABLE_URI_SCHEMES = (
 128     'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
 129     'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
 130     'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
 131     # Additional common-but-unofficial schemes
 132     'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
 133     'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
 134 )
 135 #ACCEPTABLE_URI_SCHEMES = ()
 136
 137 # ---------- required modules (should come with any Python distribution) ----------
 138 import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime
 139 try:
 140     from io import BytesIO as _StringIO
 141 except ImportError:
 142     try:
 143         from cStringIO import StringIO as _StringIO
 144     except:
 145         from StringIO import StringIO as _StringIO
 146
 147 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
 148
 149 # gzip is included with most Python distributions, but may not be available if you compiled your own
 150 try:
 151     import gzip
 152 except:
 153     gzip = None
 154 try:
 155     import zlib
 156 except:
 157     zlib = None
 158
 159 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
 160 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
 161 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
 162 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
 163 try:
 164     import xml.sax
 165     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
 166     from xml.sax.saxutils import escape as _xmlescape
 167     _XML_AVAILABLE = 1
 168 except:
 169     _XML_AVAILABLE = 0
 170     def _xmlescape(data,entities={}):
 171         data = data.replace('&', '&amp;')
 172         data = data.replace('>', '&gt;')
 173         data = data.replace('<', '&lt;')
 174         for char, entity in entities:
 175             data = data.replace(char, entity)
 176         return data
 177
 178 # cjkcodecs and iconv_codec provide support for more character encodings.
 179 # Both are available from http://cjkpython.i18n.org/
 180 try:
 181     import cjkcodecs.aliases
 182 except:
 183     pass
 184 try:
 185     import iconv_codec
 186 except:
 187     pass
 188
 189 # chardet library auto-detects character encodings
 190 # Download from http://chardet.feedparser.org/
 191 try:
 192     import chardet
 193     if _debug:
 194         import chardet.constants
 195         chardet.constants._debug = 1
 196 except:
 197     chardet = None
 198
 199 # reversable htmlentitydefs mappings for Python 2.2
 200 try:
 201   from htmlentitydefs import name2codepoint, codepoint2name
 202 except:
 203   import htmlentitydefs
 204   name2codepoint={}
 205   codepoint2name={}
 206   for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
 207     if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
 208     name2codepoint[name]=ord(codepoint)
 209     codepoint2name[ord(codepoint)]=name
 210
 211 # BeautifulSoup parser used for parsing microformats from embedded HTML content
 212 # http://www.crummy.com/software/BeautifulSoup/
 213 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
 214 # older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
 215 # patch and modify the compatibility statement accordingly.
 216 try:
 217     import BeautifulSoup
 218 except:
 219     BeautifulSoup = None
 220
 221 # ---------- don't touch these ----------
 222 class ThingsNobodyCaresAboutButMe(Exception): pass
 223 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
 224 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
 225 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
 226 class UndeclaredNamespace(Exception): pass
 227
 228 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 229 sgmllib.special = re.compile('<!')
 230 sgmllib.charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
 231
 232 if sgmllib.endbracket.search(' <').start(0):
 233     class EndBracketRegEx:
 234         def __init__(self):
 235             # Overriding the built-in sgmllib.endbracket regex allows the
 236             # parser to find angle brackets embedded in element attributes.
 237             self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
 238         def search(self,string,index=0):
 239             match = self.endbracket.match(string,index)
 240             if match is not None:
 241                 # Returning a new object in the calling thread's context
 242                 # resolves a thread-safety.
 243                 return EndBracketMatch(match)
 244             return None
 245     class EndBracketMatch:
 246         def __init__(self, match):
 247             self.match = match
 248         def start(self, n):
 249             return self.match.end(n)
 250     sgmllib.endbracket = EndBracketRegEx()
 251
 252 SUPPORTED_VERSIONS = {'': 'unknown',
 253                       'rss090': 'RSS 0.90',
 254                       'rss091n': 'RSS 0.91 (Netscape)',
 255                       'rss091u': 'RSS 0.91 (Userland)',
 256                       'rss092': 'RSS 0.92',
 257                       'rss093': 'RSS 0.93',
 258                       'rss094': 'RSS 0.94',
 259                       'rss20': 'RSS 2.0',
 260                       'rss10': 'RSS 1.0',
 261                       'rss': 'RSS (unknown version)',
 262                       'atom01': 'Atom 0.1',
 263                       'atom02': 'Atom 0.2',
 264                       'atom03': 'Atom 0.3',
 265                       'atom10': 'Atom 1.0',
 266                       'atom': 'Atom (unknown version)',
 267                       'cdf': 'CDF',
 268                       'hotrss': 'Hot RSS'
 269                       }
 270
 271 try:
 272     UserDict = dict
 273 except NameError:
 274     # Python 2.1 does not have dict
 275     from UserDict import UserDict
 276     def dict(aList):
 277         rc = {}
 278         for k, v in aList:
 279             rc[k] = v
 280         return rc
 281
 282 class FeedParserDict(UserDict):
 283     keymap = {'channel': 'feed',
 284               'items': 'entries',
 285               'guid': 'id',
 286               'date': 'updated',
 287               'date_parsed': 'updated_parsed',
 288               'description': ['summary', 'subtitle'],
 289               'url': ['href'],
 290               'modified': 'updated',
 291               'modified_parsed': 'updated_parsed',
 292               'issued': 'published',
 293               'issued_parsed': 'published_parsed',
 294               'copyright': 'rights',
 295               'copyright_detail': 'rights_detail',
 296               'tagline': 'subtitle',
 297               'tagline_detail': 'subtitle_detail'}
 298     def __getitem__(self, key):
 299         if key == 'category':
 300             return UserDict.__getitem__(self, 'tags')[0]['term']
 301         if key == 'enclosures':
 302             norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
 303             return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
 304         if key == 'license':
 305             for link in UserDict.__getitem__(self, 'links'):
 306                 if link['rel']=='license' and link.has_key('href'):
 307                     return link['href']
 308         if key == 'categories':
 309             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
 310         realkey = self.keymap.get(key, key)
 311         if type(realkey) == types.ListType:
 312             for k in realkey:
 313                 if UserDict.__contains__(self, k):
 314                     return UserDict.__getitem__(self, k)
 315         if UserDict.__contains__(self, key):
 316             return UserDict.__getitem__(self, key)
 317         return UserDict.__getitem__(self, realkey)
 318
 319     def __setitem__(self, key, value):
 320         for k in self.keymap.keys():
 321             if key == k:
 322                 key = self.keymap[k]
 323                 if type(key) == types.ListType:
 324                     key = key[0]
 325         return UserDict.__setitem__(self, key, value)
 326
 327     def get(self, key, default=None):
 328         if self.has_key(key):
 329             return self[key]
 330         else:
 331             return default
 332
 333     def setdefault(self, key, value):
 334         if not self.has_key(key):
 335             self[key] = value
 336         return self[key]
 337
 338     def has_key(self, key):
 339         try:
 340             return hasattr(self, key) or UserDict.__contains__(self, key)
 341         except AttributeError:
 342             return False
 343     # This alias prevents the 2to3 tool from changing the semantics of the
 344     # __contains__ function below and exhausting the maximum recursion depth
 345     __has_key = has_key
 346
 347     def __getattr__(self, key):
 348         try:
 349             return self.__dict__[key]
 350         except KeyError:
 351             pass
 352         try:
 353             assert not key.startswith('_')
 354             return self.__getitem__(key)
 355         except:
 356             raise AttributeError, "object has no attribute '%s'" % key
 357
 358     def __setattr__(self, key, value):
 359         if key.startswith('_') or key == 'data':
 360             self.__dict__[key] = value
 361         else:
 362             return self.__setitem__(key, value)
 363
 364     def __contains__(self, key):
 365         return self.__has_key(key)
 366
 367 def zopeCompatibilityHack():
 368     global FeedParserDict
 369     del FeedParserDict
 370     def FeedParserDict(aDict=None):
 371         rc = {}
 372         if aDict:
 373             rc.update(aDict)
 374         return rc
 375
 376 _ebcdic_to_ascii_map = None
 377 def _ebcdic_to_ascii(s):
 378     global _ebcdic_to_ascii_map
 379     if not _ebcdic_to_ascii_map:
 380         emap = (
 381             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
 382             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
 383             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
 384             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
 385             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
 386             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
 387             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
 388             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
 389             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
 390             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
 391             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
 392             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
 393             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
 394             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
 395             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
 396             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
 397             )
 398         _ebcdic_to_ascii_map = _maketrans( \
 399             _l2bytes(range(256)), _l2bytes(emap))
 400     return s.translate(_ebcdic_to_ascii_map)
 401
 402 _cp1252 = {
 403   unichr(128): unichr(8364), # euro sign
 404   unichr(130): unichr(8218), # single low-9 quotation mark
 405   unichr(131): unichr( 402), # latin small letter f with hook
 406   unichr(132): unichr(8222), # double low-9 quotation mark
 407   unichr(133): unichr(8230), # horizontal ellipsis
 408   unichr(134): unichr(8224), # dagger
 409   unichr(135): unichr(8225), # double dagger
 410   unichr(136): unichr( 710), # modifier letter circumflex accent
 411   unichr(137): unichr(8240), # per mille sign
 412   unichr(138): unichr( 352), # latin capital letter s with caron
 413   unichr(139): unichr(8249), # single left-pointing angle quotation mark
 414   unichr(140): unichr( 338), # latin capital ligature oe
 415   unichr(142): unichr( 381), # latin capital letter z with caron
 416   unichr(145): unichr(8216), # left single quotation mark
 417   unichr(146): unichr(8217), # right single quotation mark
 418   unichr(147): unichr(8220), # left double quotation mark
 419   unichr(148): unichr(8221), # right double quotation mark
 420   unichr(149): unichr(8226), # bullet
 421   unichr(150): unichr(8211), # en dash
 422   unichr(151): unichr(8212), # em dash
 423   unichr(152): unichr( 732), # small tilde
 424   unichr(153): unichr(8482), # trade mark sign
 425   unichr(154): unichr( 353), # latin small letter s with caron
 426   unichr(155): unichr(8250), # single right-pointing angle quotation mark
 427   unichr(156): unichr( 339), # latin small ligature oe
 428   unichr(158): unichr( 382), # latin small letter z with caron
 429   unichr(159): unichr( 376)} # latin capital letter y with diaeresis
 430
 431 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
 432 def _urljoin(base, uri):
 433     uri = _urifixer.sub(r'\1\3', uri)
 434     try:
 435         return urlparse.urljoin(base, uri)
 436     except:
 437         uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
 438         return urlparse.urljoin(base, uri)
 439
 440 class _FeedParserMixin:
 441     namespaces = {'': '',
 442                   'http://backend.userland.com/rss': '',
 443                   'http://blogs.law.harvard.edu/tech/rss': '',
 444                   'http://purl.org/rss/1.0/': '',
 445                   'http://my.netscape.com/rdf/simple/0.9/': '',
 446                   'http://example.com/newformat#': '',
 447                   'http://example.com/necho': '',
 448                   'http://purl.org/echo/': '',
 449                   'uri/of/echo/namespace#': '',
 450                   'http://purl.org/pie/': '',
 451                   'http://purl.org/atom/ns#': '',
 452                   'http://www.w3.org/2005/Atom': '',
 453                   'http://purl.org/rss/1.0/modules/rss091#': '',
 454
 455                   'http://webns.net/mvcb/':                               'admin',
 456                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
 457                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
 458                   'http://media.tangent.org/rss/1.0/':                    'audio',
 459                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
 460                   'http://web.resource.org/cc/':                          'cc',
 461                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
 462                   'http://purl.org/rss/1.0/modules/company':              'co',
 463                   'http://purl.org/rss/1.0/modules/content/':             'content',
 464                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
 465                   'http://purl.org/dc/elements/1.1/':                     'dc',
 466                   'http://purl.org/dc/terms/':                            'dcterms',
 467                   'http://purl.org/rss/1.0/modules/email/':               'email',
 468                   'http://purl.org/rss/1.0/modules/event/':               'ev',
 469                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
 470                   'http://freshmeat.net/rss/fm/':                         'fm',
 471                   'http://xmlns.com/foaf/0.1/':                           'foaf',
 472                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
 473                   'http://postneo.com/icbm/':                             'icbm',
 474                   'http://purl.org/rss/1.0/modules/image/':               'image',
 475                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
 476                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
 477                   'http://purl.org/rss/1.0/modules/link/':                'l',
 478                   'http://search.yahoo.com/mrss':                         'media',
 479                   #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
 480                   'http://search.yahoo.com/mrss/':                         'media',
 481                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
 482                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
 483                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
 484                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
 485                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
 486                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
 487                   'http://purl.org/rss/1.0/modules/search/':              'search',
 488                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
 489                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
 490                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
 491                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
 492                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
 493                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
 494                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
 495                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
 496                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
 497                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
 498                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
 499                   'http://wellformedweb.org/commentAPI/':                 'wfw',
 500                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
 501                   'http://www.w3.org/1999/xhtml':                         'xhtml',
 502                   'http://www.w3.org/1999/xlink':                         'xlink',
 503                   'http://www.w3.org/XML/1998/namespace':                 'xml'
 504 }
 505     _matchnamespaces = {}
 506
 507     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
 508     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 509     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 510     html_types = ['text/html', 'application/xhtml+xml']
 511
 512     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
 513         if _debug: sys.stderr.write('initializing FeedParser\n')
 514         if not self._matchnamespaces:
 515             for k, v in self.namespaces.items():
 516                 self._matchnamespaces[k.lower()] = v
 517         self.feeddata = FeedParserDict() # feed-level data
 518         self.encoding = encoding # character encoding
 519         self.entries = [] # list of entry-level data
 520         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
 521         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
 522
 523         # the following are used internally to track state;
 524         # this is really out of control and should be refactored
 525         self.infeed = 0
 526         self.inentry = 0
 527         self.incontent = 0
 528         self.intextinput = 0
 529         self.inimage = 0
 530         self.inauthor = 0
 531         self.incontributor = 0
 532         self.inpublisher = 0
 533         self.insource = 0
 534         self.sourcedata = FeedParserDict()
 535         self.contentparams = FeedParserDict()
 536         self._summaryKey = None
 537         self.namespacemap = {}
 538         self.elementstack = []
 539         self.basestack = []
 540         self.langstack = []
 541         self.baseuri = baseuri or ''
 542         self.lang = baselang or None
 543         self.svgOK = 0
 544         self.hasTitle = 0
 545         if baselang:
 546             self.feeddata['language'] = baselang.replace('_','-')
 547
 548     def unknown_starttag(self, tag, attrs):
 549         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
 550         # normalize attrs
 551         attrs = [(k.lower(), v) for k, v in attrs]
 552         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 553         # the sgml parser doesn't handle entities in attributes, but
 554         # strict xml parsers do -- account for this difference
 555         if isinstance(self, _LooseFeedParser):
 556             attrs = [(k, v.replace('&amp;', '&')) for k, v in attrs]
 557
 558         # track xml:base and xml:lang
 559         attrsD = dict(attrs)
 560         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
 561         if type(baseuri) != type(u''):
 562             try:
 563                 baseuri = unicode(baseuri, self.encoding)
 564             except:
 565                 baseuri = unicode(baseuri, 'iso-8859-1')
 566         # ensure that self.baseuri is always an absolute URI that
 567         # uses a whitelisted URI scheme (e.g. not `javscript:`)
 568         if self.baseuri:
 569             self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
 570         else:
 571             self.baseuri = _urljoin(self.baseuri, baseuri)
 572         lang = attrsD.get('xml:lang', attrsD.get('lang'))
 573         if lang == '':
 574             # xml:lang could be explicitly set to '', we need to capture that
 575             lang = None
 576         elif lang is None:
 577             # if no xml:lang is specified, use parent lang
 578             lang = self.lang
 579         if lang:
 580             if tag in ('feed', 'rss', 'rdf:RDF'):
 581                 self.feeddata['language'] = lang.replace('_','-')
 582         self.lang = lang
 583         self.basestack.append(self.baseuri)
 584         self.langstack.append(lang)
 585
 586         # track namespaces
 587         for prefix, uri in attrs:
 588             if prefix.startswith('xmlns:'):
 589                 self.trackNamespace(prefix[6:], uri)
 590             elif prefix == 'xmlns':
 591                 self.trackNamespace(None, uri)
 592
 593         # track inline content
 594         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 595             if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
 596             # element declared itself as escaped markup, but it isn't really
 597             self.contentparams['type'] = 'application/xhtml+xml'
 598         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 599             if tag.find(':') <> -1:
 600                 prefix, tag = tag.split(':', 1)
 601                 namespace = self.namespacesInUse.get(prefix, '')
 602                 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
 603                     attrs.append(('xmlns',namespace))
 604                 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
 605                     attrs.append(('xmlns',namespace))
 606             if tag == 'svg': self.svgOK += 1
 607             return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
 608
 609         # match namespaces
 610         if tag.find(':') <> -1:
 611             prefix, suffix = tag.split(':', 1)
 612         else:
 613             prefix, suffix = '', tag
 614         prefix = self.namespacemap.get(prefix, prefix)
 615         if prefix:
 616             prefix = prefix + '_'
 617
 618         # special hack for better tracking of empty textinput/image elements in illformed feeds
 619         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
 620             self.intextinput = 0
 621         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
 622             self.inimage = 0
 623
 624         # call special handler (if defined) or default handler
 625         methodname = '_start_' + prefix + suffix
 626         try:
 627             method = getattr(self, methodname)
 628             return method(attrsD)
 629         except AttributeError:
 630             # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
 631             unknown_tag = prefix + suffix
 632             if len(attrsD) == 0:
 633                 # No attributes so merge it into the encosing dictionary
 634                 return self.push(unknown_tag, 1)
 635             else:
 636                 # Has attributes so create it in its own dictionary
 637                 context = self._getContext()
 638                 context[unknown_tag] = attrsD
 639
 640     def unknown_endtag(self, tag):
 641         if _debug: sys.stderr.write('end %s\n' % tag)
 642         # match namespaces
 643         if tag.find(':') <> -1:
 644             prefix, suffix = tag.split(':', 1)
 645         else:
 646             prefix, suffix = '', tag
 647         prefix = self.namespacemap.get(prefix, prefix)
 648         if prefix:
 649             prefix = prefix + '_'
 650         if suffix == 'svg' and self.svgOK: self.svgOK -= 1
 651
 652         # call special handler (if defined) or default handler
 653         methodname = '_end_' + prefix + suffix
 654         try:
 655             if self.svgOK: raise AttributeError()
 656             method = getattr(self, methodname)
 657             method()
 658         except AttributeError:
 659             self.pop(prefix + suffix)
 660
 661         # track inline content
 662         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 663             # element declared itself as escaped markup, but it isn't really
 664             if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
 665             self.contentparams['type'] = 'application/xhtml+xml'
 666         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 667             tag = tag.split(':')[-1]
 668             self.handle_data('</%s>' % tag, escape=0)
 669
 670         # track xml:base and xml:lang going out of scope
 671         if self.basestack:
 672             self.basestack.pop()
 673             if self.basestack and self.basestack[-1]:
 674                 self.baseuri = self.basestack[-1]
 675         if self.langstack:
 676             self.langstack.pop()
 677             if self.langstack: # and (self.langstack[-1] is not None):
 678                 self.lang = self.langstack[-1]
 679
 680     def handle_charref(self, ref):
 681         # called for each character reference, e.g. for '&#160;', ref will be '160'
 682         if not self.elementstack: return
 683         ref = ref.lower()
 684         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
 685             text = '&#%s;' % ref
 686         else:
 687             if ref[0] == 'x':
 688                 c = int(ref[1:], 16)
 689             else:
 690                 c = int(ref)
 691             text = unichr(c).encode('utf-8')
 692         self.elementstack[-1][2].append(text)
 693
 694     def handle_entityref(self, ref):
 695         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
 696         if not self.elementstack: return
 697         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
 698         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
 699             text = '&%s;' % ref
 700         elif ref in self.entities.keys():
 701             text = self.entities[ref]
 702             if text.startswith('&#') and text.endswith(';'):
 703                 return self.handle_entityref(text)
 704         else:
 705             try: name2codepoint[ref]
 706             except KeyError: text = '&%s;' % ref
 707             else: text = unichr(name2codepoint[ref]).encode('utf-8')
 708         self.elementstack[-1][2].append(text)
 709
 710     def handle_data(self, text, escape=1):
 711         # called for each block of plain text, i.e. outside of any tag and
 712         # not containing any character or entity references
 713         if not self.elementstack: return
 714         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
 715             text = _xmlescape(text)
 716         self.elementstack[-1][2].append(text)
 717
 718     def handle_comment(self, text):
 719         # called for each comment, e.g. <!-- insert message here -->
 720         pass
 721
 722     def handle_pi(self, text):
 723         # called for each processing instruction, e.g. <?instruction>
 724         pass
 725
 726     def handle_decl(self, text):
 727         pass
 728
 729     def parse_declaration(self, i):
 730         # override internal declaration handler to handle CDATA blocks
 731         if _debug: sys.stderr.write('entering parse_declaration\n')
 732         if self.rawdata[i:i+9] == '<![CDATA[':
 733             k = self.rawdata.find(']]>', i)
 734             if k == -1:
 735                 # CDATA block began but didn't finish
 736                 k = len(self.rawdata)
 737                 return k
 738             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
 739             return k+3
 740         else:
 741             k = self.rawdata.find('>', i)
 742             if k >= 0:
 743                 return k+1
 744             else:
 745                 # We have an incomplete CDATA block.
 746                 return k
 747
 748     def mapContentType(self, contentType):
 749         contentType = contentType.lower()
 750         if contentType == 'text':
 751             contentType = 'text/plain'
 752         elif contentType == 'html':
 753             contentType = 'text/html'
 754         elif contentType == 'xhtml':
 755             contentType = 'application/xhtml+xml'
 756         return contentType
 757
 758     def trackNamespace(self, prefix, uri):
 759         loweruri = uri.lower()
 760         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
 761             self.version = 'rss090'
 762         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
 763             self.version = 'rss10'
 764         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
 765             self.version = 'atom10'
 766         if loweruri.find('backend.userland.com/rss') <> -1:
 767             # match any backend.userland.com namespace
 768             uri = 'http://backend.userland.com/rss'
 769             loweruri = uri
 770         if self._matchnamespaces.has_key(loweruri):
 771             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
 772             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
 773         else:
 774             self.namespacesInUse[prefix or ''] = uri
 775
 776     def resolveURI(self, uri):
 777         return _urljoin(self.baseuri or '', uri)
 778
 779     def decodeEntities(self, element, data):
 780         return data
 781
 782     def strattrs(self, attrs):
 783         return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
 784
 785     def push(self, element, expectingText):
 786         self.elementstack.append([element, expectingText, []])
 787
 788     def pop(self, element, stripWhitespace=1):
 789         if not self.elementstack: return
 790         if self.elementstack[-1][0] != element: return
 791
 792         element, expectingText, pieces = self.elementstack.pop()
 793
 794         if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
 795             # remove enclosing child element, but only if it is a <div> and
 796             # only if all the remaining content is nested underneath it.
 797             # This means that the divs would be retained in the following:
 798             #    <div>foo</div><div>bar</div>
 799             while pieces and len(pieces)>1 and not pieces[-1].strip():
 800                 del pieces[-1]
 801             while pieces and len(pieces)>1 and not pieces[0].strip():
 802                 del pieces[0]
 803             if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
 804                 depth = 0
 805                 for piece in pieces[:-1]:
 806                     if piece.startswith('</'):
 807                         depth -= 1
 808                         if depth == 0: break
 809                     elif piece.startswith('<') and not piece.endswith('/>'):
 810                         depth += 1
 811                 else:
 812                     pieces = pieces[1:-1]
 813
 814         # Ensure each piece is a str for Python 3
 815         for (i, v) in enumerate(pieces):
 816             if not isinstance(v, basestring):
 817                 pieces[i] = v.decode('utf-8')
 818
 819         output = ''.join(pieces)
 820         if stripWhitespace:
 821             output = output.strip()
 822         if not expectingText: return output
 823
 824         # decode base64 content
 825         if base64 and self.contentparams.get('base64', 0):
 826             try:
 827                 output = _base64decode(output)
 828             except binascii.Error:
 829                 pass
 830             except binascii.Incomplete:
 831                 pass
 832             except TypeError:
 833                 # In Python 3, base64 takes and outputs bytes, not str
 834                 # This may not be the most correct way to accomplish this
 835                 output = _base64decode(output.encode('utf-8')).decode('utf-8')
 836
 837         # resolve relative URIs
 838         if (element in self.can_be_relative_uri) and output:
 839             output = self.resolveURI(output)
 840
 841         # decode entities within embedded markup
 842         if not self.contentparams.get('base64', 0):
 843             output = self.decodeEntities(element, output)
 844
 845         if self.lookslikehtml(output):
 846             self.contentparams['type']='text/html'
 847
 848         # remove temporary cruft from contentparams
 849         try:
 850             del self.contentparams['mode']
 851         except KeyError:
 852             pass
 853         try:
 854             del self.contentparams['base64']
 855         except KeyError:
 856             pass
 857
 858         is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
 859         # resolve relative URIs within embedded markup
 860         if is_htmlish and RESOLVE_RELATIVE_URIS:
 861             if element in self.can_contain_relative_uris:
 862                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
 863
 864         # parse microformats
 865         # (must do this before sanitizing because some microformats
 866         # rely on elements that we sanitize)
 867         if is_htmlish and element in ['content', 'description', 'summary']:
 868             mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
 869             if mfresults:
 870                 for tag in mfresults.get('tags', []):
 871                     self._addTag(tag['term'], tag['scheme'], tag['label'])
 872                 for enclosure in mfresults.get('enclosures', []):
 873                     self._start_enclosure(enclosure)
 874                 for xfn in mfresults.get('xfn', []):
 875                     self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
 876                 vcard = mfresults.get('vcard')
 877                 if vcard:
 878                     self._getContext()['vcard'] = vcard
 879
 880         # sanitize embedded markup
 881         if is_htmlish and SANITIZE_HTML:
 882             if element in self.can_contain_dangerous_markup:
 883                 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
 884
 885         if self.encoding and type(output) != type(u''):
 886             try:
 887                 output = unicode(output, self.encoding)
 888             except:
 889                 pass
 890
 891         # address common error where people take data that is already
 892         # utf-8, presume that it is iso-8859-1, and re-encode it.
 893         if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''):
 894             try:
 895                 output = unicode(output.encode('iso-8859-1'), 'utf-8')
 896             except:
 897                 pass
 898
 899         # map win-1252 extensions to the proper code points
 900         if type(output) == type(u''):
 901             output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
 902
 903         # categories/tags/keywords/whatever are handled in _end_category
 904         if element == 'category':
 905             return output
 906
 907         if element == 'title' and self.hasTitle:
 908             return output
 909
 910         # store output in appropriate place(s)
 911         if self.inentry and not self.insource:
 912             if element == 'content':
 913                 self.entries[-1].setdefault(element, [])
 914                 contentparams = copy.deepcopy(self.contentparams)
 915                 contentparams['value'] = output
 916                 self.entries[-1][element].append(contentparams)
 917             elif element == 'link':
 918                 if not self.inimage:
 919                     # query variables in urls in link elements are improperly
 920                     # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
 921                     # unhandled character references. fix this special case.
 922                     output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
 923                     self.entries[-1][element] = output
 924                     if output:
 925                         self.entries[-1]['links'][-1]['href'] = output
 926             else:
 927                 if element == 'description':
 928                     element = 'summary'
 929                 self.entries[-1][element] = output
 930                 if self.incontent:
 931                     contentparams = copy.deepcopy(self.contentparams)
 932                     contentparams['value'] = output
 933                     self.entries[-1][element + '_detail'] = contentparams
 934         elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
 935             context = self._getContext()
 936             if element == 'description':
 937                 element = 'subtitle'
 938             context[element] = output
 939             if element == 'link':
 940                 # fix query variables; see above for the explanation
 941                 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
 942                 context[element] = output
 943                 context['links'][-1]['href'] = output
 944             elif self.incontent:
 945                 contentparams = copy.deepcopy(self.contentparams)
 946                 contentparams['value'] = output
 947                 context[element + '_detail'] = contentparams
 948         return output
 949
 950     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
 951         self.incontent += 1
 952         if self.lang: self.lang=self.lang.replace('_','-')
 953         self.contentparams = FeedParserDict({
 954             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
 955             'language': self.lang,
 956             'base': self.baseuri})
 957         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
 958         self.push(tag, expectingText)
 959
 960     def popContent(self, tag):
 961         value = self.pop(tag)
 962         self.incontent -= 1
 963         self.contentparams.clear()
 964         return value
 965
 966     # a number of elements in a number of RSS variants are nominally plain
 967     # text, but this is routinely ignored.  This is an attempt to detect
 968     # the most common cases.  As false positives often result in silent
 969     # data loss, this function errs on the conservative side.
 970     def lookslikehtml(self, s):
 971         if self.version.startswith('atom'): return
 972         if self.contentparams.get('type','text/html') != 'text/plain': return
 973
 974         # must have a close tag or a entity reference to qualify
 975         if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return
 976
 977         # all tags must be in a restricted subset of valid HTML tags
 978         if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
 979             re.findall(r'</?(\w+)',s)): return
 980
 981         # all entities must have been defined as valid HTML entities
 982         from htmlentitydefs import entitydefs
 983         if filter(lambda e: e not in entitydefs.keys(),
 984             re.findall(r'&(\w+);',s)): return
 985
 986         return 1
 987
 988     def _mapToStandardPrefix(self, name):
 989         colonpos = name.find(':')
 990         if colonpos <> -1:
 991             prefix = name[:colonpos]
 992             suffix = name[colonpos+1:]
 993             prefix = self.namespacemap.get(prefix, prefix)
 994             name = prefix + ':' + suffix
 995         return name
 996
 997     def _getAttribute(self, attrsD, name):
 998         return attrsD.get(self._mapToStandardPrefix(name))
 999
1000     def _isBase64(self, attrsD, contentparams):
1001         if attrsD.get('mode', '') == 'base64':
1002             return 1
1003         if self.contentparams['type'].startswith('text/'):
1004             return 0
1005         if self.contentparams['type'].endswith('+xml'):
1006             return 0
1007         if self.contentparams['type'].endswith('/xml'):
1008             return 0
1009         return 1
1010
1011     def _itsAnHrefDamnIt(self, attrsD):
1012         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1013         if href:
1014             try:
1015                 del attrsD['url']
1016             except KeyError:
1017                 pass
1018             try:
1019                 del attrsD['uri']
1020             except KeyError:
1021                 pass
1022             attrsD['href'] = href
1023         return attrsD
1024
1025     def _save(self, key, value, overwrite=False):
1026         context = self._getContext()
1027         if overwrite:
1028             context[key] = value
1029         else:
1030             context.setdefault(key, value)
1031
1032     def _start_rss(self, attrsD):
1033         versionmap = {'0.91': 'rss091u',
1034                       '0.92': 'rss092',
1035                       '0.93': 'rss093',
1036                       '0.94': 'rss094'}
1037         #If we're here then this is an RSS feed.
1038         #If we don't have a version or have a version that starts with something
1039         #other than RSS then there's been a mistake. Correct it.
1040         if not self.version or not self.version.startswith('rss'):
1041             attr_version = attrsD.get('version', '')
1042             version = versionmap.get(attr_version)
1043             if version:
1044                 self.version = version
1045             elif attr_version.startswith('2.'):
1046                 self.version = 'rss20'
1047             else:
1048                 self.version = 'rss'
1049
1050     def _start_dlhottitles(self, attrsD):
1051         self.version = 'hotrss'
1052
1053     def _start_channel(self, attrsD):
1054         self.infeed = 1
1055         self._cdf_common(attrsD)
1056     _start_feedinfo = _start_channel
1057
1058     def _cdf_common(self, attrsD):
1059         if attrsD.has_key('lastmod'):
1060             self._start_modified({})
1061             self.elementstack[-1][-1] = attrsD['lastmod']
1062             self._end_modified()
1063         if attrsD.has_key('href'):
1064             self._start_link({})
1065             self.elementstack[-1][-1] = attrsD['href']
1066             self._end_link()
1067
1068     def _start_feed(self, attrsD):
1069         self.infeed = 1
1070         versionmap = {'0.1': 'atom01',
1071                       '0.2': 'atom02',
1072                       '0.3': 'atom03'}
1073         if not self.version:
1074             attr_version = attrsD.get('version')
1075             version = versionmap.get(attr_version)
1076             if version:
1077                 self.version = version
1078             else:
1079                 self.version = 'atom'
1080
1081     def _end_channel(self):
1082         self.infeed = 0
1083     _end_feed = _end_channel
1084
1085     def _start_image(self, attrsD):
1086         context = self._getContext()
1087         if not self.inentry:
1088             context.setdefault('image', FeedParserDict())
1089         self.inimage = 1
1090         self.hasTitle = 0
1091         self.push('image', 0)
1092
1093     def _end_image(self):
1094         self.pop('image')
1095         self.inimage = 0
1096
1097     def _start_textinput(self, attrsD):
1098         context = self._getContext()
1099         context.setdefault('textinput', FeedParserDict())
1100         self.intextinput = 1
1101         self.hasTitle = 0
1102         self.push('textinput', 0)
1103     _start_textInput = _start_textinput
1104
1105     def _end_textinput(self):
1106         self.pop('textinput')
1107         self.intextinput = 0
1108     _end_textInput = _end_textinput
1109
1110     def _start_author(self, attrsD):
1111         self.inauthor = 1
1112         self.push('author', 1)
1113         # Append a new FeedParserDict when expecting an author
1114         context = self._getContext()
1115         context.setdefault('authors', [])
1116         context['authors'].append(FeedParserDict())
1117     _start_managingeditor = _start_author
1118     _start_dc_author = _start_author
1119     _start_dc_creator = _start_author
1120     _start_itunes_author = _start_author
1121
1122     def _end_author(self):
1123         self.pop('author')
1124         self.inauthor = 0
1125         self._sync_author_detail()
1126     _end_managingeditor = _end_author
1127     _end_dc_author = _end_author
1128     _end_dc_creator = _end_author
1129     _end_itunes_author = _end_author
1130
1131     def _start_itunes_owner(self, attrsD):
1132         self.inpublisher = 1
1133         self.push('publisher', 0)
1134
1135     def _end_itunes_owner(self):
1136         self.pop('publisher')
1137         self.inpublisher = 0
1138         self._sync_author_detail('publisher')
1139
1140     def _start_contributor(self, attrsD):
1141         self.incontributor = 1
1142         context = self._getContext()
1143         context.setdefault('contributors', [])
1144         context['contributors'].append(FeedParserDict())
1145         self.push('contributor', 0)
1146
1147     def _end_contributor(self):
1148         self.pop('contributor')
1149         self.incontributor = 0
1150
1151     def _start_dc_contributor(self, attrsD):
1152         self.incontributor = 1
1153         context = self._getContext()
1154         context.setdefault('contributors', [])
1155         context['contributors'].append(FeedParserDict())
1156         self.push('name', 0)
1157
1158     def _end_dc_contributor(self):
1159         self._end_name()
1160         self.incontributor = 0
1161
1162     def _start_name(self, attrsD):
1163         self.push('name', 0)
1164     _start_itunes_name = _start_name
1165
1166     def _end_name(self):
1167         value = self.pop('name')
1168         if self.inpublisher:
1169             self._save_author('name', value, 'publisher')
1170         elif self.inauthor:
1171             self._save_author('name', value)
1172         elif self.incontributor:
1173             self._save_contributor('name', value)
1174         elif self.intextinput:
1175             context = self._getContext()
1176             context['name'] = value
1177     _end_itunes_name = _end_name
1178
1179     def _start_width(self, attrsD):
1180         self.push('width', 0)
1181
1182     def _end_width(self):
1183         value = self.pop('width')
1184         try:
1185             value = int(value)
1186         except:
1187             value = 0
1188         if self.inimage:
1189             context = self._getContext()
1190             context['width'] = value
1191
1192     def _start_height(self, attrsD):
1193         self.push('height', 0)
1194
1195     def _end_height(self):
1196         value = self.pop('height')
1197         try:
1198             value = int(value)
1199         except:
1200             value = 0
1201         if self.inimage:
1202             context = self._getContext()
1203             context['height'] = value
1204
1205     def _start_url(self, attrsD):
1206         self.push('href', 1)
1207     _start_homepage = _start_url
1208     _start_uri = _start_url
1209
1210     def _end_url(self):
1211         value = self.pop('href')
1212         if self.inauthor:
1213             self._save_author('href', value)
1214         elif self.incontributor:
1215             self._save_contributor('href', value)
1216     _end_homepage = _end_url
1217     _end_uri = _end_url
1218
1219     def _start_email(self, attrsD):
1220         self.push('email', 0)
1221     _start_itunes_email = _start_email
1222
1223     def _end_email(self):
1224         value = self.pop('email')
1225         if self.inpublisher:
1226             self._save_author('email', value, 'publisher')
1227         elif self.inauthor:
1228             self._save_author('email', value)
1229         elif self.incontributor:
1230             self._save_contributor('email', value)
1231     _end_itunes_email = _end_email
1232
1233     def _getContext(self):
1234         if self.insource:
1235             context = self.sourcedata
1236         elif self.inimage and self.feeddata.has_key('image'):
1237             context = self.feeddata['image']
1238         elif self.intextinput:
1239             context = self.feeddata['textinput']
1240         elif self.inentry:
1241             context = self.entries[-1]
1242         else:
1243             context = self.feeddata
1244         return context
1245
1246     def _save_author(self, key, value, prefix='author'):
1247         context = self._getContext()
1248         context.setdefault(prefix + '_detail', FeedParserDict())
1249         context[prefix + '_detail'][key] = value
1250         self._sync_author_detail()
1251         context.setdefault('authors', [FeedParserDict()])
1252         context['authors'][-1][key] = value
1253
1254     def _save_contributor(self, key, value):
1255         context = self._getContext()
1256         context.setdefault('contributors', [FeedParserDict()])
1257         context['contributors'][-1][key] = value
1258
1259     def _sync_author_detail(self, key='author'):
1260         context = self._getContext()
1261         detail = context.get('%s_detail' % key)
1262         if detail:
1263             name = detail.get('name')
1264             email = detail.get('email')
1265             if name and email:
1266                 context[key] = '%s (%s)' % (name, email)
1267             elif name:
1268                 context[key] = name
1269             elif email:
1270                 context[key] = email
1271         else:
1272             author, email = context.get(key), None
1273             if not author: return
1274             emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1275             if emailmatch:
1276                 email = emailmatch.group(0)
1277                 # probably a better way to do the following, but it passes all the tests
1278                 author = author.replace(email, '')
1279                 author = author.replace('()', '')
1280                 author = author.replace('<>', '')
1281                 author = author.replace('&lt;&gt;', '')
1282                 author = author.strip()
1283                 if author and (author[0] == '('):
1284                     author = author[1:]
1285                 if author and (author[-1] == ')'):
1286                     author = author[:-1]
1287                 author = author.strip()
1288             if author or email:
1289                 context.setdefault('%s_detail' % key, FeedParserDict())
1290             if author:
1291                 context['%s_detail' % key]['name'] = author
1292             if email:
1293                 context['%s_detail' % key]['email'] = email
1294
1295     def _start_subtitle(self, attrsD):
1296         self.pushContent('subtitle', attrsD, 'text/plain', 1)
1297     _start_tagline = _start_subtitle
1298     _start_itunes_subtitle = _start_subtitle
1299
1300     def _end_subtitle(self):
1301         self.popContent('subtitle')
1302     _end_tagline = _end_subtitle
1303     _end_itunes_subtitle = _end_subtitle
1304
1305     def _start_rights(self, attrsD):
1306         self.pushContent('rights', attrsD, 'text/plain', 1)
1307     _start_dc_rights = _start_rights
1308     _start_copyright = _start_rights
1309
1310     def _end_rights(self):
1311         self.popContent('rights')
1312     _end_dc_rights = _end_rights
1313     _end_copyright = _end_rights
1314
1315     def _start_item(self, attrsD):
1316         self.entries.append(FeedParserDict())
1317         self.push('item', 0)
1318         self.inentry = 1
1319         self.guidislink = 0
1320         self.hasTitle = 0
1321         id = self._getAttribute(attrsD, 'rdf:about')
1322         if id:
1323             context = self._getContext()
1324             context['id'] = id
1325         self._cdf_common(attrsD)
1326     _start_entry = _start_item
1327     _start_product = _start_item
1328
1329     def _end_item(self):
1330         self.pop('item')
1331         self.inentry = 0
1332     _end_entry = _end_item
1333
1334     def _start_dc_language(self, attrsD):
1335         self.push('language', 1)
1336     _start_language = _start_dc_language
1337
1338     def _end_dc_language(self):
1339         self.lang = self.pop('language')
1340     _end_language = _end_dc_language
1341
1342     def _start_dc_publisher(self, attrsD):
1343         self.push('publisher', 1)
1344     _start_webmaster = _start_dc_publisher
1345
1346     def _end_dc_publisher(self):
1347         self.pop('publisher')
1348         self._sync_author_detail('publisher')
1349     _end_webmaster = _end_dc_publisher
1350
1351     def _start_published(self, attrsD):
1352         self.push('published', 1)
1353     _start_dcterms_issued = _start_published
1354     _start_issued = _start_published
1355
1356     def _end_published(self):
1357         value = self.pop('published')
1358         self._save('published_parsed', _parse_date(value), overwrite=True)
1359     _end_dcterms_issued = _end_published
1360     _end_issued = _end_published
1361
1362     def _start_updated(self, attrsD):
1363         self.push('updated', 1)
1364     _start_modified = _start_updated
1365     _start_dcterms_modified = _start_updated
1366     _start_pubdate = _start_updated
1367     _start_dc_date = _start_updated
1368     _start_lastbuilddate = _start_updated
1369
1370     def _end_updated(self):
1371         value = self.pop('updated')
1372         parsed_value = _parse_date(value)
1373         self._save('updated_parsed', parsed_value, overwrite=True)
1374     _end_modified = _end_updated
1375     _end_dcterms_modified = _end_updated
1376     _end_pubdate = _end_updated
1377     _end_dc_date = _end_updated
1378     _end_lastbuilddate = _end_updated
1379
1380     def _start_created(self, attrsD):
1381         self.push('created', 1)
1382     _start_dcterms_created = _start_created
1383
1384     def _end_created(self):
1385         value = self.pop('created')
1386         self._save('created_parsed', _parse_date(value), overwrite=True)
1387     _end_dcterms_created = _end_created
1388
1389     def _start_expirationdate(self, attrsD):
1390         self.push('expired', 1)
1391
1392     def _end_expirationdate(self):
1393         self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1394
1395     def _start_cc_license(self, attrsD):
1396         context = self._getContext()
1397         value = self._getAttribute(attrsD, 'rdf:resource')
1398         attrsD = FeedParserDict()
1399         attrsD['rel']='license'
1400         if value: attrsD['href']=value
1401         context.setdefault('links', []).append(attrsD)
1402
1403     def _start_creativecommons_license(self, attrsD):
1404         self.push('license', 1)
1405     _start_creativeCommons_license = _start_creativecommons_license
1406
1407     def _end_creativecommons_license(self):
1408         value = self.pop('license')
1409         context = self._getContext()
1410         attrsD = FeedParserDict()
1411         attrsD['rel']='license'
1412         if value: attrsD['href']=value
1413         context.setdefault('links', []).append(attrsD)
1414         del context['license']
1415     _end_creativeCommons_license = _end_creativecommons_license
1416
1417     def _addXFN(self, relationships, href, name):
1418         context = self._getContext()
1419         xfn = context.setdefault('xfn', [])
1420         value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1421         if value not in xfn:
1422             xfn.append(value)
1423
1424     def _addTag(self, term, scheme, label):
1425         context = self._getContext()
1426         tags = context.setdefault('tags', [])
1427         if (not term) and (not scheme) and (not label): return
1428         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1429         if value not in tags:
1430             tags.append(value)
1431
1432     def _start_category(self, attrsD):
1433         if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1434         term = attrsD.get('term')
1435         scheme = attrsD.get('scheme', attrsD.get('domain'))
1436         label = attrsD.get('label')
1437         self._addTag(term, scheme, label)
1438         self.push('category', 1)
1439     _start_dc_subject = _start_category
1440     _start_keywords = _start_category
1441
1442     def _start_media_category(self, attrsD):
1443         attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
1444         self._start_category(attrsD)
1445
1446     def _end_itunes_keywords(self):
1447         for term in self.pop('itunes_keywords').split():
1448             self._addTag(term, 'http://www.itunes.com/', None)
1449
1450     def _start_itunes_category(self, attrsD):
1451         self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1452         self.push('category', 1)
1453
1454     def _end_category(self):
1455         value = self.pop('category')
1456         if not value: return
1457         context = self._getContext()
1458         tags = context['tags']
1459         if value and len(tags) and not tags[-1]['term']:
1460             tags[-1]['term'] = value
1461         else:
1462             self._addTag(value, None, None)
1463     _end_dc_subject = _end_category
1464     _end_keywords = _end_category
1465     _end_itunes_category = _end_category
1466     _end_media_category = _end_category
1467
1468     def _start_cloud(self, attrsD):
1469         self._getContext()['cloud'] = FeedParserDict(attrsD)
1470
1471     def _start_link(self, attrsD):
1472         attrsD.setdefault('rel', 'alternate')
1473         if attrsD['rel'] == 'self':
1474             attrsD.setdefault('type', 'application/atom+xml')
1475         else:
1476             attrsD.setdefault('type', 'text/html')
1477         context = self._getContext()
1478         attrsD = self._itsAnHrefDamnIt(attrsD)
1479         if attrsD.has_key('href'):
1480             attrsD['href'] = self.resolveURI(attrsD['href'])
1481         expectingText = self.infeed or self.inentry or self.insource
1482         context.setdefault('links', [])
1483         if not (self.inentry and self.inimage):
1484             context['links'].append(FeedParserDict(attrsD))
1485         if attrsD.has_key('href'):
1486             expectingText = 0
1487             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1488                 context['link'] = attrsD['href']
1489         else:
1490             self.push('link', expectingText)
1491     _start_producturl = _start_link
1492
1493     def _end_link(self):
1494         value = self.pop('link')
1495         context = self._getContext()
1496     _end_producturl = _end_link
1497
1498     def _start_guid(self, attrsD):
1499         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1500         self.push('id', 1)
1501
1502     def _end_guid(self):
1503         value = self.pop('id')
1504         self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1505         if self.guidislink:
1506             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1507             # and only if the item doesn't already have a link element
1508             self._save('link', value)
1509
1510     def _start_title(self, attrsD):
1511         if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1512         self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1513     _start_dc_title = _start_title
1514     _start_media_title = _start_title
1515
1516     def _end_title(self):
1517         if self.svgOK: return
1518         value = self.popContent('title')
1519         if not value: return
1520         context = self._getContext()
1521         self.hasTitle = 1
1522     _end_dc_title = _end_title
1523
1524     def _end_media_title(self):
1525         hasTitle = self.hasTitle
1526         self._end_title()
1527         self.hasTitle = hasTitle
1528
1529     def _start_description(self, attrsD):
1530         context = self._getContext()
1531         if context.has_key('summary'):
1532             self._summaryKey = 'content'
1533             self._start_content(attrsD)
1534         else:
1535             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1536     _start_dc_description = _start_description
1537
1538     def _start_abstract(self, attrsD):
1539         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1540
1541     def _end_description(self):
1542         if self._summaryKey == 'content':
1543             self._end_content()
1544         else:
1545             value = self.popContent('description')
1546         self._summaryKey = None
1547     _end_abstract = _end_description
1548     _end_dc_description = _end_description
1549
1550     def _start_info(self, attrsD):
1551         self.pushContent('info', attrsD, 'text/plain', 1)
1552     _start_feedburner_browserfriendly = _start_info
1553
1554     def _end_info(self):
1555         self.popContent('info')
1556     _end_feedburner_browserfriendly = _end_info
1557
1558     def _start_generator(self, attrsD):
1559         if attrsD:
1560             attrsD = self._itsAnHrefDamnIt(attrsD)
1561             if attrsD.has_key('href'):
1562                 attrsD['href'] = self.resolveURI(attrsD['href'])
1563         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1564         self.push('generator', 1)
1565
1566     def _end_generator(self):
1567         value = self.pop('generator')
1568         context = self._getContext()
1569         if context.has_key('generator_detail'):
1570             context['generator_detail']['name'] = value
1571
1572     def _start_admin_generatoragent(self, attrsD):
1573         self.push('generator', 1)
1574         value = self._getAttribute(attrsD, 'rdf:resource')
1575         if value:
1576             self.elementstack[-1][2].append(value)
1577         self.pop('generator')
1578         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1579
1580     def _start_admin_errorreportsto(self, attrsD):
1581         self.push('errorreportsto', 1)
1582         value = self._getAttribute(attrsD, 'rdf:resource')
1583         if value:
1584             self.elementstack[-1][2].append(value)
1585         self.pop('errorreportsto')
1586
1587     def _start_summary(self, attrsD):
1588         context = self._getContext()
1589         if context.has_key('summary'):
1590             self._summaryKey = 'content'
1591             self._start_content(attrsD)
1592         else:
1593             self._summaryKey = 'summary'
1594             self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1595     _start_itunes_summary = _start_summary
1596
1597     def _end_summary(self):
1598         if self._summaryKey == 'content':
1599             self._end_content()
1600         else:
1601             self.popContent(self._summaryKey or 'summary')
1602         self._summaryKey = None
1603     _end_itunes_summary = _end_summary
1604
1605     def _start_enclosure(self, attrsD):
1606         attrsD = self._itsAnHrefDamnIt(attrsD)
1607         context = self._getContext()
1608         attrsD['rel']='enclosure'
1609         context.setdefault('links', []).append(FeedParserDict(attrsD))
1610
1611     def _start_source(self, attrsD):
1612         if 'url' in attrsD:
1613           # This means that we're processing a source element from an RSS 2.0 feed
1614           self.sourcedata['href'] = attrsD[u'url']
1615         self.push('source', 1)
1616         self.insource = 1
1617         self.hasTitle = 0
1618
1619     def _end_source(self):
1620         self.insource = 0
1621         value = self.pop('source')
1622         if value:
1623           self.sourcedata['title'] = value
1624         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1625         self.sourcedata.clear()
1626
1627     def _start_content(self, attrsD):
1628         self.pushContent('content', attrsD, 'text/plain', 1)
1629         src = attrsD.get('src')
1630         if src:
1631             self.contentparams['src'] = src
1632         self.push('content', 1)
1633
1634     def _start_prodlink(self, attrsD):
1635         self.pushContent('content', attrsD, 'text/html', 1)
1636
1637     def _start_body(self, attrsD):
1638         self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1639     _start_xhtml_body = _start_body
1640
1641     def _start_content_encoded(self, attrsD):
1642         self.pushContent('content', attrsD, 'text/html', 1)
1643     _start_fullitem = _start_content_encoded
1644
1645     def _end_content(self):
1646         copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1647         value = self.popContent('content')
1648         if copyToSummary:
1649             self._save('summary', value)
1650
1651     _end_body = _end_content
1652     _end_xhtml_body = _end_content
1653     _end_content_encoded = _end_content
1654     _end_fullitem = _end_content
1655     _end_prodlink = _end_content
1656
1657     def _start_itunes_image(self, attrsD):
1658         self.push('itunes_image', 0)
1659         if attrsD.get('href'):
1660             self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1661     _start_itunes_link = _start_itunes_image
1662
1663     def _end_itunes_block(self):
1664         value = self.pop('itunes_block', 0)
1665         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1666
1667     def _end_itunes_explicit(self):
1668         value = self.pop('itunes_explicit', 0)
1669         # Convert 'yes' -> True, 'clean' to False, and any other value to None
1670         # False and None both evaluate as False, so the difference can be ignored
1671         # by applications that only need to know if the content is explicit.
1672         self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1673
1674     def _start_media_content(self, attrsD):
1675         context = self._getContext()
1676         context.setdefault('media_content', [])
1677         context['media_content'].append(attrsD)
1678
1679     def _start_media_thumbnail(self, attrsD):
1680         context = self._getContext()
1681         context.setdefault('media_thumbnail', [])
1682         self.push('url', 1) # new
1683         context['media_thumbnail'].append(attrsD)
1684
1685     def _end_media_thumbnail(self):
1686         url = self.pop('url')
1687         context = self._getContext()
1688         if url != None and len(url.strip()) != 0:
1689             if not context['media_thumbnail'][-1].has_key('url'):
1690                 context['media_thumbnail'][-1]['url'] = url
1691
1692     def _start_media_player(self, attrsD):
1693         self.push('media_player', 0)
1694         self._getContext()['media_player'] = FeedParserDict(attrsD)
1695
1696     def _end_media_player(self):
1697         value = self.pop('media_player')
1698         context = self._getContext()
1699         context['media_player']['content'] = value
1700
1701     def _start_newlocation(self, attrsD):
1702         self.push('newlocation', 1)
1703
1704     def _end_newlocation(self):
1705         url = self.pop('newlocation')
1706         context = self._getContext()
1707         # don't set newlocation if the context isn't right
1708         if context is not self.feeddata:
1709             return
1710         context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1711
1712 if _XML_AVAILABLE:
1713     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1714         def __init__(self, baseuri, baselang, encoding):
1715             if _debug: sys.stderr.write('trying StrictFeedParser\n')
1716             xml.sax.handler.ContentHandler.__init__(self)
1717             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1718             self.bozo = 0
1719             self.exc = None
1720             self.decls = {}
1721
1722         def startPrefixMapping(self, prefix, uri):
1723             self.trackNamespace(prefix, uri)
1724             if uri == 'http://www.w3.org/1999/xlink':
1725               self.decls['xmlns:'+prefix] = uri
1726
1727         def startElementNS(self, name, qname, attrs):
1728             namespace, localname = name
1729             lowernamespace = str(namespace or '').lower()
1730             if lowernamespace.find('backend.userland.com/rss') <> -1:
1731                 # match any backend.userland.com namespace
1732                 namespace = 'http://backend.userland.com/rss'
1733                 lowernamespace = namespace
1734             if qname and qname.find(':') > 0:
1735                 givenprefix = qname.split(':')[0]
1736             else:
1737                 givenprefix = None
1738             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1739             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1740                     raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1741             localname = str(localname).lower()
1742
1743             # qname implementation is horribly broken in Python 2.1 (it
1744             # doesn't report any), and slightly broken in Python 2.2 (it
1745             # doesn't report the xml: namespace). So we match up namespaces
1746             # with a known list first, and then possibly override them with
1747             # the qnames the SAX parser gives us (if indeed it gives us any
1748             # at all).  Thanks to MatejC for helping me test this and
1749             # tirelessly telling me that it didn't work yet.
1750             attrsD, self.decls = self.decls, {}
1751             if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1752                 attrsD['xmlns']=namespace
1753             if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1754                 attrsD['xmlns']=namespace
1755
1756             if prefix:
1757                 localname = prefix.lower() + ':' + localname
1758             elif namespace and not qname: #Expat
1759                 for name,value in self.namespacesInUse.items():
1760                      if name and value == namespace:
1761                          localname = name + ':' + localname
1762                          break
1763             if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1764
1765             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1766                 lowernamespace = (namespace or '').lower()
1767                 prefix = self._matchnamespaces.get(lowernamespace, '')
1768                 if prefix:
1769                     attrlocalname = prefix + ':' + attrlocalname
1770                 attrsD[str(attrlocalname).lower()] = attrvalue
1771             for qname in attrs.getQNames():
1772                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1773             self.unknown_starttag(localname, attrsD.items())
1774
1775         def characters(self, text):
1776             self.handle_data(text)
1777
1778         def endElementNS(self, name, qname):
1779             namespace, localname = name
1780             lowernamespace = str(namespace or '').lower()
1781             if qname and qname.find(':') > 0:
1782                 givenprefix = qname.split(':')[0]
1783             else:
1784                 givenprefix = ''
1785             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1786             if prefix:
1787                 localname = prefix + ':' + localname
1788             elif namespace and not qname: #Expat
1789                 for name,value in self.namespacesInUse.items():
1790                      if name and value == namespace:
1791                          localname = name + ':' + localname
1792                          break
1793             localname = str(localname).lower()
1794             self.unknown_endtag(localname)
1795
1796         def error(self, exc):
1797             self.bozo = 1
1798             self.exc = exc
1799
1800         def fatalError(self, exc):
1801             self.error(exc)
1802             raise exc
1803
1804 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1805     special = re.compile('''[<>'"]''')
1806     bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1807     elements_no_end_tag = [
1808       'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1809       'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1810       'source', 'track', 'wbr'
1811     ]
1812
1813     def __init__(self, encoding, _type):
1814         self.encoding = encoding
1815         self._type = _type
1816         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1817         sgmllib.SGMLParser.__init__(self)
1818
1819     def reset(self):
1820         self.pieces = []
1821         sgmllib.SGMLParser.reset(self)
1822
1823     def _shorttag_replace(self, match):
1824         tag = match.group(1)
1825         if tag in self.elements_no_end_tag:
1826             return '<' + tag + ' />'
1827         else:
1828             return '<' + tag + '></' + tag + '>'
1829
1830     def parse_starttag(self,i):
1831         j=sgmllib.SGMLParser.parse_starttag(self, i)
1832         if self._type == 'application/xhtml+xml':
1833             if j>2 and self.rawdata[j-2:j]=='/>':
1834                 self.unknown_endtag(self.lasttag)
1835         return j
1836
1837     def feed(self, data):
1838         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1839         #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1840         data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
1841         data = data.replace('&#39;', "'")
1842         data = data.replace('&#34;', '"')
1843         try:
1844             bytes
1845             if bytes is str:
1846                 raise NameError
1847             self.encoding = self.encoding + '_INVALID_PYTHON_3'
1848         except NameError:
1849             if self.encoding and type(data) == type(u''):
1850                 data = data.encode(self.encoding)
1851         sgmllib.SGMLParser.feed(self, data)
1852         sgmllib.SGMLParser.close(self)
1853
1854     def normalize_attrs(self, attrs):
1855         if not attrs: return attrs
1856         # utility method to be called by descendants
1857         attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1858         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1859         attrs.sort()
1860         return attrs
1861
1862     def unknown_starttag(self, tag, attrs):
1863         # called for each start tag
1864         # attrs is a list of (attr, value) tuples
1865         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1866         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1867         uattrs = []
1868         strattrs=''
1869         if attrs:
1870             for key, value in attrs:
1871                 value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
1872                 value = self.bare_ampersand.sub("&amp;", value)
1873                 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1874                 if type(value) != type(u''):
1875                     try:
1876                         value = unicode(value, self.encoding)
1877                     except:
1878                         value = unicode(value, 'iso-8859-1')
1879                 try:
1880                     # Currently, in Python 3 the key is already a str, and cannot be decoded again
1881                     uattrs.append((unicode(key, self.encoding), value))
1882                 except TypeError:
1883                     uattrs.append((key, value))
1884             strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1885             if self.encoding:
1886                 try:
1887                     strattrs=strattrs.encode(self.encoding)
1888                 except:
1889                     pass
1890         if tag in self.elements_no_end_tag:
1891             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1892         else:
1893             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1894
1895     def unknown_endtag(self, tag):
1896         # called for each end tag, e.g. for </pre>, tag will be 'pre'
1897         # Reconstruct the original end tag.
1898         if tag not in self.elements_no_end_tag:
1899             self.pieces.append("</%(tag)s>" % locals())
1900
1901     def handle_charref(self, ref):
1902         # called for each character reference, e.g. for '&#160;', ref will be '160'
1903         # Reconstruct the original character reference.
1904         if ref.startswith('x'):
1905             value = unichr(int(ref[1:],16))
1906         else:
1907             value = unichr(int(ref))
1908
1909         if value in _cp1252.keys():
1910             self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1911         else:
1912             self.pieces.append('&#%(ref)s;' % locals())
1913
1914     def handle_entityref(self, ref):
1915         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1916         # Reconstruct the original entity reference.
1917         if name2codepoint.has_key(ref):
1918             self.pieces.append('&%(ref)s;' % locals())
1919         else:
1920             self.pieces.append('&amp;%(ref)s' % locals())
1921
1922     def handle_data(self, text):
1923         # called for each block of plain text, i.e. outside of any tag and
1924         # not containing any character or entity references
1925         # Store the original text verbatim.
1926         if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
1927         self.pieces.append(text)
1928
1929     def handle_comment(self, text):
1930         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1931         # Reconstruct the original comment.
1932         self.pieces.append('<!--%(text)s-->' % locals())
1933
1934     def handle_pi(self, text):
1935         # called for each processing instruction, e.g. <?instruction>
1936         # Reconstruct original processing instruction.
1937         self.pieces.append('<?%(text)s>' % locals())
1938
1939     def handle_decl(self, text):
1940         # called for the DOCTYPE, if present, e.g.
1941         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1942         #     "http://www.w3.org/TR/html4/loose.dtd">
1943         # Reconstruct original DOCTYPE
1944         self.pieces.append('<!%(text)s>' % locals())
1945
1946     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1947     def _scan_name(self, i, declstartpos):
1948         rawdata = self.rawdata
1949         n = len(rawdata)
1950         if i == n:
1951             return None, -1
1952         m = self._new_declname_match(rawdata, i)
1953         if m:
1954             s = m.group()
1955             name = s.strip()
1956             if (i + len(s)) == n:
1957                 return None, -1  # end of buffer
1958             return name.lower(), m.end()
1959         else:
1960             self.handle_data(rawdata)
1961 #            self.updatepos(declstartpos, i)
1962             return None, -1
1963
1964     def convert_charref(self, name):
1965         return '&#%s;' % name
1966
1967     def convert_entityref(self, name):
1968         return '&%s;' % name
1969
1970     def output(self):
1971         '''Return processed HTML as a single string'''
1972         return ''.join([str(p) for p in self.pieces])
1973
1974 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1975     def __init__(self, baseuri, baselang, encoding, entities):
1976         sgmllib.SGMLParser.__init__(self)
1977         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1978         _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1979         self.entities=entities
1980
1981     def decodeEntities(self, element, data):
1982         data = data.replace('&#60;', '&lt;')
1983         data = data.replace('&#x3c;', '&lt;')
1984         data = data.replace('&#x3C;', '&lt;')
1985         data = data.replace('&#62;', '&gt;')
1986         data = data.replace('&#x3e;', '&gt;')
1987         data = data.replace('&#x3E;', '&gt;')
1988         data = data.replace('&#38;', '&amp;')
1989         data = data.replace('&#x26;', '&amp;')
1990         data = data.replace('&#34;', '&quot;')
1991         data = data.replace('&#x22;', '&quot;')
1992         data = data.replace('&#39;', '&apos;')
1993         data = data.replace('&#x27;', '&apos;')
1994         if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1995             data = data.replace('&lt;', '<')
1996             data = data.replace('&gt;', '>')
1997             data = data.replace('&amp;', '&')
1998             data = data.replace('&quot;', '"')
1999             data = data.replace('&apos;', "'")
2000         return data
2001
2002     def strattrs(self, attrs):
2003         return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
2004
2005 class _MicroformatsParser:
2006     STRING = 1
2007     DATE = 2
2008     URI = 3
2009     NODE = 4
2010     EMAIL = 5
2011
2012     known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2013     known_binary_extensions =  ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2014
2015     def __init__(self, data, baseuri, encoding):
2016         self.document = BeautifulSoup.BeautifulSoup(data)
2017         self.baseuri = baseuri
2018         self.encoding = encoding
2019         if type(data) == type(u''):
2020             data = data.encode(encoding)
2021         self.tags = []
2022         self.enclosures = []
2023         self.xfn = []
2024         self.vcard = None
2025
2026     def vcardEscape(self, s):
2027         if type(s) in (type(''), type(u'')):
2028             s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2029         return s
2030
2031     def vcardFold(self, s):
2032         s = re.sub(';+$', '', s)
2033         sFolded = ''
2034         iMax = 75
2035         sPrefix = ''
2036         while len(s) > iMax:
2037             sFolded += sPrefix + s[:iMax] + '\n'
2038             s = s[iMax:]
2039             sPrefix = ' '
2040             iMax = 74
2041         sFolded += sPrefix + s
2042         return sFolded
2043
2044     def normalize(self, s):
2045         return re.sub(r'\s+', ' ', s).strip()
2046
2047     def unique(self, aList):
2048         results = []
2049         for element in aList:
2050             if element not in results:
2051                 results.append(element)
2052         return results
2053
2054     def toISO8601(self, dt):
2055         return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
2056
2057     def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
2058         all = lambda x: 1
2059         sProperty = sProperty.lower()
2060         bFound = 0
2061         bNormalize = 1
2062         propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
2063         if bAllowMultiple and (iPropertyType != self.NODE):
2064             snapResults = []
2065             containers = elmRoot(['ul', 'ol'], propertyMatch)
2066             for container in containers:
2067                 snapResults.extend(container('li'))
2068             bFound = (len(snapResults) != 0)
2069         if not bFound:
2070             snapResults = elmRoot(all, propertyMatch)
2071             bFound = (len(snapResults) != 0)
2072         if (not bFound) and (sProperty == 'value'):
2073             snapResults = elmRoot('pre')
2074             bFound = (len(snapResults) != 0)
2075             bNormalize = not bFound
2076             if not bFound:
2077                 snapResults = [elmRoot]
2078                 bFound = (len(snapResults) != 0)
2079         arFilter = []
2080         if sProperty == 'vcard':
2081             snapFilter = elmRoot(all, propertyMatch)
2082             for node in snapFilter:
2083                 if node.findParent(all, propertyMatch):
2084                     arFilter.append(node)
2085         arResults = []
2086         for node in snapResults:
2087             if node not in arFilter:
2088                 arResults.append(node)
2089         bFound = (len(arResults) != 0)
2090         if not bFound:
2091             if bAllowMultiple: return []
2092             elif iPropertyType == self.STRING: return ''
2093             elif iPropertyType == self.DATE: return None
2094             elif iPropertyType == self.URI: return ''
2095             elif iPropertyType == self.NODE: return None
2096             else: return None
2097         arValues = []
2098         for elmResult in arResults:
2099             sValue = None
2100             if iPropertyType == self.NODE:
2101                 if bAllowMultiple:
2102                     arValues.append(elmResult)
2103                     continue
2104                 else:
2105                     return elmResult
2106             sNodeName = elmResult.name.lower()
2107             if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
2108                 sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
2109             if sValue:
2110                 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2111             if (not sValue) and (sNodeName == 'abbr'):
2112                 sValue = elmResult.get('title')
2113             if sValue:
2114                 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2115             if (not sValue) and (iPropertyType == self.URI):
2116                 if sNodeName == 'a': sValue = elmResult.get('href')
2117                 elif sNodeName == 'img': sValue = elmResult.get('src')
2118                 elif sNodeName == 'object': sValue = elmResult.get('data')
2119             if sValue:
2120                 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2121             if (not sValue) and (sNodeName == 'img'):
2122                 sValue = elmResult.get('alt')
2123             if sValue:
2124                 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2125             if not sValue:
2126                 sValue = elmResult.renderContents()
2127                 sValue = re.sub(r'<\S[^>]*>', '', sValue)
2128                 sValue = sValue.replace('\r\n', '\n')
2129                 sValue = sValue.replace('\r', '\n')
2130             if sValue:
2131                 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2132             if not sValue: continue
2133             if iPropertyType == self.DATE:
2134                 sValue = _parse_date_iso8601(sValue)
2135             if bAllowMultiple:
2136                 arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
2137             else:
2138                 return bAutoEscape and self.vcardEscape(sValue) or sValue
2139         return arValues
2140
2141     def findVCards(self, elmRoot, bAgentParsing=0):
2142         sVCards = ''
2143
2144         if not bAgentParsing:
2145             arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
2146         else:
2147             arCards = [elmRoot]
2148
2149         for elmCard in arCards:
2150             arLines = []
2151
2152             def processSingleString(sProperty):
2153                 sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
2154                 if sValue:
2155                     arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
2156                 return sValue or u''
2157
2158             def processSingleURI(sProperty):
2159                 sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
2160                 if sValue:
2161                     sContentType = ''
2162                     sEncoding = ''
2163                     sValueKey = ''
2164                     if sValue.startswith('data:'):
2165                         sEncoding = ';ENCODING=b'
2166                         sContentType = sValue.split(';')[0].split('/').pop()
2167                         sValue = sValue.split(',', 1).pop()
2168                     else:
2169                         elmValue = self.getPropertyValue(elmCard, sProperty)
2170                         if elmValue:
2171                             if sProperty != 'url':
2172                                 sValueKey = ';VALUE=uri'
2173                             sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
2174                     sContentType = sContentType.upper()
2175                     if sContentType == 'OCTET-STREAM':
2176                         sContentType = ''
2177                     if sContentType:
2178                         sContentType = ';TYPE=' + sContentType.upper()
2179                     arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
2180
2181             def processTypeValue(sProperty, arDefaultType, arForceType=None):
2182                 arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
2183                 for elmResult in arResults:
2184                     arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
2185                     if arForceType:
2186                         arType = self.unique(arForceType + arType)
2187                     if not arType:
2188                         arType = arDefaultType
2189                     sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
2190                     if sValue:
2191                         arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
2192
2193             # AGENT
2194             # must do this before all other properties because it is destructive
2195             # (removes nested class="vcard" nodes so they don't interfere with
2196             # this vcard's other properties)
2197             arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2198             for elmAgent in arAgent:
2199                 if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2200                     sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2201                     sAgentValue = sAgentValue.replace('\n', '\\n')
2202                     sAgentValue = sAgentValue.replace(';', '\\;')
2203                     if sAgentValue:
2204                         arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2205                     # Completely remove the agent element from the parse tree
2206                     elmAgent.extract()
2207                 else:
2208                     sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2209                     if sAgentValue:
2210                         arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2211
2212             # FN (full name)
2213             sFN = processSingleString('fn')
2214
2215             # N (name)
2216             elmName = self.getPropertyValue(elmCard, 'n')
2217             if elmName:
2218                 sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2219                 sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2220                 arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2221                 arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2222                 arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2223                 arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2224                                          sGivenName + ';' +
2225                                          ','.join(arAdditionalNames) + ';' +
2226                                          ','.join(arHonorificPrefixes) + ';' +
2227                                          ','.join(arHonorificSuffixes)))
2228             elif sFN:
2229                 # implied "N" optimization
2230                 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2231                 arNames = self.normalize(sFN).split()
2232                 if len(arNames) == 2:
2233                     bFamilyNameFirst = (arNames[0].endswith(',') or
2234                                         len(arNames[1]) == 1 or
2235                                         ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2236                     if bFamilyNameFirst:
2237                         arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2238                     else:
2239                         arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2240
2241             # SORT-STRING
2242             sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2243             if sSortString:
2244                 arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2245
2246             # NICKNAME
2247             arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2248             if arNickname:
2249                 arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2250
2251             # PHOTO
2252             processSingleURI('photo')
2253
2254             # BDAY
2255             dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2256             if dtBday:
2257                 arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2258
2259             # ADR (address)
2260             arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2261             for elmAdr in arAdr:
2262                 arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2263                 if not arType:
2264                     arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2265                 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2266                 sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2267                 sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2268                 sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2269                 sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2270                 sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2271                 sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2272                 arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2273                                          sPostOfficeBox + ';' +
2274                                          sExtendedAddress + ';' +
2275                                          sStreetAddress + ';' +
2276                                          sLocality + ';' +
2277                                          sRegion + ';' +
2278                                          sPostalCode + ';' +
2279                                          sCountryName))
2280
2281             # LABEL
2282             processTypeValue('label', ['intl','postal','parcel','work'])
2283
2284             # TEL (phone number)
2285             processTypeValue('tel', ['voice'])
2286
2287             # EMAIL
2288             processTypeValue('email', ['internet'], ['internet'])
2289
2290             # MAILER
2291             processSingleString('mailer')
2292
2293             # TZ (timezone)
2294             processSingleString('tz')
2295
2296             # GEO (geographical information)
2297             elmGeo = self.getPropertyValue(elmCard, 'geo')
2298             if elmGeo:
2299                 sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2300                 sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2301                 arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2302
2303             # TITLE
2304             processSingleString('title')
2305
2306             # ROLE
2307             processSingleString('role')
2308
2309             # LOGO
2310             processSingleURI('logo')
2311
2312             # ORG (organization)
2313             elmOrg = self.getPropertyValue(elmCard, 'org')
2314             if elmOrg:
2315                 sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2316                 if not sOrganizationName:
2317                     # implied "organization-name" optimization
2318                     # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2319                     sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2320                     if sOrganizationName:
2321                         arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2322                 else:
2323                     arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2324                     arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2325
2326             # CATEGORY
2327             arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2328             if arCategory:
2329                 arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2330
2331             # NOTE
2332             processSingleString('note')
2333
2334             # REV
2335             processSingleString('rev')
2336
2337             # SOUND
2338             processSingleURI('sound')
2339
2340             # UID
2341             processSingleString('uid')
2342
2343             # URL
2344             processSingleURI('url')
2345
2346             # CLASS
2347             processSingleString('class')
2348
2349             # KEY
2350             processSingleURI('key')
2351
2352             if arLines:
2353                 arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
2354                 sVCards += u'\n'.join(arLines) + u'\n'
2355
2356         return sVCards.strip()
2357
2358     def isProbablyDownloadable(self, elm):
2359         attrsD = elm.attrMap
2360         if not attrsD.has_key('href'): return 0
2361         linktype = attrsD.get('type', '').strip()
2362         if linktype.startswith('audio/') or \
2363            linktype.startswith('video/') or \
2364            (linktype.startswith('application/') and not linktype.endswith('xml')):
2365             return 1
2366         path = urlparse.urlparse(attrsD['href'])[2]
2367         if path.find('.') == -1: return 0
2368         fileext = path.split('.').pop().lower()
2369         return fileext in self.known_binary_extensions
2370
2371     def findTags(self):
2372         all = lambda x: 1
2373         for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2374             href = elm.get('href')
2375             if not href: continue
2376             urlscheme, domain, path, params, query, fragment = \
2377                        urlparse.urlparse(_urljoin(self.baseuri, href))
2378             segments = path.split('/')
2379             tag = segments.pop()
2380             if not tag:
2381                 tag = segments.pop()
2382             tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2383             if not tagscheme.endswith('/'):
2384                 tagscheme += '/'
2385             self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2386
2387     def findEnclosures(self):
2388         all = lambda x: 1
2389         enclosure_match = re.compile(r'\benclosure\b')
2390         for elm in self.document(all, {'href': re.compile(r'.+')}):
2391             if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
2392             if elm.attrMap not in self.enclosures:
2393                 self.enclosures.append(elm.attrMap)
2394                 if elm.string and not elm.get('title'):
2395                     self.enclosures[-1]['title'] = elm.string
2396
2397     def findXFN(self):
2398         all = lambda x: 1
2399         for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2400             rels = elm.get('rel', '').split()
2401             xfn_rels = []
2402             for rel in rels:
2403                 if rel in self.known_xfn_relationships:
2404                     xfn_rels.append(rel)
2405             if xfn_rels:
2406                 self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2407
2408 def _parseMicroformats(htmlSource, baseURI, encoding):
2409     if not BeautifulSoup: return
2410     if _debug: sys.stderr.write('entering _parseMicroformats\n')
2411     try:
2412         p = _MicroformatsParser(htmlSource, baseURI, encoding)
2413     except UnicodeEncodeError:
2414         # sgmllib throws this exception when performing lookups of tags
2415         # with non-ASCII characters in them.
2416         return
2417     p.vcard = p.findVCards(p.document)
2418     p.findTags()
2419     p.findEnclosures()
2420     p.findXFN()
2421     return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2422
2423 class _RelativeURIResolver(_BaseHTMLProcessor):
2424     relative_uris = [('a', 'href'),
2425                      ('applet', 'codebase'),
2426                      ('area', 'href'),
2427                      ('blockquote', 'cite'),
2428                      ('body', 'background'),
2429                      ('del', 'cite'),
2430                      ('form', 'action'),
2431                      ('frame', 'longdesc'),
2432                      ('frame', 'src'),
2433                      ('iframe', 'longdesc'),
2434                      ('iframe', 'src'),
2435                      ('head', 'profile'),
2436                      ('img', 'longdesc'),
2437                      ('img', 'src'),
2438                      ('img', 'usemap'),
2439                      ('input', 'src'),
2440                      ('input', 'usemap'),
2441                      ('ins', 'cite'),
2442                      ('link', 'href'),
2443                      ('object', 'classid'),
2444                      ('object', 'codebase'),
2445                      ('object', 'data'),
2446                      ('object', 'usemap'),
2447                      ('q', 'cite'),
2448                      ('script', 'src')]
2449
2450     def __init__(self, baseuri, encoding, _type):
2451         _BaseHTMLProcessor.__init__(self, encoding, _type)
2452         self.baseuri = baseuri
2453
2454     def resolveURI(self, uri):
2455         return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
2456
2457     def unknown_starttag(self, tag, attrs):
2458         if _debug:
2459             sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
2460         attrs = self.normalize_attrs(attrs)
2461         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2462         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2463
2464 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2465     if _debug:
2466         sys.stderr.write('entering _resolveRelativeURIs\n')
2467
2468     p = _RelativeURIResolver(baseURI, encoding, _type)
2469     p.feed(htmlSource)
2470     return p.output()
2471
2472 def _makeSafeAbsoluteURI(base, rel=None):
2473     # bail if ACCEPTABLE_URI_SCHEMES is empty
2474     if not ACCEPTABLE_URI_SCHEMES:
2475         return _urljoin(base, rel or u'')
2476     if not base:
2477         return rel or u''
2478     if not rel:
2479         if base.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2480             return u''
2481         return base
2482     uri = _urljoin(base, rel)
2483     if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2484         return u''
2485     return uri
2486
2487 class _HTMLSanitizer(_BaseHTMLProcessor):
2488     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
2489       'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
2490       'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
2491       'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
2492       'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
2493       'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
2494       'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
2495       'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
2496       'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2497       'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
2498       'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
2499       'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2500
2501     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2502       'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2503       'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2504       'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2505       'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2506       'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2507       'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2508       'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2509       'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2510       'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2511       'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2512       'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2513       'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2514       'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2515       'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2516       'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2517       'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2518       'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2519       'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2520       'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2521       'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2522       'xml:lang']
2523
2524     unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2525
2526     acceptable_css_properties = ['azimuth', 'background-color',
2527       'border-bottom-color', 'border-collapse', 'border-color',
2528       'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2529       'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2530       'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2531       'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2532       'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2533       'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2534       'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2535       'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2536       'white-space', 'width']
2537
2538     # survey of common keywords found in feeds
2539     acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2540       'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2541       'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2542       'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2543       'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2544       'transparent', 'underline', 'white', 'yellow']
2545
2546     valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2547       '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2548
2549     mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2550       'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2551       'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2552       'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2553       'munderover', 'none', 'semantics']
2554
2555     mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2556       'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2557       'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2558       'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2559       'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2560       'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2561       'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2562       'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2563       'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2564
2565     # svgtiny - foreignObject + linearGradient + radialGradient + stop
2566     svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2567       'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2568       'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2569       'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2570       'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2571       'svg', 'switch', 'text', 'title', 'tspan', 'use']
2572
2573     # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2574     svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2575        'arabic-form', 'ascent', 'attributeName', 'attributeType',
2576        'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2577        'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2578        'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2579        'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2580        'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2581        'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2582        'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2583        'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2584        'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2585        'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2586        'overline-position', 'overline-thickness', 'panose-1', 'path',
2587        'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2588        'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2589        'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2590        'stop-color', 'stop-opacity', 'strikethrough-position',
2591        'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2592        'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2593        'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2594        'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2595        'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2596        'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2597        'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2598        'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2599        'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2600        'y2', 'zoomAndPan']
2601
2602     svg_attr_map = None
2603     svg_elem_map = None
2604
2605     acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2606       'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2607       'stroke-opacity']
2608
2609     def reset(self):
2610         _BaseHTMLProcessor.reset(self)
2611         self.unacceptablestack = 0
2612         self.mathmlOK = 0
2613         self.svgOK = 0
2614
2615     def unknown_starttag(self, tag, attrs):
2616         acceptable_attributes = self.acceptable_attributes
2617         keymap = {}
2618         if not tag in self.acceptable_elements or self.svgOK:
2619             if tag in self.unacceptable_elements_with_end_tag:
2620                 self.unacceptablestack += 1
2621
2622             # add implicit namespaces to html5 inline svg/mathml
2623             if self._type.endswith('html'):
2624                 if not dict(attrs).get('xmlns'):
2625                     if tag=='svg':
2626                         attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2627                     if tag=='math':
2628                         attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2629
2630             # not otherwise acceptable, perhaps it is MathML or SVG?
2631             if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2632                 self.mathmlOK += 1
2633             if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2634                 self.svgOK += 1
2635
2636             # chose acceptable attributes based on tag class, else bail
2637             if  self.mathmlOK and tag in self.mathml_elements:
2638                 acceptable_attributes = self.mathml_attributes
2639             elif self.svgOK and tag in self.svg_elements:
2640                 # for most vocabularies, lowercasing is a good idea.  Many
2641                 # svg elements, however, are camel case
2642                 if not self.svg_attr_map:
2643                     lower=[attr.lower() for attr in self.svg_attributes]
2644                     mix=[a for a in self.svg_attributes if a not in lower]
2645                     self.svg_attributes = lower
2646                     self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2647
2648                     lower=[attr.lower() for attr in self.svg_elements]
2649                     mix=[a for a in self.svg_elements if a not in lower]
2650                     self.svg_elements = lower
2651                     self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2652                 acceptable_attributes = self.svg_attributes
2653                 tag = self.svg_elem_map.get(tag,tag)
2654                 keymap = self.svg_attr_map
2655             elif not tag in self.acceptable_elements:
2656                 return
2657
2658         # declare xlink namespace, if needed
2659         if self.mathmlOK or self.svgOK:
2660             if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2661                 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2662                     attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2663
2664         clean_attrs = []
2665         for key, value in self.normalize_attrs(attrs):
2666             if key in acceptable_attributes:
2667                 key=keymap.get(key,key)
2668                 clean_attrs.append((key,value))
2669             elif key=='style':
2670                 clean_value = self.sanitize_style(value)
2671                 if clean_value: clean_attrs.append((key,clean_value))
2672         _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2673
2674     def unknown_endtag(self, tag):
2675         if not tag in self.acceptable_elements:
2676             if tag in self.unacceptable_elements_with_end_tag:
2677                 self.unacceptablestack -= 1
2678             if self.mathmlOK and tag in self.mathml_elements:
2679                 if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
2680             elif self.svgOK and tag in self.svg_elements:
2681                 tag = self.svg_elem_map.get(tag,tag)
2682                 if tag == 'svg' and self.svgOK: self.svgOK -= 1
2683             else:
2684                 return
2685         _BaseHTMLProcessor.unknown_endtag(self, tag)
2686
2687     def handle_pi(self, text):
2688         pass
2689
2690     def handle_decl(self, text):
2691         pass
2692
2693     def handle_data(self, text):
2694         if not self.unacceptablestack:
2695             _BaseHTMLProcessor.handle_data(self, text)
2696
2697     def sanitize_style(self, style):
2698         # disallow urls
2699         style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2700
2701         # gauntlet
2702         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
2703         # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2704         if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
2705
2706         clean = []
2707         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2708           if not value: continue
2709           if prop.lower() in self.acceptable_css_properties:
2710               clean.append(prop + ': ' + value + ';')
2711           elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2712               for keyword in value.split():
2713                   if not keyword in self.acceptable_css_keywords and \
2714                       not self.valid_css_values.match(keyword):
2715                       break
2716               else:
2717                   clean.append(prop + ': ' + value + ';')
2718           elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2719               clean.append(prop + ': ' + value + ';')
2720
2721         return ' '.join(clean)
2722
2723
2724 def _sanitizeHTML(htmlSource, encoding, _type):
2725     p = _HTMLSanitizer(encoding, _type)
2726     htmlSource = htmlSource.replace('<![CDATA[', '&lt;![CDATA[')
2727     p.feed(htmlSource)
2728     data = p.output()
2729     if TIDY_MARKUP:
2730         # loop through list of preferred Tidy interfaces looking for one that's installed,
2731         # then set up a common _tidy function to wrap the interface-specific API.
2732         _tidy = None
2733         for tidy_interface in PREFERRED_TIDY_INTERFACES:
2734             try:
2735                 if tidy_interface == "uTidy":
2736                     from tidy import parseString as _utidy
2737                     def _tidy(data, **kwargs):
2738                         return str(_utidy(data, **kwargs))
2739                     break
2740                 elif tidy_interface == "mxTidy":
2741                     from mx.Tidy import Tidy as _mxtidy
2742                     def _tidy(data, **kwargs):
2743                         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2744                         return data
2745                     break
2746             except:
2747                 pass
2748         if _tidy:
2749             utf8 = type(data) == type(u'')
2750             if utf8:
2751                 data = data.encode('utf-8')
2752             data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2753             if utf8:
2754                 data = unicode(data, 'utf-8')
2755             if data.count('<body'):
2756                 data = data.split('<body', 1)[1]
2757                 if data.count('>'):
2758                     data = data.split('>', 1)[1]
2759             if data.count('</body'):
2760                 data = data.split('</body', 1)[0]
2761     data = data.strip().replace('\r\n', '\n')
2762     return data
2763
2764 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2765     def http_error_default(self, req, fp, code, msg, headers):
2766         if ((code / 100) == 3) and (code != 304):
2767             return self.http_error_302(req, fp, code, msg, headers)
2768         infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2769         infourl.status = code
2770         return infourl
2771
2772     def http_error_302(self, req, fp, code, msg, headers):
2773         if headers.dict.has_key('location'):
2774             infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
2775         else:
2776             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2777         if not hasattr(infourl, 'status'):
2778             infourl.status = code
2779         return infourl
2780
2781     def http_error_301(self, req, fp, code, msg, headers):
2782         if headers.dict.has_key('location'):
2783             infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
2784         else:
2785             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2786         if not hasattr(infourl, 'status'):
2787             infourl.status = code
2788         return infourl
2789
2790     http_error_300 = http_error_302
2791     http_error_303 = http_error_302
2792     http_error_307 = http_error_302
2793
2794     def http_error_401(self, req, fp, code, msg, headers):
2795         # Check if
2796         # - server requires digest auth, AND
2797         # - we tried (unsuccessfully) with basic auth, AND
2798         # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2799         # If all conditions hold, parse authentication information
2800         # out of the Authorization header we sent the first time
2801         # (for the username and password) and the WWW-Authenticate
2802         # header the server sent back (for the realm) and retry
2803         # the request with the appropriate digest auth headers instead.
2804         # This evil genius hack has been brought to you by Aaron Swartz.
2805         host = urlparse.urlparse(req.get_full_url())[1]
2806         try:
2807             assert sys.version.split()[0] >= '2.3.3'
2808             assert base64 != None
2809             user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':')
2810             realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2811             self.add_password(realm, host, user, passw)
2812             retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2813             self.reset_retry_count()
2814             return retry
2815         except:
2816             return self.http_error_default(req, fp, code, msg, headers)
2817
2818 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2819     """URL, filename, or string --> stream
2820
2821     This function lets you define parsers that take any input source
2822     (URL, pathname to local or network file, or actual data as a string)
2823     and deal with it in a uniform manner.  Returned object is guaranteed
2824     to have all the basic stdio read methods (read, readline, readlines).
2825     Just .close() the object when you're done with it.
2826
2827     If the etag argument is supplied, it will be used as the value of an
2828     If-None-Match request header.
2829
2830     If the modified argument is supplied, it can be a tuple of 9 integers
2831     (as returned by gmtime() in the standard Python time module) or a date
2832     string in any format supported by feedparser. Regardless, it MUST
2833     be in GMT (Greenwich Mean Time). It will be reformatted into an
2834     RFC 1123-compliant date and used as the value of an If-Modified-Since
2835     request header.
2836
2837     If the agent argument is supplied, it will be used as the value of a
2838     User-Agent request header.
2839
2840     If the referrer argument is supplied, it will be used as the value of a
2841     Referer[sic] request header.
2842
2843     If handlers is supplied, it is a list of handlers used to build a
2844     urllib2 opener.
2845
2846     if request_headers is supplied it is a dictionary of HTTP request headers
2847     that will override the values generated by FeedParser.
2848     """
2849
2850     if hasattr(url_file_stream_or_string, 'read'):
2851         return url_file_stream_or_string
2852
2853     if url_file_stream_or_string == '-':
2854         return sys.stdin
2855
2856     if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2857         # Deal with the feed URI scheme
2858         if url_file_stream_or_string.startswith('feed:http'):
2859             url_file_stream_or_string = url_file_stream_or_string[5:]
2860         elif url_file_stream_or_string.startswith('feed:'):
2861             url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2862         if not agent:
2863             agent = USER_AGENT
2864         # test for inline user:password for basic auth
2865         auth = None
2866         if base64:
2867             urltype, rest = urllib.splittype(url_file_stream_or_string)
2868             realhost, rest = urllib.splithost(rest)
2869             if realhost:
2870                 user_passwd, realhost = urllib.splituser(realhost)
2871                 if user_passwd:
2872                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2873                     auth = base64.standard_b64encode(user_passwd).strip()
2874
2875         # iri support
2876         try:
2877             if isinstance(url_file_stream_or_string,unicode):
2878                 url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8')
2879             else:
2880                 url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8')
2881         except:
2882             pass
2883
2884         # try to open with urllib2 (to use optional headers)
2885         request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2886         opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
2887         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2888         try:
2889             return opener.open(request)
2890         finally:
2891             opener.close() # JohnD
2892
2893     # try to open with native open function (if url_file_stream_or_string is a filename)
2894     try:
2895         return open(url_file_stream_or_string, 'rb')
2896     except:
2897         pass
2898
2899     # treat url_file_stream_or_string as string
2900     return _StringIO(str(url_file_stream_or_string))
2901
2902 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2903     request = urllib2.Request(url)
2904     request.add_header('User-Agent', agent)
2905     if etag:
2906         request.add_header('If-None-Match', etag)
2907     if type(modified) == type(''):
2908         modified = _parse_date(modified)
2909     elif isinstance(modified, datetime.datetime):
2910         modified = modified.utctimetuple()
2911     if modified:
2912         # format into an RFC 1123-compliant timestamp. We can't use
2913         # time.strftime() since the %a and %b directives can be affected
2914         # by the current locale, but RFC 2616 states that dates must be
2915         # in English.
2916         short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2917         months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2918         request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2919     if referrer:
2920         request.add_header('Referer', referrer)
2921     if gzip and zlib:
2922         request.add_header('Accept-encoding', 'gzip, deflate')
2923     elif gzip:
2924         request.add_header('Accept-encoding', 'gzip')
2925     elif zlib:
2926         request.add_header('Accept-encoding', 'deflate')
2927     else:
2928         request.add_header('Accept-encoding', '')
2929     if auth:
2930         request.add_header('Authorization', 'Basic %s' % auth)
2931     if ACCEPT_HEADER:
2932         request.add_header('Accept', ACCEPT_HEADER)
2933     # use this for whatever -- cookies, special headers, etc
2934     # [('Cookie','Something'),('x-special-header','Another Value')]
2935     for header_name, header_value in request_headers.items():
2936         request.add_header(header_name, header_value)
2937     request.add_header('A-IM', 'feed') # RFC 3229 support
2938     return request
2939
2940 _date_handlers = []
2941 def registerDateHandler(func):
2942     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2943     _date_handlers.insert(0, func)
2944
2945 # ISO-8601 date parsing routines written by Fazal Majid.
2946 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2947 # parser is beyond the scope of feedparser and would be a worthwhile addition
2948 # to the Python library.
2949 # A single regular expression cannot parse ISO 8601 date formats into groups
2950 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2951 # 0301-04-01), so we use templates instead.
2952 # Please note the order in templates is significant because we need a
2953 # greedy match.
2954 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2955                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2956                 '-YY-?MM', '-OOO', '-YY',
2957                 '--MM-?DD', '--MM',
2958                 '---DD',
2959                 'CC', '']
2960 _iso8601_re = [
2961     tmpl.replace(
2962     'YYYY', r'(?P<year>\d{4})').replace(
2963     'YY', r'(?P<year>\d\d)').replace(
2964     'MM', r'(?P<month>[01]\d)').replace(
2965     'DD', r'(?P<day>[0123]\d)').replace(
2966     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2967     'CC', r'(?P<century>\d\d$)')
2968     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2969     + r'(:(?P<second>\d{2}))?'
2970     + r'(\.(?P<fracsecond>\d+))?'
2971     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2972     for tmpl in _iso8601_tmpl]
2973 try:
2974     del tmpl
2975 except NameError:
2976     pass
2977 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2978 try:
2979     del regex
2980 except NameError:
2981     pass
2982 def _parse_date_iso8601(dateString):
2983     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2984     m = None
2985     for _iso8601_match in _iso8601_matches:
2986         m = _iso8601_match(dateString)
2987         if m: break
2988     if not m: return
2989     if m.span() == (0, 0): return
2990     params = m.groupdict()
2991     ordinal = params.get('ordinal', 0)
2992     if ordinal:
2993         ordinal = int(ordinal)
2994     else:
2995         ordinal = 0
2996     year = params.get('year', '--')
2997     if not year or year == '--':
2998         year = time.gmtime()[0]
2999     elif len(year) == 2:
3000         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3001         year = 100 * int(time.gmtime()[0] / 100) + int(year)
3002     else:
3003         year = int(year)
3004     month = params.get('month', '-')
3005     if not month or month == '-':
3006         # ordinals are NOT normalized by mktime, we simulate them
3007         # by setting month=1, day=ordinal
3008         if ordinal:
3009             month = 1
3010         else:
3011             month = time.gmtime()[1]
3012     month = int(month)
3013     day = params.get('day', 0)
3014     if not day:
3015         # see above
3016         if ordinal:
3017             day = ordinal
3018         elif params.get('century', 0) or \
3019                  params.get('year', 0) or params.get('month', 0):
3020             day = 1
3021         else:
3022             day = time.gmtime()[2]
3023     else:
3024         day = int(day)
3025     # special case of the century - is the first year of the 21st century
3026     # 2000 or 2001 ? The debate goes on...
3027     if 'century' in params.keys():
3028         year = (int(params['century']) - 1) * 100 + 1
3029     # in ISO 8601 most fields are optional
3030     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3031         if not params.get(field, None):
3032             params[field] = 0
3033     hour = int(params.get('hour', 0))
3034     minute = int(params.get('minute', 0))
3035     second = int(float(params.get('second', 0)))
3036     # weekday is normalized by mktime(), we can ignore it
3037     weekday = 0
3038     daylight_savings_flag = -1
3039     tm = [year, month, day, hour, minute, second, weekday,
3040           ordinal, daylight_savings_flag]
3041     # ISO 8601 time zone adjustments
3042     tz = params.get('tz')
3043     if tz and tz != 'Z':
3044         if tz[0] == '-':
3045             tm[3] += int(params.get('tzhour', 0))
3046             tm[4] += int(params.get('tzmin', 0))
3047         elif tz[0] == '+':
3048             tm[3] -= int(params.get('tzhour', 0))
3049             tm[4] -= int(params.get('tzmin', 0))
3050         else:
3051             return None
3052     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3053     # which is guaranteed to normalize d/m/y/h/m/s.
3054     # Many implementations have bugs, but we'll pretend they don't.
3055     return time.localtime(time.mktime(tuple(tm)))
3056 registerDateHandler(_parse_date_iso8601)
3057
3058 # 8-bit date handling routines written by ytrewq1.
3059 _korean_year  = u'\ub144' # b3e2 in euc-kr
3060 _korean_month = u'\uc6d4' # bff9 in euc-kr
3061 _korean_day   = u'\uc77c' # c0cf in euc-kr
3062 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3063 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3064
3065 _korean_onblog_date_re = \
3066     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3067                (_korean_year, _korean_month, _korean_day))
3068 _korean_nate_date_re = \
3069     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3070                (_korean_am, _korean_pm))
3071 def _parse_date_onblog(dateString):
3072     '''Parse a string according to the OnBlog 8-bit date format'''
3073     m = _korean_onblog_date_re.match(dateString)
3074     if not m: return
3075     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3076                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3077                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3078                  'zonediff': '+09:00'}
3079     if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
3080     return _parse_date_w3dtf(w3dtfdate)
3081 registerDateHandler(_parse_date_onblog)
3082
3083 def _parse_date_nate(dateString):
3084     '''Parse a string according to the Nate 8-bit date format'''
3085     m = _korean_nate_date_re.match(dateString)
3086     if not m: return
3087     hour = int(m.group(5))
3088     ampm = m.group(4)
3089     if (ampm == _korean_pm):
3090         hour += 12
3091     hour = str(hour)
3092     if len(hour) == 1:
3093         hour = '0' + hour
3094     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3095                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3096                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3097                  'zonediff': '+09:00'}
3098     if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
3099     return _parse_date_w3dtf(w3dtfdate)
3100 registerDateHandler(_parse_date_nate)
3101
3102 _mssql_date_re = \
3103     re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3104 def _parse_date_mssql(dateString):
3105     '''Parse a string according to the MS SQL date format'''
3106     m = _mssql_date_re.match(dateString)
3107     if not m: return
3108     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3109                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3110                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3111                  'zonediff': '+09:00'}
3112     if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
3113     return _parse_date_w3dtf(w3dtfdate)
3114 registerDateHandler(_parse_date_mssql)
3115
3116 # Unicode strings for Greek date strings
3117 _greek_months = \
3118   { \
3119    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
3120    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
3121    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
3122    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
3123    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
3124    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
3125    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
3126    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
3127    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3128    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
3129    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3130    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
3131    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
3132    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
3133    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
3134    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
3135    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
3136    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
3137    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
3138   }
3139
3140 _greek_wdays = \
3141   { \
3142    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3143    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3144    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3145    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3146    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3147    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3148    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3149   }
3150
3151 _greek_date_format_re = \
3152     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3153
3154 def _parse_date_greek(dateString):
3155     '''Parse a string according to a Greek 8-bit date format.'''
3156     m = _greek_date_format_re.match(dateString)
3157     if not m: return
3158     try:
3159         wday = _greek_wdays[m.group(1)]
3160         month = _greek_months[m.group(3)]
3161     except:
3162         return
3163     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3164                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3165                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3166                   'zonediff': m.group(8)}
3167     if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
3168     return _parse_date_rfc822(rfc822date)
3169 registerDateHandler(_parse_date_greek)
3170
3171 # Unicode strings for Hungarian date strings
3172 _hungarian_months = \
3173   { \
3174     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
3175     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
3176     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
3177     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
3178     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
3179     u'j\u00fanius':   u'06',  # fa in iso-8859-2
3180     u'j\u00falius':   u'07',  # fa in iso-8859-2
3181     u'augusztus':     u'08',
3182     u'szeptember':    u'09',
3183     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
3184     u'november':      u'11',
3185     u'december':      u'12',
3186   }
3187
3188 _hungarian_date_format_re = \
3189   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3190
3191 def _parse_date_hungarian(dateString):
3192     '''Parse a string according to a Hungarian 8-bit date format.'''
3193     m = _hungarian_date_format_re.match(dateString)
3194     if not m: return
3195     try:
3196         month = _hungarian_months[m.group(2)]
3197         day = m.group(3)
3198         if len(day) == 1:
3199             day = '0' + day
3200         hour = m.group(4)
3201         if len(hour) == 1:
3202             hour = '0' + hour
3203     except:
3204         return
3205     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3206                 {'year': m.group(1), 'month': month, 'day': day,\
3207                  'hour': hour, 'minute': m.group(5),\
3208                  'zonediff': m.group(6)}
3209     if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
3210     return _parse_date_w3dtf(w3dtfdate)
3211 registerDateHandler(_parse_date_hungarian)
3212
3213 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3214 # Drake and licensed under the Python license.  Removed all range checking
3215 # for month, day, hour, minute, and second, since mktime will normalize
3216 # these later
3217 def _parse_date_w3dtf(dateString):
3218     def __extract_date(m):
3219         year = int(m.group('year'))
3220         if year < 100:
3221             year = 100 * int(time.gmtime()[0] / 100) + int(year)
3222         if year < 1000:
3223             return 0, 0, 0
3224         julian = m.group('julian')
3225         if julian:
3226             julian = int(julian)
3227             month = julian / 30 + 1
3228             day = julian % 30 + 1
3229             jday = None
3230             while jday != julian:
3231                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
3232                 jday = time.gmtime(t)[-2]
3233                 diff = abs(jday - julian)
3234                 if jday > julian:
3235                     if diff < day:
3236                         day = day - diff
3237                     else:
3238                         month = month - 1
3239                         day = 31
3240                 elif jday < julian:
3241                     if day + diff < 28:
3242                        day = day + diff
3243                     else:
3244                         month = month + 1
3245             return year, month, day
3246         month = m.group('month')
3247         day = 1
3248         if month is None:
3249             month = 1
3250         else:
3251             month = int(month)
3252             day = m.group('day')
3253             if day:
3254                 day = int(day)
3255             else:
3256                 day = 1
3257         return year, month, day
3258
3259     def __extract_time(m):
3260         if not m:
3261             return 0, 0, 0
3262         hours = m.group('hours')
3263         if not hours:
3264             return 0, 0, 0
3265         hours = int(hours)
3266         minutes = int(m.group('minutes'))
3267         seconds = m.group('seconds')
3268         if seconds:
3269             seconds = int(seconds)
3270         else:
3271             seconds = 0
3272         return hours, minutes, seconds
3273
3274     def __extract_tzd(m):
3275         '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3276         if not m:
3277             return 0
3278         tzd = m.group('tzd')
3279         if not tzd:
3280             return 0
3281         if tzd == 'Z':
3282             return 0
3283         hours = int(m.group('tzdhours'))
3284         minutes = m.group('tzdminutes')
3285         if minutes:
3286             minutes = int(minutes)
3287         else:
3288             minutes = 0
3289         offset = (hours*60 + minutes) * 60
3290         if tzd[0] == '+':
3291             return -offset
3292         return offset
3293
3294     __date_re = ('(?P<year>\d\d\d\d)'
3295                  '(?:(?P<dsep>-|)'
3296                  '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3297                  '|(?P<julian>\d\d\d)))?')
3298     __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3299     __tzd_rx = re.compile(__tzd_re)
3300     __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3301                  '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3302                  + __tzd_re)
3303     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3304     __datetime_rx = re.compile(__datetime_re)
3305     m = __datetime_rx.match(dateString)
3306     if (m is None) or (m.group() != dateString): return
3307     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3308     if gmt[0] == 0: return
3309     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3310 registerDateHandler(_parse_date_w3dtf)
3311
3312 def _parse_date_rfc822(dateString):
3313     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3314     data = dateString.split()
3315     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3316         del data[0]
3317     if len(data) == 4:
3318         s = data[3]
3319         i = s.find('+')
3320         if i > 0:
3321             data[3:] = [s[:i], s[i+1:]]
3322         else:
3323             data.append('')
3324         dateString = " ".join(data)
3325     # Account for the Etc/GMT timezone by stripping 'Etc/'
3326     elif len(data) == 5 and data[4].lower().startswith('etc/'):
3327         data[4] = data[4][4:]
3328         dateString = " ".join(data)
3329     if len(data) < 5:
3330         dateString += ' 00:00:00 GMT'
3331     tm = rfc822.parsedate_tz(dateString)
3332     if tm:
3333         return time.gmtime(rfc822.mktime_tz(tm))
3334 # rfc822.py defines several time zones, but we define some extra ones.
3335 # 'ET' is equivalent to 'EST', etc.
3336 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3337 rfc822._timezones.update(_additional_timezones)
3338 registerDateHandler(_parse_date_rfc822)
3339
3340 def _parse_date_perforce(aDateString):
3341         """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3342         # Fri, 2006/09/15 08:19:53 EDT
3343         _my_date_pattern = re.compile( \
3344                 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3345
3346         dow, year, month, day, hour, minute, second, tz = \
3347                 _my_date_pattern.search(aDateString).groups()
3348         months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3349         dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3350         tm = rfc822.parsedate_tz(dateString)
3351         if tm:
3352                 return time.gmtime(rfc822.mktime_tz(tm))
3353 registerDateHandler(_parse_date_perforce)
3354
3355 def _parse_date(dateString):
3356     '''Parses a variety of date formats into a 9-tuple in GMT'''
3357     for handler in _date_handlers:
3358         try:
3359             date9tuple = handler(dateString)
3360             if not date9tuple: continue
3361             if len(date9tuple) != 9:
3362                 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
3363                 raise ValueError
3364             map(int, date9tuple)
3365             return date9tuple
3366         except Exception, e:
3367             if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
3368             pass
3369     return None
3370
3371 def _getCharacterEncoding(http_headers, xml_data):
3372     '''Get the character encoding of the XML document
3373
3374     http_headers is a dictionary
3375     xml_data is a raw string (not Unicode)
3376
3377     This is so much trickier than it sounds, it's not even funny.
3378     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3379     is application/xml, application/*+xml,
3380     application/xml-external-parsed-entity, or application/xml-dtd,
3381     the encoding given in the charset parameter of the HTTP Content-Type
3382     takes precedence over the encoding given in the XML prefix within the
3383     document, and defaults to 'utf-8' if neither are specified.  But, if
3384     the HTTP Content-Type is text/xml, text/*+xml, or
3385     text/xml-external-parsed-entity, the encoding given in the XML prefix
3386     within the document is ALWAYS IGNORED and only the encoding given in
3387     the charset parameter of the HTTP Content-Type header should be
3388     respected, and it defaults to 'us-ascii' if not specified.
3389
3390     Furthermore, discussion on the atom-syntax mailing list with the
3391     author of RFC 3023 leads me to the conclusion that any document
3392     served with a Content-Type of text/* and no charset parameter
3393     must be treated as us-ascii.  (We now do this.)  And also that it
3394     must always be flagged as non-well-formed.  (We now do this too.)
3395
3396     If Content-Type is unspecified (input was local file or non-HTTP source)
3397     or unrecognized (server just got it totally wrong), then go by the
3398     encoding given in the XML prefix of the document and default to
3399     'iso-8859-1' as per the HTTP specification (RFC 2616).
3400
3401     Then, assuming we didn't find a character encoding in the HTTP headers
3402     (and the HTTP Content-type allowed us to look in the body), we need
3403     to sniff the first few bytes of the XML data and try to determine
3404     whether the encoding is ASCII-compatible.  Section F of the XML
3405     specification shows the way here:
3406     http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3407
3408     If the sniffed encoding is not ASCII-compatible, we need to make it
3409     ASCII compatible so that we can sniff further into the XML declaration
3410     to find the encoding attribute, which will tell us the true encoding.
3411
3412     Of course, none of this guarantees that we will be able to parse the
3413     feed in the declared character encoding (assuming it was declared
3414     correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
3415     you should definitely install them if you can.
3416     http://cjkpython.i18n.org/
3417     '''
3418
3419     def _parseHTTPContentType(content_type):
3420         '''takes HTTP Content-Type header and returns (content type, charset)
3421
3422         If no charset is specified, returns (content type, '')
3423         If no content type is specified, returns ('', '')
3424         Both return parameters are guaranteed to be lowercase strings
3425         '''
3426         content_type = content_type or ''
3427         content_type, params = cgi.parse_header(content_type)
3428         return content_type, params.get('charset', '').replace("'", '')
3429
3430     sniffed_xml_encoding = ''
3431     xml_encoding = ''
3432     true_encoding = ''
3433     http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
3434     # Must sniff for non-ASCII-compatible character encodings before
3435     # searching for XML declaration.  This heuristic is defined in
3436     # section F of the XML specification:
3437     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3438     try:
3439         if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3440             # EBCDIC
3441             xml_data = _ebcdic_to_ascii(xml_data)
3442         elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3443             # UTF-16BE
3444             sniffed_xml_encoding = 'utf-16be'
3445             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3446         elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3447             # UTF-16BE with BOM
3448             sniffed_xml_encoding = 'utf-16be'
3449             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3450         elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3451             # UTF-16LE
3452             sniffed_xml_encoding = 'utf-16le'
3453             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3454         elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3455             # UTF-16LE with BOM
3456             sniffed_xml_encoding = 'utf-16le'
3457             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3458         elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3459             # UTF-32BE
3460             sniffed_xml_encoding = 'utf-32be'
3461             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3462         elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3463             # UTF-32LE
3464             sniffed_xml_encoding = 'utf-32le'
3465             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3466         elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3467             # UTF-32BE with BOM
3468             sniffed_xml_encoding = 'utf-32be'
3469             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3470         elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3471             # UTF-32LE with BOM
3472             sniffed_xml_encoding = 'utf-32le'
3473             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3474         elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3475             # UTF-8 with BOM
3476             sniffed_xml_encoding = 'utf-8'
3477             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3478         else:
3479             # ASCII-compatible
3480             pass
3481         xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
3482     except:
3483         xml_encoding_match = None
3484     if xml_encoding_match:
3485         xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3486         if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3487             xml_encoding = sniffed_xml_encoding
3488     acceptable_content_type = 0
3489     application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3490     text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
3491     if (http_content_type in application_content_types) or \
3492        (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
3493         acceptable_content_type = 1
3494         true_encoding = http_encoding or xml_encoding or 'utf-8'
3495     elif (http_content_type in text_content_types) or \
3496          (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
3497         acceptable_content_type = 1
3498         true_encoding = http_encoding or 'us-ascii'
3499     elif http_content_type.startswith('text/'):
3500         true_encoding = http_encoding or 'us-ascii'
3501     elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
3502         true_encoding = xml_encoding or 'iso-8859-1'
3503     else:
3504         true_encoding = xml_encoding or 'utf-8'
3505     # some feeds claim to be gb2312 but are actually gb18030.
3506     # apparently MSIE and Firefox both do the following switch:
3507     if true_encoding.lower() == 'gb2312':
3508         true_encoding = 'gb18030'
3509     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3510
3511 def _toUTF8(data, encoding):
3512     '''Changes an XML data stream on the fly to specify a new encoding
3513
3514     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3515     encoding is a string recognized by encodings.aliases
3516     '''
3517     if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3518     # strip Byte Order Mark (if present)
3519     if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3520         if _debug:
3521             sys.stderr.write('stripping BOM\n')
3522             if encoding != 'utf-16be':
3523                 sys.stderr.write('trying utf-16be instead\n')
3524         encoding = 'utf-16be'
3525         data = data[2:]
3526     elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3527         if _debug:
3528             sys.stderr.write('stripping BOM\n')
3529             if encoding != 'utf-16le':
3530                 sys.stderr.write('trying utf-16le instead\n')
3531         encoding = 'utf-16le'
3532         data = data[2:]
3533     elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3534         if _debug:
3535             sys.stderr.write('stripping BOM\n')
3536             if encoding != 'utf-8':
3537                 sys.stderr.write('trying utf-8 instead\n')
3538         encoding = 'utf-8'
3539         data = data[3:]
3540     elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3541         if _debug:
3542             sys.stderr.write('stripping BOM\n')
3543             if encoding != 'utf-32be':
3544                 sys.stderr.write('trying utf-32be instead\n')
3545         encoding = 'utf-32be'
3546         data = data[4:]
3547     elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3548         if _debug:
3549             sys.stderr.write('stripping BOM\n')
3550             if encoding != 'utf-32le':
3551                 sys.stderr.write('trying utf-32le instead\n')
3552         encoding = 'utf-32le'
3553         data = data[4:]
3554     newdata = unicode(data, encoding)
3555     if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3556     declmatch = re.compile('^<\?xml[^>]*?>')
3557     newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3558     if declmatch.search(newdata):
3559         newdata = declmatch.sub(newdecl, newdata)
3560     else:
3561         newdata = newdecl + u'\n' + newdata
3562     return newdata.encode('utf-8')
3563
3564 def _stripDoctype(data):
3565     '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3566
3567     rss_version may be 'rss091n' or None
3568     stripped_data is the same XML document, minus the DOCTYPE
3569     '''
3570     start = re.search(_s2bytes('<\w'), data)
3571     start = start and start.start() or -1
3572     head,data = data[:start+1], data[start+1:]
3573
3574     entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3575     entity_results=entity_pattern.findall(head)
3576     head = entity_pattern.sub(_s2bytes(''), head)
3577     doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3578     doctype_results = doctype_pattern.findall(head)
3579     doctype = doctype_results and doctype_results[0] or _s2bytes('')
3580     if doctype.lower().count(_s2bytes('netscape')):
3581         version = 'rss091n'
3582     else:
3583         version = None
3584
3585     # only allow in 'safe' inline entity definitions
3586     replacement=_s2bytes('')
3587     if len(doctype_results)==1 and entity_results:
3588        safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3589        safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3590        if safe_entities:
3591            replacement=_s2bytes('<!DOCTYPE feed [\n  <!ENTITY') + _s2bytes('>\n  <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
3592     data = doctype_pattern.sub(replacement, head) + data
3593
3594     return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
3595
3596 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}):
3597     '''Parse a feed from a URL, file, stream, or string.
3598
3599     request_headers, if given, is a dict from http header name to value to add
3600     to the request; this overrides internally generated values.
3601     '''
3602     result = FeedParserDict()
3603     result['feed'] = FeedParserDict()
3604     result['entries'] = []
3605     if _XML_AVAILABLE:
3606         result['bozo'] = 0
3607     if not isinstance(handlers, list):
3608         handlers = [handlers]
3609     try:
3610         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3611         data = f.read()
3612     except Exception, e:
3613         result['bozo'] = 1
3614         result['bozo_exception'] = e
3615         data = None
3616         f = None
3617
3618     if hasattr(f, 'headers'):
3619         result['headers'] = dict(f.headers)
3620     # overwrite existing headers using response_headers
3621     if 'headers' in result:
3622         result['headers'].update(response_headers)
3623     elif response_headers:
3624         result['headers'] = copy.deepcopy(response_headers)
3625
3626     # if feed is gzip-compressed, decompress it
3627     if f and data and 'headers' in result:
3628         if gzip and result['headers'].get('content-encoding') == 'gzip':
3629             try:
3630                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3631             except Exception, e:
3632                 # Some feeds claim to be gzipped but they're not, so
3633                 # we get garbage.  Ideally, we should re-request the
3634                 # feed without the 'Accept-encoding: gzip' header,
3635                 # but we don't.
3636                 result['bozo'] = 1
3637                 result['bozo_exception'] = e
3638                 data = ''
3639         elif zlib and result['headers'].get('content-encoding') == 'deflate':
3640             try:
3641                 data = zlib.decompress(data, -zlib.MAX_WBITS)
3642             except Exception, e:
3643                 result['bozo'] = 1
3644                 result['bozo_exception'] = e
3645                 data = ''
3646
3647     # save HTTP headers
3648     if 'headers' in result:
3649         if 'etag' in result['headers'] or 'ETag' in result['headers']:
3650             etag = result['headers'].get('etag', result['headers'].get('ETag'))
3651             if etag:
3652                 result['etag'] = etag
3653         if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
3654             modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
3655             if modified:
3656                 result['modified'] = _parse_date(modified)
3657     if hasattr(f, 'url'):
3658         result['href'] = f.url
3659         result['status'] = 200
3660     if hasattr(f, 'status'):
3661         result['status'] = f.status
3662     if hasattr(f, 'close'):
3663         f.close()
3664
3665     # there are four encodings to keep track of:
3666     # - http_encoding is the encoding declared in the Content-Type HTTP header
3667     # - xml_encoding is the encoding declared in the <?xml declaration
3668     # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3669     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3670     http_headers = result.get('headers', {})
3671     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3672         _getCharacterEncoding(http_headers, data)
3673     if http_headers and (not acceptable_content_type):
3674         if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
3675             bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
3676         else:
3677             bozo_message = 'no Content-type specified'
3678         result['bozo'] = 1
3679         result['bozo_exception'] = NonXMLContentType(bozo_message)
3680
3681     if data is not None:
3682         result['version'], data, entities = _stripDoctype(data)
3683
3684     # ensure that baseuri is an absolute uri using an acceptable URI scheme
3685     contentloc = http_headers.get('content-location', http_headers.get('Content-Location', ''))
3686     href = result.get('href', '')
3687     baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3688
3689     baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
3690
3691     # if server sent 304, we're done
3692     if result.get('status', 0) == 304:
3693         result['version'] = ''
3694         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3695             'so the server sent no data.  This is a feature, not a bug!'
3696         return result
3697
3698     # if there was a problem downloading, we're done
3699     if data is None:
3700         return result
3701
3702     # determine character encoding
3703     use_strict_parser = 0
3704     known_encoding = 0
3705     tried_encodings = []
3706     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3707     for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3708         if not proposed_encoding: continue
3709         if proposed_encoding in tried_encodings: continue
3710         tried_encodings.append(proposed_encoding)
3711         try:
3712             data = _toUTF8(data, proposed_encoding)
3713             known_encoding = use_strict_parser = 1
3714             break
3715         except:
3716             pass
3717     # if no luck and we have auto-detection library, try that
3718     if (not known_encoding) and chardet:
3719         try:
3720             proposed_encoding = chardet.detect(data)['encoding']
3721             if proposed_encoding and (proposed_encoding not in tried_encodings):
3722                 tried_encodings.append(proposed_encoding)
3723                 data = _toUTF8(data, proposed_encoding)
3724                 known_encoding = use_strict_parser = 1
3725         except:
3726             pass
3727     # if still no luck and we haven't tried utf-8 yet, try that
3728     if (not known_encoding) and ('utf-8' not in tried_encodings):
3729         try:
3730             proposed_encoding = 'utf-8'
3731             tried_encodings.append(proposed_encoding)
3732             data = _toUTF8(data, proposed_encoding)
3733             known_encoding = use_strict_parser = 1
3734         except:
3735             pass
3736     # if still no luck and we haven't tried windows-1252 yet, try that
3737     if (not known_encoding) and ('windows-1252' not in tried_encodings):
3738         try:
3739             proposed_encoding = 'windows-1252'
3740             tried_encodings.append(proposed_encoding)
3741             data = _toUTF8(data, proposed_encoding)
3742             known_encoding = use_strict_parser = 1
3743         except:
3744             pass
3745     # if still no luck and we haven't tried iso-8859-2 yet, try that.
3746     if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
3747         try:
3748             proposed_encoding = 'iso-8859-2'
3749             tried_encodings.append(proposed_encoding)
3750             data = _toUTF8(data, proposed_encoding)
3751             known_encoding = use_strict_parser = 1
3752         except:
3753             pass
3754     # if still no luck, give up
3755     if not known_encoding:
3756         result['bozo'] = 1
3757         result['bozo_exception'] = CharacterEncodingUnknown( \
3758             'document encoding unknown, I tried ' + \
3759             '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3760             (result['encoding'], xml_encoding))
3761         result['encoding'] = ''
3762     elif proposed_encoding != result['encoding']:
3763         result['bozo'] = 1
3764         result['bozo_exception'] = CharacterEncodingOverride( \
3765             'document declared as %s, but parsed as %s' % \
3766             (result['encoding'], proposed_encoding))
3767         result['encoding'] = proposed_encoding
3768
3769     if not _XML_AVAILABLE:
3770         use_strict_parser = 0
3771     if use_strict_parser:
3772         # initialize the SAX parser
3773         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3774         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3775         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3776         saxparser.setContentHandler(feedparser)
3777         saxparser.setErrorHandler(feedparser)
3778         source = xml.sax.xmlreader.InputSource()
3779         source.setByteStream(_StringIO(data))
3780         if hasattr(saxparser, '_ns_stack'):
3781             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3782             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3783             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3784         try:
3785             saxparser.parse(source)
3786         except Exception, e:
3787             if _debug:
3788                 import traceback
3789                 traceback.print_stack()
3790                 traceback.print_exc()
3791                 sys.stderr.write('xml parsing failed\n')
3792             result['bozo'] = 1
3793             result['bozo_exception'] = feedparser.exc or e
3794             use_strict_parser = 0
3795     if not use_strict_parser:
3796         feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3797         feedparser.feed(data.decode('utf-8', 'replace'))
3798     result['feed'] = feedparser.feeddata
3799     result['entries'] = feedparser.entries
3800     result['version'] = result['version'] or feedparser.version
3801     result['namespaces'] = feedparser.namespacesInUse
3802     return result
3803
3804 class Serializer:
3805     def __init__(self, results):
3806         self.results = results
3807
3808 class TextSerializer(Serializer):
3809     def write(self, stream=sys.stdout):
3810         self._writer(stream, self.results, '')
3811
3812     def _writer(self, stream, node, prefix):
3813         if not node: return
3814         if hasattr(node, 'keys'):
3815             keys = node.keys()
3816             keys.sort()
3817             for k in keys:
3818                 if k in ('description', 'link'): continue
3819                 if node.has_key(k + '_detail'): continue
3820                 if node.has_key(k + '_parsed'): continue
3821                 self._writer(stream, node[k], prefix + k + '.')
3822         elif type(node) == types.ListType:
3823             index = 0
3824             for n in node:
3825                 self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
3826                 index += 1
3827         else:
3828             try:
3829                 s = str(node).encode('utf-8')
3830                 s = s.replace('\\', '\\\\')
3831                 s = s.replace('\r', '')
3832                 s = s.replace('\n', r'\n')
3833                 stream.write(prefix[:-1])
3834                 stream.write('=')
3835                 stream.write(s)
3836                 stream.write('\n')
3837             except:
3838                 pass
3839
3840 class PprintSerializer(Serializer):
3841     def write(self, stream=sys.stdout):
3842         if self.results.has_key('href'):
3843             stream.write(self.results['href'] + '\n\n')
3844         from pprint import pprint
3845         pprint(self.results, stream)
3846         stream.write('\n')
3847
3848 if __name__ == '__main__':
3849     try:
3850         from optparse import OptionParser
3851     except:
3852         OptionParser = None
3853
3854     if OptionParser:
3855         optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
3856         optionParser.set_defaults(format="pprint")
3857         optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
3858         optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
3859         optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
3860         optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3861         optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
3862         optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
3863         (options, urls) = optionParser.parse_args()
3864         if options.verbose:
3865             _debug = 1
3866         if not urls:
3867             optionParser.print_help()
3868             sys.exit(0)
3869     else:
3870         if not sys.argv[1:]:
3871             print __doc__
3872             sys.exit(0)
3873         class _Options:
3874             etag = modified = agent = referrer = None
3875             format = 'pprint'
3876         options = _Options()
3877         urls = sys.argv[1:]
3878
3879     zopeCompatibilityHack()
3880
3881     serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
3882     for url in urls:
3883         results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3884         serializer(results).write(sys.stdout)