2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14 __version__
= "4.2-pre-" + "$Revision$"[11:14] + "-svn"
15 __license__
= """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
17 Redistribution and use in source and binary forms, with or without modification,
18 are permitted provided that the following conditions are met:
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
22 * Redistributions in binary form must reproduce the above copyright notice,
23 this list of conditions and the following disclaimer in the documentation
24 and/or other materials provided with the distribution.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE."""
37 __author__
= "Mark Pilgrim <http://diveintomark.org/>"
38 __contributors__
= ["Jason Diamond <http://injektilo.org/>",
39 "John Beimler <http://john.beimler.org/>",
40 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41 "Aaron Swartz <http://aaronsw.com/>",
42 "Kevin Marks <http://epeus.blogspot.com/>",
43 "Sam Ruby <http://intertwingly.net/>",
44 "Ade Oshineye <http://blog.oshineye.com/>",
45 "Martin Pool <http://sourcefrog.net/>"]
48 # HTTP "User-Agent" header to send to servers when downloading feeds.
49 # If you are embedding feedparser in a larger application, you should
50 # change this to your application name and URL.
51 USER_AGENT
= "UniversalFeedParser/%s +http://feedparser.org/" % __version__
53 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
54 # want to send an Accept header, set this to None.
55 ACCEPT_HEADER
= "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
57 # List of preferred XML parsers, by SAX driver name. These will be tried first,
58 # but if they're not installed, Python will keep searching through its own list
59 # of pre-installed parsers until it finds one that supports everything we need.
60 PREFERRED_XML_PARSERS
= ["drv_libxml2"]
62 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
63 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
64 # or utidylib <http://utidylib.berlios.de/>.
67 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
69 PREFERRED_TIDY_INTERFACES
= ["uTidy", "mxTidy"]
71 # If you want feedparser to automatically resolve all relative URIs, set this
73 RESOLVE_RELATIVE_URIS
= 1
75 # If you want feedparser to automatically sanitize all potentially unsafe
76 # HTML content, set this to 1.
79 # ---------- Python 3 modules (make it work if possible) ----------
83 from email
import _parseaddr
as rfc822
86 # Python 3.1 introduces bytes.maketrans and simultaneously
87 # deprecates string.maketrans; use bytes.maketrans if possible
88 _maketrans
= bytes
.maketrans
89 except (NameError, AttributeError):
91 _maketrans
= string
.maketrans
93 # base64 support for Atom feeds that contain embedded binary data
95 import base64
, binascii
96 # Python 3.1 deprecates decodestring in favor of decodebytes
97 _base64decode
= getattr(base64
, 'decodebytes', base64
.decodestring
)
99 base64
= binascii
= None
102 # Convert a UTF-8 str to bytes if the interpreter is Python 3
104 return bytes(s
, 'utf8')
105 except (NameError, TypeError):
106 # In Python 2.5 and below, bytes doesn't exist (NameError)
107 # In Python 2.6 and above, bytes and str are the same (TypeError)
111 # Convert a list of ints to bytes if the interpreter is Python 3
114 # In Python 2.6 and above, this call won't raise an exception
115 # but it will return bytes([65]) as '[65]' instead of 'A'
119 return ''.join(map(chr, l
))
121 # If you want feedparser to allow all URL schemes, set this to ()
122 # List culled from Python's urlparse documentation at:
123 # http://docs.python.org/library/urlparse.html
124 # as well as from "URI scheme" at Wikipedia:
125 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
126 # Many more will likely need to be added!
127 ACCEPTABLE_URI_SCHEMES
= (
128 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
129 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
130 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
131 # Additional common-but-unofficial schemes
132 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
133 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
135 #ACCEPTABLE_URI_SCHEMES = ()
137 # ---------- required modules (should come with any Python distribution) ----------
138 import sgmllib
, re
, sys
, copy
, urlparse
, time
, types
, cgi
, urllib
, urllib2
, datetime
140 from io
import BytesIO
as _StringIO
143 from cStringIO
import StringIO
as _StringIO
145 from StringIO
import StringIO
as _StringIO
147 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
149 # gzip is included with most Python distributions, but may not be available if you compiled your own
159 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
160 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
161 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
162 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
165 xml
.sax
.make_parser(PREFERRED_XML_PARSERS
) # test for valid parsers
166 from xml
.sax
.saxutils
import escape
as _xmlescape
170 def _xmlescape(data
,entities
={}):
171 data
= data
.replace('&', '&')
172 data
= data
.replace('>', '>')
173 data
= data
.replace('<', '<')
174 for char
, entity
in entities
:
175 data
= data
.replace(char
, entity
)
178 # cjkcodecs and iconv_codec provide support for more character encodings.
179 # Both are available from http://cjkpython.i18n.org/
181 import cjkcodecs
.aliases
189 # chardet library auto-detects character encodings
190 # Download from http://chardet.feedparser.org/
194 import chardet
.constants
195 chardet
.constants
._debug
= 1
199 # reversable htmlentitydefs mappings for Python 2.2
201 from htmlentitydefs
import name2codepoint
, codepoint2name
203 import htmlentitydefs
206 for (name
,codepoint
) in htmlentitydefs
.entitydefs
.iteritems():
207 if codepoint
.startswith('&#'): codepoint
=unichr(int(codepoint
[2:-1]))
208 name2codepoint
[name
]=ord(codepoint
)
209 codepoint2name
[ord(codepoint
)]=name
211 # BeautifulSoup parser used for parsing microformats from embedded HTML content
212 # http://www.crummy.com/software/BeautifulSoup/
213 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
214 # older 2.x series. If it doesn't, and you can figure out why, I'll accept a
215 # patch and modify the compatibility statement accordingly.
221 # ---------- don't touch these ----------
222 class ThingsNobodyCaresAboutButMe(Exception): pass
223 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe
): pass
224 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe
): pass
225 class NonXMLContentType(ThingsNobodyCaresAboutButMe
): pass
226 class UndeclaredNamespace(Exception): pass
228 sgmllib
.tagfind
= re
.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
229 sgmllib
.special
= re
.compile('<!')
230 sgmllib
.charref
= re
.compile('&#(\d+|[xX][0-9a-fA-F]+);')
232 if sgmllib
.endbracket
.search(' <').start(0):
233 class EndBracketRegEx
:
235 # Overriding the built-in sgmllib.endbracket regex allows the
236 # parser to find angle brackets embedded in element attributes.
237 self
.endbracket
= re
.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
238 def search(self
,string
,index
=0):
239 match
= self
.endbracket
.match(string
,index
)
240 if match
is not None:
241 # Returning a new object in the calling thread's context
242 # resolves a thread-safety.
243 return EndBracketMatch(match
)
245 class EndBracketMatch
:
246 def __init__(self
, match
):
249 return self
.match
.end(n
)
250 sgmllib
.endbracket
= EndBracketRegEx()
252 SUPPORTED_VERSIONS
= {'': 'unknown',
253 'rss090': 'RSS 0.90',
254 'rss091n': 'RSS 0.91 (Netscape)',
255 'rss091u': 'RSS 0.91 (Userland)',
256 'rss092': 'RSS 0.92',
257 'rss093': 'RSS 0.93',
258 'rss094': 'RSS 0.94',
261 'rss': 'RSS (unknown version)',
262 'atom01': 'Atom 0.1',
263 'atom02': 'Atom 0.2',
264 'atom03': 'Atom 0.3',
265 'atom10': 'Atom 1.0',
266 'atom': 'Atom (unknown version)',
274 # Python 2.1 does not have dict
275 from UserDict
import UserDict
282 class FeedParserDict(UserDict
):
283 keymap
= {'channel': 'feed',
287 'date_parsed': 'updated_parsed',
288 'description': ['summary', 'subtitle'],
290 'modified': 'updated',
291 'modified_parsed': 'updated_parsed',
292 'issued': 'published',
293 'issued_parsed': 'published_parsed',
294 'copyright': 'rights',
295 'copyright_detail': 'rights_detail',
296 'tagline': 'subtitle',
297 'tagline_detail': 'subtitle_detail'}
298 def __getitem__(self
, key
):
299 if key
== 'category':
300 return UserDict
.__getitem
__(self
, 'tags')[0]['term']
301 if key
== 'enclosures':
302 norel
= lambda link
: FeedParserDict([(name
,value
) for (name
,value
) in link
.items() if name
!='rel'])
303 return [norel(link
) for link
in UserDict
.__getitem
__(self
, 'links') if link
['rel']=='enclosure']
305 for link
in UserDict
.__getitem
__(self
, 'links'):
306 if link
['rel']=='license' and link
.has_key('href'):
308 if key
== 'categories':
309 return [(tag
['scheme'], tag
['term']) for tag
in UserDict
.__getitem
__(self
, 'tags')]
310 realkey
= self
.keymap
.get(key
, key
)
311 if type(realkey
) == types
.ListType
:
313 if UserDict
.__contains
__(self
, k
):
314 return UserDict
.__getitem
__(self
, k
)
315 if UserDict
.__contains
__(self
, key
):
316 return UserDict
.__getitem
__(self
, key
)
317 return UserDict
.__getitem
__(self
, realkey
)
319 def __setitem__(self
, key
, value
):
320 for k
in self
.keymap
.keys():
323 if type(key
) == types
.ListType
:
325 return UserDict
.__setitem
__(self
, key
, value
)
327 def get(self
, key
, default
=None):
328 if self
.has_key(key
):
333 def setdefault(self
, key
, value
):
334 if not self
.has_key(key
):
338 def has_key(self
, key
):
340 return hasattr(self
, key
) or UserDict
.__contains
__(self
, key
)
341 except AttributeError:
343 # This alias prevents the 2to3 tool from changing the semantics of the
344 # __contains__ function below and exhausting the maximum recursion depth
347 def __getattr__(self
, key
):
349 return self
.__dict
__[key
]
353 assert not key
.startswith('_')
354 return self
.__getitem
__(key
)
356 raise AttributeError, "object has no attribute '%s'" % key
358 def __setattr__(self
, key
, value
):
359 if key
.startswith('_') or key
== 'data':
360 self
.__dict
__[key
] = value
362 return self
.__setitem
__(key
, value
)
364 def __contains__(self
, key
):
365 return self
.__has
_key
(key
)
367 def zopeCompatibilityHack():
368 global FeedParserDict
370 def FeedParserDict(aDict
=None):
376 _ebcdic_to_ascii_map
= None
377 def _ebcdic_to_ascii(s
):
378 global _ebcdic_to_ascii_map
379 if not _ebcdic_to_ascii_map
:
381 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
382 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
383 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
384 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
385 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
386 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
387 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
388 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
389 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
390 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
391 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
392 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
393 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
394 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
395 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
396 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
398 _ebcdic_to_ascii_map
= _maketrans( \
399 _l2bytes(range(256)), _l2bytes(emap
))
400 return s
.translate(_ebcdic_to_ascii_map
)
403 unichr(128): unichr(8364), # euro sign
404 unichr(130): unichr(8218), # single low-9 quotation mark
405 unichr(131): unichr( 402), # latin small letter f with hook
406 unichr(132): unichr(8222), # double low-9 quotation mark
407 unichr(133): unichr(8230), # horizontal ellipsis
408 unichr(134): unichr(8224), # dagger
409 unichr(135): unichr(8225), # double dagger
410 unichr(136): unichr( 710), # modifier letter circumflex accent
411 unichr(137): unichr(8240), # per mille sign
412 unichr(138): unichr( 352), # latin capital letter s with caron
413 unichr(139): unichr(8249), # single left-pointing angle quotation mark
414 unichr(140): unichr( 338), # latin capital ligature oe
415 unichr(142): unichr( 381), # latin capital letter z with caron
416 unichr(145): unichr(8216), # left single quotation mark
417 unichr(146): unichr(8217), # right single quotation mark
418 unichr(147): unichr(8220), # left double quotation mark
419 unichr(148): unichr(8221), # right double quotation mark
420 unichr(149): unichr(8226), # bullet
421 unichr(150): unichr(8211), # en dash
422 unichr(151): unichr(8212), # em dash
423 unichr(152): unichr( 732), # small tilde
424 unichr(153): unichr(8482), # trade mark sign
425 unichr(154): unichr( 353), # latin small letter s with caron
426 unichr(155): unichr(8250), # single right-pointing angle quotation mark
427 unichr(156): unichr( 339), # latin small ligature oe
428 unichr(158): unichr( 382), # latin small letter z with caron
429 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
431 _urifixer
= re
.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
432 def _urljoin(base
, uri
):
433 uri
= _urifixer
.sub(r
'\1\3', uri
)
435 return urlparse
.urljoin(base
, uri
)
437 uri
= urlparse
.urlunparse([urllib
.quote(part
) for part
in urlparse
.urlparse(uri
)])
438 return urlparse
.urljoin(base
, uri
)
440 class _FeedParserMixin
:
441 namespaces
= {'': '',
442 'http://backend.userland.com/rss': '',
443 'http://blogs.law.harvard.edu/tech/rss': '',
444 'http://purl.org/rss/1.0/': '',
445 'http://my.netscape.com/rdf/simple/0.9/': '',
446 'http://example.com/newformat#': '',
447 'http://example.com/necho': '',
448 'http://purl.org/echo/': '',
449 'uri/of/echo/namespace#': '',
450 'http://purl.org/pie/': '',
451 'http://purl.org/atom/ns#': '',
452 'http://www.w3.org/2005/Atom': '',
453 'http://purl.org/rss/1.0/modules/rss091#': '',
455 'http://webns.net/mvcb/': 'admin',
456 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
457 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
458 'http://media.tangent.org/rss/1.0/': 'audio',
459 'http://backend.userland.com/blogChannelModule': 'blogChannel',
460 'http://web.resource.org/cc/': 'cc',
461 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
462 'http://purl.org/rss/1.0/modules/company': 'co',
463 'http://purl.org/rss/1.0/modules/content/': 'content',
464 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
465 'http://purl.org/dc/elements/1.1/': 'dc',
466 'http://purl.org/dc/terms/': 'dcterms',
467 'http://purl.org/rss/1.0/modules/email/': 'email',
468 'http://purl.org/rss/1.0/modules/event/': 'ev',
469 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
470 'http://freshmeat.net/rss/fm/': 'fm',
471 'http://xmlns.com/foaf/0.1/': 'foaf',
472 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
473 'http://postneo.com/icbm/': 'icbm',
474 'http://purl.org/rss/1.0/modules/image/': 'image',
475 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
476 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
477 'http://purl.org/rss/1.0/modules/link/': 'l',
478 'http://search.yahoo.com/mrss': 'media',
479 #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
480 'http://search.yahoo.com/mrss/': 'media',
481 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
482 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
483 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
484 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
485 'http://purl.org/rss/1.0/modules/reference/': 'ref',
486 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
487 'http://purl.org/rss/1.0/modules/search/': 'search',
488 'http://purl.org/rss/1.0/modules/slash/': 'slash',
489 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
490 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
491 'http://hacks.benhammersley.com/rss/streaming/': 'str',
492 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
493 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
494 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
495 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
496 'http://purl.org/rss/1.0/modules/threading/': 'thr',
497 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
498 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
499 'http://wellformedweb.org/commentAPI/': 'wfw',
500 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
501 'http://www.w3.org/1999/xhtml': 'xhtml',
502 'http://www.w3.org/1999/xlink': 'xlink',
503 'http://www.w3.org/XML/1998/namespace': 'xml'
505 _matchnamespaces
= {}
507 can_be_relative_uri
= ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
508 can_contain_relative_uris
= ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
509 can_contain_dangerous_markup
= ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
510 html_types
= ['text/html', 'application/xhtml+xml']
512 def __init__(self
, baseuri
=None, baselang
=None, encoding
='utf-8'):
513 if _debug
: sys
.stderr
.write('initializing FeedParser\n')
514 if not self
._matchnamespaces
:
515 for k
, v
in self
.namespaces
.items():
516 self
._matchnamespaces
[k
.lower()] = v
517 self
.feeddata
= FeedParserDict() # feed-level data
518 self
.encoding
= encoding
# character encoding
519 self
.entries
= [] # list of entry-level data
520 self
.version
= '' # feed type/version, see SUPPORTED_VERSIONS
521 self
.namespacesInUse
= {} # dictionary of namespaces defined by the feed
523 # the following are used internally to track state;
524 # this is really out of control and should be refactored
531 self
.incontributor
= 0
534 self
.sourcedata
= FeedParserDict()
535 self
.contentparams
= FeedParserDict()
536 self
._summaryKey
= None
537 self
.namespacemap
= {}
538 self
.elementstack
= []
541 self
.baseuri
= baseuri
or ''
542 self
.lang
= baselang
or None
546 self
.feeddata
['language'] = baselang
.replace('_','-')
548 def unknown_starttag(self
, tag
, attrs
):
549 if _debug
: sys
.stderr
.write('start %s with %s\n' % (tag
, attrs
))
551 attrs
= [(k
.lower(), v
) for k
, v
in attrs
]
552 attrs
= [(k
, k
in ('rel', 'type') and v
.lower() or v
) for k
, v
in attrs
]
553 # the sgml parser doesn't handle entities in attributes, but
554 # strict xml parsers do -- account for this difference
555 if isinstance(self
, _LooseFeedParser
):
556 attrs
= [(k
, v
.replace('&', '&')) for k
, v
in attrs
]
558 # track xml:base and xml:lang
560 baseuri
= attrsD
.get('xml:base', attrsD
.get('base')) or self
.baseuri
561 if type(baseuri
) != type(u
''):
563 baseuri
= unicode(baseuri
, self
.encoding
)
565 baseuri
= unicode(baseuri
, 'iso-8859-1')
566 # ensure that self.baseuri is always an absolute URI that
567 # uses a whitelisted URI scheme (e.g. not `javscript:`)
569 self
.baseuri
= _makeSafeAbsoluteURI(self
.baseuri
, baseuri
) or self
.baseuri
571 self
.baseuri
= _urljoin(self
.baseuri
, baseuri
)
572 lang
= attrsD
.get('xml:lang', attrsD
.get('lang'))
574 # xml:lang could be explicitly set to '', we need to capture that
577 # if no xml:lang is specified, use parent lang
580 if tag
in ('feed', 'rss', 'rdf:RDF'):
581 self
.feeddata
['language'] = lang
.replace('_','-')
583 self
.basestack
.append(self
.baseuri
)
584 self
.langstack
.append(lang
)
587 for prefix
, uri
in attrs
:
588 if prefix
.startswith('xmlns:'):
589 self
.trackNamespace(prefix
[6:], uri
)
590 elif prefix
== 'xmlns':
591 self
.trackNamespace(None, uri
)
593 # track inline content
594 if self
.incontent
and self
.contentparams
.has_key('type') and not self
.contentparams
.get('type', 'xml').endswith('xml'):
595 if tag
in ['xhtml:div', 'div']: return # typepad does this 10/2007
596 # element declared itself as escaped markup, but it isn't really
597 self
.contentparams
['type'] = 'application/xhtml+xml'
598 if self
.incontent
and self
.contentparams
.get('type') == 'application/xhtml+xml':
599 if tag
.find(':') <> -1:
600 prefix
, tag
= tag
.split(':', 1)
601 namespace
= self
.namespacesInUse
.get(prefix
, '')
602 if tag
=='math' and namespace
=='http://www.w3.org/1998/Math/MathML':
603 attrs
.append(('xmlns',namespace
))
604 if tag
=='svg' and namespace
=='http://www.w3.org/2000/svg':
605 attrs
.append(('xmlns',namespace
))
606 if tag
== 'svg': self
.svgOK
+= 1
607 return self
.handle_data('<%s%s>' % (tag
, self
.strattrs(attrs
)), escape
=0)
610 if tag
.find(':') <> -1:
611 prefix
, suffix
= tag
.split(':', 1)
613 prefix
, suffix
= '', tag
614 prefix
= self
.namespacemap
.get(prefix
, prefix
)
616 prefix
= prefix
+ '_'
618 # special hack for better tracking of empty textinput/image elements in illformed feeds
619 if (not prefix
) and tag
not in ('title', 'link', 'description', 'name'):
621 if (not prefix
) and tag
not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
624 # call special handler (if defined) or default handler
625 methodname
= '_start_' + prefix
+ suffix
627 method
= getattr(self
, methodname
)
628 return method(attrsD
)
629 except AttributeError:
630 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
631 unknown_tag
= prefix
+ suffix
633 # No attributes so merge it into the encosing dictionary
634 return self
.push(unknown_tag
, 1)
636 # Has attributes so create it in its own dictionary
637 context
= self
._getContext
()
638 context
[unknown_tag
] = attrsD
640 def unknown_endtag(self
, tag
):
641 if _debug
: sys
.stderr
.write('end %s\n' % tag
)
643 if tag
.find(':') <> -1:
644 prefix
, suffix
= tag
.split(':', 1)
646 prefix
, suffix
= '', tag
647 prefix
= self
.namespacemap
.get(prefix
, prefix
)
649 prefix
= prefix
+ '_'
650 if suffix
== 'svg' and self
.svgOK
: self
.svgOK
-= 1
652 # call special handler (if defined) or default handler
653 methodname
= '_end_' + prefix
+ suffix
655 if self
.svgOK
: raise AttributeError()
656 method
= getattr(self
, methodname
)
658 except AttributeError:
659 self
.pop(prefix
+ suffix
)
661 # track inline content
662 if self
.incontent
and self
.contentparams
.has_key('type') and not self
.contentparams
.get('type', 'xml').endswith('xml'):
663 # element declared itself as escaped markup, but it isn't really
664 if tag
in ['xhtml:div', 'div']: return # typepad does this 10/2007
665 self
.contentparams
['type'] = 'application/xhtml+xml'
666 if self
.incontent
and self
.contentparams
.get('type') == 'application/xhtml+xml':
667 tag
= tag
.split(':')[-1]
668 self
.handle_data('</%s>' % tag
, escape
=0)
670 # track xml:base and xml:lang going out of scope
673 if self
.basestack
and self
.basestack
[-1]:
674 self
.baseuri
= self
.basestack
[-1]
677 if self
.langstack
: # and (self.langstack[-1] is not None):
678 self
.lang
= self
.langstack
[-1]
680 def handle_charref(self
, ref
):
681 # called for each character reference, e.g. for ' ', ref will be '160'
682 if not self
.elementstack
: return
684 if ref
in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
691 text
= unichr(c
).encode('utf-8')
692 self
.elementstack
[-1][2].append(text
)
694 def handle_entityref(self
, ref
):
695 # called for each entity reference, e.g. for '©', ref will be 'copy'
696 if not self
.elementstack
: return
697 if _debug
: sys
.stderr
.write('entering handle_entityref with %s\n' % ref
)
698 if ref
in ('lt', 'gt', 'quot', 'amp', 'apos'):
700 elif ref
in self
.entities
.keys():
701 text
= self
.entities
[ref
]
702 if text
.startswith('&#') and text
.endswith(';'):
703 return self
.handle_entityref(text
)
705 try: name2codepoint
[ref
]
706 except KeyError: text
= '&%s;' % ref
707 else: text
= unichr(name2codepoint
[ref
]).encode('utf-8')
708 self
.elementstack
[-1][2].append(text
)
710 def handle_data(self
, text
, escape
=1):
711 # called for each block of plain text, i.e. outside of any tag and
712 # not containing any character or entity references
713 if not self
.elementstack
: return
714 if escape
and self
.contentparams
.get('type') == 'application/xhtml+xml':
715 text
= _xmlescape(text
)
716 self
.elementstack
[-1][2].append(text
)
718 def handle_comment(self
, text
):
719 # called for each comment, e.g. <!-- insert message here -->
722 def handle_pi(self
, text
):
723 # called for each processing instruction, e.g. <?instruction>
726 def handle_decl(self
, text
):
729 def parse_declaration(self
, i
):
730 # override internal declaration handler to handle CDATA blocks
731 if _debug
: sys
.stderr
.write('entering parse_declaration\n')
732 if self
.rawdata
[i
:i
+9] == '<![CDATA[':
733 k
= self
.rawdata
.find(']]>', i
)
735 # CDATA block began but didn't finish
736 k
= len(self
.rawdata
)
738 self
.handle_data(_xmlescape(self
.rawdata
[i
+9:k
]), 0)
741 k
= self
.rawdata
.find('>', i
)
745 # We have an incomplete CDATA block.
748 def mapContentType(self
, contentType
):
749 contentType
= contentType
.lower()
750 if contentType
== 'text':
751 contentType
= 'text/plain'
752 elif contentType
== 'html':
753 contentType
= 'text/html'
754 elif contentType
== 'xhtml':
755 contentType
= 'application/xhtml+xml'
758 def trackNamespace(self
, prefix
, uri
):
759 loweruri
= uri
.lower()
760 if (prefix
, loweruri
) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self
.version
:
761 self
.version
= 'rss090'
762 if loweruri
== 'http://purl.org/rss/1.0/' and not self
.version
:
763 self
.version
= 'rss10'
764 if loweruri
== 'http://www.w3.org/2005/atom' and not self
.version
:
765 self
.version
= 'atom10'
766 if loweruri
.find('backend.userland.com/rss') <> -1:
767 # match any backend.userland.com namespace
768 uri
= 'http://backend.userland.com/rss'
770 if self
._matchnamespaces
.has_key(loweruri
):
771 self
.namespacemap
[prefix
] = self
._matchnamespaces
[loweruri
]
772 self
.namespacesInUse
[self
._matchnamespaces
[loweruri
]] = uri
774 self
.namespacesInUse
[prefix
or ''] = uri
776 def resolveURI(self
, uri
):
777 return _urljoin(self
.baseuri
or '', uri
)
779 def decodeEntities(self
, element
, data
):
782 def strattrs(self
, attrs
):
783 return ''.join([' %s="%s"' % (t
[0],_xmlescape(t
[1],{'"':'"'})) for t
in attrs
])
785 def push(self
, element
, expectingText
):
786 self
.elementstack
.append([element
, expectingText
, []])
788 def pop(self
, element
, stripWhitespace
=1):
789 if not self
.elementstack
: return
790 if self
.elementstack
[-1][0] != element
: return
792 element
, expectingText
, pieces
= self
.elementstack
.pop()
794 if self
.version
== 'atom10' and self
.contentparams
.get('type','text') == 'application/xhtml+xml':
795 # remove enclosing child element, but only if it is a <div> and
796 # only if all the remaining content is nested underneath it.
797 # This means that the divs would be retained in the following:
798 # <div>foo</div><div>bar</div>
799 while pieces
and len(pieces
)>1 and not pieces
[-1].strip():
801 while pieces
and len(pieces
)>1 and not pieces
[0].strip():
803 if pieces
and (pieces
[0] == '<div>' or pieces
[0].startswith('<div ')) and pieces
[-1]=='</div>':
805 for piece
in pieces
[:-1]:
806 if piece
.startswith('</'):
809 elif piece
.startswith('<') and not piece
.endswith('/>'):
812 pieces
= pieces
[1:-1]
814 # Ensure each piece is a str for Python 3
815 for (i
, v
) in enumerate(pieces
):
816 if not isinstance(v
, basestring
):
817 pieces
[i
] = v
.decode('utf-8')
819 output
= ''.join(pieces
)
821 output
= output
.strip()
822 if not expectingText
: return output
824 # decode base64 content
825 if base64
and self
.contentparams
.get('base64', 0):
827 output
= _base64decode(output
)
828 except binascii
.Error
:
830 except binascii
.Incomplete
:
833 # In Python 3, base64 takes and outputs bytes, not str
834 # This may not be the most correct way to accomplish this
835 output
= _base64decode(output
.encode('utf-8')).decode('utf-8')
837 # resolve relative URIs
838 if (element
in self
.can_be_relative_uri
) and output
:
839 output
= self
.resolveURI(output
)
841 # decode entities within embedded markup
842 if not self
.contentparams
.get('base64', 0):
843 output
= self
.decodeEntities(element
, output
)
845 if self
.lookslikehtml(output
):
846 self
.contentparams
['type']='text/html'
848 # remove temporary cruft from contentparams
850 del self
.contentparams
['mode']
854 del self
.contentparams
['base64']
858 is_htmlish
= self
.mapContentType(self
.contentparams
.get('type', 'text/html')) in self
.html_types
859 # resolve relative URIs within embedded markup
860 if is_htmlish
and RESOLVE_RELATIVE_URIS
:
861 if element
in self
.can_contain_relative_uris
:
862 output
= _resolveRelativeURIs(output
, self
.baseuri
, self
.encoding
, self
.contentparams
.get('type', 'text/html'))
865 # (must do this before sanitizing because some microformats
866 # rely on elements that we sanitize)
867 if is_htmlish
and element
in ['content', 'description', 'summary']:
868 mfresults
= _parseMicroformats(output
, self
.baseuri
, self
.encoding
)
870 for tag
in mfresults
.get('tags', []):
871 self
._addTag
(tag
['term'], tag
['scheme'], tag
['label'])
872 for enclosure
in mfresults
.get('enclosures', []):
873 self
._start
_enclosure
(enclosure
)
874 for xfn
in mfresults
.get('xfn', []):
875 self
._addXFN
(xfn
['relationships'], xfn
['href'], xfn
['name'])
876 vcard
= mfresults
.get('vcard')
878 self
._getContext
()['vcard'] = vcard
880 # sanitize embedded markup
881 if is_htmlish
and SANITIZE_HTML
:
882 if element
in self
.can_contain_dangerous_markup
:
883 output
= _sanitizeHTML(output
, self
.encoding
, self
.contentparams
.get('type', 'text/html'))
885 if self
.encoding
and type(output
) != type(u
''):
887 output
= unicode(output
, self
.encoding
)
891 # address common error where people take data that is already
892 # utf-8, presume that it is iso-8859-1, and re-encode it.
893 if self
.encoding
in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output
) == type(u
''):
895 output
= unicode(output
.encode('iso-8859-1'), 'utf-8')
899 # map win-1252 extensions to the proper code points
900 if type(output
) == type(u
''):
901 output
= u
''.join([c
in _cp1252
.keys() and _cp1252
[c
] or c
for c
in output
])
903 # categories/tags/keywords/whatever are handled in _end_category
904 if element
== 'category':
907 if element
== 'title' and self
.hasTitle
:
910 # store output in appropriate place(s)
911 if self
.inentry
and not self
.insource
:
912 if element
== 'content':
913 self
.entries
[-1].setdefault(element
, [])
914 contentparams
= copy
.deepcopy(self
.contentparams
)
915 contentparams
['value'] = output
916 self
.entries
[-1][element
].append(contentparams
)
917 elif element
== 'link':
919 # query variables in urls in link elements are improperly
920 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
921 # unhandled character references. fix this special case.
922 output
= re
.sub("&([A-Za-z0-9_]+);", "&\g<1>", output
)
923 self
.entries
[-1][element
] = output
925 self
.entries
[-1]['links'][-1]['href'] = output
927 if element
== 'description':
929 self
.entries
[-1][element
] = output
931 contentparams
= copy
.deepcopy(self
.contentparams
)
932 contentparams
['value'] = output
933 self
.entries
[-1][element
+ '_detail'] = contentparams
934 elif (self
.infeed
or self
.insource
):# and (not self.intextinput) and (not self.inimage):
935 context
= self
._getContext
()
936 if element
== 'description':
938 context
[element
] = output
939 if element
== 'link':
940 # fix query variables; see above for the explanation
941 output
= re
.sub("&([A-Za-z0-9_]+);", "&\g<1>", output
)
942 context
[element
] = output
943 context
['links'][-1]['href'] = output
945 contentparams
= copy
.deepcopy(self
.contentparams
)
946 contentparams
['value'] = output
947 context
[element
+ '_detail'] = contentparams
950 def pushContent(self
, tag
, attrsD
, defaultContentType
, expectingText
):
952 if self
.lang
: self
.lang
=self
.lang
.replace('_','-')
953 self
.contentparams
= FeedParserDict({
954 'type': self
.mapContentType(attrsD
.get('type', defaultContentType
)),
955 'language': self
.lang
,
956 'base': self
.baseuri
})
957 self
.contentparams
['base64'] = self
._isBase
64(attrsD
, self
.contentparams
)
958 self
.push(tag
, expectingText
)
960 def popContent(self
, tag
):
961 value
= self
.pop(tag
)
963 self
.contentparams
.clear()
966 # a number of elements in a number of RSS variants are nominally plain
967 # text, but this is routinely ignored. This is an attempt to detect
968 # the most common cases. As false positives often result in silent
969 # data loss, this function errs on the conservative side.
970 def lookslikehtml(self
, s
):
971 if self
.version
.startswith('atom'): return
972 if self
.contentparams
.get('type','text/html') != 'text/plain': return
974 # must have a close tag or a entity reference to qualify
975 if not (re
.search(r
'</(\w+)>',s
) or re
.search("&#?\w+;",s
)): return
977 # all tags must be in a restricted subset of valid HTML tags
978 if filter(lambda t
: t
.lower() not in _HTMLSanitizer
.acceptable_elements
,
979 re
.findall(r
'</?(\w+)',s
)): return
981 # all entities must have been defined as valid HTML entities
982 from htmlentitydefs
import entitydefs
983 if filter(lambda e
: e
not in entitydefs
.keys(),
984 re
.findall(r
'&(\w+);',s
)): return
988 def _mapToStandardPrefix(self
, name
):
989 colonpos
= name
.find(':')
991 prefix
= name
[:colonpos
]
992 suffix
= name
[colonpos
+1:]
993 prefix
= self
.namespacemap
.get(prefix
, prefix
)
994 name
= prefix
+ ':' + suffix
997 def _getAttribute(self
, attrsD
, name
):
998 return attrsD
.get(self
._mapToStandardPrefix
(name
))
1000 def _isBase64(self
, attrsD
, contentparams
):
1001 if attrsD
.get('mode', '') == 'base64':
1003 if self
.contentparams
['type'].startswith('text/'):
1005 if self
.contentparams
['type'].endswith('+xml'):
1007 if self
.contentparams
['type'].endswith('/xml'):
1011 def _itsAnHrefDamnIt(self
, attrsD
):
1012 href
= attrsD
.get('url', attrsD
.get('uri', attrsD
.get('href', None)))
1022 attrsD
['href'] = href
1025 def _save(self
, key
, value
, overwrite
=False):
1026 context
= self
._getContext
()
1028 context
[key
] = value
1030 context
.setdefault(key
, value
)
1032 def _start_rss(self
, attrsD
):
1033 versionmap
= {'0.91': 'rss091u',
1037 #If we're here then this is an RSS feed.
1038 #If we don't have a version or have a version that starts with something
1039 #other than RSS then there's been a mistake. Correct it.
1040 if not self
.version
or not self
.version
.startswith('rss'):
1041 attr_version
= attrsD
.get('version', '')
1042 version
= versionmap
.get(attr_version
)
1044 self
.version
= version
1045 elif attr_version
.startswith('2.'):
1046 self
.version
= 'rss20'
1048 self
.version
= 'rss'
1050 def _start_dlhottitles(self
, attrsD
):
1051 self
.version
= 'hotrss'
1053 def _start_channel(self
, attrsD
):
1055 self
._cdf
_common
(attrsD
)
1056 _start_feedinfo
= _start_channel
1058 def _cdf_common(self
, attrsD
):
1059 if attrsD
.has_key('lastmod'):
1060 self
._start
_modified
({})
1061 self
.elementstack
[-1][-1] = attrsD
['lastmod']
1062 self
._end
_modified
()
1063 if attrsD
.has_key('href'):
1064 self
._start
_link
({})
1065 self
.elementstack
[-1][-1] = attrsD
['href']
1068 def _start_feed(self
, attrsD
):
1070 versionmap
= {'0.1': 'atom01',
1073 if not self
.version
:
1074 attr_version
= attrsD
.get('version')
1075 version
= versionmap
.get(attr_version
)
1077 self
.version
= version
1079 self
.version
= 'atom'
1081 def _end_channel(self
):
1083 _end_feed
= _end_channel
1085 def _start_image(self
, attrsD
):
1086 context
= self
._getContext
()
1087 if not self
.inentry
:
1088 context
.setdefault('image', FeedParserDict())
1091 self
.push('image', 0)
1093 def _end_image(self
):
1097 def _start_textinput(self
, attrsD
):
1098 context
= self
._getContext
()
1099 context
.setdefault('textinput', FeedParserDict())
1100 self
.intextinput
= 1
1102 self
.push('textinput', 0)
1103 _start_textInput
= _start_textinput
1105 def _end_textinput(self
):
1106 self
.pop('textinput')
1107 self
.intextinput
= 0
1108 _end_textInput
= _end_textinput
1110 def _start_author(self
, attrsD
):
1112 self
.push('author', 1)
1113 # Append a new FeedParserDict when expecting an author
1114 context
= self
._getContext
()
1115 context
.setdefault('authors', [])
1116 context
['authors'].append(FeedParserDict())
1117 _start_managingeditor
= _start_author
1118 _start_dc_author
= _start_author
1119 _start_dc_creator
= _start_author
1120 _start_itunes_author
= _start_author
1122 def _end_author(self
):
1125 self
._sync
_author
_detail
()
1126 _end_managingeditor
= _end_author
1127 _end_dc_author
= _end_author
1128 _end_dc_creator
= _end_author
1129 _end_itunes_author
= _end_author
1131 def _start_itunes_owner(self
, attrsD
):
1132 self
.inpublisher
= 1
1133 self
.push('publisher', 0)
1135 def _end_itunes_owner(self
):
1136 self
.pop('publisher')
1137 self
.inpublisher
= 0
1138 self
._sync
_author
_detail
('publisher')
1140 def _start_contributor(self
, attrsD
):
1141 self
.incontributor
= 1
1142 context
= self
._getContext
()
1143 context
.setdefault('contributors', [])
1144 context
['contributors'].append(FeedParserDict())
1145 self
.push('contributor', 0)
1147 def _end_contributor(self
):
1148 self
.pop('contributor')
1149 self
.incontributor
= 0
1151 def _start_dc_contributor(self
, attrsD
):
1152 self
.incontributor
= 1
1153 context
= self
._getContext
()
1154 context
.setdefault('contributors', [])
1155 context
['contributors'].append(FeedParserDict())
1156 self
.push('name', 0)
1158 def _end_dc_contributor(self
):
1160 self
.incontributor
= 0
1162 def _start_name(self
, attrsD
):
1163 self
.push('name', 0)
1164 _start_itunes_name
= _start_name
1166 def _end_name(self
):
1167 value
= self
.pop('name')
1168 if self
.inpublisher
:
1169 self
._save
_author
('name', value
, 'publisher')
1171 self
._save
_author
('name', value
)
1172 elif self
.incontributor
:
1173 self
._save
_contributor
('name', value
)
1174 elif self
.intextinput
:
1175 context
= self
._getContext
()
1176 context
['name'] = value
1177 _end_itunes_name
= _end_name
1179 def _start_width(self
, attrsD
):
1180 self
.push('width', 0)
1182 def _end_width(self
):
1183 value
= self
.pop('width')
1189 context
= self
._getContext
()
1190 context
['width'] = value
1192 def _start_height(self
, attrsD
):
1193 self
.push('height', 0)
1195 def _end_height(self
):
1196 value
= self
.pop('height')
1202 context
= self
._getContext
()
1203 context
['height'] = value
1205 def _start_url(self
, attrsD
):
1206 self
.push('href', 1)
1207 _start_homepage
= _start_url
1208 _start_uri
= _start_url
1211 value
= self
.pop('href')
1213 self
._save
_author
('href', value
)
1214 elif self
.incontributor
:
1215 self
._save
_contributor
('href', value
)
1216 _end_homepage
= _end_url
1219 def _start_email(self
, attrsD
):
1220 self
.push('email', 0)
1221 _start_itunes_email
= _start_email
1223 def _end_email(self
):
1224 value
= self
.pop('email')
1225 if self
.inpublisher
:
1226 self
._save
_author
('email', value
, 'publisher')
1228 self
._save
_author
('email', value
)
1229 elif self
.incontributor
:
1230 self
._save
_contributor
('email', value
)
1231 _end_itunes_email
= _end_email
1233 def _getContext(self
):
1235 context
= self
.sourcedata
1236 elif self
.inimage
and self
.feeddata
.has_key('image'):
1237 context
= self
.feeddata
['image']
1238 elif self
.intextinput
:
1239 context
= self
.feeddata
['textinput']
1241 context
= self
.entries
[-1]
1243 context
= self
.feeddata
1246 def _save_author(self
, key
, value
, prefix
='author'):
1247 context
= self
._getContext
()
1248 context
.setdefault(prefix
+ '_detail', FeedParserDict())
1249 context
[prefix
+ '_detail'][key
] = value
1250 self
._sync
_author
_detail
()
1251 context
.setdefault('authors', [FeedParserDict()])
1252 context
['authors'][-1][key
] = value
1254 def _save_contributor(self
, key
, value
):
1255 context
= self
._getContext
()
1256 context
.setdefault('contributors', [FeedParserDict()])
1257 context
['contributors'][-1][key
] = value
1259 def _sync_author_detail(self
, key
='author'):
1260 context
= self
._getContext
()
1261 detail
= context
.get('%s_detail' % key
)
1263 name
= detail
.get('name')
1264 email
= detail
.get('email')
1266 context
[key
] = '%s (%s)' % (name
, email
)
1270 context
[key
] = email
1272 author
, email
= context
.get(key
), None
1273 if not author
: return
1274 emailmatch
= re
.search(r
'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author
)
1276 email
= emailmatch
.group(0)
1277 # probably a better way to do the following, but it passes all the tests
1278 author
= author
.replace(email
, '')
1279 author
= author
.replace('()', '')
1280 author
= author
.replace('<>', '')
1281 author
= author
.replace('<>', '')
1282 author
= author
.strip()
1283 if author
and (author
[0] == '('):
1285 if author
and (author
[-1] == ')'):
1286 author
= author
[:-1]
1287 author
= author
.strip()
1289 context
.setdefault('%s_detail' % key
, FeedParserDict())
1291 context
['%s_detail' % key
]['name'] = author
1293 context
['%s_detail' % key
]['email'] = email
1295 def _start_subtitle(self
, attrsD
):
1296 self
.pushContent('subtitle', attrsD
, 'text/plain', 1)
1297 _start_tagline
= _start_subtitle
1298 _start_itunes_subtitle
= _start_subtitle
1300 def _end_subtitle(self
):
1301 self
.popContent('subtitle')
1302 _end_tagline
= _end_subtitle
1303 _end_itunes_subtitle
= _end_subtitle
1305 def _start_rights(self
, attrsD
):
1306 self
.pushContent('rights', attrsD
, 'text/plain', 1)
1307 _start_dc_rights
= _start_rights
1308 _start_copyright
= _start_rights
1310 def _end_rights(self
):
1311 self
.popContent('rights')
1312 _end_dc_rights
= _end_rights
1313 _end_copyright
= _end_rights
1315 def _start_item(self
, attrsD
):
1316 self
.entries
.append(FeedParserDict())
1317 self
.push('item', 0)
1321 id = self
._getAttribute
(attrsD
, 'rdf:about')
1323 context
= self
._getContext
()
1325 self
._cdf
_common
(attrsD
)
1326 _start_entry
= _start_item
1327 _start_product
= _start_item
1329 def _end_item(self
):
1332 _end_entry
= _end_item
1334 def _start_dc_language(self
, attrsD
):
1335 self
.push('language', 1)
1336 _start_language
= _start_dc_language
1338 def _end_dc_language(self
):
1339 self
.lang
= self
.pop('language')
1340 _end_language
= _end_dc_language
1342 def _start_dc_publisher(self
, attrsD
):
1343 self
.push('publisher', 1)
1344 _start_webmaster
= _start_dc_publisher
1346 def _end_dc_publisher(self
):
1347 self
.pop('publisher')
1348 self
._sync
_author
_detail
('publisher')
1349 _end_webmaster
= _end_dc_publisher
1351 def _start_published(self
, attrsD
):
1352 self
.push('published', 1)
1353 _start_dcterms_issued
= _start_published
1354 _start_issued
= _start_published
1356 def _end_published(self
):
1357 value
= self
.pop('published')
1358 self
._save
('published_parsed', _parse_date(value
), overwrite
=True)
1359 _end_dcterms_issued
= _end_published
1360 _end_issued
= _end_published
1362 def _start_updated(self
, attrsD
):
1363 self
.push('updated', 1)
1364 _start_modified
= _start_updated
1365 _start_dcterms_modified
= _start_updated
1366 _start_pubdate
= _start_updated
1367 _start_dc_date
= _start_updated
1368 _start_lastbuilddate
= _start_updated
1370 def _end_updated(self
):
1371 value
= self
.pop('updated')
1372 parsed_value
= _parse_date(value
)
1373 self
._save
('updated_parsed', parsed_value
, overwrite
=True)
1374 _end_modified
= _end_updated
1375 _end_dcterms_modified
= _end_updated
1376 _end_pubdate
= _end_updated
1377 _end_dc_date
= _end_updated
1378 _end_lastbuilddate
= _end_updated
1380 def _start_created(self
, attrsD
):
1381 self
.push('created', 1)
1382 _start_dcterms_created
= _start_created
1384 def _end_created(self
):
1385 value
= self
.pop('created')
1386 self
._save
('created_parsed', _parse_date(value
), overwrite
=True)
1387 _end_dcterms_created
= _end_created
1389 def _start_expirationdate(self
, attrsD
):
1390 self
.push('expired', 1)
1392 def _end_expirationdate(self
):
1393 self
._save
('expired_parsed', _parse_date(self
.pop('expired')), overwrite
=True)
1395 def _start_cc_license(self
, attrsD
):
1396 context
= self
._getContext
()
1397 value
= self
._getAttribute
(attrsD
, 'rdf:resource')
1398 attrsD
= FeedParserDict()
1399 attrsD
['rel']='license'
1400 if value
: attrsD
['href']=value
1401 context
.setdefault('links', []).append(attrsD
)
1403 def _start_creativecommons_license(self
, attrsD
):
1404 self
.push('license', 1)
1405 _start_creativeCommons_license
= _start_creativecommons_license
1407 def _end_creativecommons_license(self
):
1408 value
= self
.pop('license')
1409 context
= self
._getContext
()
1410 attrsD
= FeedParserDict()
1411 attrsD
['rel']='license'
1412 if value
: attrsD
['href']=value
1413 context
.setdefault('links', []).append(attrsD
)
1414 del context
['license']
1415 _end_creativeCommons_license
= _end_creativecommons_license
1417 def _addXFN(self
, relationships
, href
, name
):
1418 context
= self
._getContext
()
1419 xfn
= context
.setdefault('xfn', [])
1420 value
= FeedParserDict({'relationships': relationships
, 'href': href
, 'name': name
})
1421 if value
not in xfn
:
1424 def _addTag(self
, term
, scheme
, label
):
1425 context
= self
._getContext
()
1426 tags
= context
.setdefault('tags', [])
1427 if (not term
) and (not scheme
) and (not label
): return
1428 value
= FeedParserDict({'term': term
, 'scheme': scheme
, 'label': label
})
1429 if value
not in tags
:
1432 def _start_category(self
, attrsD
):
1433 if _debug
: sys
.stderr
.write('entering _start_category with %s\n' % repr(attrsD
))
1434 term
= attrsD
.get('term')
1435 scheme
= attrsD
.get('scheme', attrsD
.get('domain'))
1436 label
= attrsD
.get('label')
1437 self
._addTag
(term
, scheme
, label
)
1438 self
.push('category', 1)
1439 _start_dc_subject
= _start_category
1440 _start_keywords
= _start_category
1442 def _start_media_category(self
, attrsD
):
1443 attrsD
.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
1444 self
._start
_category
(attrsD
)
1446 def _end_itunes_keywords(self
):
1447 for term
in self
.pop('itunes_keywords').split():
1448 self
._addTag
(term
, 'http://www.itunes.com/', None)
1450 def _start_itunes_category(self
, attrsD
):
1451 self
._addTag
(attrsD
.get('text'), 'http://www.itunes.com/', None)
1452 self
.push('category', 1)
1454 def _end_category(self
):
1455 value
= self
.pop('category')
1456 if not value
: return
1457 context
= self
._getContext
()
1458 tags
= context
['tags']
1459 if value
and len(tags
) and not tags
[-1]['term']:
1460 tags
[-1]['term'] = value
1462 self
._addTag
(value
, None, None)
1463 _end_dc_subject
= _end_category
1464 _end_keywords
= _end_category
1465 _end_itunes_category
= _end_category
1466 _end_media_category
= _end_category
1468 def _start_cloud(self
, attrsD
):
1469 self
._getContext
()['cloud'] = FeedParserDict(attrsD
)
1471 def _start_link(self
, attrsD
):
1472 attrsD
.setdefault('rel', 'alternate')
1473 if attrsD
['rel'] == 'self':
1474 attrsD
.setdefault('type', 'application/atom+xml')
1476 attrsD
.setdefault('type', 'text/html')
1477 context
= self
._getContext
()
1478 attrsD
= self
._itsAnHrefDamnIt
(attrsD
)
1479 if attrsD
.has_key('href'):
1480 attrsD
['href'] = self
.resolveURI(attrsD
['href'])
1481 expectingText
= self
.infeed
or self
.inentry
or self
.insource
1482 context
.setdefault('links', [])
1483 if not (self
.inentry
and self
.inimage
):
1484 context
['links'].append(FeedParserDict(attrsD
))
1485 if attrsD
.has_key('href'):
1487 if (attrsD
.get('rel') == 'alternate') and (self
.mapContentType(attrsD
.get('type')) in self
.html_types
):
1488 context
['link'] = attrsD
['href']
1490 self
.push('link', expectingText
)
1491 _start_producturl
= _start_link
1493 def _end_link(self
):
1494 value
= self
.pop('link')
1495 context
= self
._getContext
()
1496 _end_producturl
= _end_link
1498 def _start_guid(self
, attrsD
):
1499 self
.guidislink
= (attrsD
.get('ispermalink', 'true') == 'true')
1502 def _end_guid(self
):
1503 value
= self
.pop('id')
1504 self
._save
('guidislink', self
.guidislink
and not self
._getContext
().has_key('link'))
1506 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1507 # and only if the item doesn't already have a link element
1508 self
._save
('link', value
)
1510 def _start_title(self
, attrsD
):
1511 if self
.svgOK
: return self
.unknown_starttag('title', attrsD
.items())
1512 self
.pushContent('title', attrsD
, 'text/plain', self
.infeed
or self
.inentry
or self
.insource
)
1513 _start_dc_title
= _start_title
1514 _start_media_title
= _start_title
1516 def _end_title(self
):
1517 if self
.svgOK
: return
1518 value
= self
.popContent('title')
1519 if not value
: return
1520 context
= self
._getContext
()
1522 _end_dc_title
= _end_title
1524 def _end_media_title(self
):
1525 hasTitle
= self
.hasTitle
1527 self
.hasTitle
= hasTitle
1529 def _start_description(self
, attrsD
):
1530 context
= self
._getContext
()
1531 if context
.has_key('summary'):
1532 self
._summaryKey
= 'content'
1533 self
._start
_content
(attrsD
)
1535 self
.pushContent('description', attrsD
, 'text/html', self
.infeed
or self
.inentry
or self
.insource
)
1536 _start_dc_description
= _start_description
1538 def _start_abstract(self
, attrsD
):
1539 self
.pushContent('description', attrsD
, 'text/plain', self
.infeed
or self
.inentry
or self
.insource
)
1541 def _end_description(self
):
1542 if self
._summaryKey
== 'content':
1545 value
= self
.popContent('description')
1546 self
._summaryKey
= None
1547 _end_abstract
= _end_description
1548 _end_dc_description
= _end_description
1550 def _start_info(self
, attrsD
):
1551 self
.pushContent('info', attrsD
, 'text/plain', 1)
1552 _start_feedburner_browserfriendly
= _start_info
1554 def _end_info(self
):
1555 self
.popContent('info')
1556 _end_feedburner_browserfriendly
= _end_info
1558 def _start_generator(self
, attrsD
):
1560 attrsD
= self
._itsAnHrefDamnIt
(attrsD
)
1561 if attrsD
.has_key('href'):
1562 attrsD
['href'] = self
.resolveURI(attrsD
['href'])
1563 self
._getContext
()['generator_detail'] = FeedParserDict(attrsD
)
1564 self
.push('generator', 1)
1566 def _end_generator(self
):
1567 value
= self
.pop('generator')
1568 context
= self
._getContext
()
1569 if context
.has_key('generator_detail'):
1570 context
['generator_detail']['name'] = value
1572 def _start_admin_generatoragent(self
, attrsD
):
1573 self
.push('generator', 1)
1574 value
= self
._getAttribute
(attrsD
, 'rdf:resource')
1576 self
.elementstack
[-1][2].append(value
)
1577 self
.pop('generator')
1578 self
._getContext
()['generator_detail'] = FeedParserDict({'href': value
})
1580 def _start_admin_errorreportsto(self
, attrsD
):
1581 self
.push('errorreportsto', 1)
1582 value
= self
._getAttribute
(attrsD
, 'rdf:resource')
1584 self
.elementstack
[-1][2].append(value
)
1585 self
.pop('errorreportsto')
1587 def _start_summary(self
, attrsD
):
1588 context
= self
._getContext
()
1589 if context
.has_key('summary'):
1590 self
._summaryKey
= 'content'
1591 self
._start
_content
(attrsD
)
1593 self
._summaryKey
= 'summary'
1594 self
.pushContent(self
._summaryKey
, attrsD
, 'text/plain', 1)
1595 _start_itunes_summary
= _start_summary
1597 def _end_summary(self
):
1598 if self
._summaryKey
== 'content':
1601 self
.popContent(self
._summaryKey
or 'summary')
1602 self
._summaryKey
= None
1603 _end_itunes_summary
= _end_summary
1605 def _start_enclosure(self
, attrsD
):
1606 attrsD
= self
._itsAnHrefDamnIt
(attrsD
)
1607 context
= self
._getContext
()
1608 attrsD
['rel']='enclosure'
1609 context
.setdefault('links', []).append(FeedParserDict(attrsD
))
1611 def _start_source(self
, attrsD
):
1613 # This means that we're processing a source element from an RSS 2.0 feed
1614 self
.sourcedata
['href'] = attrsD
[u
'url']
1615 self
.push('source', 1)
1619 def _end_source(self
):
1621 value
= self
.pop('source')
1623 self
.sourcedata
['title'] = value
1624 self
._getContext
()['source'] = copy
.deepcopy(self
.sourcedata
)
1625 self
.sourcedata
.clear()
1627 def _start_content(self
, attrsD
):
1628 self
.pushContent('content', attrsD
, 'text/plain', 1)
1629 src
= attrsD
.get('src')
1631 self
.contentparams
['src'] = src
1632 self
.push('content', 1)
1634 def _start_prodlink(self
, attrsD
):
1635 self
.pushContent('content', attrsD
, 'text/html', 1)
1637 def _start_body(self
, attrsD
):
1638 self
.pushContent('content', attrsD
, 'application/xhtml+xml', 1)
1639 _start_xhtml_body
= _start_body
1641 def _start_content_encoded(self
, attrsD
):
1642 self
.pushContent('content', attrsD
, 'text/html', 1)
1643 _start_fullitem
= _start_content_encoded
1645 def _end_content(self
):
1646 copyToSummary
= self
.mapContentType(self
.contentparams
.get('type')) in (['text/plain'] + self
.html_types
)
1647 value
= self
.popContent('content')
1649 self
._save
('summary', value
)
1651 _end_body
= _end_content
1652 _end_xhtml_body
= _end_content
1653 _end_content_encoded
= _end_content
1654 _end_fullitem
= _end_content
1655 _end_prodlink
= _end_content
1657 def _start_itunes_image(self
, attrsD
):
1658 self
.push('itunes_image', 0)
1659 if attrsD
.get('href'):
1660 self
._getContext
()['image'] = FeedParserDict({'href': attrsD
.get('href')})
1661 _start_itunes_link
= _start_itunes_image
1663 def _end_itunes_block(self
):
1664 value
= self
.pop('itunes_block', 0)
1665 self
._getContext
()['itunes_block'] = (value
== 'yes') and 1 or 0
1667 def _end_itunes_explicit(self
):
1668 value
= self
.pop('itunes_explicit', 0)
1669 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1670 # False and None both evaluate as False, so the difference can be ignored
1671 # by applications that only need to know if the content is explicit.
1672 self
._getContext
()['itunes_explicit'] = (None, False, True)[(value
== 'yes' and 2) or value
== 'clean' or 0]
1674 def _start_media_content(self
, attrsD
):
1675 context
= self
._getContext
()
1676 context
.setdefault('media_content', [])
1677 context
['media_content'].append(attrsD
)
1679 def _start_media_thumbnail(self
, attrsD
):
1680 context
= self
._getContext
()
1681 context
.setdefault('media_thumbnail', [])
1682 self
.push('url', 1) # new
1683 context
['media_thumbnail'].append(attrsD
)
1685 def _end_media_thumbnail(self
):
1686 url
= self
.pop('url')
1687 context
= self
._getContext
()
1688 if url
!= None and len(url
.strip()) != 0:
1689 if not context
['media_thumbnail'][-1].has_key('url'):
1690 context
['media_thumbnail'][-1]['url'] = url
1692 def _start_media_player(self
, attrsD
):
1693 self
.push('media_player', 0)
1694 self
._getContext
()['media_player'] = FeedParserDict(attrsD
)
1696 def _end_media_player(self
):
1697 value
= self
.pop('media_player')
1698 context
= self
._getContext
()
1699 context
['media_player']['content'] = value
1701 def _start_newlocation(self
, attrsD
):
1702 self
.push('newlocation', 1)
1704 def _end_newlocation(self
):
1705 url
= self
.pop('newlocation')
1706 context
= self
._getContext
()
1707 # don't set newlocation if the context isn't right
1708 if context
is not self
.feeddata
:
1710 context
['newlocation'] = _makeSafeAbsoluteURI(self
.baseuri
, url
.strip())
1713 class _StrictFeedParser(_FeedParserMixin
, xml
.sax
.handler
.ContentHandler
):
1714 def __init__(self
, baseuri
, baselang
, encoding
):
1715 if _debug
: sys
.stderr
.write('trying StrictFeedParser\n')
1716 xml
.sax
.handler
.ContentHandler
.__init
__(self
)
1717 _FeedParserMixin
.__init
__(self
, baseuri
, baselang
, encoding
)
1722 def startPrefixMapping(self
, prefix
, uri
):
1723 self
.trackNamespace(prefix
, uri
)
1724 if uri
== 'http://www.w3.org/1999/xlink':
1725 self
.decls
['xmlns:'+prefix
] = uri
1727 def startElementNS(self
, name
, qname
, attrs
):
1728 namespace
, localname
= name
1729 lowernamespace
= str(namespace
or '').lower()
1730 if lowernamespace
.find('backend.userland.com/rss') <> -1:
1731 # match any backend.userland.com namespace
1732 namespace
= 'http://backend.userland.com/rss'
1733 lowernamespace
= namespace
1734 if qname
and qname
.find(':') > 0:
1735 givenprefix
= qname
.split(':')[0]
1738 prefix
= self
._matchnamespaces
.get(lowernamespace
, givenprefix
)
1739 if givenprefix
and (prefix
== None or (prefix
== '' and lowernamespace
== '')) and not self
.namespacesInUse
.has_key(givenprefix
):
1740 raise UndeclaredNamespace
, "'%s' is not associated with a namespace" % givenprefix
1741 localname
= str(localname
).lower()
1743 # qname implementation is horribly broken in Python 2.1 (it
1744 # doesn't report any), and slightly broken in Python 2.2 (it
1745 # doesn't report the xml: namespace). So we match up namespaces
1746 # with a known list first, and then possibly override them with
1747 # the qnames the SAX parser gives us (if indeed it gives us any
1748 # at all). Thanks to MatejC for helping me test this and
1749 # tirelessly telling me that it didn't work yet.
1750 attrsD
, self
.decls
= self
.decls
, {}
1751 if localname
=='math' and namespace
=='http://www.w3.org/1998/Math/MathML':
1752 attrsD
['xmlns']=namespace
1753 if localname
=='svg' and namespace
=='http://www.w3.org/2000/svg':
1754 attrsD
['xmlns']=namespace
1757 localname
= prefix
.lower() + ':' + localname
1758 elif namespace
and not qname
: #Expat
1759 for name
,value
in self
.namespacesInUse
.items():
1760 if name
and value
== namespace
:
1761 localname
= name
+ ':' + localname
1763 if _debug
: sys
.stderr
.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname
, namespace
, givenprefix
, prefix
, attrs
.items(), localname
))
1765 for (namespace
, attrlocalname
), attrvalue
in attrs
._attrs
.items():
1766 lowernamespace
= (namespace
or '').lower()
1767 prefix
= self
._matchnamespaces
.get(lowernamespace
, '')
1769 attrlocalname
= prefix
+ ':' + attrlocalname
1770 attrsD
[str(attrlocalname
).lower()] = attrvalue
1771 for qname
in attrs
.getQNames():
1772 attrsD
[str(qname
).lower()] = attrs
.getValueByQName(qname
)
1773 self
.unknown_starttag(localname
, attrsD
.items())
1775 def characters(self
, text
):
1776 self
.handle_data(text
)
1778 def endElementNS(self
, name
, qname
):
1779 namespace
, localname
= name
1780 lowernamespace
= str(namespace
or '').lower()
1781 if qname
and qname
.find(':') > 0:
1782 givenprefix
= qname
.split(':')[0]
1785 prefix
= self
._matchnamespaces
.get(lowernamespace
, givenprefix
)
1787 localname
= prefix
+ ':' + localname
1788 elif namespace
and not qname
: #Expat
1789 for name
,value
in self
.namespacesInUse
.items():
1790 if name
and value
== namespace
:
1791 localname
= name
+ ':' + localname
1793 localname
= str(localname
).lower()
1794 self
.unknown_endtag(localname
)
1796 def error(self
, exc
):
1800 def fatalError(self
, exc
):
1804 class _BaseHTMLProcessor(sgmllib
.SGMLParser
):
1805 special
= re
.compile('''[<>'"]''')
1806 bare_ampersand
= re
.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1807 elements_no_end_tag
= [
1808 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1809 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1810 'source', 'track', 'wbr'
1813 def __init__(self
, encoding
, _type
):
1814 self
.encoding
= encoding
1816 if _debug
: sys
.stderr
.write('entering BaseHTMLProcessor, encoding=%s\n' % self
.encoding
)
1817 sgmllib
.SGMLParser
.__init
__(self
)
1821 sgmllib
.SGMLParser
.reset(self
)
1823 def _shorttag_replace(self
, match
):
1824 tag
= match
.group(1)
1825 if tag
in self
.elements_no_end_tag
:
1826 return '<' + tag
+ ' />'
1828 return '<' + tag
+ '></' + tag
+ '>'
1830 def parse_starttag(self
,i
):
1831 j
=sgmllib
.SGMLParser
.parse_starttag(self
, i
)
1832 if self
._type
== 'application/xhtml+xml':
1833 if j
>2 and self
.rawdata
[j
-2:j
]=='/>':
1834 self
.unknown_endtag(self
.lasttag
)
1837 def feed(self
, data
):
1838 data
= re
.compile(r
'<!((?!DOCTYPE|--|\[))', re
.IGNORECASE
).sub(r
'<!\1', data
)
1839 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1840 data
= re
.sub(r
'<([^<>\s]+?)\s*/>', self
._shorttag
_replace
, data
)
1841 data
= data
.replace(''', "'")
1842 data
= data
.replace('"', '"')
1847 self
.encoding
= self
.encoding
+ '_INVALID_PYTHON_3'
1849 if self
.encoding
and type(data
) == type(u
''):
1850 data
= data
.encode(self
.encoding
)
1851 sgmllib
.SGMLParser
.feed(self
, data
)
1852 sgmllib
.SGMLParser
.close(self
)
1854 def normalize_attrs(self
, attrs
):
1855 if not attrs
: return attrs
1856 # utility method to be called by descendants
1857 attrs
= dict([(k
.lower(), v
) for k
, v
in attrs
]).items()
1858 attrs
= [(k
, k
in ('rel', 'type') and v
.lower() or v
) for k
, v
in attrs
]
1862 def unknown_starttag(self
, tag
, attrs
):
1863 # called for each start tag
1864 # attrs is a list of (attr, value) tuples
1865 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1866 if _debug
: sys
.stderr
.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag
)
1870 for key
, value
in attrs
:
1871 value
=value
.replace('>','>').replace('<','<').replace('"','"')
1872 value
= self
.bare_ampersand
.sub("&", value
)
1873 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1874 if type(value
) != type(u
''):
1876 value
= unicode(value
, self
.encoding
)
1878 value
= unicode(value
, 'iso-8859-1')
1880 # Currently, in Python 3 the key is already a str, and cannot be decoded again
1881 uattrs
.append((unicode(key
, self
.encoding
), value
))
1883 uattrs
.append((key
, value
))
1884 strattrs
= u
''.join([u
' %s="%s"' % (key
, value
) for key
, value
in uattrs
])
1887 strattrs
=strattrs
.encode(self
.encoding
)
1890 if tag
in self
.elements_no_end_tag
:
1891 self
.pieces
.append('<%(tag)s%(strattrs)s />' % locals())
1893 self
.pieces
.append('<%(tag)s%(strattrs)s>' % locals())
1895 def unknown_endtag(self
, tag
):
1896 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1897 # Reconstruct the original end tag.
1898 if tag
not in self
.elements_no_end_tag
:
1899 self
.pieces
.append("</%(tag)s>" % locals())
1901 def handle_charref(self
, ref
):
1902 # called for each character reference, e.g. for ' ', ref will be '160'
1903 # Reconstruct the original character reference.
1904 if ref
.startswith('x'):
1905 value
= unichr(int(ref
[1:],16))
1907 value
= unichr(int(ref
))
1909 if value
in _cp1252
.keys():
1910 self
.pieces
.append('&#%s;' % hex(ord(_cp1252
[value
]))[1:])
1912 self
.pieces
.append('&#%(ref)s;' % locals())
1914 def handle_entityref(self
, ref
):
1915 # called for each entity reference, e.g. for '©', ref will be 'copy'
1916 # Reconstruct the original entity reference.
1917 if name2codepoint
.has_key(ref
):
1918 self
.pieces
.append('&%(ref)s;' % locals())
1920 self
.pieces
.append('&%(ref)s' % locals())
1922 def handle_data(self
, text
):
1923 # called for each block of plain text, i.e. outside of any tag and
1924 # not containing any character or entity references
1925 # Store the original text verbatim.
1926 if _debug
: sys
.stderr
.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text
)
1927 self
.pieces
.append(text
)
1929 def handle_comment(self
, text
):
1930 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1931 # Reconstruct the original comment.
1932 self
.pieces
.append('<!--%(text)s-->' % locals())
1934 def handle_pi(self
, text
):
1935 # called for each processing instruction, e.g. <?instruction>
1936 # Reconstruct original processing instruction.
1937 self
.pieces
.append('<?%(text)s>' % locals())
1939 def handle_decl(self
, text
):
1940 # called for the DOCTYPE, if present, e.g.
1941 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1942 # "http://www.w3.org/TR/html4/loose.dtd">
1943 # Reconstruct original DOCTYPE
1944 self
.pieces
.append('<!%(text)s>' % locals())
1946 _new_declname_match
= re
.compile(r
'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1947 def _scan_name(self
, i
, declstartpos
):
1948 rawdata
= self
.rawdata
1952 m
= self
._new
_declname
_match
(rawdata
, i
)
1956 if (i
+ len(s
)) == n
:
1957 return None, -1 # end of buffer
1958 return name
.lower(), m
.end()
1960 self
.handle_data(rawdata
)
1961 # self.updatepos(declstartpos, i)
1964 def convert_charref(self
, name
):
1965 return '&#%s;' % name
1967 def convert_entityref(self
, name
):
1968 return '&%s;' % name
1971 '''Return processed HTML as a single string'''
1972 return ''.join([str(p
) for p
in self
.pieces
])
1974 class _LooseFeedParser(_FeedParserMixin
, _BaseHTMLProcessor
):
1975 def __init__(self
, baseuri
, baselang
, encoding
, entities
):
1976 sgmllib
.SGMLParser
.__init
__(self
)
1977 _FeedParserMixin
.__init
__(self
, baseuri
, baselang
, encoding
)
1978 _BaseHTMLProcessor
.__init
__(self
, encoding
, 'application/xhtml+xml')
1979 self
.entities
=entities
1981 def decodeEntities(self
, element
, data
):
1982 data
= data
.replace('<', '<')
1983 data
= data
.replace('<', '<')
1984 data
= data
.replace('<', '<')
1985 data
= data
.replace('>', '>')
1986 data
= data
.replace('>', '>')
1987 data
= data
.replace('>', '>')
1988 data
= data
.replace('&', '&')
1989 data
= data
.replace('&', '&')
1990 data
= data
.replace('"', '"')
1991 data
= data
.replace('"', '"')
1992 data
= data
.replace(''', ''')
1993 data
= data
.replace(''', ''')
1994 if self
.contentparams
.has_key('type') and not self
.contentparams
.get('type', 'xml').endswith('xml'):
1995 data
= data
.replace('<', '<')
1996 data
= data
.replace('>', '>')
1997 data
= data
.replace('&', '&')
1998 data
= data
.replace('"', '"')
1999 data
= data
.replace(''', "'")
2002 def strattrs(self
, attrs
):
2003 return ''.join([' %s="%s"' % (n
,v
.replace('"','"')) for n
,v
in attrs
])
2005 class _MicroformatsParser
:
2012 known_xfn_relationships
= ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2013 known_binary_extensions
= ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2015 def __init__(self
, data
, baseuri
, encoding
):
2016 self
.document
= BeautifulSoup
.BeautifulSoup(data
)
2017 self
.baseuri
= baseuri
2018 self
.encoding
= encoding
2019 if type(data
) == type(u
''):
2020 data
= data
.encode(encoding
)
2022 self
.enclosures
= []
2026 def vcardEscape(self
, s
):
2027 if type(s
) in (type(''), type(u
'')):
2028 s
= s
.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2031 def vcardFold(self
, s
):
2032 s
= re
.sub(';+$', '', s
)
2036 while len(s
) > iMax
:
2037 sFolded
+= sPrefix
+ s
[:iMax
] + '\n'
2041 sFolded
+= sPrefix
+ s
2044 def normalize(self
, s
):
2045 return re
.sub(r
'\s+', ' ', s
).strip()
2047 def unique(self
, aList
):
2049 for element
in aList
:
2050 if element
not in results
:
2051 results
.append(element
)
2054 def toISO8601(self
, dt
):
2055 return time
.strftime('%Y-%m-%dT%H:%M:%SZ', dt
)
2057 def getPropertyValue(self
, elmRoot
, sProperty
, iPropertyType
=4, bAllowMultiple
=0, bAutoEscape
=0):
2059 sProperty
= sProperty
.lower()
2062 propertyMatch
= {'class': re
.compile(r
'\b%s\b' % sProperty
)}
2063 if bAllowMultiple
and (iPropertyType
!= self
.NODE
):
2065 containers
= elmRoot(['ul', 'ol'], propertyMatch
)
2066 for container
in containers
:
2067 snapResults
.extend(container('li'))
2068 bFound
= (len(snapResults
) != 0)
2070 snapResults
= elmRoot(all
, propertyMatch
)
2071 bFound
= (len(snapResults
) != 0)
2072 if (not bFound
) and (sProperty
== 'value'):
2073 snapResults
= elmRoot('pre')
2074 bFound
= (len(snapResults
) != 0)
2075 bNormalize
= not bFound
2077 snapResults
= [elmRoot
]
2078 bFound
= (len(snapResults
) != 0)
2080 if sProperty
== 'vcard':
2081 snapFilter
= elmRoot(all
, propertyMatch
)
2082 for node
in snapFilter
:
2083 if node
.findParent(all
, propertyMatch
):
2084 arFilter
.append(node
)
2086 for node
in snapResults
:
2087 if node
not in arFilter
:
2088 arResults
.append(node
)
2089 bFound
= (len(arResults
) != 0)
2091 if bAllowMultiple
: return []
2092 elif iPropertyType
== self
.STRING
: return ''
2093 elif iPropertyType
== self
.DATE
: return None
2094 elif iPropertyType
== self
.URI
: return ''
2095 elif iPropertyType
== self
.NODE
: return None
2098 for elmResult
in arResults
:
2100 if iPropertyType
== self
.NODE
:
2102 arValues
.append(elmResult
)
2106 sNodeName
= elmResult
.name
.lower()
2107 if (iPropertyType
== self
.EMAIL
) and (sNodeName
== 'a'):
2108 sValue
= (elmResult
.get('href') or '').split('mailto:').pop().split('?')[0]
2110 sValue
= bNormalize
and self
.normalize(sValue
) or sValue
.strip()
2111 if (not sValue
) and (sNodeName
== 'abbr'):
2112 sValue
= elmResult
.get('title')
2114 sValue
= bNormalize
and self
.normalize(sValue
) or sValue
.strip()
2115 if (not sValue
) and (iPropertyType
== self
.URI
):
2116 if sNodeName
== 'a': sValue
= elmResult
.get('href')
2117 elif sNodeName
== 'img': sValue
= elmResult
.get('src')
2118 elif sNodeName
== 'object': sValue
= elmResult
.get('data')
2120 sValue
= bNormalize
and self
.normalize(sValue
) or sValue
.strip()
2121 if (not sValue
) and (sNodeName
== 'img'):
2122 sValue
= elmResult
.get('alt')
2124 sValue
= bNormalize
and self
.normalize(sValue
) or sValue
.strip()
2126 sValue
= elmResult
.renderContents()
2127 sValue
= re
.sub(r
'<\S[^>]*>', '', sValue
)
2128 sValue
= sValue
.replace('\r\n', '\n')
2129 sValue
= sValue
.replace('\r', '\n')
2131 sValue
= bNormalize
and self
.normalize(sValue
) or sValue
.strip()
2132 if not sValue
: continue
2133 if iPropertyType
== self
.DATE
:
2134 sValue
= _parse_date_iso8601(sValue
)
2136 arValues
.append(bAutoEscape
and self
.vcardEscape(sValue
) or sValue
)
2138 return bAutoEscape
and self
.vcardEscape(sValue
) or sValue
2141 def findVCards(self
, elmRoot
, bAgentParsing
=0):
2144 if not bAgentParsing
:
2145 arCards
= self
.getPropertyValue(elmRoot
, 'vcard', bAllowMultiple
=1)
2149 for elmCard
in arCards
:
2152 def processSingleString(sProperty
):
2153 sValue
= self
.getPropertyValue(elmCard
, sProperty
, self
.STRING
, bAutoEscape
=1).decode(self
.encoding
)
2155 arLines
.append(self
.vcardFold(sProperty
.upper() + ':' + sValue
))
2156 return sValue
or u
''
2158 def processSingleURI(sProperty
):
2159 sValue
= self
.getPropertyValue(elmCard
, sProperty
, self
.URI
)
2164 if sValue
.startswith('data:'):
2165 sEncoding
= ';ENCODING=b'
2166 sContentType
= sValue
.split(';')[0].split('/').pop()
2167 sValue
= sValue
.split(',', 1).pop()
2169 elmValue
= self
.getPropertyValue(elmCard
, sProperty
)
2171 if sProperty
!= 'url':
2172 sValueKey
= ';VALUE=uri'
2173 sContentType
= elmValue
.get('type', '').strip().split('/').pop().strip()
2174 sContentType
= sContentType
.upper()
2175 if sContentType
== 'OCTET-STREAM':
2178 sContentType
= ';TYPE=' + sContentType
.upper()
2179 arLines
.append(self
.vcardFold(sProperty
.upper() + sEncoding
+ sContentType
+ sValueKey
+ ':' + sValue
))
2181 def processTypeValue(sProperty
, arDefaultType
, arForceType
=None):
2182 arResults
= self
.getPropertyValue(elmCard
, sProperty
, bAllowMultiple
=1)
2183 for elmResult
in arResults
:
2184 arType
= self
.getPropertyValue(elmResult
, 'type', self
.STRING
, 1, 1)
2186 arType
= self
.unique(arForceType
+ arType
)
2188 arType
= arDefaultType
2189 sValue
= self
.getPropertyValue(elmResult
, 'value', self
.EMAIL
, 0)
2191 arLines
.append(self
.vcardFold(sProperty
.upper() + ';TYPE=' + ','.join(arType
) + ':' + sValue
))
2194 # must do this before all other properties because it is destructive
2195 # (removes nested class="vcard" nodes so they don't interfere with
2196 # this vcard's other properties)
2197 arAgent
= self
.getPropertyValue(elmCard
, 'agent', bAllowMultiple
=1)
2198 for elmAgent
in arAgent
:
2199 if re
.compile(r
'\bvcard\b').search(elmAgent
.get('class')):
2200 sAgentValue
= self
.findVCards(elmAgent
, 1) + '\n'
2201 sAgentValue
= sAgentValue
.replace('\n', '\\n')
2202 sAgentValue
= sAgentValue
.replace(';', '\\;')
2204 arLines
.append(self
.vcardFold('AGENT:' + sAgentValue
))
2205 # Completely remove the agent element from the parse tree
2208 sAgentValue
= self
.getPropertyValue(elmAgent
, 'value', self
.URI
, bAutoEscape
=1);
2210 arLines
.append(self
.vcardFold('AGENT;VALUE=uri:' + sAgentValue
))
2213 sFN
= processSingleString('fn')
2216 elmName
= self
.getPropertyValue(elmCard
, 'n')
2218 sFamilyName
= self
.getPropertyValue(elmName
, 'family-name', self
.STRING
, bAutoEscape
=1)
2219 sGivenName
= self
.getPropertyValue(elmName
, 'given-name', self
.STRING
, bAutoEscape
=1)
2220 arAdditionalNames
= self
.getPropertyValue(elmName
, 'additional-name', self
.STRING
, 1, 1) + self
.getPropertyValue(elmName
, 'additional-names', self
.STRING
, 1, 1)
2221 arHonorificPrefixes
= self
.getPropertyValue(elmName
, 'honorific-prefix', self
.STRING
, 1, 1) + self
.getPropertyValue(elmName
, 'honorific-prefixes', self
.STRING
, 1, 1)
2222 arHonorificSuffixes
= self
.getPropertyValue(elmName
, 'honorific-suffix', self
.STRING
, 1, 1) + self
.getPropertyValue(elmName
, 'honorific-suffixes', self
.STRING
, 1, 1)
2223 arLines
.append(self
.vcardFold('N:' + sFamilyName
+ ';' +
2225 ','.join(arAdditionalNames
) + ';' +
2226 ','.join(arHonorificPrefixes
) + ';' +
2227 ','.join(arHonorificSuffixes
)))
2229 # implied "N" optimization
2230 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2231 arNames
= self
.normalize(sFN
).split()
2232 if len(arNames
) == 2:
2233 bFamilyNameFirst
= (arNames
[0].endswith(',') or
2234 len(arNames
[1]) == 1 or
2235 ((len(arNames
[1]) == 2) and (arNames
[1].endswith('.'))))
2236 if bFamilyNameFirst
:
2237 arLines
.append(self
.vcardFold('N:' + arNames
[0] + ';' + arNames
[1]))
2239 arLines
.append(self
.vcardFold('N:' + arNames
[1] + ';' + arNames
[0]))
2242 sSortString
= self
.getPropertyValue(elmCard
, 'sort-string', self
.STRING
, bAutoEscape
=1)
2244 arLines
.append(self
.vcardFold('SORT-STRING:' + sSortString
))
2247 arNickname
= self
.getPropertyValue(elmCard
, 'nickname', self
.STRING
, 1, 1)
2249 arLines
.append(self
.vcardFold('NICKNAME:' + ','.join(arNickname
)))
2252 processSingleURI('photo')
2255 dtBday
= self
.getPropertyValue(elmCard
, 'bday', self
.DATE
)
2257 arLines
.append(self
.vcardFold('BDAY:' + self
.toISO8601(dtBday
)))
2260 arAdr
= self
.getPropertyValue(elmCard
, 'adr', bAllowMultiple
=1)
2261 for elmAdr
in arAdr
:
2262 arType
= self
.getPropertyValue(elmAdr
, 'type', self
.STRING
, 1, 1)
2264 arType
= ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2265 sPostOfficeBox
= self
.getPropertyValue(elmAdr
, 'post-office-box', self
.STRING
, 0, 1)
2266 sExtendedAddress
= self
.getPropertyValue(elmAdr
, 'extended-address', self
.STRING
, 0, 1)
2267 sStreetAddress
= self
.getPropertyValue(elmAdr
, 'street-address', self
.STRING
, 0, 1)
2268 sLocality
= self
.getPropertyValue(elmAdr
, 'locality', self
.STRING
, 0, 1)
2269 sRegion
= self
.getPropertyValue(elmAdr
, 'region', self
.STRING
, 0, 1)
2270 sPostalCode
= self
.getPropertyValue(elmAdr
, 'postal-code', self
.STRING
, 0, 1)
2271 sCountryName
= self
.getPropertyValue(elmAdr
, 'country-name', self
.STRING
, 0, 1)
2272 arLines
.append(self
.vcardFold('ADR;TYPE=' + ','.join(arType
) + ':' +
2273 sPostOfficeBox
+ ';' +
2274 sExtendedAddress
+ ';' +
2275 sStreetAddress
+ ';' +
2282 processTypeValue('label', ['intl','postal','parcel','work'])
2284 # TEL (phone number)
2285 processTypeValue('tel', ['voice'])
2288 processTypeValue('email', ['internet'], ['internet'])
2291 processSingleString('mailer')
2294 processSingleString('tz')
2296 # GEO (geographical information)
2297 elmGeo
= self
.getPropertyValue(elmCard
, 'geo')
2299 sLatitude
= self
.getPropertyValue(elmGeo
, 'latitude', self
.STRING
, 0, 1)
2300 sLongitude
= self
.getPropertyValue(elmGeo
, 'longitude', self
.STRING
, 0, 1)
2301 arLines
.append(self
.vcardFold('GEO:' + sLatitude
+ ';' + sLongitude
))
2304 processSingleString('title')
2307 processSingleString('role')
2310 processSingleURI('logo')
2312 # ORG (organization)
2313 elmOrg
= self
.getPropertyValue(elmCard
, 'org')
2315 sOrganizationName
= self
.getPropertyValue(elmOrg
, 'organization-name', self
.STRING
, 0, 1)
2316 if not sOrganizationName
:
2317 # implied "organization-name" optimization
2318 # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2319 sOrganizationName
= self
.getPropertyValue(elmCard
, 'org', self
.STRING
, 0, 1)
2320 if sOrganizationName
:
2321 arLines
.append(self
.vcardFold('ORG:' + sOrganizationName
))
2323 arOrganizationUnit
= self
.getPropertyValue(elmOrg
, 'organization-unit', self
.STRING
, 1, 1)
2324 arLines
.append(self
.vcardFold('ORG:' + sOrganizationName
+ ';' + ';'.join(arOrganizationUnit
)))
2327 arCategory
= self
.getPropertyValue(elmCard
, 'category', self
.STRING
, 1, 1) + self
.getPropertyValue(elmCard
, 'categories', self
.STRING
, 1, 1)
2329 arLines
.append(self
.vcardFold('CATEGORIES:' + ','.join(arCategory
)))
2332 processSingleString('note')
2335 processSingleString('rev')
2338 processSingleURI('sound')
2341 processSingleString('uid')
2344 processSingleURI('url')
2347 processSingleString('class')
2350 processSingleURI('key')
2353 arLines
= [u
'BEGIN:vCard',u
'VERSION:3.0'] + arLines
+ [u
'END:vCard']
2354 sVCards
+= u
'\n'.join(arLines
) + u
'\n'
2356 return sVCards
.strip()
2358 def isProbablyDownloadable(self
, elm
):
2359 attrsD
= elm
.attrMap
2360 if not attrsD
.has_key('href'): return 0
2361 linktype
= attrsD
.get('type', '').strip()
2362 if linktype
.startswith('audio/') or \
2363 linktype
.startswith('video/') or \
2364 (linktype
.startswith('application/') and not linktype
.endswith('xml')):
2366 path
= urlparse
.urlparse(attrsD
['href'])[2]
2367 if path
.find('.') == -1: return 0
2368 fileext
= path
.split('.').pop().lower()
2369 return fileext
in self
.known_binary_extensions
2373 for elm
in self
.document(all
, {'rel': re
.compile(r
'\btag\b')}):
2374 href
= elm
.get('href')
2375 if not href
: continue
2376 urlscheme
, domain
, path
, params
, query
, fragment
= \
2377 urlparse
.urlparse(_urljoin(self
.baseuri
, href
))
2378 segments
= path
.split('/')
2379 tag
= segments
.pop()
2381 tag
= segments
.pop()
2382 tagscheme
= urlparse
.urlunparse((urlscheme
, domain
, '/'.join(segments
), '', '', ''))
2383 if not tagscheme
.endswith('/'):
2385 self
.tags
.append(FeedParserDict({"term": tag
, "scheme": tagscheme
, "label": elm
.string
or ''}))
2387 def findEnclosures(self
):
2389 enclosure_match
= re
.compile(r
'\benclosure\b')
2390 for elm
in self
.document(all
, {'href': re
.compile(r
'.+')}):
2391 if not enclosure_match
.search(elm
.get('rel', '')) and not self
.isProbablyDownloadable(elm
): continue
2392 if elm
.attrMap
not in self
.enclosures
:
2393 self
.enclosures
.append(elm
.attrMap
)
2394 if elm
.string
and not elm
.get('title'):
2395 self
.enclosures
[-1]['title'] = elm
.string
2399 for elm
in self
.document(all
, {'rel': re
.compile('.+'), 'href': re
.compile('.+')}):
2400 rels
= elm
.get('rel', '').split()
2403 if rel
in self
.known_xfn_relationships
:
2404 xfn_rels
.append(rel
)
2406 self
.xfn
.append({"relationships": xfn_rels
, "href": elm
.get('href', ''), "name": elm
.string
})
2408 def _parseMicroformats(htmlSource
, baseURI
, encoding
):
2409 if not BeautifulSoup
: return
2410 if _debug
: sys
.stderr
.write('entering _parseMicroformats\n')
2412 p
= _MicroformatsParser(htmlSource
, baseURI
, encoding
)
2413 except UnicodeEncodeError:
2414 # sgmllib throws this exception when performing lookups of tags
2415 # with non-ASCII characters in them.
2417 p
.vcard
= p
.findVCards(p
.document
)
2421 return {"tags": p
.tags
, "enclosures": p
.enclosures
, "xfn": p
.xfn
, "vcard": p
.vcard
}
2423 class _RelativeURIResolver(_BaseHTMLProcessor
):
2424 relative_uris
= [('a', 'href'),
2425 ('applet', 'codebase'),
2427 ('blockquote', 'cite'),
2428 ('body', 'background'),
2431 ('frame', 'longdesc'),
2433 ('iframe', 'longdesc'),
2435 ('head', 'profile'),
2436 ('img', 'longdesc'),
2440 ('input', 'usemap'),
2443 ('object', 'classid'),
2444 ('object', 'codebase'),
2446 ('object', 'usemap'),
2450 def __init__(self
, baseuri
, encoding
, _type
):
2451 _BaseHTMLProcessor
.__init
__(self
, encoding
, _type
)
2452 self
.baseuri
= baseuri
2454 def resolveURI(self
, uri
):
2455 return _makeSafeAbsoluteURI(_urljoin(self
.baseuri
, uri
.strip()))
2457 def unknown_starttag(self
, tag
, attrs
):
2459 sys
.stderr
.write('tag: [%s] with attributes: [%s]\n' % (tag
, str(attrs
)))
2460 attrs
= self
.normalize_attrs(attrs
)
2461 attrs
= [(key
, ((tag
, key
) in self
.relative_uris
) and self
.resolveURI(value
) or value
) for key
, value
in attrs
]
2462 _BaseHTMLProcessor
.unknown_starttag(self
, tag
, attrs
)
2464 def _resolveRelativeURIs(htmlSource
, baseURI
, encoding
, _type
):
2466 sys
.stderr
.write('entering _resolveRelativeURIs\n')
2468 p
= _RelativeURIResolver(baseURI
, encoding
, _type
)
2472 def _makeSafeAbsoluteURI(base
, rel
=None):
2473 # bail if ACCEPTABLE_URI_SCHEMES is empty
2474 if not ACCEPTABLE_URI_SCHEMES
:
2475 return _urljoin(base
, rel
or u
'')
2479 if base
.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES
:
2482 uri
= _urljoin(base
, rel
)
2483 if uri
.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES
:
2487 class _HTMLSanitizer(_BaseHTMLProcessor
):
2488 acceptable_elements
= ['a', 'abbr', 'acronym', 'address', 'area', 'article',
2489 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
2490 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
2491 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
2492 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
2493 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
2494 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
2495 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
2496 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2497 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
2498 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
2499 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2501 acceptable_attributes
= ['abbr', 'accept', 'accept-charset', 'accesskey',
2502 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2503 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2504 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2505 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2506 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2507 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2508 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2509 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2510 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2511 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2512 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2513 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2514 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2515 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2516 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2517 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2518 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2519 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2520 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2521 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2524 unacceptable_elements_with_end_tag
= ['script', 'applet', 'style']
2526 acceptable_css_properties
= ['azimuth', 'background-color',
2527 'border-bottom-color', 'border-collapse', 'border-color',
2528 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2529 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2530 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2531 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2532 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2533 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2534 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2535 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2536 'white-space', 'width']
2538 # survey of common keywords found in feeds
2539 acceptable_css_keywords
= ['auto', 'aqua', 'black', 'block', 'blue',
2540 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2541 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2542 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2543 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2544 'transparent', 'underline', 'white', 'yellow']
2546 valid_css_values
= re
.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2547 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2549 mathml_elements
= ['annotation', 'annotation-xml', 'maction', 'math',
2550 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2551 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2552 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2553 'munderover', 'none', 'semantics']
2555 mathml_attributes
= ['actiontype', 'align', 'columnalign', 'columnalign',
2556 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2557 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2558 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2559 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2560 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2561 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2562 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2563 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2565 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2566 svg_elements
= ['a', 'animate', 'animateColor', 'animateMotion',
2567 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2568 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2569 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2570 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2571 'svg', 'switch', 'text', 'title', 'tspan', 'use']
2573 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2574 svg_attributes
= ['accent-height', 'accumulate', 'additive', 'alphabetic',
2575 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2576 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2577 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2578 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2579 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2580 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2581 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2582 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2583 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2584 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2585 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2586 'overline-position', 'overline-thickness', 'panose-1', 'path',
2587 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2588 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2589 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2590 'stop-color', 'stop-opacity', 'strikethrough-position',
2591 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2592 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2593 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2594 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2595 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2596 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2597 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2598 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2599 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2605 acceptable_svg_properties
= [ 'fill', 'fill-opacity', 'fill-rule',
2606 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2610 _BaseHTMLProcessor
.reset(self
)
2611 self
.unacceptablestack
= 0
2615 def unknown_starttag(self
, tag
, attrs
):
2616 acceptable_attributes
= self
.acceptable_attributes
2618 if not tag
in self
.acceptable_elements
or self
.svgOK
:
2619 if tag
in self
.unacceptable_elements_with_end_tag
:
2620 self
.unacceptablestack
+= 1
2622 # add implicit namespaces to html5 inline svg/mathml
2623 if self
._type
.endswith('html'):
2624 if not dict(attrs
).get('xmlns'):
2626 attrs
.append( ('xmlns','http://www.w3.org/2000/svg') )
2628 attrs
.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2630 # not otherwise acceptable, perhaps it is MathML or SVG?
2631 if tag
=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs
:
2633 if tag
=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs
:
2636 # chose acceptable attributes based on tag class, else bail
2637 if self
.mathmlOK
and tag
in self
.mathml_elements
:
2638 acceptable_attributes
= self
.mathml_attributes
2639 elif self
.svgOK
and tag
in self
.svg_elements
:
2640 # for most vocabularies, lowercasing is a good idea. Many
2641 # svg elements, however, are camel case
2642 if not self
.svg_attr_map
:
2643 lower
=[attr
.lower() for attr
in self
.svg_attributes
]
2644 mix
=[a
for a
in self
.svg_attributes
if a
not in lower
]
2645 self
.svg_attributes
= lower
2646 self
.svg_attr_map
= dict([(a
.lower(),a
) for a
in mix
])
2648 lower
=[attr
.lower() for attr
in self
.svg_elements
]
2649 mix
=[a
for a
in self
.svg_elements
if a
not in lower
]
2650 self
.svg_elements
= lower
2651 self
.svg_elem_map
= dict([(a
.lower(),a
) for a
in mix
])
2652 acceptable_attributes
= self
.svg_attributes
2653 tag
= self
.svg_elem_map
.get(tag
,tag
)
2654 keymap
= self
.svg_attr_map
2655 elif not tag
in self
.acceptable_elements
:
2658 # declare xlink namespace, if needed
2659 if self
.mathmlOK
or self
.svgOK
:
2660 if filter(lambda (n
,v
): n
.startswith('xlink:'),attrs
):
2661 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs
:
2662 attrs
.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2665 for key
, value
in self
.normalize_attrs(attrs
):
2666 if key
in acceptable_attributes
:
2667 key
=keymap
.get(key
,key
)
2668 clean_attrs
.append((key
,value
))
2670 clean_value
= self
.sanitize_style(value
)
2671 if clean_value
: clean_attrs
.append((key
,clean_value
))
2672 _BaseHTMLProcessor
.unknown_starttag(self
, tag
, clean_attrs
)
2674 def unknown_endtag(self
, tag
):
2675 if not tag
in self
.acceptable_elements
:
2676 if tag
in self
.unacceptable_elements_with_end_tag
:
2677 self
.unacceptablestack
-= 1
2678 if self
.mathmlOK
and tag
in self
.mathml_elements
:
2679 if tag
== 'math' and self
.mathmlOK
: self
.mathmlOK
-= 1
2680 elif self
.svgOK
and tag
in self
.svg_elements
:
2681 tag
= self
.svg_elem_map
.get(tag
,tag
)
2682 if tag
== 'svg' and self
.svgOK
: self
.svgOK
-= 1
2685 _BaseHTMLProcessor
.unknown_endtag(self
, tag
)
2687 def handle_pi(self
, text
):
2690 def handle_decl(self
, text
):
2693 def handle_data(self
, text
):
2694 if not self
.unacceptablestack
:
2695 _BaseHTMLProcessor
.handle_data(self
, text
)
2697 def sanitize_style(self
, style
):
2699 style
=re
.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style
)
2702 if not re
.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style
): return ''
2703 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2704 if re
.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style
).strip(): return ''
2707 for prop
,value
in re
.findall("([-\w]+)\s*:\s*([^:;]*)",style
):
2708 if not value
: continue
2709 if prop
.lower() in self
.acceptable_css_properties
:
2710 clean
.append(prop
+ ': ' + value
+ ';')
2711 elif prop
.split('-')[0].lower() in ['background','border','margin','padding']:
2712 for keyword
in value
.split():
2713 if not keyword
in self
.acceptable_css_keywords
and \
2714 not self
.valid_css_values
.match(keyword
):
2717 clean
.append(prop
+ ': ' + value
+ ';')
2718 elif self
.svgOK
and prop
.lower() in self
.acceptable_svg_properties
:
2719 clean
.append(prop
+ ': ' + value
+ ';')
2721 return ' '.join(clean
)
2724 def _sanitizeHTML(htmlSource
, encoding
, _type
):
2725 p
= _HTMLSanitizer(encoding
, _type
)
2726 htmlSource
= htmlSource
.replace('<![CDATA[', '<![CDATA[')
2730 # loop through list of preferred Tidy interfaces looking for one that's installed,
2731 # then set up a common _tidy function to wrap the interface-specific API.
2733 for tidy_interface
in PREFERRED_TIDY_INTERFACES
:
2735 if tidy_interface
== "uTidy":
2736 from tidy
import parseString
as _utidy
2737 def _tidy(data
, **kwargs
):
2738 return str(_utidy(data
, **kwargs
))
2740 elif tidy_interface
== "mxTidy":
2741 from mx
.Tidy
import Tidy
as _mxtidy
2742 def _tidy(data
, **kwargs
):
2743 nerrors
, nwarnings
, data
, errordata
= _mxtidy
.tidy(data
, **kwargs
)
2749 utf8
= type(data
) == type(u
'')
2751 data
= data
.encode('utf-8')
2752 data
= _tidy(data
, output_xhtml
=1, numeric_entities
=1, wrap
=0, char_encoding
="utf8")
2754 data
= unicode(data
, 'utf-8')
2755 if data
.count('<body'):
2756 data
= data
.split('<body', 1)[1]
2758 data
= data
.split('>', 1)[1]
2759 if data
.count('</body'):
2760 data
= data
.split('</body', 1)[0]
2761 data
= data
.strip().replace('\r\n', '\n')
2764 class _FeedURLHandler(urllib2
.HTTPDigestAuthHandler
, urllib2
.HTTPRedirectHandler
, urllib2
.HTTPDefaultErrorHandler
):
2765 def http_error_default(self
, req
, fp
, code
, msg
, headers
):
2766 if ((code
/ 100) == 3) and (code
!= 304):
2767 return self
.http_error_302(req
, fp
, code
, msg
, headers
)
2768 infourl
= urllib
.addinfourl(fp
, headers
, req
.get_full_url())
2769 infourl
.status
= code
2772 def http_error_302(self
, req
, fp
, code
, msg
, headers
):
2773 if headers
.dict.has_key('location'):
2774 infourl
= urllib2
.HTTPRedirectHandler
.http_error_302(self
, req
, fp
, code
, msg
, headers
)
2776 infourl
= urllib
.addinfourl(fp
, headers
, req
.get_full_url())
2777 if not hasattr(infourl
, 'status'):
2778 infourl
.status
= code
2781 def http_error_301(self
, req
, fp
, code
, msg
, headers
):
2782 if headers
.dict.has_key('location'):
2783 infourl
= urllib2
.HTTPRedirectHandler
.http_error_301(self
, req
, fp
, code
, msg
, headers
)
2785 infourl
= urllib
.addinfourl(fp
, headers
, req
.get_full_url())
2786 if not hasattr(infourl
, 'status'):
2787 infourl
.status
= code
2790 http_error_300
= http_error_302
2791 http_error_303
= http_error_302
2792 http_error_307
= http_error_302
2794 def http_error_401(self
, req
, fp
, code
, msg
, headers
):
2796 # - server requires digest auth, AND
2797 # - we tried (unsuccessfully) with basic auth, AND
2798 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2799 # If all conditions hold, parse authentication information
2800 # out of the Authorization header we sent the first time
2801 # (for the username and password) and the WWW-Authenticate
2802 # header the server sent back (for the realm) and retry
2803 # the request with the appropriate digest auth headers instead.
2804 # This evil genius hack has been brought to you by Aaron Swartz.
2805 host
= urlparse
.urlparse(req
.get_full_url())[1]
2807 assert sys
.version
.split()[0] >= '2.3.3'
2808 assert base64
!= None
2809 user
, passw
= _base64decode(req
.headers
['Authorization'].split(' ')[1]).split(':')
2810 realm
= re
.findall('realm="([^"]*)"', headers
['WWW-Authenticate'])[0]
2811 self
.add_password(realm
, host
, user
, passw
)
2812 retry
= self
.http_error_auth_reqed('www-authenticate', host
, req
, headers
)
2813 self
.reset_retry_count()
2816 return self
.http_error_default(req
, fp
, code
, msg
, headers
)
2818 def _open_resource(url_file_stream_or_string
, etag
, modified
, agent
, referrer
, handlers
, request_headers
):
2819 """URL, filename, or string --> stream
2821 This function lets you define parsers that take any input source
2822 (URL, pathname to local or network file, or actual data as a string)
2823 and deal with it in a uniform manner. Returned object is guaranteed
2824 to have all the basic stdio read methods (read, readline, readlines).
2825 Just .close() the object when you're done with it.
2827 If the etag argument is supplied, it will be used as the value of an
2828 If-None-Match request header.
2830 If the modified argument is supplied, it can be a tuple of 9 integers
2831 (as returned by gmtime() in the standard Python time module) or a date
2832 string in any format supported by feedparser. Regardless, it MUST
2833 be in GMT (Greenwich Mean Time). It will be reformatted into an
2834 RFC 1123-compliant date and used as the value of an If-Modified-Since
2837 If the agent argument is supplied, it will be used as the value of a
2838 User-Agent request header.
2840 If the referrer argument is supplied, it will be used as the value of a
2841 Referer[sic] request header.
2843 If handlers is supplied, it is a list of handlers used to build a
2846 if request_headers is supplied it is a dictionary of HTTP request headers
2847 that will override the values generated by FeedParser.
2850 if hasattr(url_file_stream_or_string
, 'read'):
2851 return url_file_stream_or_string
2853 if url_file_stream_or_string
== '-':
2856 if urlparse
.urlparse(url_file_stream_or_string
)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2857 # Deal with the feed URI scheme
2858 if url_file_stream_or_string
.startswith('feed:http'):
2859 url_file_stream_or_string
= url_file_stream_or_string
[5:]
2860 elif url_file_stream_or_string
.startswith('feed:'):
2861 url_file_stream_or_string
= 'http:' + url_file_stream_or_string
[5:]
2864 # test for inline user:password for basic auth
2867 urltype
, rest
= urllib
.splittype(url_file_stream_or_string
)
2868 realhost
, rest
= urllib
.splithost(rest
)
2870 user_passwd
, realhost
= urllib
.splituser(realhost
)
2872 url_file_stream_or_string
= '%s://%s%s' % (urltype
, realhost
, rest
)
2873 auth
= base64
.standard_b64encode(user_passwd
).strip()
2877 if isinstance(url_file_stream_or_string
,unicode):
2878 url_file_stream_or_string
= url_file_stream_or_string
.encode('idna').decode('utf-8')
2880 url_file_stream_or_string
= url_file_stream_or_string
.decode('utf-8').encode('idna').decode('utf-8')
2884 # try to open with urllib2 (to use optional headers)
2885 request
= _build_urllib2_request(url_file_stream_or_string
, agent
, etag
, modified
, referrer
, auth
, request_headers
)
2886 opener
= apply(urllib2
.build_opener
, tuple(handlers
+ [_FeedURLHandler()]))
2887 opener
.addheaders
= [] # RMK - must clear so we only send our custom User-Agent
2889 return opener
.open(request
)
2891 opener
.close() # JohnD
2893 # try to open with native open function (if url_file_stream_or_string is a filename)
2895 return open(url_file_stream_or_string
, 'rb')
2899 # treat url_file_stream_or_string as string
2900 return _StringIO(str(url_file_stream_or_string
))
2902 def _build_urllib2_request(url
, agent
, etag
, modified
, referrer
, auth
, request_headers
):
2903 request
= urllib2
.Request(url
)
2904 request
.add_header('User-Agent', agent
)
2906 request
.add_header('If-None-Match', etag
)
2907 if type(modified
) == type(''):
2908 modified
= _parse_date(modified
)
2909 elif isinstance(modified
, datetime
.datetime
):
2910 modified
= modified
.utctimetuple()
2912 # format into an RFC 1123-compliant timestamp. We can't use
2913 # time.strftime() since the %a and %b directives can be affected
2914 # by the current locale, but RFC 2616 states that dates must be
2916 short_weekdays
= ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2917 months
= ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2918 request
.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays
[modified
[6]], modified
[2], months
[modified
[1] - 1], modified
[0], modified
[3], modified
[4], modified
[5]))
2920 request
.add_header('Referer', referrer
)
2922 request
.add_header('Accept-encoding', 'gzip, deflate')
2924 request
.add_header('Accept-encoding', 'gzip')
2926 request
.add_header('Accept-encoding', 'deflate')
2928 request
.add_header('Accept-encoding', '')
2930 request
.add_header('Authorization', 'Basic %s' % auth
)
2932 request
.add_header('Accept', ACCEPT_HEADER
)
2933 # use this for whatever -- cookies, special headers, etc
2934 # [('Cookie','Something'),('x-special-header','Another Value')]
2935 for header_name
, header_value
in request_headers
.items():
2936 request
.add_header(header_name
, header_value
)
2937 request
.add_header('A-IM', 'feed') # RFC 3229 support
2941 def registerDateHandler(func
):
2942 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2943 _date_handlers
.insert(0, func
)
2945 # ISO-8601 date parsing routines written by Fazal Majid.
2946 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2947 # parser is beyond the scope of feedparser and would be a worthwhile addition
2948 # to the Python library.
2949 # A single regular expression cannot parse ISO 8601 date formats into groups
2950 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2951 # 0301-04-01), so we use templates instead.
2952 # Please note the order in templates is significant because we need a
2954 _iso8601_tmpl
= ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2955 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2956 '-YY-?MM', '-OOO', '-YY',
2962 'YYYY', r
'(?P<year>\d{4})').replace(
2963 'YY', r
'(?P<year>\d\d)').replace(
2964 'MM', r
'(?P<month>[01]\d)').replace(
2965 'DD', r
'(?P<day>[0123]\d)').replace(
2966 'OOO', r
'(?P<ordinal>[0123]\d\d)').replace(
2967 'CC', r
'(?P<century>\d\d$)')
2968 + r
'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2969 + r
'(:(?P<second>\d{2}))?'
2970 + r
'(\.(?P<fracsecond>\d+))?'
2971 + r
'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2972 for tmpl
in _iso8601_tmpl
]
2977 _iso8601_matches
= [re
.compile(regex
).match
for regex
in _iso8601_re
]
2982 def _parse_date_iso8601(dateString
):
2983 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2985 for _iso8601_match
in _iso8601_matches
:
2986 m
= _iso8601_match(dateString
)
2989 if m
.span() == (0, 0): return
2990 params
= m
.groupdict()
2991 ordinal
= params
.get('ordinal', 0)
2993 ordinal
= int(ordinal
)
2996 year
= params
.get('year', '--')
2997 if not year
or year
== '--':
2998 year
= time
.gmtime()[0]
2999 elif len(year
) == 2:
3000 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3001 year
= 100 * int(time
.gmtime()[0] / 100) + int(year
)
3004 month
= params
.get('month', '-')
3005 if not month
or month
== '-':
3006 # ordinals are NOT normalized by mktime, we simulate them
3007 # by setting month=1, day=ordinal
3011 month
= time
.gmtime()[1]
3013 day
= params
.get('day', 0)
3018 elif params
.get('century', 0) or \
3019 params
.get('year', 0) or params
.get('month', 0):
3022 day
= time
.gmtime()[2]
3025 # special case of the century - is the first year of the 21st century
3026 # 2000 or 2001 ? The debate goes on...
3027 if 'century' in params
.keys():
3028 year
= (int(params
['century']) - 1) * 100 + 1
3029 # in ISO 8601 most fields are optional
3030 for field
in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3031 if not params
.get(field
, None):
3033 hour
= int(params
.get('hour', 0))
3034 minute
= int(params
.get('minute', 0))
3035 second
= int(float(params
.get('second', 0)))
3036 # weekday is normalized by mktime(), we can ignore it
3038 daylight_savings_flag
= -1
3039 tm
= [year
, month
, day
, hour
, minute
, second
, weekday
,
3040 ordinal
, daylight_savings_flag
]
3041 # ISO 8601 time zone adjustments
3042 tz
= params
.get('tz')
3043 if tz
and tz
!= 'Z':
3045 tm
[3] += int(params
.get('tzhour', 0))
3046 tm
[4] += int(params
.get('tzmin', 0))
3048 tm
[3] -= int(params
.get('tzhour', 0))
3049 tm
[4] -= int(params
.get('tzmin', 0))
3052 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3053 # which is guaranteed to normalize d/m/y/h/m/s.
3054 # Many implementations have bugs, but we'll pretend they don't.
3055 return time
.localtime(time
.mktime(tuple(tm
)))
3056 registerDateHandler(_parse_date_iso8601
)
3058 # 8-bit date handling routines written by ytrewq1.
3059 _korean_year
= u
'\ub144' # b3e2 in euc-kr
3060 _korean_month
= u
'\uc6d4' # bff9 in euc-kr
3061 _korean_day
= u
'\uc77c' # c0cf in euc-kr
3062 _korean_am
= u
'\uc624\uc804' # bfc0 c0fc in euc-kr
3063 _korean_pm
= u
'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3065 _korean_onblog_date_re
= \
3066 re
.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3067 (_korean_year
, _korean_month
, _korean_day
))
3068 _korean_nate_date_re
= \
3069 re
.compile(u
'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3070 (_korean_am
, _korean_pm
))
3071 def _parse_date_onblog(dateString
):
3072 '''Parse a string according to the OnBlog 8-bit date format'''
3073 m
= _korean_onblog_date_re
.match(dateString
)
3075 w3dtfdate
= '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3076 {'year': m
.group(1), 'month': m
.group(2), 'day': m
.group(3),\
3077 'hour': m
.group(4), 'minute': m
.group(5), 'second': m
.group(6),\
3078 'zonediff': '+09:00'}
3079 if _debug
: sys
.stderr
.write('OnBlog date parsed as: %s\n' % w3dtfdate
)
3080 return _parse_date_w3dtf(w3dtfdate
)
3081 registerDateHandler(_parse_date_onblog
)
3083 def _parse_date_nate(dateString
):
3084 '''Parse a string according to the Nate 8-bit date format'''
3085 m
= _korean_nate_date_re
.match(dateString
)
3087 hour
= int(m
.group(5))
3089 if (ampm
== _korean_pm
):
3094 w3dtfdate
= '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3095 {'year': m
.group(1), 'month': m
.group(2), 'day': m
.group(3),\
3096 'hour': hour
, 'minute': m
.group(6), 'second': m
.group(7),\
3097 'zonediff': '+09:00'}
3098 if _debug
: sys
.stderr
.write('Nate date parsed as: %s\n' % w3dtfdate
)
3099 return _parse_date_w3dtf(w3dtfdate
)
3100 registerDateHandler(_parse_date_nate
)
3103 re
.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3104 def _parse_date_mssql(dateString
):
3105 '''Parse a string according to the MS SQL date format'''
3106 m
= _mssql_date_re
.match(dateString
)
3108 w3dtfdate
= '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3109 {'year': m
.group(1), 'month': m
.group(2), 'day': m
.group(3),\
3110 'hour': m
.group(4), 'minute': m
.group(5), 'second': m
.group(6),\
3111 'zonediff': '+09:00'}
3112 if _debug
: sys
.stderr
.write('MS SQL date parsed as: %s\n' % w3dtfdate
)
3113 return _parse_date_w3dtf(w3dtfdate
)
3114 registerDateHandler(_parse_date_mssql
)
3116 # Unicode strings for Greek date strings
3119 u
'\u0399\u03b1\u03bd': u
'Jan', # c9e1ed in iso-8859-7
3120 u
'\u03a6\u03b5\u03b2': u
'Feb', # d6e5e2 in iso-8859-7
3121 u
'\u039c\u03ac\u03ce': u
'Mar', # ccdcfe in iso-8859-7
3122 u
'\u039c\u03b1\u03ce': u
'Mar', # cce1fe in iso-8859-7
3123 u
'\u0391\u03c0\u03c1': u
'Apr', # c1f0f1 in iso-8859-7
3124 u
'\u039c\u03ac\u03b9': u
'May', # ccdce9 in iso-8859-7
3125 u
'\u039c\u03b1\u03ca': u
'May', # cce1fa in iso-8859-7
3126 u
'\u039c\u03b1\u03b9': u
'May', # cce1e9 in iso-8859-7
3127 u
'\u0399\u03bf\u03cd\u03bd': u
'Jun', # c9effded in iso-8859-7
3128 u
'\u0399\u03bf\u03bd': u
'Jun', # c9efed in iso-8859-7
3129 u
'\u0399\u03bf\u03cd\u03bb': u
'Jul', # c9effdeb in iso-8859-7
3130 u
'\u0399\u03bf\u03bb': u
'Jul', # c9f9eb in iso-8859-7
3131 u
'\u0391\u03cd\u03b3': u
'Aug', # c1fde3 in iso-8859-7
3132 u
'\u0391\u03c5\u03b3': u
'Aug', # c1f5e3 in iso-8859-7
3133 u
'\u03a3\u03b5\u03c0': u
'Sep', # d3e5f0 in iso-8859-7
3134 u
'\u039f\u03ba\u03c4': u
'Oct', # cfeaf4 in iso-8859-7
3135 u
'\u039d\u03bf\u03ad': u
'Nov', # cdefdd in iso-8859-7
3136 u
'\u039d\u03bf\u03b5': u
'Nov', # cdefe5 in iso-8859-7
3137 u
'\u0394\u03b5\u03ba': u
'Dec', # c4e5ea in iso-8859-7
3142 u
'\u039a\u03c5\u03c1': u
'Sun', # caf5f1 in iso-8859-7
3143 u
'\u0394\u03b5\u03c5': u
'Mon', # c4e5f5 in iso-8859-7
3144 u
'\u03a4\u03c1\u03b9': u
'Tue', # d4f1e9 in iso-8859-7
3145 u
'\u03a4\u03b5\u03c4': u
'Wed', # d4e5f4 in iso-8859-7
3146 u
'\u03a0\u03b5\u03bc': u
'Thu', # d0e5ec in iso-8859-7
3147 u
'\u03a0\u03b1\u03c1': u
'Fri', # d0e1f1 in iso-8859-7
3148 u
'\u03a3\u03b1\u03b2': u
'Sat', # d3e1e2 in iso-8859-7
3151 _greek_date_format_re
= \
3152 re
.compile(u
'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3154 def _parse_date_greek(dateString
):
3155 '''Parse a string according to a Greek 8-bit date format.'''
3156 m
= _greek_date_format_re
.match(dateString
)
3159 wday
= _greek_wdays
[m
.group(1)]
3160 month
= _greek_months
[m
.group(3)]
3163 rfc822date
= '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3164 {'wday': wday
, 'day': m
.group(2), 'month': month
, 'year': m
.group(4),\
3165 'hour': m
.group(5), 'minute': m
.group(6), 'second': m
.group(7),\
3166 'zonediff': m
.group(8)}
3167 if _debug
: sys
.stderr
.write('Greek date parsed as: %s\n' % rfc822date
)
3168 return _parse_date_rfc822(rfc822date
)
3169 registerDateHandler(_parse_date_greek
)
3171 # Unicode strings for Hungarian date strings
3172 _hungarian_months
= \
3174 u
'janu\u00e1r': u
'01', # e1 in iso-8859-2
3175 u
'febru\u00e1ri': u
'02', # e1 in iso-8859-2
3176 u
'm\u00e1rcius': u
'03', # e1 in iso-8859-2
3177 u
'\u00e1prilis': u
'04', # e1 in iso-8859-2
3178 u
'm\u00e1ujus': u
'05', # e1 in iso-8859-2
3179 u
'j\u00fanius': u
'06', # fa in iso-8859-2
3180 u
'j\u00falius': u
'07', # fa in iso-8859-2
3181 u
'augusztus': u
'08',
3182 u
'szeptember': u
'09',
3183 u
'okt\u00f3ber': u
'10', # f3 in iso-8859-2
3188 _hungarian_date_format_re
= \
3189 re
.compile(u
'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3191 def _parse_date_hungarian(dateString
):
3192 '''Parse a string according to a Hungarian 8-bit date format.'''
3193 m
= _hungarian_date_format_re
.match(dateString
)
3196 month
= _hungarian_months
[m
.group(2)]
3205 w3dtfdate
= '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3206 {'year': m
.group(1), 'month': month
, 'day': day
,\
3207 'hour': hour
, 'minute': m
.group(5),\
3208 'zonediff': m
.group(6)}
3209 if _debug
: sys
.stderr
.write('Hungarian date parsed as: %s\n' % w3dtfdate
)
3210 return _parse_date_w3dtf(w3dtfdate
)
3211 registerDateHandler(_parse_date_hungarian
)
3213 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3214 # Drake and licensed under the Python license. Removed all range checking
3215 # for month, day, hour, minute, and second, since mktime will normalize
3217 def _parse_date_w3dtf(dateString
):
3218 def __extract_date(m
):
3219 year
= int(m
.group('year'))
3221 year
= 100 * int(time
.gmtime()[0] / 100) + int(year
)
3224 julian
= m
.group('julian')
3226 julian
= int(julian
)
3227 month
= julian
/ 30 + 1
3228 day
= julian
% 30 + 1
3230 while jday
!= julian
:
3231 t
= time
.mktime((year
, month
, day
, 0, 0, 0, 0, 0, 0))
3232 jday
= time
.gmtime(t
)[-2]
3233 diff
= abs(jday
- julian
)
3245 return year
, month
, day
3246 month
= m
.group('month')
3252 day
= m
.group('day')
3257 return year
, month
, day
3259 def __extract_time(m
):
3262 hours
= m
.group('hours')
3266 minutes
= int(m
.group('minutes'))
3267 seconds
= m
.group('seconds')
3269 seconds
= int(seconds
)
3272 return hours
, minutes
, seconds
3274 def __extract_tzd(m
):
3275 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3278 tzd
= m
.group('tzd')
3283 hours
= int(m
.group('tzdhours'))
3284 minutes
= m
.group('tzdminutes')
3286 minutes
= int(minutes
)
3289 offset
= (hours
*60 + minutes
) * 60
3294 __date_re
= ('(?P<year>\d\d\d\d)'
3296 '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3297 '|(?P<julian>\d\d\d)))?')
3298 __tzd_re
= '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3299 __tzd_rx
= re
.compile(__tzd_re
)
3300 __time_re
= ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3301 '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3303 __datetime_re
= '%s(?:T%s)?' % (__date_re
, __time_re
)
3304 __datetime_rx
= re
.compile(__datetime_re
)
3305 m
= __datetime_rx
.match(dateString
)
3306 if (m
is None) or (m
.group() != dateString
): return
3307 gmt
= __extract_date(m
) + __extract_time(m
) + (0, 0, 0)
3308 if gmt
[0] == 0: return
3309 return time
.gmtime(time
.mktime(gmt
) + __extract_tzd(m
) - time
.timezone
)
3310 registerDateHandler(_parse_date_w3dtf
)
3312 def _parse_date_rfc822(dateString
):
3313 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3314 data
= dateString
.split()
3315 if data
[0][-1] in (',', '.') or data
[0].lower() in rfc822
._daynames
:
3321 data
[3:] = [s
[:i
], s
[i
+1:]]
3324 dateString
= " ".join(data
)
3325 # Account for the Etc/GMT timezone by stripping 'Etc/'
3326 elif len(data
) == 5 and data
[4].lower().startswith('etc/'):
3327 data
[4] = data
[4][4:]
3328 dateString
= " ".join(data
)
3330 dateString
+= ' 00:00:00 GMT'
3331 tm
= rfc822
.parsedate_tz(dateString
)
3333 return time
.gmtime(rfc822
.mktime_tz(tm
))
3334 # rfc822.py defines several time zones, but we define some extra ones.
3335 # 'ET' is equivalent to 'EST', etc.
3336 _additional_timezones
= {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3337 rfc822
._timezones
.update(_additional_timezones
)
3338 registerDateHandler(_parse_date_rfc822
)
3340 def _parse_date_perforce(aDateString
):
3341 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3342 # Fri, 2006/09/15 08:19:53 EDT
3343 _my_date_pattern
= re
.compile( \
3344 r
'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3346 dow
, year
, month
, day
, hour
, minute
, second
, tz
= \
3347 _my_date_pattern
.search(aDateString
).groups()
3348 months
= ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3349 dateString
= "%s, %s %s %s %s:%s:%s %s" % (dow
, day
, months
[int(month
) - 1], year
, hour
, minute
, second
, tz
)
3350 tm
= rfc822
.parsedate_tz(dateString
)
3352 return time
.gmtime(rfc822
.mktime_tz(tm
))
3353 registerDateHandler(_parse_date_perforce
)
3355 def _parse_date(dateString
):
3356 '''Parses a variety of date formats into a 9-tuple in GMT'''
3357 for handler
in _date_handlers
:
3359 date9tuple
= handler(dateString
)
3360 if not date9tuple
: continue
3361 if len(date9tuple
) != 9:
3362 if _debug
: sys
.stderr
.write('date handler function must return 9-tuple\n')
3364 map(int, date9tuple
)
3366 except Exception, e
:
3367 if _debug
: sys
.stderr
.write('%s raised %s\n' % (handler
.__name
__, repr(e
)))
3371 def _getCharacterEncoding(http_headers
, xml_data
):
3372 '''Get the character encoding of the XML document
3374 http_headers is a dictionary
3375 xml_data is a raw string (not Unicode)
3377 This is so much trickier than it sounds, it's not even funny.
3378 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3379 is application/xml, application/*+xml,
3380 application/xml-external-parsed-entity, or application/xml-dtd,
3381 the encoding given in the charset parameter of the HTTP Content-Type
3382 takes precedence over the encoding given in the XML prefix within the
3383 document, and defaults to 'utf-8' if neither are specified. But, if
3384 the HTTP Content-Type is text/xml, text/*+xml, or
3385 text/xml-external-parsed-entity, the encoding given in the XML prefix
3386 within the document is ALWAYS IGNORED and only the encoding given in
3387 the charset parameter of the HTTP Content-Type header should be
3388 respected, and it defaults to 'us-ascii' if not specified.
3390 Furthermore, discussion on the atom-syntax mailing list with the
3391 author of RFC 3023 leads me to the conclusion that any document
3392 served with a Content-Type of text/* and no charset parameter
3393 must be treated as us-ascii. (We now do this.) And also that it
3394 must always be flagged as non-well-formed. (We now do this too.)
3396 If Content-Type is unspecified (input was local file or non-HTTP source)
3397 or unrecognized (server just got it totally wrong), then go by the
3398 encoding given in the XML prefix of the document and default to
3399 'iso-8859-1' as per the HTTP specification (RFC 2616).
3401 Then, assuming we didn't find a character encoding in the HTTP headers
3402 (and the HTTP Content-type allowed us to look in the body), we need
3403 to sniff the first few bytes of the XML data and try to determine
3404 whether the encoding is ASCII-compatible. Section F of the XML
3405 specification shows the way here:
3406 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3408 If the sniffed encoding is not ASCII-compatible, we need to make it
3409 ASCII compatible so that we can sniff further into the XML declaration
3410 to find the encoding attribute, which will tell us the true encoding.
3412 Of course, none of this guarantees that we will be able to parse the
3413 feed in the declared character encoding (assuming it was declared
3414 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
3415 you should definitely install them if you can.
3416 http://cjkpython.i18n.org/
3419 def _parseHTTPContentType(content_type
):
3420 '''takes HTTP Content-Type header and returns (content type, charset)
3422 If no charset is specified, returns (content type, '')
3423 If no content type is specified, returns ('', '')
3424 Both return parameters are guaranteed to be lowercase strings
3426 content_type
= content_type
or ''
3427 content_type
, params
= cgi
.parse_header(content_type
)
3428 return content_type
, params
.get('charset', '').replace("'", '')
3430 sniffed_xml_encoding
= ''
3433 http_content_type
, http_encoding
= _parseHTTPContentType(http_headers
.get('content-type', http_headers
.get('Content-type')))
3434 # Must sniff for non-ASCII-compatible character encodings before
3435 # searching for XML declaration. This heuristic is defined in
3436 # section F of the XML specification:
3437 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3439 if xml_data
[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3441 xml_data
= _ebcdic_to_ascii(xml_data
)
3442 elif xml_data
[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3444 sniffed_xml_encoding
= 'utf-16be'
3445 xml_data
= unicode(xml_data
, 'utf-16be').encode('utf-8')
3446 elif (len(xml_data
) >= 4) and (xml_data
[:2] == _l2bytes([0xfe, 0xff])) and (xml_data
[2:4] != _l2bytes([0x00, 0x00])):
3448 sniffed_xml_encoding
= 'utf-16be'
3449 xml_data
= unicode(xml_data
[2:], 'utf-16be').encode('utf-8')
3450 elif xml_data
[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3452 sniffed_xml_encoding
= 'utf-16le'
3453 xml_data
= unicode(xml_data
, 'utf-16le').encode('utf-8')
3454 elif (len(xml_data
) >= 4) and (xml_data
[:2] == _l2bytes([0xff, 0xfe])) and (xml_data
[2:4] != _l2bytes([0x00, 0x00])):
3456 sniffed_xml_encoding
= 'utf-16le'
3457 xml_data
= unicode(xml_data
[2:], 'utf-16le').encode('utf-8')
3458 elif xml_data
[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3460 sniffed_xml_encoding
= 'utf-32be'
3461 xml_data
= unicode(xml_data
, 'utf-32be').encode('utf-8')
3462 elif xml_data
[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3464 sniffed_xml_encoding
= 'utf-32le'
3465 xml_data
= unicode(xml_data
, 'utf-32le').encode('utf-8')
3466 elif xml_data
[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3468 sniffed_xml_encoding
= 'utf-32be'
3469 xml_data
= unicode(xml_data
[4:], 'utf-32be').encode('utf-8')
3470 elif xml_data
[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3472 sniffed_xml_encoding
= 'utf-32le'
3473 xml_data
= unicode(xml_data
[4:], 'utf-32le').encode('utf-8')
3474 elif xml_data
[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3476 sniffed_xml_encoding
= 'utf-8'
3477 xml_data
= unicode(xml_data
[3:], 'utf-8').encode('utf-8')
3481 xml_encoding_match
= re
.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data
)
3483 xml_encoding_match
= None
3484 if xml_encoding_match
:
3485 xml_encoding
= xml_encoding_match
.groups()[0].decode('utf-8').lower()
3486 if sniffed_xml_encoding
and (xml_encoding
in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3487 xml_encoding
= sniffed_xml_encoding
3488 acceptable_content_type
= 0
3489 application_content_types
= ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3490 text_content_types
= ('text/xml', 'text/xml-external-parsed-entity')
3491 if (http_content_type
in application_content_types
) or \
3492 (http_content_type
.startswith('application/') and http_content_type
.endswith('+xml')):
3493 acceptable_content_type
= 1
3494 true_encoding
= http_encoding
or xml_encoding
or 'utf-8'
3495 elif (http_content_type
in text_content_types
) or \
3496 (http_content_type
.startswith('text/')) and http_content_type
.endswith('+xml'):
3497 acceptable_content_type
= 1
3498 true_encoding
= http_encoding
or 'us-ascii'
3499 elif http_content_type
.startswith('text/'):
3500 true_encoding
= http_encoding
or 'us-ascii'
3501 elif http_headers
and (not (http_headers
.has_key('content-type') or http_headers
.has_key('Content-type'))):
3502 true_encoding
= xml_encoding
or 'iso-8859-1'
3504 true_encoding
= xml_encoding
or 'utf-8'
3505 # some feeds claim to be gb2312 but are actually gb18030.
3506 # apparently MSIE and Firefox both do the following switch:
3507 if true_encoding
.lower() == 'gb2312':
3508 true_encoding
= 'gb18030'
3509 return true_encoding
, http_encoding
, xml_encoding
, sniffed_xml_encoding
, acceptable_content_type
3511 def _toUTF8(data
, encoding
):
3512 '''Changes an XML data stream on the fly to specify a new encoding
3514 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3515 encoding is a string recognized by encodings.aliases
3517 if _debug
: sys
.stderr
.write('entering _toUTF8, trying encoding %s\n' % encoding
)
3518 # strip Byte Order Mark (if present)
3519 if (len(data
) >= 4) and (data
[:2] == _l2bytes([0xfe, 0xff])) and (data
[2:4] != _l2bytes([0x00, 0x00])):
3521 sys
.stderr
.write('stripping BOM\n')
3522 if encoding
!= 'utf-16be':
3523 sys
.stderr
.write('trying utf-16be instead\n')
3524 encoding
= 'utf-16be'
3526 elif (len(data
) >= 4) and (data
[:2] == _l2bytes([0xff, 0xfe])) and (data
[2:4] != _l2bytes([0x00, 0x00])):
3528 sys
.stderr
.write('stripping BOM\n')
3529 if encoding
!= 'utf-16le':
3530 sys
.stderr
.write('trying utf-16le instead\n')
3531 encoding
= 'utf-16le'
3533 elif data
[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3535 sys
.stderr
.write('stripping BOM\n')
3536 if encoding
!= 'utf-8':
3537 sys
.stderr
.write('trying utf-8 instead\n')
3540 elif data
[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3542 sys
.stderr
.write('stripping BOM\n')
3543 if encoding
!= 'utf-32be':
3544 sys
.stderr
.write('trying utf-32be instead\n')
3545 encoding
= 'utf-32be'
3547 elif data
[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3549 sys
.stderr
.write('stripping BOM\n')
3550 if encoding
!= 'utf-32le':
3551 sys
.stderr
.write('trying utf-32le instead\n')
3552 encoding
= 'utf-32le'
3554 newdata
= unicode(data
, encoding
)
3555 if _debug
: sys
.stderr
.write('successfully converted %s data to unicode\n' % encoding
)
3556 declmatch
= re
.compile('^<\?xml[^>]*?>')
3557 newdecl
= '''<?xml version='1.0' encoding='utf-8'?>'''
3558 if declmatch
.search(newdata
):
3559 newdata
= declmatch
.sub(newdecl
, newdata
)
3561 newdata
= newdecl
+ u
'\n' + newdata
3562 return newdata
.encode('utf-8')
3564 def _stripDoctype(data
):
3565 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3567 rss_version may be 'rss091n' or None
3568 stripped_data is the same XML document, minus the DOCTYPE
3570 start
= re
.search(_s2bytes('<\w'), data
)
3571 start
= start
and start
.start() or -1
3572 head
,data
= data
[:start
+1], data
[start
+1:]
3574 entity_pattern
= re
.compile(_s2bytes(r
'^\s*<!ENTITY([^>]*?)>'), re
.MULTILINE
)
3575 entity_results
=entity_pattern
.findall(head
)
3576 head
= entity_pattern
.sub(_s2bytes(''), head
)
3577 doctype_pattern
= re
.compile(_s2bytes(r
'^\s*<!DOCTYPE([^>]*?)>'), re
.MULTILINE
)
3578 doctype_results
= doctype_pattern
.findall(head
)
3579 doctype
= doctype_results
and doctype_results
[0] or _s2bytes('')
3580 if doctype
.lower().count(_s2bytes('netscape')):
3585 # only allow in 'safe' inline entity definitions
3586 replacement
=_s2bytes('')
3587 if len(doctype_results
)==1 and entity_results
:
3588 safe_pattern
=re
.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3589 safe_entities
=filter(lambda e
: safe_pattern
.match(e
),entity_results
)
3591 replacement
=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities
) + _s2bytes('>\n]>')
3592 data
= doctype_pattern
.sub(replacement
, head
) + data
3594 return version
, data
, dict(replacement
and [(k
.decode('utf-8'), v
.decode('utf-8')) for k
, v
in safe_pattern
.findall(replacement
)])
3596 def parse(url_file_stream_or_string
, etag
=None, modified
=None, agent
=None, referrer
=None, handlers
=[], request_headers
={}, response_headers
={}):
3597 '''Parse a feed from a URL, file, stream, or string.
3599 request_headers, if given, is a dict from http header name to value to add
3600 to the request; this overrides internally generated values.
3602 result
= FeedParserDict()
3603 result
['feed'] = FeedParserDict()
3604 result
['entries'] = []
3607 if not isinstance(handlers
, list):
3608 handlers
= [handlers
]
3610 f
= _open_resource(url_file_stream_or_string
, etag
, modified
, agent
, referrer
, handlers
, request_headers
)
3612 except Exception, e
:
3614 result
['bozo_exception'] = e
3618 if hasattr(f
, 'headers'):
3619 result
['headers'] = dict(f
.headers
)
3620 # overwrite existing headers using response_headers
3621 if 'headers' in result
:
3622 result
['headers'].update(response_headers
)
3623 elif response_headers
:
3624 result
['headers'] = copy
.deepcopy(response_headers
)
3626 # if feed is gzip-compressed, decompress it
3627 if f
and data
and 'headers' in result
:
3628 if gzip
and result
['headers'].get('content-encoding') == 'gzip':
3630 data
= gzip
.GzipFile(fileobj
=_StringIO(data
)).read()
3631 except Exception, e
:
3632 # Some feeds claim to be gzipped but they're not, so
3633 # we get garbage. Ideally, we should re-request the
3634 # feed without the 'Accept-encoding: gzip' header,
3637 result
['bozo_exception'] = e
3639 elif zlib
and result
['headers'].get('content-encoding') == 'deflate':
3641 data
= zlib
.decompress(data
, -zlib
.MAX_WBITS
)
3642 except Exception, e
:
3644 result
['bozo_exception'] = e
3648 if 'headers' in result
:
3649 if 'etag' in result
['headers'] or 'ETag' in result
['headers']:
3650 etag
= result
['headers'].get('etag', result
['headers'].get('ETag'))
3652 result
['etag'] = etag
3653 if 'last-modified' in result
['headers'] or 'Last-Modified' in result
['headers']:
3654 modified
= result
['headers'].get('last-modified', result
['headers'].get('Last-Modified'))
3656 result
['modified'] = _parse_date(modified
)
3657 if hasattr(f
, 'url'):
3658 result
['href'] = f
.url
3659 result
['status'] = 200
3660 if hasattr(f
, 'status'):
3661 result
['status'] = f
.status
3662 if hasattr(f
, 'close'):
3665 # there are four encodings to keep track of:
3666 # - http_encoding is the encoding declared in the Content-Type HTTP header
3667 # - xml_encoding is the encoding declared in the <?xml declaration
3668 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3669 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3670 http_headers
= result
.get('headers', {})
3671 result
['encoding'], http_encoding
, xml_encoding
, sniffed_xml_encoding
, acceptable_content_type
= \
3672 _getCharacterEncoding(http_headers
, data
)
3673 if http_headers
and (not acceptable_content_type
):
3674 if http_headers
.has_key('content-type') or http_headers
.has_key('Content-type'):
3675 bozo_message
= '%s is not an XML media type' % http_headers
.get('content-type', http_headers
.get('Content-type'))
3677 bozo_message
= 'no Content-type specified'
3679 result
['bozo_exception'] = NonXMLContentType(bozo_message
)
3681 if data
is not None:
3682 result
['version'], data
, entities
= _stripDoctype(data
)
3684 # ensure that baseuri is an absolute uri using an acceptable URI scheme
3685 contentloc
= http_headers
.get('content-location', http_headers
.get('Content-Location', ''))
3686 href
= result
.get('href', '')
3687 baseuri
= _makeSafeAbsoluteURI(href
, contentloc
) or _makeSafeAbsoluteURI(contentloc
) or href
3689 baselang
= http_headers
.get('content-language', http_headers
.get('Content-Language', None))
3691 # if server sent 304, we're done
3692 if result
.get('status', 0) == 304:
3693 result
['version'] = ''
3694 result
['debug_message'] = 'The feed has not changed since you last checked, ' + \
3695 'so the server sent no data. This is a feature, not a bug!'
3698 # if there was a problem downloading, we're done
3702 # determine character encoding
3703 use_strict_parser
= 0
3705 tried_encodings
= []
3706 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3707 for proposed_encoding
in (result
['encoding'], xml_encoding
, sniffed_xml_encoding
):
3708 if not proposed_encoding
: continue
3709 if proposed_encoding
in tried_encodings
: continue
3710 tried_encodings
.append(proposed_encoding
)
3712 data
= _toUTF8(data
, proposed_encoding
)
3713 known_encoding
= use_strict_parser
= 1
3717 # if no luck and we have auto-detection library, try that
3718 if (not known_encoding
) and chardet
:
3720 proposed_encoding
= chardet
.detect(data
)['encoding']
3721 if proposed_encoding
and (proposed_encoding
not in tried_encodings
):
3722 tried_encodings
.append(proposed_encoding
)
3723 data
= _toUTF8(data
, proposed_encoding
)
3724 known_encoding
= use_strict_parser
= 1
3727 # if still no luck and we haven't tried utf-8 yet, try that
3728 if (not known_encoding
) and ('utf-8' not in tried_encodings
):
3730 proposed_encoding
= 'utf-8'
3731 tried_encodings
.append(proposed_encoding
)
3732 data
= _toUTF8(data
, proposed_encoding
)
3733 known_encoding
= use_strict_parser
= 1
3736 # if still no luck and we haven't tried windows-1252 yet, try that
3737 if (not known_encoding
) and ('windows-1252' not in tried_encodings
):
3739 proposed_encoding
= 'windows-1252'
3740 tried_encodings
.append(proposed_encoding
)
3741 data
= _toUTF8(data
, proposed_encoding
)
3742 known_encoding
= use_strict_parser
= 1
3745 # if still no luck and we haven't tried iso-8859-2 yet, try that.
3746 if (not known_encoding
) and ('iso-8859-2' not in tried_encodings
):
3748 proposed_encoding
= 'iso-8859-2'
3749 tried_encodings
.append(proposed_encoding
)
3750 data
= _toUTF8(data
, proposed_encoding
)
3751 known_encoding
= use_strict_parser
= 1
3754 # if still no luck, give up
3755 if not known_encoding
:
3757 result
['bozo_exception'] = CharacterEncodingUnknown( \
3758 'document encoding unknown, I tried ' + \
3759 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3760 (result
['encoding'], xml_encoding
))
3761 result
['encoding'] = ''
3762 elif proposed_encoding
!= result
['encoding']:
3764 result
['bozo_exception'] = CharacterEncodingOverride( \
3765 'document declared as %s, but parsed as %s' % \
3766 (result
['encoding'], proposed_encoding
))
3767 result
['encoding'] = proposed_encoding
3769 if not _XML_AVAILABLE
:
3770 use_strict_parser
= 0
3771 if use_strict_parser
:
3772 # initialize the SAX parser
3773 feedparser
= _StrictFeedParser(baseuri
, baselang
, 'utf-8')
3774 saxparser
= xml
.sax
.make_parser(PREFERRED_XML_PARSERS
)
3775 saxparser
.setFeature(xml
.sax
.handler
.feature_namespaces
, 1)
3776 saxparser
.setContentHandler(feedparser
)
3777 saxparser
.setErrorHandler(feedparser
)
3778 source
= xml
.sax
.xmlreader
.InputSource()
3779 source
.setByteStream(_StringIO(data
))
3780 if hasattr(saxparser
, '_ns_stack'):
3781 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3782 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3783 saxparser
._ns
_stack
.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3785 saxparser
.parse(source
)
3786 except Exception, e
:
3789 traceback
.print_stack()
3790 traceback
.print_exc()
3791 sys
.stderr
.write('xml parsing failed\n')
3793 result
['bozo_exception'] = feedparser
.exc
or e
3794 use_strict_parser
= 0
3795 if not use_strict_parser
:
3796 feedparser
= _LooseFeedParser(baseuri
, baselang
, 'utf-8', entities
)
3797 feedparser
.feed(data
.decode('utf-8', 'replace'))
3798 result
['feed'] = feedparser
.feeddata
3799 result
['entries'] = feedparser
.entries
3800 result
['version'] = result
['version'] or feedparser
.version
3801 result
['namespaces'] = feedparser
.namespacesInUse
3805 def __init__(self
, results
):
3806 self
.results
= results
3808 class TextSerializer(Serializer
):
3809 def write(self
, stream
=sys
.stdout
):
3810 self
._writer
(stream
, self
.results
, '')
3812 def _writer(self
, stream
, node
, prefix
):
3814 if hasattr(node
, 'keys'):
3818 if k
in ('description', 'link'): continue
3819 if node
.has_key(k
+ '_detail'): continue
3820 if node
.has_key(k
+ '_parsed'): continue
3821 self
._writer
(stream
, node
[k
], prefix
+ k
+ '.')
3822 elif type(node
) == types
.ListType
:
3825 self
._writer
(stream
, n
, prefix
[:-1] + '[' + str(index
) + '].')
3829 s
= str(node
).encode('utf-8')
3830 s
= s
.replace('\\', '\\\\')
3831 s
= s
.replace('\r', '')
3832 s
= s
.replace('\n', r
'\n')
3833 stream
.write(prefix
[:-1])
3840 class PprintSerializer(Serializer
):
3841 def write(self
, stream
=sys
.stdout
):
3842 if self
.results
.has_key('href'):
3843 stream
.write(self
.results
['href'] + '\n\n')
3844 from pprint
import pprint
3845 pprint(self
.results
, stream
)
3848 if __name__
== '__main__':
3850 from optparse
import OptionParser
3855 optionParser
= OptionParser(version
=__version__
, usage
="%prog [options] url_or_filename_or_-")
3856 optionParser
.set_defaults(format
="pprint")
3857 optionParser
.add_option("-A", "--user-agent", dest
="agent", metavar
="AGENT", help="User-Agent for HTTP URLs")
3858 optionParser
.add_option("-e", "--referer", "--referrer", dest
="referrer", metavar
="URL", help="Referrer for HTTP URLs")
3859 optionParser
.add_option("-t", "--etag", dest
="etag", metavar
="TAG", help="ETag/If-None-Match for HTTP URLs")
3860 optionParser
.add_option("-m", "--last-modified", dest
="modified", metavar
="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3861 optionParser
.add_option("-f", "--format", dest
="format", metavar
="FORMAT", help="output results in FORMAT (text, pprint)")
3862 optionParser
.add_option("-v", "--verbose", action
="store_true", dest
="verbose", default
=False, help="write debugging information to stderr")
3863 (options
, urls
) = optionParser
.parse_args()
3867 optionParser
.print_help()
3870 if not sys
.argv
[1:]:
3874 etag
= modified
= agent
= referrer
= None
3876 options
= _Options()
3879 zopeCompatibilityHack()
3881 serializer
= globals().get(options
.format
.capitalize() + 'Serializer', Serializer
)
3883 results
= parse(url
, etag
=options
.etag
, modified
=options
.modified
, agent
=options
.agent
, referrer
=options
.referrer
)
3884 serializer(results
).write(sys
.stdout
)