Clients should send Accept: application/json
[mygpo-feedservice.git] / feedservice / feedparser.py
blobcfbf5a05230bd20d0030dd266180aa1df18df899
1 #!/usr/bin/env python
2 """Universal feed parser
4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6 Visit http://feedparser.org/ for the latest version
7 Visit http://feedparser.org/docs/ for the latest documentation
9 Required: Python 2.1 or later
10 Recommended: Python 2.3 or later
11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12 """
14 __version__ = "4.2-pre-" + "$Revision$"[11:14] + "-svn"
15 __license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
17 Redistribution and use in source and binary forms, with or without modification,
18 are permitted provided that the following conditions are met:
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
22 * Redistributions in binary form must reproduce the above copyright notice,
23 this list of conditions and the following disclaimer in the documentation
24 and/or other materials provided with the distribution.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE."""
37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
39 "John Beimler <http://john.beimler.org/>",
40 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41 "Aaron Swartz <http://aaronsw.com/>",
42 "Kevin Marks <http://epeus.blogspot.com/>",
43 "Sam Ruby <http://intertwingly.net/>",
44 "Ade Oshineye <http://blog.oshineye.com/>",
45 "Martin Pool <http://sourcefrog.net/>"]
46 _debug = 0
48 # HTTP "User-Agent" header to send to servers when downloading feeds.
49 # If you are embedding feedparser in a larger application, you should
50 # change this to your application name and URL.
51 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
53 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
54 # want to send an Accept header, set this to None.
55 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
57 # List of preferred XML parsers, by SAX driver name. These will be tried first,
58 # but if they're not installed, Python will keep searching through its own list
59 # of pre-installed parsers until it finds one that supports everything we need.
60 PREFERRED_XML_PARSERS = ["drv_libxml2"]
62 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
63 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
64 # or utidylib <http://utidylib.berlios.de/>.
65 TIDY_MARKUP = 0
67 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
68 # if TIDY_MARKUP = 1
69 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
71 # If you want feedparser to automatically resolve all relative URIs, set this
72 # to 1.
73 RESOLVE_RELATIVE_URIS = 1
75 # If you want feedparser to automatically sanitize all potentially unsafe
76 # HTML content, set this to 1.
77 SANITIZE_HTML = 1
79 # ---------- Python 3 modules (make it work if possible) ----------
80 try:
81 import rfc822
82 except ImportError:
83 from email import _parseaddr as rfc822
85 try:
86 # Python 3.1 introduces bytes.maketrans and simultaneously
87 # deprecates string.maketrans; use bytes.maketrans if possible
88 _maketrans = bytes.maketrans
89 except (NameError, AttributeError):
90 import string
91 _maketrans = string.maketrans
93 # base64 support for Atom feeds that contain embedded binary data
94 try:
95 import base64, binascii
96 # Python 3.1 deprecates decodestring in favor of decodebytes
97 _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
98 except:
99 base64 = binascii = None
101 def _s2bytes(s):
102 # Convert a UTF-8 str to bytes if the interpreter is Python 3
103 try:
104 return bytes(s, 'utf8')
105 except (NameError, TypeError):
106 # In Python 2.5 and below, bytes doesn't exist (NameError)
107 # In Python 2.6 and above, bytes and str are the same (TypeError)
108 return s
110 def _l2bytes(l):
111 # Convert a list of ints to bytes if the interpreter is Python 3
112 try:
113 if bytes is not str:
114 # In Python 2.6 and above, this call won't raise an exception
115 # but it will return bytes([65]) as '[65]' instead of 'A'
116 return bytes(l)
117 raise NameError
118 except NameError:
119 return ''.join(map(chr, l))
121 # If you want feedparser to allow all URL schemes, set this to ()
122 # List culled from Python's urlparse documentation at:
123 # http://docs.python.org/library/urlparse.html
124 # as well as from "URI scheme" at Wikipedia:
125 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
126 # Many more will likely need to be added!
127 ACCEPTABLE_URI_SCHEMES = (
128 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto',
129 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp',
130 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais',
131 # Additional common-but-unofficial schemes
132 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
133 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
135 #ACCEPTABLE_URI_SCHEMES = ()
137 # ---------- required modules (should come with any Python distribution) ----------
138 import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime
139 try:
140 from io import BytesIO as _StringIO
141 except ImportError:
142 try:
143 from cStringIO import StringIO as _StringIO
144 except:
145 from StringIO import StringIO as _StringIO
147 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
149 # gzip is included with most Python distributions, but may not be available if you compiled your own
150 try:
151 import gzip
152 except:
153 gzip = None
154 try:
155 import zlib
156 except:
157 zlib = None
159 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
160 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
161 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
162 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
163 try:
164 import xml.sax
165 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
166 from xml.sax.saxutils import escape as _xmlescape
167 _XML_AVAILABLE = 1
168 except:
169 _XML_AVAILABLE = 0
170 def _xmlescape(data,entities={}):
171 data = data.replace('&', '&amp;')
172 data = data.replace('>', '&gt;')
173 data = data.replace('<', '&lt;')
174 for char, entity in entities:
175 data = data.replace(char, entity)
176 return data
178 # cjkcodecs and iconv_codec provide support for more character encodings.
179 # Both are available from http://cjkpython.i18n.org/
180 try:
181 import cjkcodecs.aliases
182 except:
183 pass
184 try:
185 import iconv_codec
186 except:
187 pass
189 # chardet library auto-detects character encodings
190 # Download from http://chardet.feedparser.org/
191 try:
192 import chardet
193 if _debug:
194 import chardet.constants
195 chardet.constants._debug = 1
196 except:
197 chardet = None
199 # reversable htmlentitydefs mappings for Python 2.2
200 try:
201 from htmlentitydefs import name2codepoint, codepoint2name
202 except:
203 import htmlentitydefs
204 name2codepoint={}
205 codepoint2name={}
206 for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
207 if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
208 name2codepoint[name]=ord(codepoint)
209 codepoint2name[ord(codepoint)]=name
211 # BeautifulSoup parser used for parsing microformats from embedded HTML content
212 # http://www.crummy.com/software/BeautifulSoup/
213 # feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
214 # older 2.x series. If it doesn't, and you can figure out why, I'll accept a
215 # patch and modify the compatibility statement accordingly.
216 try:
217 import BeautifulSoup
218 except:
219 BeautifulSoup = None
221 # ---------- don't touch these ----------
222 class ThingsNobodyCaresAboutButMe(Exception): pass
223 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
224 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
225 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
226 class UndeclaredNamespace(Exception): pass
228 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
229 sgmllib.special = re.compile('<!')
230 sgmllib.charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
232 if sgmllib.endbracket.search(' <').start(0):
233 class EndBracketRegEx:
234 def __init__(self):
235 # Overriding the built-in sgmllib.endbracket regex allows the
236 # parser to find angle brackets embedded in element attributes.
237 self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
238 def search(self,string,index=0):
239 match = self.endbracket.match(string,index)
240 if match is not None:
241 # Returning a new object in the calling thread's context
242 # resolves a thread-safety.
243 return EndBracketMatch(match)
244 return None
245 class EndBracketMatch:
246 def __init__(self, match):
247 self.match = match
248 def start(self, n):
249 return self.match.end(n)
250 sgmllib.endbracket = EndBracketRegEx()
252 SUPPORTED_VERSIONS = {'': 'unknown',
253 'rss090': 'RSS 0.90',
254 'rss091n': 'RSS 0.91 (Netscape)',
255 'rss091u': 'RSS 0.91 (Userland)',
256 'rss092': 'RSS 0.92',
257 'rss093': 'RSS 0.93',
258 'rss094': 'RSS 0.94',
259 'rss20': 'RSS 2.0',
260 'rss10': 'RSS 1.0',
261 'rss': 'RSS (unknown version)',
262 'atom01': 'Atom 0.1',
263 'atom02': 'Atom 0.2',
264 'atom03': 'Atom 0.3',
265 'atom10': 'Atom 1.0',
266 'atom': 'Atom (unknown version)',
267 'cdf': 'CDF',
268 'hotrss': 'Hot RSS'
271 try:
272 UserDict = dict
273 except NameError:
274 # Python 2.1 does not have dict
275 from UserDict import UserDict
276 def dict(aList):
277 rc = {}
278 for k, v in aList:
279 rc[k] = v
280 return rc
282 class FeedParserDict(UserDict):
283 keymap = {'channel': 'feed',
284 'items': 'entries',
285 'guid': 'id',
286 'date': 'updated',
287 'date_parsed': 'updated_parsed',
288 'description': ['summary', 'subtitle'],
289 'url': ['href'],
290 'modified': 'updated',
291 'modified_parsed': 'updated_parsed',
292 'issued': 'published',
293 'issued_parsed': 'published_parsed',
294 'copyright': 'rights',
295 'copyright_detail': 'rights_detail',
296 'tagline': 'subtitle',
297 'tagline_detail': 'subtitle_detail'}
298 def __getitem__(self, key):
299 if key == 'category':
300 return UserDict.__getitem__(self, 'tags')[0]['term']
301 if key == 'enclosures':
302 norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
303 return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
304 if key == 'license':
305 for link in UserDict.__getitem__(self, 'links'):
306 if link['rel']=='license' and link.has_key('href'):
307 return link['href']
308 if key == 'categories':
309 return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
310 realkey = self.keymap.get(key, key)
311 if type(realkey) == types.ListType:
312 for k in realkey:
313 if UserDict.__contains__(self, k):
314 return UserDict.__getitem__(self, k)
315 if UserDict.__contains__(self, key):
316 return UserDict.__getitem__(self, key)
317 return UserDict.__getitem__(self, realkey)
319 def __setitem__(self, key, value):
320 for k in self.keymap.keys():
321 if key == k:
322 key = self.keymap[k]
323 if type(key) == types.ListType:
324 key = key[0]
325 return UserDict.__setitem__(self, key, value)
327 def get(self, key, default=None):
328 if self.has_key(key):
329 return self[key]
330 else:
331 return default
333 def setdefault(self, key, value):
334 if not self.has_key(key):
335 self[key] = value
336 return self[key]
338 def has_key(self, key):
339 try:
340 return hasattr(self, key) or UserDict.__contains__(self, key)
341 except AttributeError:
342 return False
343 # This alias prevents the 2to3 tool from changing the semantics of the
344 # __contains__ function below and exhausting the maximum recursion depth
345 __has_key = has_key
347 def __getattr__(self, key):
348 try:
349 return self.__dict__[key]
350 except KeyError:
351 pass
352 try:
353 assert not key.startswith('_')
354 return self.__getitem__(key)
355 except:
356 raise AttributeError, "object has no attribute '%s'" % key
358 def __setattr__(self, key, value):
359 if key.startswith('_') or key == 'data':
360 self.__dict__[key] = value
361 else:
362 return self.__setitem__(key, value)
364 def __contains__(self, key):
365 return self.__has_key(key)
367 def zopeCompatibilityHack():
368 global FeedParserDict
369 del FeedParserDict
370 def FeedParserDict(aDict=None):
371 rc = {}
372 if aDict:
373 rc.update(aDict)
374 return rc
376 _ebcdic_to_ascii_map = None
377 def _ebcdic_to_ascii(s):
378 global _ebcdic_to_ascii_map
379 if not _ebcdic_to_ascii_map:
380 emap = (
381 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
382 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
383 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
384 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
385 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
386 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
387 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
388 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
389 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
390 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
391 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
392 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
393 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
394 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
395 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
396 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
398 _ebcdic_to_ascii_map = _maketrans( \
399 _l2bytes(range(256)), _l2bytes(emap))
400 return s.translate(_ebcdic_to_ascii_map)
402 _cp1252 = {
403 unichr(128): unichr(8364), # euro sign
404 unichr(130): unichr(8218), # single low-9 quotation mark
405 unichr(131): unichr( 402), # latin small letter f with hook
406 unichr(132): unichr(8222), # double low-9 quotation mark
407 unichr(133): unichr(8230), # horizontal ellipsis
408 unichr(134): unichr(8224), # dagger
409 unichr(135): unichr(8225), # double dagger
410 unichr(136): unichr( 710), # modifier letter circumflex accent
411 unichr(137): unichr(8240), # per mille sign
412 unichr(138): unichr( 352), # latin capital letter s with caron
413 unichr(139): unichr(8249), # single left-pointing angle quotation mark
414 unichr(140): unichr( 338), # latin capital ligature oe
415 unichr(142): unichr( 381), # latin capital letter z with caron
416 unichr(145): unichr(8216), # left single quotation mark
417 unichr(146): unichr(8217), # right single quotation mark
418 unichr(147): unichr(8220), # left double quotation mark
419 unichr(148): unichr(8221), # right double quotation mark
420 unichr(149): unichr(8226), # bullet
421 unichr(150): unichr(8211), # en dash
422 unichr(151): unichr(8212), # em dash
423 unichr(152): unichr( 732), # small tilde
424 unichr(153): unichr(8482), # trade mark sign
425 unichr(154): unichr( 353), # latin small letter s with caron
426 unichr(155): unichr(8250), # single right-pointing angle quotation mark
427 unichr(156): unichr( 339), # latin small ligature oe
428 unichr(158): unichr( 382), # latin small letter z with caron
429 unichr(159): unichr( 376)} # latin capital letter y with diaeresis
431 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
432 def _urljoin(base, uri):
433 uri = _urifixer.sub(r'\1\3', uri)
434 try:
435 return urlparse.urljoin(base, uri)
436 except:
437 uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
438 return urlparse.urljoin(base, uri)
440 class _FeedParserMixin:
441 namespaces = {'': '',
442 'http://backend.userland.com/rss': '',
443 'http://blogs.law.harvard.edu/tech/rss': '',
444 'http://purl.org/rss/1.0/': '',
445 'http://my.netscape.com/rdf/simple/0.9/': '',
446 'http://example.com/newformat#': '',
447 'http://example.com/necho': '',
448 'http://purl.org/echo/': '',
449 'uri/of/echo/namespace#': '',
450 'http://purl.org/pie/': '',
451 'http://purl.org/atom/ns#': '',
452 'http://www.w3.org/2005/Atom': '',
453 'http://purl.org/rss/1.0/modules/rss091#': '',
455 'http://webns.net/mvcb/': 'admin',
456 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
457 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
458 'http://media.tangent.org/rss/1.0/': 'audio',
459 'http://backend.userland.com/blogChannelModule': 'blogChannel',
460 'http://web.resource.org/cc/': 'cc',
461 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
462 'http://purl.org/rss/1.0/modules/company': 'co',
463 'http://purl.org/rss/1.0/modules/content/': 'content',
464 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
465 'http://purl.org/dc/elements/1.1/': 'dc',
466 'http://purl.org/dc/terms/': 'dcterms',
467 'http://purl.org/rss/1.0/modules/email/': 'email',
468 'http://purl.org/rss/1.0/modules/event/': 'ev',
469 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
470 'http://freshmeat.net/rss/fm/': 'fm',
471 'http://xmlns.com/foaf/0.1/': 'foaf',
472 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
473 'http://postneo.com/icbm/': 'icbm',
474 'http://purl.org/rss/1.0/modules/image/': 'image',
475 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
476 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
477 'http://purl.org/rss/1.0/modules/link/': 'l',
478 'http://search.yahoo.com/mrss': 'media',
479 #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
480 'http://search.yahoo.com/mrss/': 'media',
481 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
482 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
483 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
484 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
485 'http://purl.org/rss/1.0/modules/reference/': 'ref',
486 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
487 'http://purl.org/rss/1.0/modules/search/': 'search',
488 'http://purl.org/rss/1.0/modules/slash/': 'slash',
489 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
490 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
491 'http://hacks.benhammersley.com/rss/streaming/': 'str',
492 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
493 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
494 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
495 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
496 'http://purl.org/rss/1.0/modules/threading/': 'thr',
497 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
498 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
499 'http://wellformedweb.org/commentAPI/': 'wfw',
500 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
501 'http://www.w3.org/1999/xhtml': 'xhtml',
502 'http://www.w3.org/1999/xlink': 'xlink',
503 'http://www.w3.org/XML/1998/namespace': 'xml'
505 _matchnamespaces = {}
507 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
508 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
509 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
510 html_types = ['text/html', 'application/xhtml+xml']
512 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
513 if _debug: sys.stderr.write('initializing FeedParser\n')
514 if not self._matchnamespaces:
515 for k, v in self.namespaces.items():
516 self._matchnamespaces[k.lower()] = v
517 self.feeddata = FeedParserDict() # feed-level data
518 self.encoding = encoding # character encoding
519 self.entries = [] # list of entry-level data
520 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
521 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
523 # the following are used internally to track state;
524 # this is really out of control and should be refactored
525 self.infeed = 0
526 self.inentry = 0
527 self.incontent = 0
528 self.intextinput = 0
529 self.inimage = 0
530 self.inauthor = 0
531 self.incontributor = 0
532 self.inpublisher = 0
533 self.insource = 0
534 self.sourcedata = FeedParserDict()
535 self.contentparams = FeedParserDict()
536 self._summaryKey = None
537 self.namespacemap = {}
538 self.elementstack = []
539 self.basestack = []
540 self.langstack = []
541 self.baseuri = baseuri or ''
542 self.lang = baselang or None
543 self.svgOK = 0
544 self.hasTitle = 0
545 if baselang:
546 self.feeddata['language'] = baselang.replace('_','-')
548 def unknown_starttag(self, tag, attrs):
549 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
550 # normalize attrs
551 attrs = [(k.lower(), v) for k, v in attrs]
552 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
553 # the sgml parser doesn't handle entities in attributes, but
554 # strict xml parsers do -- account for this difference
555 if isinstance(self, _LooseFeedParser):
556 attrs = [(k, v.replace('&amp;', '&')) for k, v in attrs]
558 # track xml:base and xml:lang
559 attrsD = dict(attrs)
560 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
561 if type(baseuri) != type(u''):
562 try:
563 baseuri = unicode(baseuri, self.encoding)
564 except:
565 baseuri = unicode(baseuri, 'iso-8859-1')
566 # ensure that self.baseuri is always an absolute URI that
567 # uses a whitelisted URI scheme (e.g. not `javscript:`)
568 if self.baseuri:
569 self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
570 else:
571 self.baseuri = _urljoin(self.baseuri, baseuri)
572 lang = attrsD.get('xml:lang', attrsD.get('lang'))
573 if lang == '':
574 # xml:lang could be explicitly set to '', we need to capture that
575 lang = None
576 elif lang is None:
577 # if no xml:lang is specified, use parent lang
578 lang = self.lang
579 if lang:
580 if tag in ('feed', 'rss', 'rdf:RDF'):
581 self.feeddata['language'] = lang.replace('_','-')
582 self.lang = lang
583 self.basestack.append(self.baseuri)
584 self.langstack.append(lang)
586 # track namespaces
587 for prefix, uri in attrs:
588 if prefix.startswith('xmlns:'):
589 self.trackNamespace(prefix[6:], uri)
590 elif prefix == 'xmlns':
591 self.trackNamespace(None, uri)
593 # track inline content
594 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
595 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
596 # element declared itself as escaped markup, but it isn't really
597 self.contentparams['type'] = 'application/xhtml+xml'
598 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
599 if tag.find(':') <> -1:
600 prefix, tag = tag.split(':', 1)
601 namespace = self.namespacesInUse.get(prefix, '')
602 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
603 attrs.append(('xmlns',namespace))
604 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
605 attrs.append(('xmlns',namespace))
606 if tag == 'svg': self.svgOK += 1
607 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
609 # match namespaces
610 if tag.find(':') <> -1:
611 prefix, suffix = tag.split(':', 1)
612 else:
613 prefix, suffix = '', tag
614 prefix = self.namespacemap.get(prefix, prefix)
615 if prefix:
616 prefix = prefix + '_'
618 # special hack for better tracking of empty textinput/image elements in illformed feeds
619 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
620 self.intextinput = 0
621 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
622 self.inimage = 0
624 # call special handler (if defined) or default handler
625 methodname = '_start_' + prefix + suffix
626 try:
627 method = getattr(self, methodname)
628 return method(attrsD)
629 except AttributeError:
630 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
631 unknown_tag = prefix + suffix
632 if len(attrsD) == 0:
633 # No attributes so merge it into the encosing dictionary
634 return self.push(unknown_tag, 1)
635 else:
636 # Has attributes so create it in its own dictionary
637 context = self._getContext()
638 context[unknown_tag] = attrsD
640 def unknown_endtag(self, tag):
641 if _debug: sys.stderr.write('end %s\n' % tag)
642 # match namespaces
643 if tag.find(':') <> -1:
644 prefix, suffix = tag.split(':', 1)
645 else:
646 prefix, suffix = '', tag
647 prefix = self.namespacemap.get(prefix, prefix)
648 if prefix:
649 prefix = prefix + '_'
650 if suffix == 'svg' and self.svgOK: self.svgOK -= 1
652 # call special handler (if defined) or default handler
653 methodname = '_end_' + prefix + suffix
654 try:
655 if self.svgOK: raise AttributeError()
656 method = getattr(self, methodname)
657 method()
658 except AttributeError:
659 self.pop(prefix + suffix)
661 # track inline content
662 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
663 # element declared itself as escaped markup, but it isn't really
664 if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
665 self.contentparams['type'] = 'application/xhtml+xml'
666 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
667 tag = tag.split(':')[-1]
668 self.handle_data('</%s>' % tag, escape=0)
670 # track xml:base and xml:lang going out of scope
671 if self.basestack:
672 self.basestack.pop()
673 if self.basestack and self.basestack[-1]:
674 self.baseuri = self.basestack[-1]
675 if self.langstack:
676 self.langstack.pop()
677 if self.langstack: # and (self.langstack[-1] is not None):
678 self.lang = self.langstack[-1]
680 def handle_charref(self, ref):
681 # called for each character reference, e.g. for '&#160;', ref will be '160'
682 if not self.elementstack: return
683 ref = ref.lower()
684 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
685 text = '&#%s;' % ref
686 else:
687 if ref[0] == 'x':
688 c = int(ref[1:], 16)
689 else:
690 c = int(ref)
691 text = unichr(c).encode('utf-8')
692 self.elementstack[-1][2].append(text)
694 def handle_entityref(self, ref):
695 # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
696 if not self.elementstack: return
697 if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
698 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
699 text = '&%s;' % ref
700 elif ref in self.entities.keys():
701 text = self.entities[ref]
702 if text.startswith('&#') and text.endswith(';'):
703 return self.handle_entityref(text)
704 else:
705 try: name2codepoint[ref]
706 except KeyError: text = '&%s;' % ref
707 else: text = unichr(name2codepoint[ref]).encode('utf-8')
708 self.elementstack[-1][2].append(text)
710 def handle_data(self, text, escape=1):
711 # called for each block of plain text, i.e. outside of any tag and
712 # not containing any character or entity references
713 if not self.elementstack: return
714 if escape and self.contentparams.get('type') == 'application/xhtml+xml':
715 text = _xmlescape(text)
716 self.elementstack[-1][2].append(text)
718 def handle_comment(self, text):
719 # called for each comment, e.g. <!-- insert message here -->
720 pass
722 def handle_pi(self, text):
723 # called for each processing instruction, e.g. <?instruction>
724 pass
726 def handle_decl(self, text):
727 pass
729 def parse_declaration(self, i):
730 # override internal declaration handler to handle CDATA blocks
731 if _debug: sys.stderr.write('entering parse_declaration\n')
732 if self.rawdata[i:i+9] == '<![CDATA[':
733 k = self.rawdata.find(']]>', i)
734 if k == -1:
735 # CDATA block began but didn't finish
736 k = len(self.rawdata)
737 return k
738 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
739 return k+3
740 else:
741 k = self.rawdata.find('>', i)
742 if k >= 0:
743 return k+1
744 else:
745 # We have an incomplete CDATA block.
746 return k
748 def mapContentType(self, contentType):
749 contentType = contentType.lower()
750 if contentType == 'text':
751 contentType = 'text/plain'
752 elif contentType == 'html':
753 contentType = 'text/html'
754 elif contentType == 'xhtml':
755 contentType = 'application/xhtml+xml'
756 return contentType
758 def trackNamespace(self, prefix, uri):
759 loweruri = uri.lower()
760 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
761 self.version = 'rss090'
762 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
763 self.version = 'rss10'
764 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
765 self.version = 'atom10'
766 if loweruri.find('backend.userland.com/rss') <> -1:
767 # match any backend.userland.com namespace
768 uri = 'http://backend.userland.com/rss'
769 loweruri = uri
770 if self._matchnamespaces.has_key(loweruri):
771 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
772 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
773 else:
774 self.namespacesInUse[prefix or ''] = uri
776 def resolveURI(self, uri):
777 return _urljoin(self.baseuri or '', uri)
779 def decodeEntities(self, element, data):
780 return data
782 def strattrs(self, attrs):
783 return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
785 def push(self, element, expectingText):
786 self.elementstack.append([element, expectingText, []])
788 def pop(self, element, stripWhitespace=1):
789 if not self.elementstack: return
790 if self.elementstack[-1][0] != element: return
792 element, expectingText, pieces = self.elementstack.pop()
794 if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
795 # remove enclosing child element, but only if it is a <div> and
796 # only if all the remaining content is nested underneath it.
797 # This means that the divs would be retained in the following:
798 # <div>foo</div><div>bar</div>
799 while pieces and len(pieces)>1 and not pieces[-1].strip():
800 del pieces[-1]
801 while pieces and len(pieces)>1 and not pieces[0].strip():
802 del pieces[0]
803 if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
804 depth = 0
805 for piece in pieces[:-1]:
806 if piece.startswith('</'):
807 depth -= 1
808 if depth == 0: break
809 elif piece.startswith('<') and not piece.endswith('/>'):
810 depth += 1
811 else:
812 pieces = pieces[1:-1]
814 # Ensure each piece is a str for Python 3
815 for (i, v) in enumerate(pieces):
816 if not isinstance(v, basestring):
817 pieces[i] = v.decode('utf-8')
819 output = ''.join(pieces)
820 if stripWhitespace:
821 output = output.strip()
822 if not expectingText: return output
824 # decode base64 content
825 if base64 and self.contentparams.get('base64', 0):
826 try:
827 output = _base64decode(output)
828 except binascii.Error:
829 pass
830 except binascii.Incomplete:
831 pass
832 except TypeError:
833 # In Python 3, base64 takes and outputs bytes, not str
834 # This may not be the most correct way to accomplish this
835 output = _base64decode(output.encode('utf-8')).decode('utf-8')
837 # resolve relative URIs
838 if (element in self.can_be_relative_uri) and output:
839 output = self.resolveURI(output)
841 # decode entities within embedded markup
842 if not self.contentparams.get('base64', 0):
843 output = self.decodeEntities(element, output)
845 if self.lookslikehtml(output):
846 self.contentparams['type']='text/html'
848 # remove temporary cruft from contentparams
849 try:
850 del self.contentparams['mode']
851 except KeyError:
852 pass
853 try:
854 del self.contentparams['base64']
855 except KeyError:
856 pass
858 is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
859 # resolve relative URIs within embedded markup
860 if is_htmlish and RESOLVE_RELATIVE_URIS:
861 if element in self.can_contain_relative_uris:
862 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
864 # parse microformats
865 # (must do this before sanitizing because some microformats
866 # rely on elements that we sanitize)
867 if is_htmlish and element in ['content', 'description', 'summary']:
868 mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
869 if mfresults:
870 for tag in mfresults.get('tags', []):
871 self._addTag(tag['term'], tag['scheme'], tag['label'])
872 for enclosure in mfresults.get('enclosures', []):
873 self._start_enclosure(enclosure)
874 for xfn in mfresults.get('xfn', []):
875 self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
876 vcard = mfresults.get('vcard')
877 if vcard:
878 self._getContext()['vcard'] = vcard
880 # sanitize embedded markup
881 if is_htmlish and SANITIZE_HTML:
882 if element in self.can_contain_dangerous_markup:
883 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
885 if self.encoding and type(output) != type(u''):
886 try:
887 output = unicode(output, self.encoding)
888 except:
889 pass
891 # address common error where people take data that is already
892 # utf-8, presume that it is iso-8859-1, and re-encode it.
893 if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''):
894 try:
895 output = unicode(output.encode('iso-8859-1'), 'utf-8')
896 except:
897 pass
899 # map win-1252 extensions to the proper code points
900 if type(output) == type(u''):
901 output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
903 # categories/tags/keywords/whatever are handled in _end_category
904 if element == 'category':
905 return output
907 if element == 'title' and self.hasTitle:
908 return output
910 # store output in appropriate place(s)
911 if self.inentry and not self.insource:
912 if element == 'content':
913 self.entries[-1].setdefault(element, [])
914 contentparams = copy.deepcopy(self.contentparams)
915 contentparams['value'] = output
916 self.entries[-1][element].append(contentparams)
917 elif element == 'link':
918 if not self.inimage:
919 # query variables in urls in link elements are improperly
920 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
921 # unhandled character references. fix this special case.
922 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
923 self.entries[-1][element] = output
924 if output:
925 self.entries[-1]['links'][-1]['href'] = output
926 else:
927 if element == 'description':
928 element = 'summary'
929 self.entries[-1][element] = output
930 if self.incontent:
931 contentparams = copy.deepcopy(self.contentparams)
932 contentparams['value'] = output
933 self.entries[-1][element + '_detail'] = contentparams
934 elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
935 context = self._getContext()
936 if element == 'description':
937 element = 'subtitle'
938 context[element] = output
939 if element == 'link':
940 # fix query variables; see above for the explanation
941 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
942 context[element] = output
943 context['links'][-1]['href'] = output
944 elif self.incontent:
945 contentparams = copy.deepcopy(self.contentparams)
946 contentparams['value'] = output
947 context[element + '_detail'] = contentparams
948 return output
950 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
951 self.incontent += 1
952 if self.lang: self.lang=self.lang.replace('_','-')
953 self.contentparams = FeedParserDict({
954 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
955 'language': self.lang,
956 'base': self.baseuri})
957 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
958 self.push(tag, expectingText)
960 def popContent(self, tag):
961 value = self.pop(tag)
962 self.incontent -= 1
963 self.contentparams.clear()
964 return value
966 # a number of elements in a number of RSS variants are nominally plain
967 # text, but this is routinely ignored. This is an attempt to detect
968 # the most common cases. As false positives often result in silent
969 # data loss, this function errs on the conservative side.
970 def lookslikehtml(self, s):
971 if self.version.startswith('atom'): return
972 if self.contentparams.get('type','text/html') != 'text/plain': return
974 # must have a close tag or a entity reference to qualify
975 if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)): return
977 # all tags must be in a restricted subset of valid HTML tags
978 if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
979 re.findall(r'</?(\w+)',s)): return
981 # all entities must have been defined as valid HTML entities
982 from htmlentitydefs import entitydefs
983 if filter(lambda e: e not in entitydefs.keys(),
984 re.findall(r'&(\w+);',s)): return
986 return 1
988 def _mapToStandardPrefix(self, name):
989 colonpos = name.find(':')
990 if colonpos <> -1:
991 prefix = name[:colonpos]
992 suffix = name[colonpos+1:]
993 prefix = self.namespacemap.get(prefix, prefix)
994 name = prefix + ':' + suffix
995 return name
997 def _getAttribute(self, attrsD, name):
998 return attrsD.get(self._mapToStandardPrefix(name))
1000 def _isBase64(self, attrsD, contentparams):
1001 if attrsD.get('mode', '') == 'base64':
1002 return 1
1003 if self.contentparams['type'].startswith('text/'):
1004 return 0
1005 if self.contentparams['type'].endswith('+xml'):
1006 return 0
1007 if self.contentparams['type'].endswith('/xml'):
1008 return 0
1009 return 1
1011 def _itsAnHrefDamnIt(self, attrsD):
1012 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1013 if href:
1014 try:
1015 del attrsD['url']
1016 except KeyError:
1017 pass
1018 try:
1019 del attrsD['uri']
1020 except KeyError:
1021 pass
1022 attrsD['href'] = href
1023 return attrsD
1025 def _save(self, key, value, overwrite=False):
1026 context = self._getContext()
1027 if overwrite:
1028 context[key] = value
1029 else:
1030 context.setdefault(key, value)
1032 def _start_rss(self, attrsD):
1033 versionmap = {'0.91': 'rss091u',
1034 '0.92': 'rss092',
1035 '0.93': 'rss093',
1036 '0.94': 'rss094'}
1037 #If we're here then this is an RSS feed.
1038 #If we don't have a version or have a version that starts with something
1039 #other than RSS then there's been a mistake. Correct it.
1040 if not self.version or not self.version.startswith('rss'):
1041 attr_version = attrsD.get('version', '')
1042 version = versionmap.get(attr_version)
1043 if version:
1044 self.version = version
1045 elif attr_version.startswith('2.'):
1046 self.version = 'rss20'
1047 else:
1048 self.version = 'rss'
1050 def _start_dlhottitles(self, attrsD):
1051 self.version = 'hotrss'
1053 def _start_channel(self, attrsD):
1054 self.infeed = 1
1055 self._cdf_common(attrsD)
1056 _start_feedinfo = _start_channel
1058 def _cdf_common(self, attrsD):
1059 if attrsD.has_key('lastmod'):
1060 self._start_modified({})
1061 self.elementstack[-1][-1] = attrsD['lastmod']
1062 self._end_modified()
1063 if attrsD.has_key('href'):
1064 self._start_link({})
1065 self.elementstack[-1][-1] = attrsD['href']
1066 self._end_link()
1068 def _start_feed(self, attrsD):
1069 self.infeed = 1
1070 versionmap = {'0.1': 'atom01',
1071 '0.2': 'atom02',
1072 '0.3': 'atom03'}
1073 if not self.version:
1074 attr_version = attrsD.get('version')
1075 version = versionmap.get(attr_version)
1076 if version:
1077 self.version = version
1078 else:
1079 self.version = 'atom'
1081 def _end_channel(self):
1082 self.infeed = 0
1083 _end_feed = _end_channel
1085 def _start_image(self, attrsD):
1086 context = self._getContext()
1087 if not self.inentry:
1088 context.setdefault('image', FeedParserDict())
1089 self.inimage = 1
1090 self.hasTitle = 0
1091 self.push('image', 0)
1093 def _end_image(self):
1094 self.pop('image')
1095 self.inimage = 0
1097 def _start_textinput(self, attrsD):
1098 context = self._getContext()
1099 context.setdefault('textinput', FeedParserDict())
1100 self.intextinput = 1
1101 self.hasTitle = 0
1102 self.push('textinput', 0)
1103 _start_textInput = _start_textinput
1105 def _end_textinput(self):
1106 self.pop('textinput')
1107 self.intextinput = 0
1108 _end_textInput = _end_textinput
1110 def _start_author(self, attrsD):
1111 self.inauthor = 1
1112 self.push('author', 1)
1113 # Append a new FeedParserDict when expecting an author
1114 context = self._getContext()
1115 context.setdefault('authors', [])
1116 context['authors'].append(FeedParserDict())
1117 _start_managingeditor = _start_author
1118 _start_dc_author = _start_author
1119 _start_dc_creator = _start_author
1120 _start_itunes_author = _start_author
1122 def _end_author(self):
1123 self.pop('author')
1124 self.inauthor = 0
1125 self._sync_author_detail()
1126 _end_managingeditor = _end_author
1127 _end_dc_author = _end_author
1128 _end_dc_creator = _end_author
1129 _end_itunes_author = _end_author
1131 def _start_itunes_owner(self, attrsD):
1132 self.inpublisher = 1
1133 self.push('publisher', 0)
1135 def _end_itunes_owner(self):
1136 self.pop('publisher')
1137 self.inpublisher = 0
1138 self._sync_author_detail('publisher')
1140 def _start_contributor(self, attrsD):
1141 self.incontributor = 1
1142 context = self._getContext()
1143 context.setdefault('contributors', [])
1144 context['contributors'].append(FeedParserDict())
1145 self.push('contributor', 0)
1147 def _end_contributor(self):
1148 self.pop('contributor')
1149 self.incontributor = 0
1151 def _start_dc_contributor(self, attrsD):
1152 self.incontributor = 1
1153 context = self._getContext()
1154 context.setdefault('contributors', [])
1155 context['contributors'].append(FeedParserDict())
1156 self.push('name', 0)
1158 def _end_dc_contributor(self):
1159 self._end_name()
1160 self.incontributor = 0
1162 def _start_name(self, attrsD):
1163 self.push('name', 0)
1164 _start_itunes_name = _start_name
1166 def _end_name(self):
1167 value = self.pop('name')
1168 if self.inpublisher:
1169 self._save_author('name', value, 'publisher')
1170 elif self.inauthor:
1171 self._save_author('name', value)
1172 elif self.incontributor:
1173 self._save_contributor('name', value)
1174 elif self.intextinput:
1175 context = self._getContext()
1176 context['name'] = value
1177 _end_itunes_name = _end_name
1179 def _start_width(self, attrsD):
1180 self.push('width', 0)
1182 def _end_width(self):
1183 value = self.pop('width')
1184 try:
1185 value = int(value)
1186 except:
1187 value = 0
1188 if self.inimage:
1189 context = self._getContext()
1190 context['width'] = value
1192 def _start_height(self, attrsD):
1193 self.push('height', 0)
1195 def _end_height(self):
1196 value = self.pop('height')
1197 try:
1198 value = int(value)
1199 except:
1200 value = 0
1201 if self.inimage:
1202 context = self._getContext()
1203 context['height'] = value
1205 def _start_url(self, attrsD):
1206 self.push('href', 1)
1207 _start_homepage = _start_url
1208 _start_uri = _start_url
1210 def _end_url(self):
1211 value = self.pop('href')
1212 if self.inauthor:
1213 self._save_author('href', value)
1214 elif self.incontributor:
1215 self._save_contributor('href', value)
1216 _end_homepage = _end_url
1217 _end_uri = _end_url
1219 def _start_email(self, attrsD):
1220 self.push('email', 0)
1221 _start_itunes_email = _start_email
1223 def _end_email(self):
1224 value = self.pop('email')
1225 if self.inpublisher:
1226 self._save_author('email', value, 'publisher')
1227 elif self.inauthor:
1228 self._save_author('email', value)
1229 elif self.incontributor:
1230 self._save_contributor('email', value)
1231 _end_itunes_email = _end_email
1233 def _getContext(self):
1234 if self.insource:
1235 context = self.sourcedata
1236 elif self.inimage and self.feeddata.has_key('image'):
1237 context = self.feeddata['image']
1238 elif self.intextinput:
1239 context = self.feeddata['textinput']
1240 elif self.inentry:
1241 context = self.entries[-1]
1242 else:
1243 context = self.feeddata
1244 return context
1246 def _save_author(self, key, value, prefix='author'):
1247 context = self._getContext()
1248 context.setdefault(prefix + '_detail', FeedParserDict())
1249 context[prefix + '_detail'][key] = value
1250 self._sync_author_detail()
1251 context.setdefault('authors', [FeedParserDict()])
1252 context['authors'][-1][key] = value
1254 def _save_contributor(self, key, value):
1255 context = self._getContext()
1256 context.setdefault('contributors', [FeedParserDict()])
1257 context['contributors'][-1][key] = value
1259 def _sync_author_detail(self, key='author'):
1260 context = self._getContext()
1261 detail = context.get('%s_detail' % key)
1262 if detail:
1263 name = detail.get('name')
1264 email = detail.get('email')
1265 if name and email:
1266 context[key] = '%s (%s)' % (name, email)
1267 elif name:
1268 context[key] = name
1269 elif email:
1270 context[key] = email
1271 else:
1272 author, email = context.get(key), None
1273 if not author: return
1274 emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1275 if emailmatch:
1276 email = emailmatch.group(0)
1277 # probably a better way to do the following, but it passes all the tests
1278 author = author.replace(email, '')
1279 author = author.replace('()', '')
1280 author = author.replace('<>', '')
1281 author = author.replace('&lt;&gt;', '')
1282 author = author.strip()
1283 if author and (author[0] == '('):
1284 author = author[1:]
1285 if author and (author[-1] == ')'):
1286 author = author[:-1]
1287 author = author.strip()
1288 if author or email:
1289 context.setdefault('%s_detail' % key, FeedParserDict())
1290 if author:
1291 context['%s_detail' % key]['name'] = author
1292 if email:
1293 context['%s_detail' % key]['email'] = email
1295 def _start_subtitle(self, attrsD):
1296 self.pushContent('subtitle', attrsD, 'text/plain', 1)
1297 _start_tagline = _start_subtitle
1298 _start_itunes_subtitle = _start_subtitle
1300 def _end_subtitle(self):
1301 self.popContent('subtitle')
1302 _end_tagline = _end_subtitle
1303 _end_itunes_subtitle = _end_subtitle
1305 def _start_rights(self, attrsD):
1306 self.pushContent('rights', attrsD, 'text/plain', 1)
1307 _start_dc_rights = _start_rights
1308 _start_copyright = _start_rights
1310 def _end_rights(self):
1311 self.popContent('rights')
1312 _end_dc_rights = _end_rights
1313 _end_copyright = _end_rights
1315 def _start_item(self, attrsD):
1316 self.entries.append(FeedParserDict())
1317 self.push('item', 0)
1318 self.inentry = 1
1319 self.guidislink = 0
1320 self.hasTitle = 0
1321 id = self._getAttribute(attrsD, 'rdf:about')
1322 if id:
1323 context = self._getContext()
1324 context['id'] = id
1325 self._cdf_common(attrsD)
1326 _start_entry = _start_item
1327 _start_product = _start_item
1329 def _end_item(self):
1330 self.pop('item')
1331 self.inentry = 0
1332 _end_entry = _end_item
1334 def _start_dc_language(self, attrsD):
1335 self.push('language', 1)
1336 _start_language = _start_dc_language
1338 def _end_dc_language(self):
1339 self.lang = self.pop('language')
1340 _end_language = _end_dc_language
1342 def _start_dc_publisher(self, attrsD):
1343 self.push('publisher', 1)
1344 _start_webmaster = _start_dc_publisher
1346 def _end_dc_publisher(self):
1347 self.pop('publisher')
1348 self._sync_author_detail('publisher')
1349 _end_webmaster = _end_dc_publisher
1351 def _start_published(self, attrsD):
1352 self.push('published', 1)
1353 _start_dcterms_issued = _start_published
1354 _start_issued = _start_published
1356 def _end_published(self):
1357 value = self.pop('published')
1358 self._save('published_parsed', _parse_date(value), overwrite=True)
1359 _end_dcterms_issued = _end_published
1360 _end_issued = _end_published
1362 def _start_updated(self, attrsD):
1363 self.push('updated', 1)
1364 _start_modified = _start_updated
1365 _start_dcterms_modified = _start_updated
1366 _start_pubdate = _start_updated
1367 _start_dc_date = _start_updated
1368 _start_lastbuilddate = _start_updated
1370 def _end_updated(self):
1371 value = self.pop('updated')
1372 parsed_value = _parse_date(value)
1373 self._save('updated_parsed', parsed_value, overwrite=True)
1374 _end_modified = _end_updated
1375 _end_dcterms_modified = _end_updated
1376 _end_pubdate = _end_updated
1377 _end_dc_date = _end_updated
1378 _end_lastbuilddate = _end_updated
1380 def _start_created(self, attrsD):
1381 self.push('created', 1)
1382 _start_dcterms_created = _start_created
1384 def _end_created(self):
1385 value = self.pop('created')
1386 self._save('created_parsed', _parse_date(value), overwrite=True)
1387 _end_dcterms_created = _end_created
1389 def _start_expirationdate(self, attrsD):
1390 self.push('expired', 1)
1392 def _end_expirationdate(self):
1393 self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1395 def _start_cc_license(self, attrsD):
1396 context = self._getContext()
1397 value = self._getAttribute(attrsD, 'rdf:resource')
1398 attrsD = FeedParserDict()
1399 attrsD['rel']='license'
1400 if value: attrsD['href']=value
1401 context.setdefault('links', []).append(attrsD)
1403 def _start_creativecommons_license(self, attrsD):
1404 self.push('license', 1)
1405 _start_creativeCommons_license = _start_creativecommons_license
1407 def _end_creativecommons_license(self):
1408 value = self.pop('license')
1409 context = self._getContext()
1410 attrsD = FeedParserDict()
1411 attrsD['rel']='license'
1412 if value: attrsD['href']=value
1413 context.setdefault('links', []).append(attrsD)
1414 del context['license']
1415 _end_creativeCommons_license = _end_creativecommons_license
1417 def _addXFN(self, relationships, href, name):
1418 context = self._getContext()
1419 xfn = context.setdefault('xfn', [])
1420 value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name})
1421 if value not in xfn:
1422 xfn.append(value)
1424 def _addTag(self, term, scheme, label):
1425 context = self._getContext()
1426 tags = context.setdefault('tags', [])
1427 if (not term) and (not scheme) and (not label): return
1428 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1429 if value not in tags:
1430 tags.append(value)
1432 def _start_category(self, attrsD):
1433 if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1434 term = attrsD.get('term')
1435 scheme = attrsD.get('scheme', attrsD.get('domain'))
1436 label = attrsD.get('label')
1437 self._addTag(term, scheme, label)
1438 self.push('category', 1)
1439 _start_dc_subject = _start_category
1440 _start_keywords = _start_category
1442 def _start_media_category(self, attrsD):
1443 attrsD.setdefault('scheme', 'http://search.yahoo.com/mrss/category_schema')
1444 self._start_category(attrsD)
1446 def _end_itunes_keywords(self):
1447 for term in self.pop('itunes_keywords').split():
1448 self._addTag(term, 'http://www.itunes.com/', None)
1450 def _start_itunes_category(self, attrsD):
1451 self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1452 self.push('category', 1)
1454 def _end_category(self):
1455 value = self.pop('category')
1456 if not value: return
1457 context = self._getContext()
1458 tags = context['tags']
1459 if value and len(tags) and not tags[-1]['term']:
1460 tags[-1]['term'] = value
1461 else:
1462 self._addTag(value, None, None)
1463 _end_dc_subject = _end_category
1464 _end_keywords = _end_category
1465 _end_itunes_category = _end_category
1466 _end_media_category = _end_category
1468 def _start_cloud(self, attrsD):
1469 self._getContext()['cloud'] = FeedParserDict(attrsD)
1471 def _start_link(self, attrsD):
1472 attrsD.setdefault('rel', 'alternate')
1473 if attrsD['rel'] == 'self':
1474 attrsD.setdefault('type', 'application/atom+xml')
1475 else:
1476 attrsD.setdefault('type', 'text/html')
1477 context = self._getContext()
1478 attrsD = self._itsAnHrefDamnIt(attrsD)
1479 if attrsD.has_key('href'):
1480 attrsD['href'] = self.resolveURI(attrsD['href'])
1481 expectingText = self.infeed or self.inentry or self.insource
1482 context.setdefault('links', [])
1483 if not (self.inentry and self.inimage):
1484 context['links'].append(FeedParserDict(attrsD))
1485 if attrsD.has_key('href'):
1486 expectingText = 0
1487 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1488 context['link'] = attrsD['href']
1489 else:
1490 self.push('link', expectingText)
1491 _start_producturl = _start_link
1493 def _end_link(self):
1494 value = self.pop('link')
1495 context = self._getContext()
1496 _end_producturl = _end_link
1498 def _start_guid(self, attrsD):
1499 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1500 self.push('id', 1)
1502 def _end_guid(self):
1503 value = self.pop('id')
1504 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1505 if self.guidislink:
1506 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1507 # and only if the item doesn't already have a link element
1508 self._save('link', value)
1510 def _start_title(self, attrsD):
1511 if self.svgOK: return self.unknown_starttag('title', attrsD.items())
1512 self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1513 _start_dc_title = _start_title
1514 _start_media_title = _start_title
1516 def _end_title(self):
1517 if self.svgOK: return
1518 value = self.popContent('title')
1519 if not value: return
1520 context = self._getContext()
1521 self.hasTitle = 1
1522 _end_dc_title = _end_title
1524 def _end_media_title(self):
1525 hasTitle = self.hasTitle
1526 self._end_title()
1527 self.hasTitle = hasTitle
1529 def _start_description(self, attrsD):
1530 context = self._getContext()
1531 if context.has_key('summary'):
1532 self._summaryKey = 'content'
1533 self._start_content(attrsD)
1534 else:
1535 self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1536 _start_dc_description = _start_description
1538 def _start_abstract(self, attrsD):
1539 self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1541 def _end_description(self):
1542 if self._summaryKey == 'content':
1543 self._end_content()
1544 else:
1545 value = self.popContent('description')
1546 self._summaryKey = None
1547 _end_abstract = _end_description
1548 _end_dc_description = _end_description
1550 def _start_info(self, attrsD):
1551 self.pushContent('info', attrsD, 'text/plain', 1)
1552 _start_feedburner_browserfriendly = _start_info
1554 def _end_info(self):
1555 self.popContent('info')
1556 _end_feedburner_browserfriendly = _end_info
1558 def _start_generator(self, attrsD):
1559 if attrsD:
1560 attrsD = self._itsAnHrefDamnIt(attrsD)
1561 if attrsD.has_key('href'):
1562 attrsD['href'] = self.resolveURI(attrsD['href'])
1563 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1564 self.push('generator', 1)
1566 def _end_generator(self):
1567 value = self.pop('generator')
1568 context = self._getContext()
1569 if context.has_key('generator_detail'):
1570 context['generator_detail']['name'] = value
1572 def _start_admin_generatoragent(self, attrsD):
1573 self.push('generator', 1)
1574 value = self._getAttribute(attrsD, 'rdf:resource')
1575 if value:
1576 self.elementstack[-1][2].append(value)
1577 self.pop('generator')
1578 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1580 def _start_admin_errorreportsto(self, attrsD):
1581 self.push('errorreportsto', 1)
1582 value = self._getAttribute(attrsD, 'rdf:resource')
1583 if value:
1584 self.elementstack[-1][2].append(value)
1585 self.pop('errorreportsto')
1587 def _start_summary(self, attrsD):
1588 context = self._getContext()
1589 if context.has_key('summary'):
1590 self._summaryKey = 'content'
1591 self._start_content(attrsD)
1592 else:
1593 self._summaryKey = 'summary'
1594 self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1595 _start_itunes_summary = _start_summary
1597 def _end_summary(self):
1598 if self._summaryKey == 'content':
1599 self._end_content()
1600 else:
1601 self.popContent(self._summaryKey or 'summary')
1602 self._summaryKey = None
1603 _end_itunes_summary = _end_summary
1605 def _start_enclosure(self, attrsD):
1606 attrsD = self._itsAnHrefDamnIt(attrsD)
1607 context = self._getContext()
1608 attrsD['rel']='enclosure'
1609 context.setdefault('links', []).append(FeedParserDict(attrsD))
1611 def _start_source(self, attrsD):
1612 if 'url' in attrsD:
1613 # This means that we're processing a source element from an RSS 2.0 feed
1614 self.sourcedata['href'] = attrsD[u'url']
1615 self.push('source', 1)
1616 self.insource = 1
1617 self.hasTitle = 0
1619 def _end_source(self):
1620 self.insource = 0
1621 value = self.pop('source')
1622 if value:
1623 self.sourcedata['title'] = value
1624 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1625 self.sourcedata.clear()
1627 def _start_content(self, attrsD):
1628 self.pushContent('content', attrsD, 'text/plain', 1)
1629 src = attrsD.get('src')
1630 if src:
1631 self.contentparams['src'] = src
1632 self.push('content', 1)
1634 def _start_prodlink(self, attrsD):
1635 self.pushContent('content', attrsD, 'text/html', 1)
1637 def _start_body(self, attrsD):
1638 self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1639 _start_xhtml_body = _start_body
1641 def _start_content_encoded(self, attrsD):
1642 self.pushContent('content', attrsD, 'text/html', 1)
1643 _start_fullitem = _start_content_encoded
1645 def _end_content(self):
1646 copyToSummary = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1647 value = self.popContent('content')
1648 if copyToSummary:
1649 self._save('summary', value)
1651 _end_body = _end_content
1652 _end_xhtml_body = _end_content
1653 _end_content_encoded = _end_content
1654 _end_fullitem = _end_content
1655 _end_prodlink = _end_content
1657 def _start_itunes_image(self, attrsD):
1658 self.push('itunes_image', 0)
1659 if attrsD.get('href'):
1660 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1661 _start_itunes_link = _start_itunes_image
1663 def _end_itunes_block(self):
1664 value = self.pop('itunes_block', 0)
1665 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1667 def _end_itunes_explicit(self):
1668 value = self.pop('itunes_explicit', 0)
1669 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1670 # False and None both evaluate as False, so the difference can be ignored
1671 # by applications that only need to know if the content is explicit.
1672 self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1674 def _start_media_content(self, attrsD):
1675 context = self._getContext()
1676 context.setdefault('media_content', [])
1677 context['media_content'].append(attrsD)
1679 def _start_media_thumbnail(self, attrsD):
1680 context = self._getContext()
1681 context.setdefault('media_thumbnail', [])
1682 self.push('url', 1) # new
1683 context['media_thumbnail'].append(attrsD)
1685 def _end_media_thumbnail(self):
1686 url = self.pop('url')
1687 context = self._getContext()
1688 if url != None and len(url.strip()) != 0:
1689 if not context['media_thumbnail'][-1].has_key('url'):
1690 context['media_thumbnail'][-1]['url'] = url
1692 def _start_media_player(self, attrsD):
1693 self.push('media_player', 0)
1694 self._getContext()['media_player'] = FeedParserDict(attrsD)
1696 def _end_media_player(self):
1697 value = self.pop('media_player')
1698 context = self._getContext()
1699 context['media_player']['content'] = value
1701 def _start_newlocation(self, attrsD):
1702 self.push('newlocation', 1)
1704 def _end_newlocation(self):
1705 url = self.pop('newlocation')
1706 context = self._getContext()
1707 # don't set newlocation if the context isn't right
1708 if context is not self.feeddata:
1709 return
1710 context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1712 if _XML_AVAILABLE:
1713 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1714 def __init__(self, baseuri, baselang, encoding):
1715 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1716 xml.sax.handler.ContentHandler.__init__(self)
1717 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1718 self.bozo = 0
1719 self.exc = None
1720 self.decls = {}
1722 def startPrefixMapping(self, prefix, uri):
1723 self.trackNamespace(prefix, uri)
1724 if uri == 'http://www.w3.org/1999/xlink':
1725 self.decls['xmlns:'+prefix] = uri
1727 def startElementNS(self, name, qname, attrs):
1728 namespace, localname = name
1729 lowernamespace = str(namespace or '').lower()
1730 if lowernamespace.find('backend.userland.com/rss') <> -1:
1731 # match any backend.userland.com namespace
1732 namespace = 'http://backend.userland.com/rss'
1733 lowernamespace = namespace
1734 if qname and qname.find(':') > 0:
1735 givenprefix = qname.split(':')[0]
1736 else:
1737 givenprefix = None
1738 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1739 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1740 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1741 localname = str(localname).lower()
1743 # qname implementation is horribly broken in Python 2.1 (it
1744 # doesn't report any), and slightly broken in Python 2.2 (it
1745 # doesn't report the xml: namespace). So we match up namespaces
1746 # with a known list first, and then possibly override them with
1747 # the qnames the SAX parser gives us (if indeed it gives us any
1748 # at all). Thanks to MatejC for helping me test this and
1749 # tirelessly telling me that it didn't work yet.
1750 attrsD, self.decls = self.decls, {}
1751 if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
1752 attrsD['xmlns']=namespace
1753 if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
1754 attrsD['xmlns']=namespace
1756 if prefix:
1757 localname = prefix.lower() + ':' + localname
1758 elif namespace and not qname: #Expat
1759 for name,value in self.namespacesInUse.items():
1760 if name and value == namespace:
1761 localname = name + ':' + localname
1762 break
1763 if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1765 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1766 lowernamespace = (namespace or '').lower()
1767 prefix = self._matchnamespaces.get(lowernamespace, '')
1768 if prefix:
1769 attrlocalname = prefix + ':' + attrlocalname
1770 attrsD[str(attrlocalname).lower()] = attrvalue
1771 for qname in attrs.getQNames():
1772 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1773 self.unknown_starttag(localname, attrsD.items())
1775 def characters(self, text):
1776 self.handle_data(text)
1778 def endElementNS(self, name, qname):
1779 namespace, localname = name
1780 lowernamespace = str(namespace or '').lower()
1781 if qname and qname.find(':') > 0:
1782 givenprefix = qname.split(':')[0]
1783 else:
1784 givenprefix = ''
1785 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1786 if prefix:
1787 localname = prefix + ':' + localname
1788 elif namespace and not qname: #Expat
1789 for name,value in self.namespacesInUse.items():
1790 if name and value == namespace:
1791 localname = name + ':' + localname
1792 break
1793 localname = str(localname).lower()
1794 self.unknown_endtag(localname)
1796 def error(self, exc):
1797 self.bozo = 1
1798 self.exc = exc
1800 def fatalError(self, exc):
1801 self.error(exc)
1802 raise exc
1804 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1805 special = re.compile('''[<>'"]''')
1806 bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
1807 elements_no_end_tag = [
1808 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
1809 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
1810 'source', 'track', 'wbr'
1813 def __init__(self, encoding, _type):
1814 self.encoding = encoding
1815 self._type = _type
1816 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1817 sgmllib.SGMLParser.__init__(self)
1819 def reset(self):
1820 self.pieces = []
1821 sgmllib.SGMLParser.reset(self)
1823 def _shorttag_replace(self, match):
1824 tag = match.group(1)
1825 if tag in self.elements_no_end_tag:
1826 return '<' + tag + ' />'
1827 else:
1828 return '<' + tag + '></' + tag + '>'
1830 def parse_starttag(self,i):
1831 j=sgmllib.SGMLParser.parse_starttag(self, i)
1832 if self._type == 'application/xhtml+xml':
1833 if j>2 and self.rawdata[j-2:j]=='/>':
1834 self.unknown_endtag(self.lasttag)
1835 return j
1837 def feed(self, data):
1838 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1839 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1840 data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
1841 data = data.replace('&#39;', "'")
1842 data = data.replace('&#34;', '"')
1843 try:
1844 bytes
1845 if bytes is str:
1846 raise NameError
1847 self.encoding = self.encoding + '_INVALID_PYTHON_3'
1848 except NameError:
1849 if self.encoding and type(data) == type(u''):
1850 data = data.encode(self.encoding)
1851 sgmllib.SGMLParser.feed(self, data)
1852 sgmllib.SGMLParser.close(self)
1854 def normalize_attrs(self, attrs):
1855 if not attrs: return attrs
1856 # utility method to be called by descendants
1857 attrs = dict([(k.lower(), v) for k, v in attrs]).items()
1858 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1859 attrs.sort()
1860 return attrs
1862 def unknown_starttag(self, tag, attrs):
1863 # called for each start tag
1864 # attrs is a list of (attr, value) tuples
1865 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1866 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1867 uattrs = []
1868 strattrs=''
1869 if attrs:
1870 for key, value in attrs:
1871 value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
1872 value = self.bare_ampersand.sub("&amp;", value)
1873 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1874 if type(value) != type(u''):
1875 try:
1876 value = unicode(value, self.encoding)
1877 except:
1878 value = unicode(value, 'iso-8859-1')
1879 try:
1880 # Currently, in Python 3 the key is already a str, and cannot be decoded again
1881 uattrs.append((unicode(key, self.encoding), value))
1882 except TypeError:
1883 uattrs.append((key, value))
1884 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
1885 if self.encoding:
1886 try:
1887 strattrs=strattrs.encode(self.encoding)
1888 except:
1889 pass
1890 if tag in self.elements_no_end_tag:
1891 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1892 else:
1893 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1895 def unknown_endtag(self, tag):
1896 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1897 # Reconstruct the original end tag.
1898 if tag not in self.elements_no_end_tag:
1899 self.pieces.append("</%(tag)s>" % locals())
1901 def handle_charref(self, ref):
1902 # called for each character reference, e.g. for '&#160;', ref will be '160'
1903 # Reconstruct the original character reference.
1904 if ref.startswith('x'):
1905 value = unichr(int(ref[1:],16))
1906 else:
1907 value = unichr(int(ref))
1909 if value in _cp1252.keys():
1910 self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
1911 else:
1912 self.pieces.append('&#%(ref)s;' % locals())
1914 def handle_entityref(self, ref):
1915 # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1916 # Reconstruct the original entity reference.
1917 if name2codepoint.has_key(ref):
1918 self.pieces.append('&%(ref)s;' % locals())
1919 else:
1920 self.pieces.append('&amp;%(ref)s' % locals())
1922 def handle_data(self, text):
1923 # called for each block of plain text, i.e. outside of any tag and
1924 # not containing any character or entity references
1925 # Store the original text verbatim.
1926 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text)
1927 self.pieces.append(text)
1929 def handle_comment(self, text):
1930 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1931 # Reconstruct the original comment.
1932 self.pieces.append('<!--%(text)s-->' % locals())
1934 def handle_pi(self, text):
1935 # called for each processing instruction, e.g. <?instruction>
1936 # Reconstruct original processing instruction.
1937 self.pieces.append('<?%(text)s>' % locals())
1939 def handle_decl(self, text):
1940 # called for the DOCTYPE, if present, e.g.
1941 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1942 # "http://www.w3.org/TR/html4/loose.dtd">
1943 # Reconstruct original DOCTYPE
1944 self.pieces.append('<!%(text)s>' % locals())
1946 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1947 def _scan_name(self, i, declstartpos):
1948 rawdata = self.rawdata
1949 n = len(rawdata)
1950 if i == n:
1951 return None, -1
1952 m = self._new_declname_match(rawdata, i)
1953 if m:
1954 s = m.group()
1955 name = s.strip()
1956 if (i + len(s)) == n:
1957 return None, -1 # end of buffer
1958 return name.lower(), m.end()
1959 else:
1960 self.handle_data(rawdata)
1961 # self.updatepos(declstartpos, i)
1962 return None, -1
1964 def convert_charref(self, name):
1965 return '&#%s;' % name
1967 def convert_entityref(self, name):
1968 return '&%s;' % name
1970 def output(self):
1971 '''Return processed HTML as a single string'''
1972 return ''.join([str(p) for p in self.pieces])
1974 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1975 def __init__(self, baseuri, baselang, encoding, entities):
1976 sgmllib.SGMLParser.__init__(self)
1977 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1978 _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
1979 self.entities=entities
1981 def decodeEntities(self, element, data):
1982 data = data.replace('&#60;', '&lt;')
1983 data = data.replace('&#x3c;', '&lt;')
1984 data = data.replace('&#x3C;', '&lt;')
1985 data = data.replace('&#62;', '&gt;')
1986 data = data.replace('&#x3e;', '&gt;')
1987 data = data.replace('&#x3E;', '&gt;')
1988 data = data.replace('&#38;', '&amp;')
1989 data = data.replace('&#x26;', '&amp;')
1990 data = data.replace('&#34;', '&quot;')
1991 data = data.replace('&#x22;', '&quot;')
1992 data = data.replace('&#39;', '&apos;')
1993 data = data.replace('&#x27;', '&apos;')
1994 if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1995 data = data.replace('&lt;', '<')
1996 data = data.replace('&gt;', '>')
1997 data = data.replace('&amp;', '&')
1998 data = data.replace('&quot;', '"')
1999 data = data.replace('&apos;', "'")
2000 return data
2002 def strattrs(self, attrs):
2003 return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
2005 class _MicroformatsParser:
2006 STRING = 1
2007 DATE = 2
2008 URI = 3
2009 NODE = 4
2010 EMAIL = 5
2012 known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me']
2013 known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv']
2015 def __init__(self, data, baseuri, encoding):
2016 self.document = BeautifulSoup.BeautifulSoup(data)
2017 self.baseuri = baseuri
2018 self.encoding = encoding
2019 if type(data) == type(u''):
2020 data = data.encode(encoding)
2021 self.tags = []
2022 self.enclosures = []
2023 self.xfn = []
2024 self.vcard = None
2026 def vcardEscape(self, s):
2027 if type(s) in (type(''), type(u'')):
2028 s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n')
2029 return s
2031 def vcardFold(self, s):
2032 s = re.sub(';+$', '', s)
2033 sFolded = ''
2034 iMax = 75
2035 sPrefix = ''
2036 while len(s) > iMax:
2037 sFolded += sPrefix + s[:iMax] + '\n'
2038 s = s[iMax:]
2039 sPrefix = ' '
2040 iMax = 74
2041 sFolded += sPrefix + s
2042 return sFolded
2044 def normalize(self, s):
2045 return re.sub(r'\s+', ' ', s).strip()
2047 def unique(self, aList):
2048 results = []
2049 for element in aList:
2050 if element not in results:
2051 results.append(element)
2052 return results
2054 def toISO8601(self, dt):
2055 return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt)
2057 def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0):
2058 all = lambda x: 1
2059 sProperty = sProperty.lower()
2060 bFound = 0
2061 bNormalize = 1
2062 propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)}
2063 if bAllowMultiple and (iPropertyType != self.NODE):
2064 snapResults = []
2065 containers = elmRoot(['ul', 'ol'], propertyMatch)
2066 for container in containers:
2067 snapResults.extend(container('li'))
2068 bFound = (len(snapResults) != 0)
2069 if not bFound:
2070 snapResults = elmRoot(all, propertyMatch)
2071 bFound = (len(snapResults) != 0)
2072 if (not bFound) and (sProperty == 'value'):
2073 snapResults = elmRoot('pre')
2074 bFound = (len(snapResults) != 0)
2075 bNormalize = not bFound
2076 if not bFound:
2077 snapResults = [elmRoot]
2078 bFound = (len(snapResults) != 0)
2079 arFilter = []
2080 if sProperty == 'vcard':
2081 snapFilter = elmRoot(all, propertyMatch)
2082 for node in snapFilter:
2083 if node.findParent(all, propertyMatch):
2084 arFilter.append(node)
2085 arResults = []
2086 for node in snapResults:
2087 if node not in arFilter:
2088 arResults.append(node)
2089 bFound = (len(arResults) != 0)
2090 if not bFound:
2091 if bAllowMultiple: return []
2092 elif iPropertyType == self.STRING: return ''
2093 elif iPropertyType == self.DATE: return None
2094 elif iPropertyType == self.URI: return ''
2095 elif iPropertyType == self.NODE: return None
2096 else: return None
2097 arValues = []
2098 for elmResult in arResults:
2099 sValue = None
2100 if iPropertyType == self.NODE:
2101 if bAllowMultiple:
2102 arValues.append(elmResult)
2103 continue
2104 else:
2105 return elmResult
2106 sNodeName = elmResult.name.lower()
2107 if (iPropertyType == self.EMAIL) and (sNodeName == 'a'):
2108 sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0]
2109 if sValue:
2110 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2111 if (not sValue) and (sNodeName == 'abbr'):
2112 sValue = elmResult.get('title')
2113 if sValue:
2114 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2115 if (not sValue) and (iPropertyType == self.URI):
2116 if sNodeName == 'a': sValue = elmResult.get('href')
2117 elif sNodeName == 'img': sValue = elmResult.get('src')
2118 elif sNodeName == 'object': sValue = elmResult.get('data')
2119 if sValue:
2120 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2121 if (not sValue) and (sNodeName == 'img'):
2122 sValue = elmResult.get('alt')
2123 if sValue:
2124 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2125 if not sValue:
2126 sValue = elmResult.renderContents()
2127 sValue = re.sub(r'<\S[^>]*>', '', sValue)
2128 sValue = sValue.replace('\r\n', '\n')
2129 sValue = sValue.replace('\r', '\n')
2130 if sValue:
2131 sValue = bNormalize and self.normalize(sValue) or sValue.strip()
2132 if not sValue: continue
2133 if iPropertyType == self.DATE:
2134 sValue = _parse_date_iso8601(sValue)
2135 if bAllowMultiple:
2136 arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue)
2137 else:
2138 return bAutoEscape and self.vcardEscape(sValue) or sValue
2139 return arValues
2141 def findVCards(self, elmRoot, bAgentParsing=0):
2142 sVCards = ''
2144 if not bAgentParsing:
2145 arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1)
2146 else:
2147 arCards = [elmRoot]
2149 for elmCard in arCards:
2150 arLines = []
2152 def processSingleString(sProperty):
2153 sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding)
2154 if sValue:
2155 arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue))
2156 return sValue or u''
2158 def processSingleURI(sProperty):
2159 sValue = self.getPropertyValue(elmCard, sProperty, self.URI)
2160 if sValue:
2161 sContentType = ''
2162 sEncoding = ''
2163 sValueKey = ''
2164 if sValue.startswith('data:'):
2165 sEncoding = ';ENCODING=b'
2166 sContentType = sValue.split(';')[0].split('/').pop()
2167 sValue = sValue.split(',', 1).pop()
2168 else:
2169 elmValue = self.getPropertyValue(elmCard, sProperty)
2170 if elmValue:
2171 if sProperty != 'url':
2172 sValueKey = ';VALUE=uri'
2173 sContentType = elmValue.get('type', '').strip().split('/').pop().strip()
2174 sContentType = sContentType.upper()
2175 if sContentType == 'OCTET-STREAM':
2176 sContentType = ''
2177 if sContentType:
2178 sContentType = ';TYPE=' + sContentType.upper()
2179 arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue))
2181 def processTypeValue(sProperty, arDefaultType, arForceType=None):
2182 arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1)
2183 for elmResult in arResults:
2184 arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1)
2185 if arForceType:
2186 arType = self.unique(arForceType + arType)
2187 if not arType:
2188 arType = arDefaultType
2189 sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0)
2190 if sValue:
2191 arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue))
2193 # AGENT
2194 # must do this before all other properties because it is destructive
2195 # (removes nested class="vcard" nodes so they don't interfere with
2196 # this vcard's other properties)
2197 arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1)
2198 for elmAgent in arAgent:
2199 if re.compile(r'\bvcard\b').search(elmAgent.get('class')):
2200 sAgentValue = self.findVCards(elmAgent, 1) + '\n'
2201 sAgentValue = sAgentValue.replace('\n', '\\n')
2202 sAgentValue = sAgentValue.replace(';', '\\;')
2203 if sAgentValue:
2204 arLines.append(self.vcardFold('AGENT:' + sAgentValue))
2205 # Completely remove the agent element from the parse tree
2206 elmAgent.extract()
2207 else:
2208 sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1);
2209 if sAgentValue:
2210 arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue))
2212 # FN (full name)
2213 sFN = processSingleString('fn')
2215 # N (name)
2216 elmName = self.getPropertyValue(elmCard, 'n')
2217 if elmName:
2218 sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1)
2219 sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1)
2220 arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1)
2221 arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1)
2222 arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1)
2223 arLines.append(self.vcardFold('N:' + sFamilyName + ';' +
2224 sGivenName + ';' +
2225 ','.join(arAdditionalNames) + ';' +
2226 ','.join(arHonorificPrefixes) + ';' +
2227 ','.join(arHonorificSuffixes)))
2228 elif sFN:
2229 # implied "N" optimization
2230 # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization
2231 arNames = self.normalize(sFN).split()
2232 if len(arNames) == 2:
2233 bFamilyNameFirst = (arNames[0].endswith(',') or
2234 len(arNames[1]) == 1 or
2235 ((len(arNames[1]) == 2) and (arNames[1].endswith('.'))))
2236 if bFamilyNameFirst:
2237 arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1]))
2238 else:
2239 arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0]))
2241 # SORT-STRING
2242 sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1)
2243 if sSortString:
2244 arLines.append(self.vcardFold('SORT-STRING:' + sSortString))
2246 # NICKNAME
2247 arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1)
2248 if arNickname:
2249 arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname)))
2251 # PHOTO
2252 processSingleURI('photo')
2254 # BDAY
2255 dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE)
2256 if dtBday:
2257 arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday)))
2259 # ADR (address)
2260 arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1)
2261 for elmAdr in arAdr:
2262 arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1)
2263 if not arType:
2264 arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1
2265 sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1)
2266 sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1)
2267 sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1)
2268 sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1)
2269 sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1)
2270 sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1)
2271 sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1)
2272 arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' +
2273 sPostOfficeBox + ';' +
2274 sExtendedAddress + ';' +
2275 sStreetAddress + ';' +
2276 sLocality + ';' +
2277 sRegion + ';' +
2278 sPostalCode + ';' +
2279 sCountryName))
2281 # LABEL
2282 processTypeValue('label', ['intl','postal','parcel','work'])
2284 # TEL (phone number)
2285 processTypeValue('tel', ['voice'])
2287 # EMAIL
2288 processTypeValue('email', ['internet'], ['internet'])
2290 # MAILER
2291 processSingleString('mailer')
2293 # TZ (timezone)
2294 processSingleString('tz')
2296 # GEO (geographical information)
2297 elmGeo = self.getPropertyValue(elmCard, 'geo')
2298 if elmGeo:
2299 sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1)
2300 sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1)
2301 arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude))
2303 # TITLE
2304 processSingleString('title')
2306 # ROLE
2307 processSingleString('role')
2309 # LOGO
2310 processSingleURI('logo')
2312 # ORG (organization)
2313 elmOrg = self.getPropertyValue(elmCard, 'org')
2314 if elmOrg:
2315 sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1)
2316 if not sOrganizationName:
2317 # implied "organization-name" optimization
2318 # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization
2319 sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1)
2320 if sOrganizationName:
2321 arLines.append(self.vcardFold('ORG:' + sOrganizationName))
2322 else:
2323 arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1)
2324 arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit)))
2326 # CATEGORY
2327 arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1)
2328 if arCategory:
2329 arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory)))
2331 # NOTE
2332 processSingleString('note')
2334 # REV
2335 processSingleString('rev')
2337 # SOUND
2338 processSingleURI('sound')
2340 # UID
2341 processSingleString('uid')
2343 # URL
2344 processSingleURI('url')
2346 # CLASS
2347 processSingleString('class')
2349 # KEY
2350 processSingleURI('key')
2352 if arLines:
2353 arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard']
2354 sVCards += u'\n'.join(arLines) + u'\n'
2356 return sVCards.strip()
2358 def isProbablyDownloadable(self, elm):
2359 attrsD = elm.attrMap
2360 if not attrsD.has_key('href'): return 0
2361 linktype = attrsD.get('type', '').strip()
2362 if linktype.startswith('audio/') or \
2363 linktype.startswith('video/') or \
2364 (linktype.startswith('application/') and not linktype.endswith('xml')):
2365 return 1
2366 path = urlparse.urlparse(attrsD['href'])[2]
2367 if path.find('.') == -1: return 0
2368 fileext = path.split('.').pop().lower()
2369 return fileext in self.known_binary_extensions
2371 def findTags(self):
2372 all = lambda x: 1
2373 for elm in self.document(all, {'rel': re.compile(r'\btag\b')}):
2374 href = elm.get('href')
2375 if not href: continue
2376 urlscheme, domain, path, params, query, fragment = \
2377 urlparse.urlparse(_urljoin(self.baseuri, href))
2378 segments = path.split('/')
2379 tag = segments.pop()
2380 if not tag:
2381 tag = segments.pop()
2382 tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', ''))
2383 if not tagscheme.endswith('/'):
2384 tagscheme += '/'
2385 self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''}))
2387 def findEnclosures(self):
2388 all = lambda x: 1
2389 enclosure_match = re.compile(r'\benclosure\b')
2390 for elm in self.document(all, {'href': re.compile(r'.+')}):
2391 if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue
2392 if elm.attrMap not in self.enclosures:
2393 self.enclosures.append(elm.attrMap)
2394 if elm.string and not elm.get('title'):
2395 self.enclosures[-1]['title'] = elm.string
2397 def findXFN(self):
2398 all = lambda x: 1
2399 for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}):
2400 rels = elm.get('rel', '').split()
2401 xfn_rels = []
2402 for rel in rels:
2403 if rel in self.known_xfn_relationships:
2404 xfn_rels.append(rel)
2405 if xfn_rels:
2406 self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string})
2408 def _parseMicroformats(htmlSource, baseURI, encoding):
2409 if not BeautifulSoup: return
2410 if _debug: sys.stderr.write('entering _parseMicroformats\n')
2411 try:
2412 p = _MicroformatsParser(htmlSource, baseURI, encoding)
2413 except UnicodeEncodeError:
2414 # sgmllib throws this exception when performing lookups of tags
2415 # with non-ASCII characters in them.
2416 return
2417 p.vcard = p.findVCards(p.document)
2418 p.findTags()
2419 p.findEnclosures()
2420 p.findXFN()
2421 return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard}
2423 class _RelativeURIResolver(_BaseHTMLProcessor):
2424 relative_uris = [('a', 'href'),
2425 ('applet', 'codebase'),
2426 ('area', 'href'),
2427 ('blockquote', 'cite'),
2428 ('body', 'background'),
2429 ('del', 'cite'),
2430 ('form', 'action'),
2431 ('frame', 'longdesc'),
2432 ('frame', 'src'),
2433 ('iframe', 'longdesc'),
2434 ('iframe', 'src'),
2435 ('head', 'profile'),
2436 ('img', 'longdesc'),
2437 ('img', 'src'),
2438 ('img', 'usemap'),
2439 ('input', 'src'),
2440 ('input', 'usemap'),
2441 ('ins', 'cite'),
2442 ('link', 'href'),
2443 ('object', 'classid'),
2444 ('object', 'codebase'),
2445 ('object', 'data'),
2446 ('object', 'usemap'),
2447 ('q', 'cite'),
2448 ('script', 'src')]
2450 def __init__(self, baseuri, encoding, _type):
2451 _BaseHTMLProcessor.__init__(self, encoding, _type)
2452 self.baseuri = baseuri
2454 def resolveURI(self, uri):
2455 return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip()))
2457 def unknown_starttag(self, tag, attrs):
2458 if _debug:
2459 sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs)))
2460 attrs = self.normalize_attrs(attrs)
2461 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2462 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2464 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2465 if _debug:
2466 sys.stderr.write('entering _resolveRelativeURIs\n')
2468 p = _RelativeURIResolver(baseURI, encoding, _type)
2469 p.feed(htmlSource)
2470 return p.output()
2472 def _makeSafeAbsoluteURI(base, rel=None):
2473 # bail if ACCEPTABLE_URI_SCHEMES is empty
2474 if not ACCEPTABLE_URI_SCHEMES:
2475 return _urljoin(base, rel or u'')
2476 if not base:
2477 return rel or u''
2478 if not rel:
2479 if base.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2480 return u''
2481 return base
2482 uri = _urljoin(base, rel)
2483 if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2484 return u''
2485 return uri
2487 class _HTMLSanitizer(_BaseHTMLProcessor):
2488 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
2489 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
2490 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
2491 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
2492 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
2493 'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
2494 'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
2495 'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
2496 'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2497 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
2498 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
2499 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
2501 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
2502 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2503 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2504 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2505 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2506 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2507 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2508 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2509 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2510 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2511 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2512 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2513 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2514 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2515 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2516 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
2517 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
2518 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
2519 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
2520 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
2521 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
2522 'xml:lang']
2524 unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
2526 acceptable_css_properties = ['azimuth', 'background-color',
2527 'border-bottom-color', 'border-collapse', 'border-color',
2528 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2529 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2530 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2531 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2532 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2533 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2534 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2535 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2536 'white-space', 'width']
2538 # survey of common keywords found in feeds
2539 acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
2540 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2541 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2542 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2543 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2544 'transparent', 'underline', 'white', 'yellow']
2546 valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2547 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2549 mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
2550 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
2551 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
2552 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
2553 'munderover', 'none', 'semantics']
2555 mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
2556 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
2557 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
2558 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
2559 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
2560 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
2561 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
2562 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
2563 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
2565 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2566 svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
2567 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2568 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2569 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2570 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2571 'svg', 'switch', 'text', 'title', 'tspan', 'use']
2573 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2574 svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
2575 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2576 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2577 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2578 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2579 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2580 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2581 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2582 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2583 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2584 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2585 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2586 'overline-position', 'overline-thickness', 'panose-1', 'path',
2587 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2588 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2589 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2590 'stop-color', 'stop-opacity', 'strikethrough-position',
2591 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2592 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2593 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2594 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2595 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2596 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2597 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2598 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2599 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2600 'y2', 'zoomAndPan']
2602 svg_attr_map = None
2603 svg_elem_map = None
2605 acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
2606 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2607 'stroke-opacity']
2609 def reset(self):
2610 _BaseHTMLProcessor.reset(self)
2611 self.unacceptablestack = 0
2612 self.mathmlOK = 0
2613 self.svgOK = 0
2615 def unknown_starttag(self, tag, attrs):
2616 acceptable_attributes = self.acceptable_attributes
2617 keymap = {}
2618 if not tag in self.acceptable_elements or self.svgOK:
2619 if tag in self.unacceptable_elements_with_end_tag:
2620 self.unacceptablestack += 1
2622 # add implicit namespaces to html5 inline svg/mathml
2623 if self._type.endswith('html'):
2624 if not dict(attrs).get('xmlns'):
2625 if tag=='svg':
2626 attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2627 if tag=='math':
2628 attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2630 # not otherwise acceptable, perhaps it is MathML or SVG?
2631 if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2632 self.mathmlOK += 1
2633 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2634 self.svgOK += 1
2636 # chose acceptable attributes based on tag class, else bail
2637 if self.mathmlOK and tag in self.mathml_elements:
2638 acceptable_attributes = self.mathml_attributes
2639 elif self.svgOK and tag in self.svg_elements:
2640 # for most vocabularies, lowercasing is a good idea. Many
2641 # svg elements, however, are camel case
2642 if not self.svg_attr_map:
2643 lower=[attr.lower() for attr in self.svg_attributes]
2644 mix=[a for a in self.svg_attributes if a not in lower]
2645 self.svg_attributes = lower
2646 self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2648 lower=[attr.lower() for attr in self.svg_elements]
2649 mix=[a for a in self.svg_elements if a not in lower]
2650 self.svg_elements = lower
2651 self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2652 acceptable_attributes = self.svg_attributes
2653 tag = self.svg_elem_map.get(tag,tag)
2654 keymap = self.svg_attr_map
2655 elif not tag in self.acceptable_elements:
2656 return
2658 # declare xlink namespace, if needed
2659 if self.mathmlOK or self.svgOK:
2660 if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2661 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2662 attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2664 clean_attrs = []
2665 for key, value in self.normalize_attrs(attrs):
2666 if key in acceptable_attributes:
2667 key=keymap.get(key,key)
2668 clean_attrs.append((key,value))
2669 elif key=='style':
2670 clean_value = self.sanitize_style(value)
2671 if clean_value: clean_attrs.append((key,clean_value))
2672 _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2674 def unknown_endtag(self, tag):
2675 if not tag in self.acceptable_elements:
2676 if tag in self.unacceptable_elements_with_end_tag:
2677 self.unacceptablestack -= 1
2678 if self.mathmlOK and tag in self.mathml_elements:
2679 if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
2680 elif self.svgOK and tag in self.svg_elements:
2681 tag = self.svg_elem_map.get(tag,tag)
2682 if tag == 'svg' and self.svgOK: self.svgOK -= 1
2683 else:
2684 return
2685 _BaseHTMLProcessor.unknown_endtag(self, tag)
2687 def handle_pi(self, text):
2688 pass
2690 def handle_decl(self, text):
2691 pass
2693 def handle_data(self, text):
2694 if not self.unacceptablestack:
2695 _BaseHTMLProcessor.handle_data(self, text)
2697 def sanitize_style(self, style):
2698 # disallow urls
2699 style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2701 # gauntlet
2702 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
2703 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2704 if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return ''
2706 clean = []
2707 for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2708 if not value: continue
2709 if prop.lower() in self.acceptable_css_properties:
2710 clean.append(prop + ': ' + value + ';')
2711 elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2712 for keyword in value.split():
2713 if not keyword in self.acceptable_css_keywords and \
2714 not self.valid_css_values.match(keyword):
2715 break
2716 else:
2717 clean.append(prop + ': ' + value + ';')
2718 elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2719 clean.append(prop + ': ' + value + ';')
2721 return ' '.join(clean)
2724 def _sanitizeHTML(htmlSource, encoding, _type):
2725 p = _HTMLSanitizer(encoding, _type)
2726 htmlSource = htmlSource.replace('<![CDATA[', '&lt;![CDATA[')
2727 p.feed(htmlSource)
2728 data = p.output()
2729 if TIDY_MARKUP:
2730 # loop through list of preferred Tidy interfaces looking for one that's installed,
2731 # then set up a common _tidy function to wrap the interface-specific API.
2732 _tidy = None
2733 for tidy_interface in PREFERRED_TIDY_INTERFACES:
2734 try:
2735 if tidy_interface == "uTidy":
2736 from tidy import parseString as _utidy
2737 def _tidy(data, **kwargs):
2738 return str(_utidy(data, **kwargs))
2739 break
2740 elif tidy_interface == "mxTidy":
2741 from mx.Tidy import Tidy as _mxtidy
2742 def _tidy(data, **kwargs):
2743 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2744 return data
2745 break
2746 except:
2747 pass
2748 if _tidy:
2749 utf8 = type(data) == type(u'')
2750 if utf8:
2751 data = data.encode('utf-8')
2752 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2753 if utf8:
2754 data = unicode(data, 'utf-8')
2755 if data.count('<body'):
2756 data = data.split('<body', 1)[1]
2757 if data.count('>'):
2758 data = data.split('>', 1)[1]
2759 if data.count('</body'):
2760 data = data.split('</body', 1)[0]
2761 data = data.strip().replace('\r\n', '\n')
2762 return data
2764 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2765 def http_error_default(self, req, fp, code, msg, headers):
2766 if ((code / 100) == 3) and (code != 304):
2767 return self.http_error_302(req, fp, code, msg, headers)
2768 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2769 infourl.status = code
2770 return infourl
2772 def http_error_302(self, req, fp, code, msg, headers):
2773 if headers.dict.has_key('location'):
2774 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
2775 else:
2776 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2777 if not hasattr(infourl, 'status'):
2778 infourl.status = code
2779 return infourl
2781 def http_error_301(self, req, fp, code, msg, headers):
2782 if headers.dict.has_key('location'):
2783 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
2784 else:
2785 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
2786 if not hasattr(infourl, 'status'):
2787 infourl.status = code
2788 return infourl
2790 http_error_300 = http_error_302
2791 http_error_303 = http_error_302
2792 http_error_307 = http_error_302
2794 def http_error_401(self, req, fp, code, msg, headers):
2795 # Check if
2796 # - server requires digest auth, AND
2797 # - we tried (unsuccessfully) with basic auth, AND
2798 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
2799 # If all conditions hold, parse authentication information
2800 # out of the Authorization header we sent the first time
2801 # (for the username and password) and the WWW-Authenticate
2802 # header the server sent back (for the realm) and retry
2803 # the request with the appropriate digest auth headers instead.
2804 # This evil genius hack has been brought to you by Aaron Swartz.
2805 host = urlparse.urlparse(req.get_full_url())[1]
2806 try:
2807 assert sys.version.split()[0] >= '2.3.3'
2808 assert base64 != None
2809 user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':')
2810 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2811 self.add_password(realm, host, user, passw)
2812 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2813 self.reset_retry_count()
2814 return retry
2815 except:
2816 return self.http_error_default(req, fp, code, msg, headers)
2818 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2819 """URL, filename, or string --> stream
2821 This function lets you define parsers that take any input source
2822 (URL, pathname to local or network file, or actual data as a string)
2823 and deal with it in a uniform manner. Returned object is guaranteed
2824 to have all the basic stdio read methods (read, readline, readlines).
2825 Just .close() the object when you're done with it.
2827 If the etag argument is supplied, it will be used as the value of an
2828 If-None-Match request header.
2830 If the modified argument is supplied, it can be a tuple of 9 integers
2831 (as returned by gmtime() in the standard Python time module) or a date
2832 string in any format supported by feedparser. Regardless, it MUST
2833 be in GMT (Greenwich Mean Time). It will be reformatted into an
2834 RFC 1123-compliant date and used as the value of an If-Modified-Since
2835 request header.
2837 If the agent argument is supplied, it will be used as the value of a
2838 User-Agent request header.
2840 If the referrer argument is supplied, it will be used as the value of a
2841 Referer[sic] request header.
2843 If handlers is supplied, it is a list of handlers used to build a
2844 urllib2 opener.
2846 if request_headers is supplied it is a dictionary of HTTP request headers
2847 that will override the values generated by FeedParser.
2850 if hasattr(url_file_stream_or_string, 'read'):
2851 return url_file_stream_or_string
2853 if url_file_stream_or_string == '-':
2854 return sys.stdin
2856 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2857 # Deal with the feed URI scheme
2858 if url_file_stream_or_string.startswith('feed:http'):
2859 url_file_stream_or_string = url_file_stream_or_string[5:]
2860 elif url_file_stream_or_string.startswith('feed:'):
2861 url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2862 if not agent:
2863 agent = USER_AGENT
2864 # test for inline user:password for basic auth
2865 auth = None
2866 if base64:
2867 urltype, rest = urllib.splittype(url_file_stream_or_string)
2868 realhost, rest = urllib.splithost(rest)
2869 if realhost:
2870 user_passwd, realhost = urllib.splituser(realhost)
2871 if user_passwd:
2872 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2873 auth = base64.standard_b64encode(user_passwd).strip()
2875 # iri support
2876 try:
2877 if isinstance(url_file_stream_or_string,unicode):
2878 url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8')
2879 else:
2880 url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8')
2881 except:
2882 pass
2884 # try to open with urllib2 (to use optional headers)
2885 request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2886 opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()]))
2887 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2888 try:
2889 return opener.open(request)
2890 finally:
2891 opener.close() # JohnD
2893 # try to open with native open function (if url_file_stream_or_string is a filename)
2894 try:
2895 return open(url_file_stream_or_string, 'rb')
2896 except:
2897 pass
2899 # treat url_file_stream_or_string as string
2900 return _StringIO(str(url_file_stream_or_string))
2902 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2903 request = urllib2.Request(url)
2904 request.add_header('User-Agent', agent)
2905 if etag:
2906 request.add_header('If-None-Match', etag)
2907 if type(modified) == type(''):
2908 modified = _parse_date(modified)
2909 elif isinstance(modified, datetime.datetime):
2910 modified = modified.utctimetuple()
2911 if modified:
2912 # format into an RFC 1123-compliant timestamp. We can't use
2913 # time.strftime() since the %a and %b directives can be affected
2914 # by the current locale, but RFC 2616 states that dates must be
2915 # in English.
2916 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2917 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2918 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2919 if referrer:
2920 request.add_header('Referer', referrer)
2921 if gzip and zlib:
2922 request.add_header('Accept-encoding', 'gzip, deflate')
2923 elif gzip:
2924 request.add_header('Accept-encoding', 'gzip')
2925 elif zlib:
2926 request.add_header('Accept-encoding', 'deflate')
2927 else:
2928 request.add_header('Accept-encoding', '')
2929 if auth:
2930 request.add_header('Authorization', 'Basic %s' % auth)
2931 if ACCEPT_HEADER:
2932 request.add_header('Accept', ACCEPT_HEADER)
2933 # use this for whatever -- cookies, special headers, etc
2934 # [('Cookie','Something'),('x-special-header','Another Value')]
2935 for header_name, header_value in request_headers.items():
2936 request.add_header(header_name, header_value)
2937 request.add_header('A-IM', 'feed') # RFC 3229 support
2938 return request
2940 _date_handlers = []
2941 def registerDateHandler(func):
2942 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2943 _date_handlers.insert(0, func)
2945 # ISO-8601 date parsing routines written by Fazal Majid.
2946 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2947 # parser is beyond the scope of feedparser and would be a worthwhile addition
2948 # to the Python library.
2949 # A single regular expression cannot parse ISO 8601 date formats into groups
2950 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2951 # 0301-04-01), so we use templates instead.
2952 # Please note the order in templates is significant because we need a
2953 # greedy match.
2954 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2955 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2956 '-YY-?MM', '-OOO', '-YY',
2957 '--MM-?DD', '--MM',
2958 '---DD',
2959 'CC', '']
2960 _iso8601_re = [
2961 tmpl.replace(
2962 'YYYY', r'(?P<year>\d{4})').replace(
2963 'YY', r'(?P<year>\d\d)').replace(
2964 'MM', r'(?P<month>[01]\d)').replace(
2965 'DD', r'(?P<day>[0123]\d)').replace(
2966 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2967 'CC', r'(?P<century>\d\d$)')
2968 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2969 + r'(:(?P<second>\d{2}))?'
2970 + r'(\.(?P<fracsecond>\d+))?'
2971 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2972 for tmpl in _iso8601_tmpl]
2973 try:
2974 del tmpl
2975 except NameError:
2976 pass
2977 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2978 try:
2979 del regex
2980 except NameError:
2981 pass
2982 def _parse_date_iso8601(dateString):
2983 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
2984 m = None
2985 for _iso8601_match in _iso8601_matches:
2986 m = _iso8601_match(dateString)
2987 if m: break
2988 if not m: return
2989 if m.span() == (0, 0): return
2990 params = m.groupdict()
2991 ordinal = params.get('ordinal', 0)
2992 if ordinal:
2993 ordinal = int(ordinal)
2994 else:
2995 ordinal = 0
2996 year = params.get('year', '--')
2997 if not year or year == '--':
2998 year = time.gmtime()[0]
2999 elif len(year) == 2:
3000 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3001 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3002 else:
3003 year = int(year)
3004 month = params.get('month', '-')
3005 if not month or month == '-':
3006 # ordinals are NOT normalized by mktime, we simulate them
3007 # by setting month=1, day=ordinal
3008 if ordinal:
3009 month = 1
3010 else:
3011 month = time.gmtime()[1]
3012 month = int(month)
3013 day = params.get('day', 0)
3014 if not day:
3015 # see above
3016 if ordinal:
3017 day = ordinal
3018 elif params.get('century', 0) or \
3019 params.get('year', 0) or params.get('month', 0):
3020 day = 1
3021 else:
3022 day = time.gmtime()[2]
3023 else:
3024 day = int(day)
3025 # special case of the century - is the first year of the 21st century
3026 # 2000 or 2001 ? The debate goes on...
3027 if 'century' in params.keys():
3028 year = (int(params['century']) - 1) * 100 + 1
3029 # in ISO 8601 most fields are optional
3030 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3031 if not params.get(field, None):
3032 params[field] = 0
3033 hour = int(params.get('hour', 0))
3034 minute = int(params.get('minute', 0))
3035 second = int(float(params.get('second', 0)))
3036 # weekday is normalized by mktime(), we can ignore it
3037 weekday = 0
3038 daylight_savings_flag = -1
3039 tm = [year, month, day, hour, minute, second, weekday,
3040 ordinal, daylight_savings_flag]
3041 # ISO 8601 time zone adjustments
3042 tz = params.get('tz')
3043 if tz and tz != 'Z':
3044 if tz[0] == '-':
3045 tm[3] += int(params.get('tzhour', 0))
3046 tm[4] += int(params.get('tzmin', 0))
3047 elif tz[0] == '+':
3048 tm[3] -= int(params.get('tzhour', 0))
3049 tm[4] -= int(params.get('tzmin', 0))
3050 else:
3051 return None
3052 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3053 # which is guaranteed to normalize d/m/y/h/m/s.
3054 # Many implementations have bugs, but we'll pretend they don't.
3055 return time.localtime(time.mktime(tuple(tm)))
3056 registerDateHandler(_parse_date_iso8601)
3058 # 8-bit date handling routines written by ytrewq1.
3059 _korean_year = u'\ub144' # b3e2 in euc-kr
3060 _korean_month = u'\uc6d4' # bff9 in euc-kr
3061 _korean_day = u'\uc77c' # c0cf in euc-kr
3062 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3063 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3065 _korean_onblog_date_re = \
3066 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3067 (_korean_year, _korean_month, _korean_day))
3068 _korean_nate_date_re = \
3069 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3070 (_korean_am, _korean_pm))
3071 def _parse_date_onblog(dateString):
3072 '''Parse a string according to the OnBlog 8-bit date format'''
3073 m = _korean_onblog_date_re.match(dateString)
3074 if not m: return
3075 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3076 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3077 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3078 'zonediff': '+09:00'}
3079 if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
3080 return _parse_date_w3dtf(w3dtfdate)
3081 registerDateHandler(_parse_date_onblog)
3083 def _parse_date_nate(dateString):
3084 '''Parse a string according to the Nate 8-bit date format'''
3085 m = _korean_nate_date_re.match(dateString)
3086 if not m: return
3087 hour = int(m.group(5))
3088 ampm = m.group(4)
3089 if (ampm == _korean_pm):
3090 hour += 12
3091 hour = str(hour)
3092 if len(hour) == 1:
3093 hour = '0' + hour
3094 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3095 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3096 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3097 'zonediff': '+09:00'}
3098 if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
3099 return _parse_date_w3dtf(w3dtfdate)
3100 registerDateHandler(_parse_date_nate)
3102 _mssql_date_re = \
3103 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
3104 def _parse_date_mssql(dateString):
3105 '''Parse a string according to the MS SQL date format'''
3106 m = _mssql_date_re.match(dateString)
3107 if not m: return
3108 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3109 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3110 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3111 'zonediff': '+09:00'}
3112 if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
3113 return _parse_date_w3dtf(w3dtfdate)
3114 registerDateHandler(_parse_date_mssql)
3116 # Unicode strings for Greek date strings
3117 _greek_months = \
3119 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3120 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3121 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3122 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3123 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3124 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3125 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3126 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3127 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3128 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3129 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3130 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3131 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3132 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3133 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3134 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3135 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3136 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3137 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3140 _greek_wdays = \
3142 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3143 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3144 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3145 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3146 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3147 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3148 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3151 _greek_date_format_re = \
3152 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3154 def _parse_date_greek(dateString):
3155 '''Parse a string according to a Greek 8-bit date format.'''
3156 m = _greek_date_format_re.match(dateString)
3157 if not m: return
3158 try:
3159 wday = _greek_wdays[m.group(1)]
3160 month = _greek_months[m.group(3)]
3161 except:
3162 return
3163 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3164 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3165 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3166 'zonediff': m.group(8)}
3167 if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
3168 return _parse_date_rfc822(rfc822date)
3169 registerDateHandler(_parse_date_greek)
3171 # Unicode strings for Hungarian date strings
3172 _hungarian_months = \
3174 u'janu\u00e1r': u'01', # e1 in iso-8859-2
3175 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3176 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3177 u'\u00e1prilis': u'04', # e1 in iso-8859-2
3178 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3179 u'j\u00fanius': u'06', # fa in iso-8859-2
3180 u'j\u00falius': u'07', # fa in iso-8859-2
3181 u'augusztus': u'08',
3182 u'szeptember': u'09',
3183 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3184 u'november': u'11',
3185 u'december': u'12',
3188 _hungarian_date_format_re = \
3189 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3191 def _parse_date_hungarian(dateString):
3192 '''Parse a string according to a Hungarian 8-bit date format.'''
3193 m = _hungarian_date_format_re.match(dateString)
3194 if not m: return
3195 try:
3196 month = _hungarian_months[m.group(2)]
3197 day = m.group(3)
3198 if len(day) == 1:
3199 day = '0' + day
3200 hour = m.group(4)
3201 if len(hour) == 1:
3202 hour = '0' + hour
3203 except:
3204 return
3205 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3206 {'year': m.group(1), 'month': month, 'day': day,\
3207 'hour': hour, 'minute': m.group(5),\
3208 'zonediff': m.group(6)}
3209 if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
3210 return _parse_date_w3dtf(w3dtfdate)
3211 registerDateHandler(_parse_date_hungarian)
3213 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
3214 # Drake and licensed under the Python license. Removed all range checking
3215 # for month, day, hour, minute, and second, since mktime will normalize
3216 # these later
3217 def _parse_date_w3dtf(dateString):
3218 def __extract_date(m):
3219 year = int(m.group('year'))
3220 if year < 100:
3221 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3222 if year < 1000:
3223 return 0, 0, 0
3224 julian = m.group('julian')
3225 if julian:
3226 julian = int(julian)
3227 month = julian / 30 + 1
3228 day = julian % 30 + 1
3229 jday = None
3230 while jday != julian:
3231 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
3232 jday = time.gmtime(t)[-2]
3233 diff = abs(jday - julian)
3234 if jday > julian:
3235 if diff < day:
3236 day = day - diff
3237 else:
3238 month = month - 1
3239 day = 31
3240 elif jday < julian:
3241 if day + diff < 28:
3242 day = day + diff
3243 else:
3244 month = month + 1
3245 return year, month, day
3246 month = m.group('month')
3247 day = 1
3248 if month is None:
3249 month = 1
3250 else:
3251 month = int(month)
3252 day = m.group('day')
3253 if day:
3254 day = int(day)
3255 else:
3256 day = 1
3257 return year, month, day
3259 def __extract_time(m):
3260 if not m:
3261 return 0, 0, 0
3262 hours = m.group('hours')
3263 if not hours:
3264 return 0, 0, 0
3265 hours = int(hours)
3266 minutes = int(m.group('minutes'))
3267 seconds = m.group('seconds')
3268 if seconds:
3269 seconds = int(seconds)
3270 else:
3271 seconds = 0
3272 return hours, minutes, seconds
3274 def __extract_tzd(m):
3275 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
3276 if not m:
3277 return 0
3278 tzd = m.group('tzd')
3279 if not tzd:
3280 return 0
3281 if tzd == 'Z':
3282 return 0
3283 hours = int(m.group('tzdhours'))
3284 minutes = m.group('tzdminutes')
3285 if minutes:
3286 minutes = int(minutes)
3287 else:
3288 minutes = 0
3289 offset = (hours*60 + minutes) * 60
3290 if tzd[0] == '+':
3291 return -offset
3292 return offset
3294 __date_re = ('(?P<year>\d\d\d\d)'
3295 '(?:(?P<dsep>-|)'
3296 '(?:(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?'
3297 '|(?P<julian>\d\d\d)))?')
3298 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
3299 __tzd_rx = re.compile(__tzd_re)
3300 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
3301 '(?:(?P=tsep)(?P<seconds>\d\d)(?:[.,]\d+)?)?'
3302 + __tzd_re)
3303 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
3304 __datetime_rx = re.compile(__datetime_re)
3305 m = __datetime_rx.match(dateString)
3306 if (m is None) or (m.group() != dateString): return
3307 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
3308 if gmt[0] == 0: return
3309 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
3310 registerDateHandler(_parse_date_w3dtf)
3312 def _parse_date_rfc822(dateString):
3313 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
3314 data = dateString.split()
3315 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
3316 del data[0]
3317 if len(data) == 4:
3318 s = data[3]
3319 i = s.find('+')
3320 if i > 0:
3321 data[3:] = [s[:i], s[i+1:]]
3322 else:
3323 data.append('')
3324 dateString = " ".join(data)
3325 # Account for the Etc/GMT timezone by stripping 'Etc/'
3326 elif len(data) == 5 and data[4].lower().startswith('etc/'):
3327 data[4] = data[4][4:]
3328 dateString = " ".join(data)
3329 if len(data) < 5:
3330 dateString += ' 00:00:00 GMT'
3331 tm = rfc822.parsedate_tz(dateString)
3332 if tm:
3333 return time.gmtime(rfc822.mktime_tz(tm))
3334 # rfc822.py defines several time zones, but we define some extra ones.
3335 # 'ET' is equivalent to 'EST', etc.
3336 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
3337 rfc822._timezones.update(_additional_timezones)
3338 registerDateHandler(_parse_date_rfc822)
3340 def _parse_date_perforce(aDateString):
3341 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3342 # Fri, 2006/09/15 08:19:53 EDT
3343 _my_date_pattern = re.compile( \
3344 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3346 dow, year, month, day, hour, minute, second, tz = \
3347 _my_date_pattern.search(aDateString).groups()
3348 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3349 dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3350 tm = rfc822.parsedate_tz(dateString)
3351 if tm:
3352 return time.gmtime(rfc822.mktime_tz(tm))
3353 registerDateHandler(_parse_date_perforce)
3355 def _parse_date(dateString):
3356 '''Parses a variety of date formats into a 9-tuple in GMT'''
3357 for handler in _date_handlers:
3358 try:
3359 date9tuple = handler(dateString)
3360 if not date9tuple: continue
3361 if len(date9tuple) != 9:
3362 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
3363 raise ValueError
3364 map(int, date9tuple)
3365 return date9tuple
3366 except Exception, e:
3367 if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
3368 pass
3369 return None
3371 def _getCharacterEncoding(http_headers, xml_data):
3372 '''Get the character encoding of the XML document
3374 http_headers is a dictionary
3375 xml_data is a raw string (not Unicode)
3377 This is so much trickier than it sounds, it's not even funny.
3378 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3379 is application/xml, application/*+xml,
3380 application/xml-external-parsed-entity, or application/xml-dtd,
3381 the encoding given in the charset parameter of the HTTP Content-Type
3382 takes precedence over the encoding given in the XML prefix within the
3383 document, and defaults to 'utf-8' if neither are specified. But, if
3384 the HTTP Content-Type is text/xml, text/*+xml, or
3385 text/xml-external-parsed-entity, the encoding given in the XML prefix
3386 within the document is ALWAYS IGNORED and only the encoding given in
3387 the charset parameter of the HTTP Content-Type header should be
3388 respected, and it defaults to 'us-ascii' if not specified.
3390 Furthermore, discussion on the atom-syntax mailing list with the
3391 author of RFC 3023 leads me to the conclusion that any document
3392 served with a Content-Type of text/* and no charset parameter
3393 must be treated as us-ascii. (We now do this.) And also that it
3394 must always be flagged as non-well-formed. (We now do this too.)
3396 If Content-Type is unspecified (input was local file or non-HTTP source)
3397 or unrecognized (server just got it totally wrong), then go by the
3398 encoding given in the XML prefix of the document and default to
3399 'iso-8859-1' as per the HTTP specification (RFC 2616).
3401 Then, assuming we didn't find a character encoding in the HTTP headers
3402 (and the HTTP Content-type allowed us to look in the body), we need
3403 to sniff the first few bytes of the XML data and try to determine
3404 whether the encoding is ASCII-compatible. Section F of the XML
3405 specification shows the way here:
3406 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3408 If the sniffed encoding is not ASCII-compatible, we need to make it
3409 ASCII compatible so that we can sniff further into the XML declaration
3410 to find the encoding attribute, which will tell us the true encoding.
3412 Of course, none of this guarantees that we will be able to parse the
3413 feed in the declared character encoding (assuming it was declared
3414 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
3415 you should definitely install them if you can.
3416 http://cjkpython.i18n.org/
3419 def _parseHTTPContentType(content_type):
3420 '''takes HTTP Content-Type header and returns (content type, charset)
3422 If no charset is specified, returns (content type, '')
3423 If no content type is specified, returns ('', '')
3424 Both return parameters are guaranteed to be lowercase strings
3426 content_type = content_type or ''
3427 content_type, params = cgi.parse_header(content_type)
3428 return content_type, params.get('charset', '').replace("'", '')
3430 sniffed_xml_encoding = ''
3431 xml_encoding = ''
3432 true_encoding = ''
3433 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type')))
3434 # Must sniff for non-ASCII-compatible character encodings before
3435 # searching for XML declaration. This heuristic is defined in
3436 # section F of the XML specification:
3437 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3438 try:
3439 if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
3440 # EBCDIC
3441 xml_data = _ebcdic_to_ascii(xml_data)
3442 elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
3443 # UTF-16BE
3444 sniffed_xml_encoding = 'utf-16be'
3445 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
3446 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3447 # UTF-16BE with BOM
3448 sniffed_xml_encoding = 'utf-16be'
3449 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
3450 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
3451 # UTF-16LE
3452 sniffed_xml_encoding = 'utf-16le'
3453 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
3454 elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
3455 # UTF-16LE with BOM
3456 sniffed_xml_encoding = 'utf-16le'
3457 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
3458 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
3459 # UTF-32BE
3460 sniffed_xml_encoding = 'utf-32be'
3461 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
3462 elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
3463 # UTF-32LE
3464 sniffed_xml_encoding = 'utf-32le'
3465 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
3466 elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3467 # UTF-32BE with BOM
3468 sniffed_xml_encoding = 'utf-32be'
3469 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
3470 elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3471 # UTF-32LE with BOM
3472 sniffed_xml_encoding = 'utf-32le'
3473 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
3474 elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3475 # UTF-8 with BOM
3476 sniffed_xml_encoding = 'utf-8'
3477 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
3478 else:
3479 # ASCII-compatible
3480 pass
3481 xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
3482 except:
3483 xml_encoding_match = None
3484 if xml_encoding_match:
3485 xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3486 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
3487 xml_encoding = sniffed_xml_encoding
3488 acceptable_content_type = 0
3489 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
3490 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
3491 if (http_content_type in application_content_types) or \
3492 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
3493 acceptable_content_type = 1
3494 true_encoding = http_encoding or xml_encoding or 'utf-8'
3495 elif (http_content_type in text_content_types) or \
3496 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
3497 acceptable_content_type = 1
3498 true_encoding = http_encoding or 'us-ascii'
3499 elif http_content_type.startswith('text/'):
3500 true_encoding = http_encoding or 'us-ascii'
3501 elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))):
3502 true_encoding = xml_encoding or 'iso-8859-1'
3503 else:
3504 true_encoding = xml_encoding or 'utf-8'
3505 # some feeds claim to be gb2312 but are actually gb18030.
3506 # apparently MSIE and Firefox both do the following switch:
3507 if true_encoding.lower() == 'gb2312':
3508 true_encoding = 'gb18030'
3509 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3511 def _toUTF8(data, encoding):
3512 '''Changes an XML data stream on the fly to specify a new encoding
3514 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3515 encoding is a string recognized by encodings.aliases
3517 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3518 # strip Byte Order Mark (if present)
3519 if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3520 if _debug:
3521 sys.stderr.write('stripping BOM\n')
3522 if encoding != 'utf-16be':
3523 sys.stderr.write('trying utf-16be instead\n')
3524 encoding = 'utf-16be'
3525 data = data[2:]
3526 elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3527 if _debug:
3528 sys.stderr.write('stripping BOM\n')
3529 if encoding != 'utf-16le':
3530 sys.stderr.write('trying utf-16le instead\n')
3531 encoding = 'utf-16le'
3532 data = data[2:]
3533 elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3534 if _debug:
3535 sys.stderr.write('stripping BOM\n')
3536 if encoding != 'utf-8':
3537 sys.stderr.write('trying utf-8 instead\n')
3538 encoding = 'utf-8'
3539 data = data[3:]
3540 elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3541 if _debug:
3542 sys.stderr.write('stripping BOM\n')
3543 if encoding != 'utf-32be':
3544 sys.stderr.write('trying utf-32be instead\n')
3545 encoding = 'utf-32be'
3546 data = data[4:]
3547 elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3548 if _debug:
3549 sys.stderr.write('stripping BOM\n')
3550 if encoding != 'utf-32le':
3551 sys.stderr.write('trying utf-32le instead\n')
3552 encoding = 'utf-32le'
3553 data = data[4:]
3554 newdata = unicode(data, encoding)
3555 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3556 declmatch = re.compile('^<\?xml[^>]*?>')
3557 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3558 if declmatch.search(newdata):
3559 newdata = declmatch.sub(newdecl, newdata)
3560 else:
3561 newdata = newdecl + u'\n' + newdata
3562 return newdata.encode('utf-8')
3564 def _stripDoctype(data):
3565 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
3567 rss_version may be 'rss091n' or None
3568 stripped_data is the same XML document, minus the DOCTYPE
3570 start = re.search(_s2bytes('<\w'), data)
3571 start = start and start.start() or -1
3572 head,data = data[:start+1], data[start+1:]
3574 entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3575 entity_results=entity_pattern.findall(head)
3576 head = entity_pattern.sub(_s2bytes(''), head)
3577 doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3578 doctype_results = doctype_pattern.findall(head)
3579 doctype = doctype_results and doctype_results[0] or _s2bytes('')
3580 if doctype.lower().count(_s2bytes('netscape')):
3581 version = 'rss091n'
3582 else:
3583 version = None
3585 # only allow in 'safe' inline entity definitions
3586 replacement=_s2bytes('')
3587 if len(doctype_results)==1 and entity_results:
3588 safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3589 safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
3590 if safe_entities:
3591 replacement=_s2bytes('<!DOCTYPE feed [\n <!ENTITY') + _s2bytes('>\n <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
3592 data = doctype_pattern.sub(replacement, head) + data
3594 return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
3596 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}):
3597 '''Parse a feed from a URL, file, stream, or string.
3599 request_headers, if given, is a dict from http header name to value to add
3600 to the request; this overrides internally generated values.
3602 result = FeedParserDict()
3603 result['feed'] = FeedParserDict()
3604 result['entries'] = []
3605 if _XML_AVAILABLE:
3606 result['bozo'] = 0
3607 if not isinstance(handlers, list):
3608 handlers = [handlers]
3609 try:
3610 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3611 data = f.read()
3612 except Exception, e:
3613 result['bozo'] = 1
3614 result['bozo_exception'] = e
3615 data = None
3616 f = None
3618 if hasattr(f, 'headers'):
3619 result['headers'] = dict(f.headers)
3620 # overwrite existing headers using response_headers
3621 if 'headers' in result:
3622 result['headers'].update(response_headers)
3623 elif response_headers:
3624 result['headers'] = copy.deepcopy(response_headers)
3626 # if feed is gzip-compressed, decompress it
3627 if f and data and 'headers' in result:
3628 if gzip and result['headers'].get('content-encoding') == 'gzip':
3629 try:
3630 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3631 except Exception, e:
3632 # Some feeds claim to be gzipped but they're not, so
3633 # we get garbage. Ideally, we should re-request the
3634 # feed without the 'Accept-encoding: gzip' header,
3635 # but we don't.
3636 result['bozo'] = 1
3637 result['bozo_exception'] = e
3638 data = ''
3639 elif zlib and result['headers'].get('content-encoding') == 'deflate':
3640 try:
3641 data = zlib.decompress(data, -zlib.MAX_WBITS)
3642 except Exception, e:
3643 result['bozo'] = 1
3644 result['bozo_exception'] = e
3645 data = ''
3647 # save HTTP headers
3648 if 'headers' in result:
3649 if 'etag' in result['headers'] or 'ETag' in result['headers']:
3650 etag = result['headers'].get('etag', result['headers'].get('ETag'))
3651 if etag:
3652 result['etag'] = etag
3653 if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']:
3654 modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified'))
3655 if modified:
3656 result['modified'] = _parse_date(modified)
3657 if hasattr(f, 'url'):
3658 result['href'] = f.url
3659 result['status'] = 200
3660 if hasattr(f, 'status'):
3661 result['status'] = f.status
3662 if hasattr(f, 'close'):
3663 f.close()
3665 # there are four encodings to keep track of:
3666 # - http_encoding is the encoding declared in the Content-Type HTTP header
3667 # - xml_encoding is the encoding declared in the <?xml declaration
3668 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
3669 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3670 http_headers = result.get('headers', {})
3671 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
3672 _getCharacterEncoding(http_headers, data)
3673 if http_headers and (not acceptable_content_type):
3674 if http_headers.has_key('content-type') or http_headers.has_key('Content-type'):
3675 bozo_message = '%s is not an XML media type' % http_headers.get('content-type', http_headers.get('Content-type'))
3676 else:
3677 bozo_message = 'no Content-type specified'
3678 result['bozo'] = 1
3679 result['bozo_exception'] = NonXMLContentType(bozo_message)
3681 if data is not None:
3682 result['version'], data, entities = _stripDoctype(data)
3684 # ensure that baseuri is an absolute uri using an acceptable URI scheme
3685 contentloc = http_headers.get('content-location', http_headers.get('Content-Location', ''))
3686 href = result.get('href', '')
3687 baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3689 baselang = http_headers.get('content-language', http_headers.get('Content-Language', None))
3691 # if server sent 304, we're done
3692 if result.get('status', 0) == 304:
3693 result['version'] = ''
3694 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3695 'so the server sent no data. This is a feature, not a bug!'
3696 return result
3698 # if there was a problem downloading, we're done
3699 if data is None:
3700 return result
3702 # determine character encoding
3703 use_strict_parser = 0
3704 known_encoding = 0
3705 tried_encodings = []
3706 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3707 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
3708 if not proposed_encoding: continue
3709 if proposed_encoding in tried_encodings: continue
3710 tried_encodings.append(proposed_encoding)
3711 try:
3712 data = _toUTF8(data, proposed_encoding)
3713 known_encoding = use_strict_parser = 1
3714 break
3715 except:
3716 pass
3717 # if no luck and we have auto-detection library, try that
3718 if (not known_encoding) and chardet:
3719 try:
3720 proposed_encoding = chardet.detect(data)['encoding']
3721 if proposed_encoding and (proposed_encoding not in tried_encodings):
3722 tried_encodings.append(proposed_encoding)
3723 data = _toUTF8(data, proposed_encoding)
3724 known_encoding = use_strict_parser = 1
3725 except:
3726 pass
3727 # if still no luck and we haven't tried utf-8 yet, try that
3728 if (not known_encoding) and ('utf-8' not in tried_encodings):
3729 try:
3730 proposed_encoding = 'utf-8'
3731 tried_encodings.append(proposed_encoding)
3732 data = _toUTF8(data, proposed_encoding)
3733 known_encoding = use_strict_parser = 1
3734 except:
3735 pass
3736 # if still no luck and we haven't tried windows-1252 yet, try that
3737 if (not known_encoding) and ('windows-1252' not in tried_encodings):
3738 try:
3739 proposed_encoding = 'windows-1252'
3740 tried_encodings.append(proposed_encoding)
3741 data = _toUTF8(data, proposed_encoding)
3742 known_encoding = use_strict_parser = 1
3743 except:
3744 pass
3745 # if still no luck and we haven't tried iso-8859-2 yet, try that.
3746 if (not known_encoding) and ('iso-8859-2' not in tried_encodings):
3747 try:
3748 proposed_encoding = 'iso-8859-2'
3749 tried_encodings.append(proposed_encoding)
3750 data = _toUTF8(data, proposed_encoding)
3751 known_encoding = use_strict_parser = 1
3752 except:
3753 pass
3754 # if still no luck, give up
3755 if not known_encoding:
3756 result['bozo'] = 1
3757 result['bozo_exception'] = CharacterEncodingUnknown( \
3758 'document encoding unknown, I tried ' + \
3759 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % \
3760 (result['encoding'], xml_encoding))
3761 result['encoding'] = ''
3762 elif proposed_encoding != result['encoding']:
3763 result['bozo'] = 1
3764 result['bozo_exception'] = CharacterEncodingOverride( \
3765 'document declared as %s, but parsed as %s' % \
3766 (result['encoding'], proposed_encoding))
3767 result['encoding'] = proposed_encoding
3769 if not _XML_AVAILABLE:
3770 use_strict_parser = 0
3771 if use_strict_parser:
3772 # initialize the SAX parser
3773 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3774 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3775 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3776 saxparser.setContentHandler(feedparser)
3777 saxparser.setErrorHandler(feedparser)
3778 source = xml.sax.xmlreader.InputSource()
3779 source.setByteStream(_StringIO(data))
3780 if hasattr(saxparser, '_ns_stack'):
3781 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
3782 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
3783 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
3784 try:
3785 saxparser.parse(source)
3786 except Exception, e:
3787 if _debug:
3788 import traceback
3789 traceback.print_stack()
3790 traceback.print_exc()
3791 sys.stderr.write('xml parsing failed\n')
3792 result['bozo'] = 1
3793 result['bozo_exception'] = feedparser.exc or e
3794 use_strict_parser = 0
3795 if not use_strict_parser:
3796 feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3797 feedparser.feed(data.decode('utf-8', 'replace'))
3798 result['feed'] = feedparser.feeddata
3799 result['entries'] = feedparser.entries
3800 result['version'] = result['version'] or feedparser.version
3801 result['namespaces'] = feedparser.namespacesInUse
3802 return result
3804 class Serializer:
3805 def __init__(self, results):
3806 self.results = results
3808 class TextSerializer(Serializer):
3809 def write(self, stream=sys.stdout):
3810 self._writer(stream, self.results, '')
3812 def _writer(self, stream, node, prefix):
3813 if not node: return
3814 if hasattr(node, 'keys'):
3815 keys = node.keys()
3816 keys.sort()
3817 for k in keys:
3818 if k in ('description', 'link'): continue
3819 if node.has_key(k + '_detail'): continue
3820 if node.has_key(k + '_parsed'): continue
3821 self._writer(stream, node[k], prefix + k + '.')
3822 elif type(node) == types.ListType:
3823 index = 0
3824 for n in node:
3825 self._writer(stream, n, prefix[:-1] + '[' + str(index) + '].')
3826 index += 1
3827 else:
3828 try:
3829 s = str(node).encode('utf-8')
3830 s = s.replace('\\', '\\\\')
3831 s = s.replace('\r', '')
3832 s = s.replace('\n', r'\n')
3833 stream.write(prefix[:-1])
3834 stream.write('=')
3835 stream.write(s)
3836 stream.write('\n')
3837 except:
3838 pass
3840 class PprintSerializer(Serializer):
3841 def write(self, stream=sys.stdout):
3842 if self.results.has_key('href'):
3843 stream.write(self.results['href'] + '\n\n')
3844 from pprint import pprint
3845 pprint(self.results, stream)
3846 stream.write('\n')
3848 if __name__ == '__main__':
3849 try:
3850 from optparse import OptionParser
3851 except:
3852 OptionParser = None
3854 if OptionParser:
3855 optionParser = OptionParser(version=__version__, usage="%prog [options] url_or_filename_or_-")
3856 optionParser.set_defaults(format="pprint")
3857 optionParser.add_option("-A", "--user-agent", dest="agent", metavar="AGENT", help="User-Agent for HTTP URLs")
3858 optionParser.add_option("-e", "--referer", "--referrer", dest="referrer", metavar="URL", help="Referrer for HTTP URLs")
3859 optionParser.add_option("-t", "--etag", dest="etag", metavar="TAG", help="ETag/If-None-Match for HTTP URLs")
3860 optionParser.add_option("-m", "--last-modified", dest="modified", metavar="DATE", help="Last-modified/If-Modified-Since for HTTP URLs (any supported date format)")
3861 optionParser.add_option("-f", "--format", dest="format", metavar="FORMAT", help="output results in FORMAT (text, pprint)")
3862 optionParser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="write debugging information to stderr")
3863 (options, urls) = optionParser.parse_args()
3864 if options.verbose:
3865 _debug = 1
3866 if not urls:
3867 optionParser.print_help()
3868 sys.exit(0)
3869 else:
3870 if not sys.argv[1:]:
3871 print __doc__
3872 sys.exit(0)
3873 class _Options:
3874 etag = modified = agent = referrer = None
3875 format = 'pprint'
3876 options = _Options()
3877 urls = sys.argv[1:]
3879 zopeCompatibilityHack()
3881 serializer = globals().get(options.format.capitalize() + 'Serializer', Serializer)
3882 for url in urls:
3883 results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3884 serializer(results).write(sys.stdout)