2 # -*- coding: utf-8 -*-
5 import re
, urllib
, urllib2
, logging
6 import simplejson
as json
8 from google
.appengine
.ext
import webapp
10 import urlstore
, httputils
, youtube
, utils
11 from mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
14 class Parser(webapp
.RequestHandler
):
15 """ Parser Endpoint """
21 urls
= map(urllib
.unquote
, self
.request
.get_all('url'))
23 inline_logo
= self
.request
.get_range('inline_logo', 0, 1, default
=0)
24 scale_to
= self
.request
.get_range('scale_logo', 0, 1, default
=0)
25 logo_format
= self
.request
.get('logo_format')
26 strip_html
= self
.request
.get_range('strip_html', 0, 1, default
=0)
27 use_cache
= self
.request
.get_range('use_cache', 0, 1, default
=1)
28 modified
= self
.request
.headers
.get('If-Modified-Since', None)
29 accept
= self
.request
.headers
.get('Accept', 'application/json')
32 podcasts
, last_modified
= parse_feeds(urls
, inline_logo
, scale_to
, logo_format
, strip_html
, modified
, use_cache
)
33 self
.send_response(podcasts
, last_modified
, accept
)
36 self
.response
.set_status(400)
37 self
.response
.out
.write('parameter url missing')
40 def send_response(self
, podcasts
, last_modified
, formats
):
41 self
.response
.headers
.add_header('Vary', 'Accept, User-Agent, Accept-Encoding')
43 format
= httputils
.select_matching_option(['text/html', 'application/json'], formats
)
45 if format
in (None, 'application/json'): #serve json as default
46 content_type
= 'application/json'
47 content
= json
.dumps(podcasts
, sort_keys
=True, indent
=None, separators
=(',', ':'))
48 from email
import utils
50 self
.response
.headers
.add_header('Last-Modified', utils
.formatdate(time
.mktime(last_modified
.timetuple())))
55 content_type
= 'text/html'
56 pretty_json
= json
.dumps(podcasts
, sort_keys
=True, indent
=4)
57 pretty_json
= cgi
.escape(pretty_json
)
58 content
= """<html><head>
59 <link href="static/screen.css" type="text/css" rel="stylesheet" />
60 <link href="static/prettify.css" type="text/css" rel="stylesheet" />
61 <script type="text/javascript" src="static/prettify.js"></script>
62 </head><body onload="prettyPrint()"><h1>HTML Response</h1><p>This response is HTML formatted. To get just the JSON data for processing in your client, <a href="/#accept">send the HTTP Header <em>Accept: application/json</em></a>. <a href="/">Back to the Documentation</a></p><pre class="prettyprint">%s</pre></body></html>""" % pretty_json
64 self
.response
.headers
['Content-Type'] = content_type
65 self
.response
.out
.write(content
)
68 def parse_feeds(feed_urls
, *args
, **kwargs
):
70 Parses several feeds, specified by feed_urls and returns their JSON
71 objects and the latest of their modification dates. RSS-Redirects are
72 followed automatically by including both feeds in the result.
80 res
, visited
, new
, last_mod
= parse_feed(url
, *args
, **kwargs
)
85 visited_urls
.update(visited
)
87 # we follow RSS-redirects automatically
88 if new
and new
not in (list(visited_urls
) + feed_urls
):
91 if not last_modified
or (last_mod
and last_mod
> last_modified
):
92 last_modified
= last_mod
96 return result
, last_modified
99 def parse_feed(feed_url
, inline_logo
, scale_to
, logo_format
, strip_html
, modified
, use_cache
):
101 Parses a feed and returns its JSON object, a list of urls that refer to
102 this feed, an outgoing redirect and the timestamp of the last modification
107 from httputils
import get_redirects
112 feed_url
, feed_content
, last_modified
= urlstore
.get_url(feed_url
, use_cache
)
115 msg
= 'could not fetch feed %(feed_url)s: %(msg)s' % \
116 dict(feed_url
=feed_url
, msg
=str(e
))
117 add_error(podcast
, 'fetch-feed', msg
)
119 podcast
['urls'] = [feed_url
]
120 return podcast
, [feed_url
], None, None
123 if last_modified
and modified
and last_modified
<= modified
:
124 return None, None, None, None
126 feed
= feedparser
.parse(feed_content
)
129 ('title', True, lambda: feed
.feed
.get('title', None)),
130 ('link', False, lambda: feed
.feed
.get('link', None)),
131 ('description', True, lambda: feed
.feed
.get('subtitle', None)),
132 ('author', True, lambda: feed
.feed
.get('author', feed
.feed
.get('itunes_author', None))),
133 ('language', False, lambda: feed
.feed
.get('language', None)),
134 ('urls', False, lambda: get_redirects(feed_url
)),
135 ('new_location', False, lambda: feed
.feed
.get('newlocation', None)),
136 ('logo', False, lambda: get_podcast_logo(feed
)),
137 ('logo_data', False, lambda: get_podcast_logo_inline(podcast
, inline_logo
, modified
, size
=scale_to
, img_format
=logo_format
)),
138 ('tags', False, lambda: get_feed_tags(feed
.feed
)),
139 ('hub', False, lambda: get_hub_url(feed
.feed
)),
140 ('episodes', False, lambda: get_episodes(feed
, strip_html
)),
141 ('content_types', False, lambda: get_podcast_types(podcast
)),
144 for name
, is_text
, func
in PROPERTIES
:
145 set_val(podcast
, name
, func
, strip_html
and is_text
)
147 subscribe_at_hub(podcast
)
149 return podcast
, podcast
.get('urls', None), podcast
.get('new_location', None), last_modified
152 def set_val(obj
, name
, func
, remove_tags
=False):
153 from utils
import remove_html_tags
156 if remove_tags
: val
= remove_html_tags(val
)
161 def add_error(feed
, key
, msg
):
162 """ Adds an error entry to the feed """
164 if not 'errors' in feed
:
167 feed
['errors'][key
] = msg
170 def get_podcast_logo(feed
):
172 image
= feed
.feed
.get('image', None)
173 if image
is not None:
174 for key
in ('href', 'url'):
175 cover_art
= getattr(image
, key
, None)
179 cover_art
= youtube
.get_real_cover(feed
.feed
.get('link', None)) or cover_art
184 def get_podcast_logo_inline(podcast
, inline_logo
, modified
, **transform_args
):
185 """ Fetches the feed's logo and returns its data URI """
190 logo_url
= podcast
.get('logo', None)
196 return get_data_uri(logo_url
, modified
, **transform_args
)
199 msg
= 'could not fetch feed logo %(logo_url)s: %(msg)s' % \
200 dict(logo_url
=logo_url
, msg
=str(e
))
201 add_error(podcast
, 'fetch-logo', msg
)
206 def get_data_uri(url
, modified_since
, **transform_args
):
208 Fetches the logo, applies the specified transformations and
209 returns the Data URI for the resulting image
214 url
, content
, last_modified
= urlstore
.get_url(url
)
216 if last_modified
and modified_since
and last_modified
<= modified
:
219 mimetype
= get_mimetype(None, url
)
221 if any(transform_args
.values()):
222 content
, mimetype
= transform_image(content
, mimetype
, **transform_args
)
224 encoded
= base64
.b64encode(content
)
225 return 'data:%s;base64,%s' % (mimetype
, encoded
)
228 def transform_image(content
, mimetype
, size
, img_format
):
230 Transforms (resizes, converts) the image and returns
231 the resulting bytes and mimetype
234 from google
.appengine
.api
import images
236 img_formats
= dict(png
=images
.PNG
, jpeg
=images
.JPEG
)
238 img
= images
.Image(content
)
241 mimetype
= 'image/%s' % img_format
243 img_format
= mimetype
[mimetype
.find('/')+1:]
246 img
.resize(min(size
, img
.width
), min(size
, img
.height
))
248 content
= img
.execute_transforms(output_encoding
=img_formats
[img_format
])
249 return content
, mimetype
252 def get_feed_tags(feed
):
255 for tag
in feed
.get('tags', []):
257 tags
.extend(filter(None, tag
['term'].split(',')))
260 tags
.append(tag
['label'])
262 return list(set(tags
))
265 def get_hub_url(feed
):
267 Returns the Hub URL as specified by
268 http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html#discovery
271 for l
in feed
.get('links', []):
272 if l
.rel
== 'hub' and l
.get('href', None):
277 def get_episodes(feed
, strip_html
):
278 get_episode
= lambda e
: get_episode_metadata(e
, strip_html
)
279 episodes
= filter(None, map(get_episode
, feed
.entries
))
281 # We take all non-empty titles
282 titles
= filter(None, [e
.get('title', None) for e
in episodes
])
284 # get the longest common substring
285 common_title
= utils
.longest_substr(titles
)
287 # but consider only the part up to the first number. Otherwise we risk
288 # removing part of the number (eg if a feed contains episodes 100 - 199)
289 common_title
= re
.search(r
'^\D*', common_title
).group(0)
292 e
.update(get_additional_episode_data(e
, common_title
))
298 def get_episode_metadata(entry
, strip_html
):
300 files
= get_episode_files(entry
)
305 ('guid', None, lambda: entry
.get('id', None)),
306 ('title', True, lambda: entry
.get('title', None)),
307 ('description', True, lambda: get_episode_summary(entry
)),
308 ('link', False, lambda: entry
.get('link', None)),
309 ('author', True, lambda: entry
.get('author', entry
.get('itunes_author', None))),
310 ('duration', False, lambda: get_duration(entry
)),
311 ('language', False, lambda: entry
.get('language', None)),
312 ('files', False, lambda: get_files(files
)),
313 ('released', False, lambda: get_timestamp(entry
)),
317 for name
, is_text
, func
in PROPERTIES
:
318 set_val(episode
, name
, func
, strip_html
and is_text
)
323 def get_episode_files(entry
):
324 """Get the download / episode URL of a feedparser entry"""
327 enclosures
= getattr(entry
, 'enclosures', [])
328 for enclosure
in enclosures
:
329 if 'href' in enclosure
:
330 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
331 if check_mimetype(mimetype
):
333 filesize
= int(enclosure
['length'])
336 urls
[enclosure
['href']] = (mimetype
, filesize
)
338 media_content
= getattr(entry
, 'media_content', [])
339 for media
in media_content
:
341 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
342 if check_mimetype(mimetype
):
343 urls
[media
['url']] = (mimetype
, None)
345 links
= getattr(entry
, 'links', [])
347 if not hasattr(link
, 'href'):
350 if youtube
.is_video_link(link
['href']):
351 urls
[link
['href']] = ('application/x-youtube', None)
353 # XXX: Implement link detection as in gPodder
358 def get_episode_summary(entry
):
359 for key
in ('summary', 'subtitle', 'link'):
360 value
= entry
.get(key
, None)
367 def get_duration(entry
):
368 from utils
import parse_time
370 str = entry
.get('itunes_duration', '')
372 return parse_time(str)
377 def get_files(files
):
379 for k
, v
in files
.items():
382 file['mimetype'] = v
[0]
384 file['filesize'] = v
[1]
389 def get_timestamp(entry
):
390 from datetime
import datetime
392 return datetime(*(entry
.updated_parsed
)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
397 def get_additional_episode_data(episode
, common_title
):
399 Returns additional data about an episode that is calculated after
400 the first pass over all episodes
404 ('number', lambda: get_episode_number(episode
.get('title', None), common_title
)),
405 ('short_title', lambda: get_short_title(episode
.get('title', None), common_title
)),
409 for name
, func
in PROPERTIES
:
410 set_val(data
, name
, func
)
415 def get_episode_number(title
, common_title
):
417 Returns the first number in the non-repeating part of the episode's title
423 title
= title
.replace(common_title
, '').strip()
424 match
= re
.search(r
'^\W*(\d+)', title
)
428 return int(match
.group(1))
431 def get_short_title(title
, common_title
):
433 Returns the non-repeating part of the episode's title
434 If an episode number is found, it is removed
440 title
= title
.replace(common_title
, '').strip()
441 title
= re
.sub(r
'^[\W\d]+', '', title
)
445 def subscribe_at_hub(feed
):
446 """ Tries to subscribe to the feed if it contains a hub URL """
448 if not feed
.get('hub', False):
453 # use the last URL in the redirect chain
454 feed_url
= feed
['urls'][-1]
456 hub_url
= feed
.get('hub')
459 pubsubhubbub
.Subscriber
.subscribe(feed_url
, hub_url
)
460 except pubsubhubbub
.SubscriptionError
, e
:
461 add_error(feed
, 'hub-subscription', repr(e
))