Subscribe to Feeds that contain Pubsubhubbub-Hubs
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blobd0d38394eb6a8443408f855753a6b4cd630d24c6
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
5 import re, urllib
6 import simplejson as json
8 from google.appengine.ext import webapp
10 import urlstore, httputils, youtube, utils
11 from mimetype import get_mimetype, check_mimetype, get_podcast_types
14 class Parser(webapp.RequestHandler):
15 """ Parser Endpoint """
17 def post(self):
18 return self.get()
20 def get(self):
21 urls = map(urllib.unquote, self.request.get_all('url'))
23 inline_logo = self.request.get_range('inline_logo', 0, 1, default=0)
24 scale_to = self.request.get_range('scale_logo', 0, 1, default=0)
25 logo_format = self.request.get('logo_format')
26 strip_html = self.request.get_range('strip_html', 0, 1, default=0)
27 use_cache = self.request.get_range('use_cache', 0, 1, default=1)
28 modified = self.request.headers.get('If-Modified-Since', None)
29 accept = self.request.headers.get('Accept', 'application/json')
31 if urls:
32 podcasts, last_modified = parse_feeds(urls, inline_logo, scale_to, logo_format, strip_html, modified, use_cache)
33 self.send_response(podcasts, last_modified, accept)
35 else:
36 self.response.set_status(400)
37 self.response.out.write('parameter url missing')
40 def send_response(self, podcasts, last_modified, formats):
41 self.response.headers.add_header('Vary', 'Accept, User-Agent, Accept-Encoding')
43 format = httputils.select_matching_option(['text/html', 'application/json'], formats)
45 if format in (None, 'application/json'): #serve json as default
46 content_type = 'application/json'
47 content = json.dumps(podcasts, sort_keys=True, indent=None, separators=(',', ':'))
48 from email import utils
49 import time
50 self.response.headers.add_header('Last-Modified', utils.formatdate(time.mktime(last_modified.timetuple())))
53 else:
54 import cgi
55 content_type = 'text/html'
56 pretty_json = json.dumps(podcasts, sort_keys=True, indent=4)
57 pretty_json = cgi.escape(pretty_json)
58 content = """<html><head>
59 <link href="static/screen.css" type="text/css" rel="stylesheet" />
60 <link href="static/prettify.css" type="text/css" rel="stylesheet" />
61 <script type="text/javascript" src="static/prettify.js"></script>
62 </head><body onload="prettyPrint()"><h1>HTML Response</h1><p>This response is HTML formatted. To get just the JSON data for processing in your client, <a href="/#accept">send the HTTP Header <em>Accept: application/json</em></a>. <a href="/">Back to the Documentation</a></p><pre class="prettyprint">%s</pre></body></html>""" % pretty_json
64 self.response.headers['Content-Type'] = content_type
65 self.response.out.write(content)
68 def parse_feeds(feed_urls, *args, **kwargs):
69 """
70 Parses several feeds, specified by feed_urls and returns their JSON
71 objects and the latest of their modification dates. RSS-Redirects are
72 followed automatically by including both feeds in the result.
73 """
75 visited_urls = set()
76 result = []
77 last_modified = None
79 for url in feed_urls:
80 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
82 if not res:
83 continue
85 visited_urls.update(visited)
87 # we follow RSS-redirects automatically
88 if new and new not in (list(visited_urls) + feed_urls):
89 feed_urls.append(new)
91 if not last_modified or (last_mod and last_mod > last_modified):
92 last_modified = last_mod
94 result.append(res)
96 return result, last_modified
99 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
101 Parses a feed and returns its JSON object, a list of urls that refer to
102 this feed, an outgoing redirect and the timestamp of the last modification
103 of the feed
106 import feedparser
107 from httputils import get_redirects
109 feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
111 if last_modified and modified and last_modified <= modified:
112 return None, None, None, None
114 feed = feedparser.parse(feed_content)
116 podcast = dict()
118 PROPERTIES = (
119 ('title', True, lambda: feed.feed.get('title', None)),
120 ('link', False, lambda: feed.feed.get('link', None)),
121 ('description', True, lambda: feed.feed.get('subtitle', None)),
122 ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
123 ('language', False, lambda: feed.feed.get('language', None)),
124 ('urls', False, lambda: get_redirects(feed_url)),
125 ('new_location', False, lambda: feed.feed.get('newlocation', None)),
126 ('logo', False, lambda: get_podcast_logo(feed)),
127 ('logo_data', False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), modified, size=scale_to, img_format=logo_format)),
128 ('tags', False, lambda: get_feed_tags(feed.feed)),
129 ('hub', False, lambda: get_hub_url(feed.feed)),
130 ('episodes', False, lambda: get_episodes(feed, strip_html)),
131 ('content_types', False, lambda: get_podcast_types(podcast)),
134 for name, is_text, func in PROPERTIES:
135 set_val(podcast, name, func, strip_html and is_text)
137 subscribe_at_hub(podcast)
139 return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
142 def set_val(obj, name, func, remove_tags=False):
143 from utils import remove_html_tags
145 val = func()
146 if remove_tags: val = remove_html_tags(val)
147 if val is not None:
148 obj[name] = val
151 def add_error(feed, key, msg):
152 """ Adds an error entry to the feed """
154 if not 'errors' in feed:
155 feed['errors'] = {}
157 feed['errors'][key] = msg
160 def get_podcast_logo(feed):
161 cover_art = None
162 image = feed.feed.get('image', None)
163 if image is not None:
164 for key in ('href', 'url'):
165 cover_art = getattr(image, key, None)
166 if cover_art:
167 break
169 cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
171 return cover_art
174 def get_data_uri(inline_logo, url, modified_since, **transform_args):
176 Fetches the logo, applies the specified transformations and
177 returns the Data URI for the resulting image
180 import base64
182 if not inline_logo or not url:
183 return None
185 url, content, last_modified = urlstore.get_url(url)
187 if last_modified and modified_since and last_modified <= modified:
188 return None
190 mimetype = get_mimetype(None, url)
192 if any(transform_args.values()):
193 content, mimetype = transform_image(content, mimetype, **transform_args)
195 encoded = base64.b64encode(content)
196 return 'data:%s;base64,%s' % (mimetype, encoded)
199 def transform_image(content, mimetype, size, img_format):
201 Transforms (resizes, converts) the image and returns
202 the resulting bytes and mimetype
205 from google.appengine.api import images
207 img_formats = dict(png=images.PNG, jpeg=images.JPEG)
209 img = images.Image(content)
211 if img_format:
212 mimetype = 'image/%s' % img_format
213 else:
214 img_format = mimetype[mimetype.find('/')+1:]
216 if size:
217 img.resize(min(size, img.width), min(size, img.height))
219 content = img.execute_transforms(output_encoding=img_formats[img_format])
220 return content, mimetype
223 def get_feed_tags(feed):
224 tags = []
226 for tag in feed.get('tags', []):
227 if tag['term']:
228 tags.extend(filter(None, tag['term'].split(',')))
230 if tag['label']:
231 tags.append(tag['label'])
233 return list(set(tags))
236 def get_hub_url(feed):
238 Returns the Hub URL as specified by
239 http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html#discovery
242 for l in feed.get('links', []):
243 if l.rel == 'hub' and l.get('href', None):
244 return l.href
245 return None
248 def get_episodes(feed, strip_html):
249 get_episode = lambda e: get_episode_metadata(e, strip_html)
250 episodes = filter(None, map(get_episode, feed.entries))
252 # We take all non-empty titles
253 titles = filter(None, [e.get('title', None) for e in episodes])
255 # get the longest common substring
256 common_title = utils.longest_substr(titles)
258 # but consider only the part up to the first number. Otherwise we risk
259 # removing part of the number (eg if a feed contains episodes 100 - 199)
260 common_title = re.search(r'^\D*', common_title).group(0)
262 for e in episodes:
263 e.update(get_additional_episode_data(e, common_title))
265 return episodes
269 def get_episode_metadata(entry, strip_html):
271 files = get_episode_files(entry)
272 if not files:
273 return None
275 PROPERTIES = (
276 ('guid', None, lambda: entry.get('id', None)),
277 ('title', True, lambda: entry.get('title', None)),
278 ('description', True, lambda: get_episode_summary(entry)),
279 ('link', False, lambda: entry.get('link', None)),
280 ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))),
281 ('duration', False, lambda: get_duration(entry)),
282 ('language', False, lambda: entry.get('language', None)),
283 ('files', False, lambda: get_files(files)),
284 ('released', False, lambda: get_timestamp(entry)),
287 episode = {}
288 for name, is_text, func in PROPERTIES:
289 set_val(episode, name, func, strip_html and is_text)
291 return episode
294 def get_episode_files(entry):
295 """Get the download / episode URL of a feedparser entry"""
297 urls = {}
298 enclosures = getattr(entry, 'enclosures', [])
299 for enclosure in enclosures:
300 if 'href' in enclosure:
301 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
302 if check_mimetype(mimetype):
303 try:
304 filesize = int(enclosure['length'])
305 except ValueError:
306 filesize = None
307 urls[enclosure['href']] = (mimetype, filesize)
309 media_content = getattr(entry, 'media_content', [])
310 for media in media_content:
311 if 'url' in media:
312 mimetype = get_mimetype(media.get('type', ''), media['url'])
313 if check_mimetype(mimetype):
314 urls[media['url']] = (mimetype, None)
316 links = getattr(entry, 'links', [])
317 for link in links:
318 if not hasattr(link, 'href'):
319 continue
321 if youtube.is_video_link(link['href']):
322 urls[link['href']] = ('application/x-youtube', None)
324 # XXX: Implement link detection as in gPodder
326 return urls
329 def get_episode_summary(entry):
330 for key in ('summary', 'subtitle', 'link'):
331 value = entry.get(key, None)
332 if value:
333 return value
335 return None
338 def get_duration(entry):
339 from utils import parse_time
341 str = entry.get('itunes_duration', '')
342 try:
343 return parse_time(str)
344 except ValueError:
345 return None
348 def get_files(files):
349 f = []
350 for k, v in files.items():
351 file = dict(url=k)
352 if v[0]:
353 file['mimetype'] = v[0]
354 if v[1]:
355 file['filesize'] = v[1]
356 f.append(file)
357 return f
360 def get_timestamp(entry):
361 from datetime import datetime
362 try:
363 return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
364 except:
365 return None
368 def get_additional_episode_data(episode, common_title):
370 Returns additional data about an episode that is calculated after
371 the first pass over all episodes
374 PROPERTIES = (
375 ('number', lambda: get_episode_number(episode.get('title', None), common_title)),
376 ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
379 data = {}
380 for name, func in PROPERTIES:
381 set_val(data, name, func)
383 return data
386 def get_episode_number(title, common_title):
388 Returns the first number in the non-repeating part of the episode's title
391 if title is None:
392 return None
394 title = title.replace(common_title, '').strip()
395 match = re.search(r'^\W*(\d+)', title)
396 if not match:
397 return None
399 return int(match.group(1))
402 def get_short_title(title, common_title):
404 Returns the non-repeating part of the episode's title
405 If an episode number is found, it is removed
408 if title is None:
409 return None
411 title = title.replace(common_title, '').strip()
412 title = re.sub(r'^[\W\d]+', '', title)
413 return title
416 def subscribe_at_hub(feed):
417 """ Tries to subscribe to the feed if it contains a hub URL """
419 if not feed.get('hub', False):
420 return
422 import pubsubhubbub
424 # use the last URL in the redirect chain
425 feed_url = feed['urls'][-1]
427 hub_url = feed.get('hub')
429 try:
430 pubsubhubbub.Subscriber.subscribe(feed_url, hub_url)
431 except pubsubhubbub.SubscriptionError, e:
432 add_error(feed, 'hub-subscription', repr(e))