add error codes to documentation
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blob18520fcd6027d2b6436e36b724e8fe29aa35c1ef
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
5 import re, urllib, urllib2, logging
6 import simplejson as json
8 from google.appengine.ext import webapp
10 import urlstore, httputils, youtube, utils
11 from mimetype import get_mimetype, check_mimetype, get_podcast_types
14 class Parser(webapp.RequestHandler):
15 """ Parser Endpoint """
17 def post(self):
18 return self.get()
20 def get(self):
21 urls = map(urllib.unquote, self.request.get_all('url'))
23 inline_logo = self.request.get_range('inline_logo', 0, 1, default=0)
24 scale_to = self.request.get_range('scale_logo', 0, 1, default=0)
25 logo_format = self.request.get('logo_format')
26 strip_html = self.request.get_range('strip_html', 0, 1, default=0)
27 use_cache = self.request.get_range('use_cache', 0, 1, default=1)
28 modified = self.request.headers.get('If-Modified-Since', None)
29 accept = self.request.headers.get('Accept', 'application/json')
31 if urls:
32 podcasts, last_modified = parse_feeds(urls, inline_logo, scale_to, logo_format, strip_html, modified, use_cache)
33 self.send_response(podcasts, last_modified, accept)
35 else:
36 self.response.set_status(400)
37 self.response.out.write('parameter url missing')
40 def send_response(self, podcasts, last_modified, formats):
41 self.response.headers.add_header('Vary', 'Accept, User-Agent, Accept-Encoding')
43 format = httputils.select_matching_option(['text/html', 'application/json'], formats)
45 if format in (None, 'application/json'): #serve json as default
46 content_type = 'application/json'
47 content = json.dumps(podcasts, sort_keys=True, indent=None, separators=(',', ':'))
48 from email import utils
49 import time
50 self.response.headers.add_header('Last-Modified', utils.formatdate(time.mktime(last_modified.timetuple())))
53 else:
54 import cgi
55 content_type = 'text/html'
56 pretty_json = json.dumps(podcasts, sort_keys=True, indent=4)
57 pretty_json = cgi.escape(pretty_json)
58 content = """<html><head>
59 <link href="static/screen.css" type="text/css" rel="stylesheet" />
60 <link href="static/prettify.css" type="text/css" rel="stylesheet" />
61 <script type="text/javascript" src="static/prettify.js"></script>
62 </head><body onload="prettyPrint()"><h1>HTML Response</h1><p>This response is HTML formatted. To get just the JSON data for processing in your client, <a href="/#accept">send the HTTP Header <em>Accept: application/json</em></a>. <a href="/">Back to the Documentation</a></p><pre class="prettyprint">%s</pre></body></html>""" % pretty_json
64 self.response.headers['Content-Type'] = content_type
65 self.response.out.write(content)
68 def parse_feeds(feed_urls, *args, **kwargs):
69 """
70 Parses several feeds, specified by feed_urls and returns their JSON
71 objects and the latest of their modification dates. RSS-Redirects are
72 followed automatically by including both feeds in the result.
73 """
75 visited_urls = set()
76 result = []
77 last_modified = None
79 for url in feed_urls:
80 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
82 if not res:
83 continue
85 visited_urls.update(visited)
87 # we follow RSS-redirects automatically
88 if new and new not in (list(visited_urls) + feed_urls):
89 feed_urls.append(new)
91 if not last_modified or (last_mod and last_mod > last_modified):
92 last_modified = last_mod
94 result.append(res)
96 return result, last_modified
99 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
101 Parses a feed and returns its JSON object, a list of urls that refer to
102 this feed, an outgoing redirect and the timestamp of the last modification
103 of the feed
106 import feedparser
107 from httputils import get_redirects
109 podcast = dict()
111 try:
112 feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
114 except Exception, e:
115 msg = 'could not fetch feed %(feed_url)s: %(msg)s' % \
116 dict(feed_url=feed_url, msg=str(e))
117 add_error(podcast, 'fetch-feed', msg)
118 logging.info(msg)
119 podcast['urls'] = [feed_url]
120 return podcast, [feed_url], None, None
123 if last_modified and modified and last_modified <= modified:
124 return None, None, None, None
126 feed = feedparser.parse(feed_content)
128 PROPERTIES = (
129 ('title', True, lambda: feed.feed.get('title', None)),
130 ('link', False, lambda: feed.feed.get('link', None)),
131 ('description', True, lambda: feed.feed.get('subtitle', None)),
132 ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
133 ('language', False, lambda: feed.feed.get('language', None)),
134 ('urls', False, lambda: get_redirects(feed_url)),
135 ('new_location', False, lambda: feed.feed.get('newlocation', None)),
136 ('logo', False, lambda: get_podcast_logo(feed)),
137 ('logo_data', False, lambda: get_podcast_logo_inline(podcast, inline_logo, modified, size=scale_to, img_format=logo_format)),
138 ('tags', False, lambda: get_feed_tags(feed.feed)),
139 ('hub', False, lambda: get_hub_url(feed.feed)),
140 ('episodes', False, lambda: get_episodes(feed, strip_html)),
141 ('content_types', False, lambda: get_podcast_types(podcast)),
144 for name, is_text, func in PROPERTIES:
145 set_val(podcast, name, func, strip_html and is_text)
147 subscribe_at_hub(podcast)
149 return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
152 def set_val(obj, name, func, remove_tags=False):
153 from utils import remove_html_tags
155 val = func()
156 if remove_tags: val = remove_html_tags(val)
157 if val is not None:
158 obj[name] = val
161 def add_error(feed, key, msg):
162 """ Adds an error entry to the feed """
164 if not 'errors' in feed:
165 feed['errors'] = {}
167 feed['errors'][key] = msg
170 def get_podcast_logo(feed):
171 cover_art = None
172 image = feed.feed.get('image', None)
173 if image is not None:
174 for key in ('href', 'url'):
175 cover_art = getattr(image, key, None)
176 if cover_art:
177 break
179 cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
181 return cover_art
184 def get_podcast_logo_inline(podcast, inline_logo, modified, **transform_args):
185 """ Fetches the feed's logo and returns its data URI """
187 if not inline_logo:
188 return None
190 logo_url = podcast.get('logo', None)
192 if not logo_url:
193 return None
195 try:
196 return get_data_uri(logo_url, modified, **transform_args)
198 except Exception, e:
199 msg = 'could not fetch feed logo %(logo_url)s: %(msg)s' % \
200 dict(logo_url=logo_url, msg=str(e))
201 add_error(podcast, 'fetch-logo', msg)
202 logging.info(msg)
203 return None
206 def get_data_uri(url, modified_since, **transform_args):
208 Fetches the logo, applies the specified transformations and
209 returns the Data URI for the resulting image
212 import base64
214 url, content, last_modified = urlstore.get_url(url)
216 if last_modified and modified_since and last_modified <= modified:
217 return None
219 mimetype = get_mimetype(None, url)
221 if any(transform_args.values()):
222 content, mimetype = transform_image(content, mimetype, **transform_args)
224 encoded = base64.b64encode(content)
225 return 'data:%s;base64,%s' % (mimetype, encoded)
228 def transform_image(content, mimetype, size, img_format):
230 Transforms (resizes, converts) the image and returns
231 the resulting bytes and mimetype
234 from google.appengine.api import images
236 img_formats = dict(png=images.PNG, jpeg=images.JPEG)
238 img = images.Image(content)
240 if img_format:
241 mimetype = 'image/%s' % img_format
242 else:
243 img_format = mimetype[mimetype.find('/')+1:]
245 if size:
246 img.resize(min(size, img.width), min(size, img.height))
248 content = img.execute_transforms(output_encoding=img_formats[img_format])
249 return content, mimetype
252 def get_feed_tags(feed):
253 tags = []
255 for tag in feed.get('tags', []):
256 if tag['term']:
257 tags.extend(filter(None, tag['term'].split(',')))
259 if tag['label']:
260 tags.append(tag['label'])
262 return list(set(tags))
265 def get_hub_url(feed):
267 Returns the Hub URL as specified by
268 http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html#discovery
271 for l in feed.get('links', []):
272 if l.rel == 'hub' and l.get('href', None):
273 return l.href
274 return None
277 def get_episodes(feed, strip_html):
278 get_episode = lambda e: get_episode_metadata(e, strip_html)
279 episodes = filter(None, map(get_episode, feed.entries))
281 # We take all non-empty titles
282 titles = filter(None, [e.get('title', None) for e in episodes])
284 # get the longest common substring
285 common_title = utils.longest_substr(titles)
287 # but consider only the part up to the first number. Otherwise we risk
288 # removing part of the number (eg if a feed contains episodes 100 - 199)
289 common_title = re.search(r'^\D*', common_title).group(0)
291 for e in episodes:
292 e.update(get_additional_episode_data(e, common_title))
294 return episodes
298 def get_episode_metadata(entry, strip_html):
300 files = get_episode_files(entry)
301 if not files:
302 return None
304 PROPERTIES = (
305 ('guid', None, lambda: entry.get('id', None)),
306 ('title', True, lambda: entry.get('title', None)),
307 ('description', True, lambda: get_episode_summary(entry)),
308 ('link', False, lambda: entry.get('link', None)),
309 ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))),
310 ('duration', False, lambda: get_duration(entry)),
311 ('language', False, lambda: entry.get('language', None)),
312 ('files', False, lambda: get_files(files)),
313 ('released', False, lambda: get_timestamp(entry)),
316 episode = {}
317 for name, is_text, func in PROPERTIES:
318 set_val(episode, name, func, strip_html and is_text)
320 return episode
323 def get_episode_files(entry):
324 """Get the download / episode URL of a feedparser entry"""
326 urls = {}
327 enclosures = getattr(entry, 'enclosures', [])
328 for enclosure in enclosures:
329 if 'href' in enclosure:
330 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
331 if check_mimetype(mimetype):
332 try:
333 filesize = int(enclosure['length'])
334 except ValueError:
335 filesize = None
336 urls[enclosure['href']] = (mimetype, filesize)
338 media_content = getattr(entry, 'media_content', [])
339 for media in media_content:
340 if 'url' in media:
341 mimetype = get_mimetype(media.get('type', ''), media['url'])
342 if check_mimetype(mimetype):
343 urls[media['url']] = (mimetype, None)
345 links = getattr(entry, 'links', [])
346 for link in links:
347 if not hasattr(link, 'href'):
348 continue
350 if youtube.is_video_link(link['href']):
351 urls[link['href']] = ('application/x-youtube', None)
353 # XXX: Implement link detection as in gPodder
355 return urls
358 def get_episode_summary(entry):
359 for key in ('summary', 'subtitle', 'link'):
360 value = entry.get(key, None)
361 if value:
362 return value
364 return None
367 def get_duration(entry):
368 from utils import parse_time
370 str = entry.get('itunes_duration', '')
371 try:
372 return parse_time(str)
373 except ValueError:
374 return None
377 def get_files(files):
378 f = []
379 for k, v in files.items():
380 file = dict(url=k)
381 if v[0]:
382 file['mimetype'] = v[0]
383 if v[1]:
384 file['filesize'] = v[1]
385 f.append(file)
386 return f
389 def get_timestamp(entry):
390 from datetime import datetime
391 try:
392 return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
393 except:
394 return None
397 def get_additional_episode_data(episode, common_title):
399 Returns additional data about an episode that is calculated after
400 the first pass over all episodes
403 PROPERTIES = (
404 ('number', lambda: get_episode_number(episode.get('title', None), common_title)),
405 ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
408 data = {}
409 for name, func in PROPERTIES:
410 set_val(data, name, func)
412 return data
415 def get_episode_number(title, common_title):
417 Returns the first number in the non-repeating part of the episode's title
420 if title is None:
421 return None
423 title = title.replace(common_title, '').strip()
424 match = re.search(r'^\W*(\d+)', title)
425 if not match:
426 return None
428 return int(match.group(1))
431 def get_short_title(title, common_title):
433 Returns the non-repeating part of the episode's title
434 If an episode number is found, it is removed
437 if title is None:
438 return None
440 title = title.replace(common_title, '').strip()
441 title = re.sub(r'^[\W\d]+', '', title)
442 return title
445 def subscribe_at_hub(feed):
446 """ Tries to subscribe to the feed if it contains a hub URL """
448 if not feed.get('hub', False):
449 return
451 import pubsubhubbub
453 # use the last URL in the redirect chain
454 feed_url = feed['urls'][-1]
456 hub_url = feed.get('hub')
458 try:
459 pubsubhubbub.Subscriber.subscribe(feed_url, hub_url)
460 except pubsubhubbub.SubscriptionError, e:
461 add_error(feed, 'hub-subscription', repr(e))