feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4
   5 import re, urllib, urllib2, logging
   6 import simplejson as json
   7
   8 from google.appengine.ext import webapp
   9
  10 import urlstore, httputils, youtube, utils
  11 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  12
  13
  14 class Parser(webapp.RequestHandler):
  15     """ Parser Endpoint """
  16
  17     def post(self):
  18         return self.get()
  19
  20     def get(self):
  21         urls = map(urllib.unquote, self.request.get_all('url'))
  22
  23         inline_logo = self.request.get_range('inline_logo', 0, 1, default=0)
  24         scale_to = self.request.get_range('scale_logo', 0, 1, default=0)
  25         logo_format = self.request.get('logo_format')
  26         strip_html = self.request.get_range('strip_html', 0, 1, default=0)
  27         use_cache = self.request.get_range('use_cache', 0, 1, default=1)
  28         modified = self.request.headers.get('If-Modified-Since', None)
  29         accept = self.request.headers.get('Accept', 'application/json')
  30
  31         if urls:
  32             podcasts, last_modified = parse_feeds(urls, inline_logo, scale_to, logo_format, strip_html, modified, use_cache)
  33             self.send_response(podcasts, last_modified, accept)
  34
  35         else:
  36             self.response.set_status(400)
  37             self.response.out.write('parameter url missing')
  38
  39
  40     def send_response(self, podcasts, last_modified, formats):
  41         self.response.headers.add_header('Vary', 'Accept, User-Agent, Accept-Encoding')
  42
  43         format = httputils.select_matching_option(['text/html', 'application/json'], formats)
  44
  45         if format in (None, 'application/json'): #serve json as default
  46             content_type = 'application/json'
  47             content = json.dumps(podcasts, sort_keys=True, indent=None, separators=(',', ':'))
  48             from email import utils
  49             import time
  50             self.response.headers.add_header('Last-Modified', utils.formatdate(time.mktime(last_modified.timetuple())))
  51
  52
  53         else:
  54             import cgi
  55             content_type = 'text/html'
  56             pretty_json = json.dumps(podcasts, sort_keys=True, indent=4)
  57             pretty_json = cgi.escape(pretty_json)
  58             content = """<html><head>
  59 <link href="static/screen.css" type="text/css" rel="stylesheet" />
  60 <link href="static/prettify.css" type="text/css" rel="stylesheet" />
  61 <script type="text/javascript" src="static/prettify.js"></script>
  62 </head><body onload="prettyPrint()"><h1>HTML Response</h1><p>This response is HTML formatted. To get just the JSON data for processing in your client, <a href="/#accept">send the HTTP Header <em>Accept: application/json</em></a>. <a href="/">Back to the Documentation</a></p><pre class="prettyprint">%s</pre></body></html>""" % pretty_json
  63
  64         self.response.headers['Content-Type'] = content_type
  65         self.response.out.write(content)
  66
  67
  68 def parse_feeds(feed_urls, *args, **kwargs):
  69     """
  70     Parses several feeds, specified by feed_urls and returns their JSON
  71     objects and the latest of their modification dates. RSS-Redirects are
  72     followed automatically by including both feeds in the result.
  73     """
  74
  75     visited_urls = set()
  76     result = []
  77     last_modified = None
  78
  79     for url in feed_urls:
  80         res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
  81
  82         if not res:
  83             continue
  84
  85         visited_urls.update(visited)
  86
  87         # we follow RSS-redirects automatically
  88         if new and new not in (list(visited_urls) + feed_urls):
  89             feed_urls.append(new)
  90
  91         if not last_modified or (last_mod and last_mod > last_modified):
  92             last_modified = last_mod
  93
  94         result.append(res)
  95
  96     return result, last_modified
  97
  98
  99 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
 100     """
 101     Parses a feed and returns its JSON object, a list of urls that refer to
 102     this feed, an outgoing redirect and the timestamp of the last modification
 103     of the feed
 104     """
 105
 106     import feedparser
 107     from httputils import get_redirects
 108
 109     podcast = dict()
 110
 111     try:
 112         feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
 113
 114     except Exception, e:
 115         msg = 'could not fetch feed %(feed_url)s: %(msg)s' % \
 116             dict(feed_url=feed_url, msg=str(e))
 117         add_error(podcast, 'fetch-feed', msg)
 118         logging.info(msg)
 119         podcast['urls'] = [feed_url]
 120         return podcast, [feed_url], None, None
 121
 122
 123     if last_modified and modified and last_modified <= modified:
 124         return None, None, None, None
 125
 126     feed = feedparser.parse(feed_content)
 127
 128     PROPERTIES = (
 129         ('title',         True,  lambda: feed.feed.get('title', None)),
 130         ('link',          False, lambda: feed.feed.get('link', None)),
 131         ('description',   True,  lambda: feed.feed.get('subtitle', None)),
 132         ('author',        True,  lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
 133         ('language',      False, lambda: feed.feed.get('language', None)),
 134         ('urls',          False, lambda: get_redirects(feed_url)),
 135         ('new_location',  False, lambda: feed.feed.get('newlocation', None)),
 136         ('logo',          False, lambda: get_podcast_logo(feed)),
 137         ('logo_data',     False, lambda: get_podcast_logo_inline(podcast, inline_logo, modified, size=scale_to, img_format=logo_format)),
 138         ('tags',          False, lambda: get_feed_tags(feed.feed)),
 139         ('hub',           False, lambda: get_hub_url(feed.feed)),
 140         ('episodes',      False, lambda: get_episodes(feed, strip_html)),
 141         ('content_types', False, lambda: get_podcast_types(podcast)),
 142     )
 143
 144     for name, is_text, func in PROPERTIES:
 145         set_val(podcast, name, func, strip_html and is_text)
 146
 147     subscribe_at_hub(podcast)
 148
 149     return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
 150
 151
 152 def set_val(obj, name, func, remove_tags=False):
 153     from utils import remove_html_tags
 154
 155     val = func()
 156     if remove_tags: val = remove_html_tags(val)
 157     if val is not None:
 158         obj[name] = val
 159
 160
 161 def add_error(feed, key, msg):
 162     """ Adds an error entry to the feed """
 163
 164     if not 'errors' in feed:
 165         feed['errors'] = {}
 166
 167     feed['errors'][key] = msg
 168
 169
 170 def get_podcast_logo(feed):
 171     cover_art = None
 172     image = feed.feed.get('image', None)
 173     if image is not None:
 174         for key in ('href', 'url'):
 175             cover_art = getattr(image, key, None)
 176             if cover_art:
 177                 break
 178
 179     cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
 180
 181     return cover_art
 182
 183
 184 def get_podcast_logo_inline(podcast, inline_logo, modified, **transform_args):
 185     """ Fetches the feed's logo and returns its data URI """
 186
 187     if not inline_logo:
 188         return None
 189
 190     logo_url = podcast.get('logo', None)
 191
 192     if not logo_url:
 193         return None
 194
 195     try:
 196         return get_data_uri(logo_url, modified, **transform_args)
 197
 198     except Exception, e:
 199         msg = 'could not fetch feed logo %(logo_url)s: %(msg)s' % \
 200             dict(logo_url=logo_url, msg=str(e))
 201         add_error(podcast, 'fetch-logo', msg)
 202         logging.info(msg)
 203         return None
 204
 205
 206 def get_data_uri(url, modified_since, **transform_args):
 207     """
 208     Fetches the logo, applies the specified transformations and
 209     returns the Data URI for the resulting image
 210     """
 211
 212     import base64
 213
 214     url, content, last_modified = urlstore.get_url(url)
 215
 216     if last_modified and modified_since and last_modified <= modified:
 217         return None
 218
 219     mimetype = get_mimetype(None, url)
 220
 221     if any(transform_args.values()):
 222         content, mimetype = transform_image(content, mimetype, **transform_args)
 223
 224     encoded = base64.b64encode(content)
 225     return 'data:%s;base64,%s' % (mimetype, encoded)
 226
 227
 228 def transform_image(content, mimetype, size, img_format):
 229     """
 230     Transforms (resizes, converts) the image and returns
 231     the resulting bytes and mimetype
 232     """
 233
 234     from google.appengine.api import images
 235
 236     img_formats = dict(png=images.PNG, jpeg=images.JPEG)
 237
 238     img = images.Image(content)
 239
 240     if img_format:
 241         mimetype = 'image/%s' % img_format
 242     else:
 243         img_format = mimetype[mimetype.find('/')+1:]
 244
 245     if size:
 246         img.resize(min(size, img.width), min(size, img.height))
 247
 248     content = img.execute_transforms(output_encoding=img_formats[img_format])
 249     return content, mimetype
 250
 251
 252 def get_feed_tags(feed):
 253     tags = []
 254
 255     for tag in feed.get('tags', []):
 256         if tag['term']:
 257             tags.extend(filter(None, tag['term'].split(',')))
 258
 259         if tag['label']:
 260             tags.append(tag['label'])
 261
 262     return list(set(tags))
 263
 264
 265 def get_hub_url(feed):
 266     """
 267     Returns the Hub URL as specified by
 268     http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html#discovery
 269     """
 270
 271     for l in feed.get('links', []):
 272         if l.rel == 'hub' and l.get('href', None):
 273             return l.href
 274     return None
 275
 276
 277 def get_episodes(feed, strip_html):
 278     get_episode = lambda e: get_episode_metadata(e, strip_html)
 279     episodes = filter(None, map(get_episode, feed.entries))
 280
 281     # We take all non-empty titles
 282     titles = filter(None, [e.get('title', None) for e in episodes])
 283
 284     # get the longest common substring
 285     common_title = utils.longest_substr(titles)
 286
 287     # but consider only the part up to the first number. Otherwise we risk
 288     # removing part of the number (eg if a feed contains episodes 100 - 199)
 289     common_title = re.search(r'^\D*', common_title).group(0)
 290
 291     for e in episodes:
 292         e.update(get_additional_episode_data(e, common_title))
 293
 294     return episodes
 295
 296
 297
 298 def get_episode_metadata(entry, strip_html):
 299
 300     files = get_episode_files(entry)
 301     if not files:
 302         return None
 303
 304     PROPERTIES = (
 305         ('guid',        None,  lambda: entry.get('id', None)),
 306         ('title',       True,  lambda: entry.get('title', None)),
 307         ('description', True,  lambda: get_episode_summary(entry)),
 308         ('link',        False, lambda: entry.get('link', None)),
 309         ('author',      True,  lambda: entry.get('author', entry.get('itunes_author', None))),
 310         ('duration',    False, lambda: get_duration(entry)),
 311         ('language',    False, lambda: entry.get('language', None)),
 312         ('files',       False, lambda: get_files(files)),
 313         ('released',    False, lambda: get_timestamp(entry)),
 314     )
 315
 316     episode = {}
 317     for name, is_text, func in PROPERTIES:
 318         set_val(episode, name, func, strip_html and is_text)
 319
 320     return episode
 321
 322
 323 def get_episode_files(entry):
 324     """Get the download / episode URL of a feedparser entry"""
 325
 326     urls = {}
 327     enclosures = getattr(entry, 'enclosures', [])
 328     for enclosure in enclosures:
 329         if 'href' in enclosure:
 330             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
 331             if check_mimetype(mimetype):
 332                 try:
 333                     filesize = int(enclosure['length'])
 334                 except ValueError:
 335                     filesize = None
 336                 urls[enclosure['href']] = (mimetype, filesize)
 337
 338     media_content = getattr(entry, 'media_content', [])
 339     for media in media_content:
 340         if 'url' in media:
 341             mimetype = get_mimetype(media.get('type', ''), media['url'])
 342             if check_mimetype(mimetype):
 343                 urls[media['url']] = (mimetype, None)
 344
 345     links = getattr(entry, 'links', [])
 346     for link in links:
 347         if not hasattr(link, 'href'):
 348             continue
 349
 350         if youtube.is_video_link(link['href']):
 351             urls[link['href']] = ('application/x-youtube', None)
 352
 353         # XXX: Implement link detection as in gPodder
 354
 355     return urls
 356
 357
 358 def get_episode_summary(entry):
 359     for key in ('summary', 'subtitle', 'link'):
 360         value = entry.get(key, None)
 361         if value:
 362             return value
 363
 364     return None
 365
 366
 367 def get_duration(entry):
 368     from utils import parse_time
 369
 370     str = entry.get('itunes_duration', '')
 371     try:
 372         return parse_time(str)
 373     except ValueError:
 374         return None
 375
 376
 377 def get_files(files):
 378     f = []
 379     for k, v in files.items():
 380         file = dict(url=k)
 381         if v[0]:
 382             file['mimetype'] = v[0]
 383         if v[1]:
 384             file['filesize'] = v[1]
 385         f.append(file)
 386     return f
 387
 388
 389 def get_timestamp(entry):
 390     from datetime import datetime
 391     try:
 392         return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 393     except:
 394         return None
 395
 396
 397 def get_additional_episode_data(episode, common_title):
 398     """
 399     Returns additional data about an episode that is calculated after
 400     the first pass over all episodes
 401     """
 402
 403     PROPERTIES = (
 404         ('number',      lambda: get_episode_number(episode.get('title', None), common_title)),
 405         ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
 406     )
 407
 408     data = {}
 409     for name, func in PROPERTIES:
 410         set_val(data, name, func)
 411
 412     return data
 413
 414
 415 def get_episode_number(title, common_title):
 416     """
 417     Returns the first number in the non-repeating part of the episode's title
 418     """
 419
 420     if title is None:
 421         return None
 422
 423     title = title.replace(common_title, '').strip()
 424     match = re.search(r'^\W*(\d+)', title)
 425     if not match:
 426         return None
 427
 428     return int(match.group(1))
 429
 430
 431 def get_short_title(title, common_title):
 432     """
 433     Returns the non-repeating part of the episode's title
 434     If an episode number is found, it is removed
 435     """
 436
 437     if title is None:
 438         return None
 439
 440     title = title.replace(common_title, '').strip()
 441     title = re.sub(r'^[\W\d]+', '', title)
 442     return title
 443
 444
 445 def subscribe_at_hub(feed):
 446     """ Tries to subscribe to the feed if it contains a hub URL """
 447
 448     if not feed.get('hub', False):
 449         return
 450
 451     import pubsubhubbub
 452
 453     # use the last URL in the redirect chain
 454     feed_url = feed['urls'][-1]
 455
 456     hub_url = feed.get('hub')
 457
 458     try:
 459         pubsubhubbub.Subscriber.subscribe(feed_url, hub_url)
 460     except pubsubhubbub.SubscriptionError, e:
 461         add_error(feed, 'hub-subscription', repr(e))