feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4
   5 import re, urllib
   6 import simplejson as json
   7
   8 from google.appengine.ext import webapp
   9
  10 import urlstore, httputils, youtube, utils
  11 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  12
  13
  14 class Parser(webapp.RequestHandler):
  15     """ Parser Endpoint """
  16
  17     def post(self):
  18         return self.get()
  19
  20     def get(self):
  21         urls = map(urllib.unquote, self.request.get_all('url'))
  22
  23         inline_logo = self.request.get_range('inline_logo', 0, 1, default=0)
  24         scale_to = self.request.get_range('scale_logo', 0, 1, default=0)
  25         logo_format = self.request.get('logo_format')
  26         strip_html = self.request.get_range('strip_html', 0, 1, default=0)
  27         use_cache = self.request.get_range('use_cache', 0, 1, default=1)
  28         modified = self.request.headers.get('If-Modified-Since', None)
  29         accept = self.request.headers.get('Accept', 'application/json')
  30
  31         if urls:
  32             podcasts, last_modified = parse_feeds(urls, inline_logo, scale_to, logo_format, strip_html, modified, use_cache)
  33             self.send_response(podcasts, last_modified, accept)
  34
  35         else:
  36             self.response.set_status(400)
  37             self.response.out.write('parameter url missing')
  38
  39
  40     def send_response(self, podcasts, last_modified, formats):
  41         self.response.headers.add_header('Vary', 'Accept, User-Agent, Accept-Encoding')
  42
  43         format = httputils.select_matching_option(['text/html', 'application/json'], formats)
  44
  45         if format in (None, 'application/json'): #serve json as default
  46             content_type = 'application/json'
  47             content = json.dumps(podcasts, sort_keys=True, indent=None, separators=(',', ':'))
  48             from email import utils
  49             import time
  50             self.response.headers.add_header('Last-Modified', utils.formatdate(time.mktime(last_modified.timetuple())))
  51
  52
  53         else:
  54             import cgi
  55             content_type = 'text/html'
  56             pretty_json = json.dumps(podcasts, sort_keys=True, indent=4)
  57             pretty_json = cgi.escape(pretty_json)
  58             content = """<html><head>
  59 <link href="static/screen.css" type="text/css" rel="stylesheet" />
  60 <link href="static/prettify.css" type="text/css" rel="stylesheet" />
  61 <script type="text/javascript" src="static/prettify.js"></script>
  62 </head><body onload="prettyPrint()"><h1>HTML Response</h1><p>This response is HTML formatted. To get just the JSON data for processing in your client, <a href="/#accept">send the HTTP Header <em>Accept: application/json</em></a>. <a href="/">Back to the Documentation</a></p><pre class="prettyprint">%s</pre></body></html>""" % pretty_json
  63
  64         self.response.headers['Content-Type'] = content_type
  65         self.response.out.write(content)
  66
  67
  68 def parse_feeds(feed_urls, *args, **kwargs):
  69     """
  70     Parses several feeds, specified by feed_urls and returns their JSON
  71     objects and the latest of their modification dates. RSS-Redirects are
  72     followed automatically by including both feeds in the result.
  73     """
  74
  75     visited_urls = set()
  76     result = []
  77     last_modified = None
  78
  79     for url in feed_urls:
  80         res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
  81
  82         if not res:
  83             continue
  84
  85         visited_urls.update(visited)
  86
  87         # we follow RSS-redirects automatically
  88         if new and new not in (list(visited_urls) + feed_urls):
  89             feed_urls.append(new)
  90
  91         if not last_modified or (last_mod and last_mod > last_modified):
  92             last_modified = last_mod
  93
  94         result.append(res)
  95
  96     return result, last_modified
  97
  98
  99 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
 100     """
 101     Parses a feed and returns its JSON object, a list of urls that refer to
 102     this feed, an outgoing redirect and the timestamp of the last modification
 103     of the feed
 104     """
 105
 106     import feedparser
 107     from httputils import get_redirects
 108
 109     feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
 110
 111     if last_modified and modified and last_modified <= modified:
 112         return None, None, None, None
 113
 114     feed = feedparser.parse(feed_content)
 115
 116     podcast = dict()
 117
 118     PROPERTIES = (
 119         ('title',         True,  lambda: feed.feed.get('title', None)),
 120         ('link',          False, lambda: feed.feed.get('link', None)),
 121         ('description',   True,  lambda: feed.feed.get('subtitle', None)),
 122         ('author',        True,  lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
 123         ('language',      False, lambda: feed.feed.get('language', None)),
 124         ('urls',          False, lambda: get_redirects(feed_url)),
 125         ('new_location',  False, lambda: feed.feed.get('newlocation', None)),
 126         ('logo',          False, lambda: get_podcast_logo(feed)),
 127         ('logo_data',     False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), modified, size=scale_to, img_format=logo_format)),
 128         ('tags',          False, lambda: get_feed_tags(feed.feed)),
 129         ('hub',           False, lambda: get_hub_url(feed.feed)),
 130         ('episodes',      False, lambda: get_episodes(feed, strip_html)),
 131         ('content_types', False, lambda: get_podcast_types(podcast)),
 132     )
 133
 134     for name, is_text, func in PROPERTIES:
 135         set_val(podcast, name, func, strip_html and is_text)
 136
 137     subscribe_at_hub(podcast)
 138
 139     return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
 140
 141
 142 def set_val(obj, name, func, remove_tags=False):
 143     from utils import remove_html_tags
 144
 145     val = func()
 146     if remove_tags: val = remove_html_tags(val)
 147     if val is not None:
 148         obj[name] = val
 149
 150
 151 def add_error(feed, key, msg):
 152     """ Adds an error entry to the feed """
 153
 154     if not 'errors' in feed:
 155         feed['errors'] = {}
 156
 157     feed['errors'][key] = msg
 158
 159
 160 def get_podcast_logo(feed):
 161     cover_art = None
 162     image = feed.feed.get('image', None)
 163     if image is not None:
 164         for key in ('href', 'url'):
 165             cover_art = getattr(image, key, None)
 166             if cover_art:
 167                 break
 168
 169     cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
 170
 171     return cover_art
 172
 173
 174 def get_data_uri(inline_logo, url, modified_since, **transform_args):
 175     """
 176     Fetches the logo, applies the specified transformations and
 177     returns the Data URI for the resulting image
 178     """
 179
 180     import base64
 181
 182     if not inline_logo or not url:
 183         return None
 184
 185     url, content, last_modified = urlstore.get_url(url)
 186
 187     if last_modified and modified_since and last_modified <= modified:
 188         return None
 189
 190     mimetype = get_mimetype(None, url)
 191
 192     if any(transform_args.values()):
 193         content, mimetype = transform_image(content, mimetype, **transform_args)
 194
 195     encoded = base64.b64encode(content)
 196     return 'data:%s;base64,%s' % (mimetype, encoded)
 197
 198
 199 def transform_image(content, mimetype, size, img_format):
 200     """
 201     Transforms (resizes, converts) the image and returns
 202     the resulting bytes and mimetype
 203     """
 204
 205     from google.appengine.api import images
 206
 207     img_formats = dict(png=images.PNG, jpeg=images.JPEG)
 208
 209     img = images.Image(content)
 210
 211     if img_format:
 212         mimetype = 'image/%s' % img_format
 213     else:
 214         img_format = mimetype[mimetype.find('/')+1:]
 215
 216     if size:
 217         img.resize(min(size, img.width), min(size, img.height))
 218
 219     content = img.execute_transforms(output_encoding=img_formats[img_format])
 220     return content, mimetype
 221
 222
 223 def get_feed_tags(feed):
 224     tags = []
 225
 226     for tag in feed.get('tags', []):
 227         if tag['term']:
 228             tags.extend(filter(None, tag['term'].split(',')))
 229
 230         if tag['label']:
 231             tags.append(tag['label'])
 232
 233     return list(set(tags))
 234
 235
 236 def get_hub_url(feed):
 237     """
 238     Returns the Hub URL as specified by
 239     http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html#discovery
 240     """
 241
 242     for l in feed.get('links', []):
 243         if l.rel == 'hub' and l.get('href', None):
 244             return l.href
 245     return None
 246
 247
 248 def get_episodes(feed, strip_html):
 249     get_episode = lambda e: get_episode_metadata(e, strip_html)
 250     episodes = filter(None, map(get_episode, feed.entries))
 251
 252     # We take all non-empty titles
 253     titles = filter(None, [e.get('title', None) for e in episodes])
 254
 255     # get the longest common substring
 256     common_title = utils.longest_substr(titles)
 257
 258     # but consider only the part up to the first number. Otherwise we risk
 259     # removing part of the number (eg if a feed contains episodes 100 - 199)
 260     common_title = re.search(r'^\D*', common_title).group(0)
 261
 262     for e in episodes:
 263         e.update(get_additional_episode_data(e, common_title))
 264
 265     return episodes
 266
 267
 268
 269 def get_episode_metadata(entry, strip_html):
 270
 271     files = get_episode_files(entry)
 272     if not files:
 273         return None
 274
 275     PROPERTIES = (
 276         ('guid',        None,  lambda: entry.get('id', None)),
 277         ('title',       True,  lambda: entry.get('title', None)),
 278         ('description', True,  lambda: get_episode_summary(entry)),
 279         ('link',        False, lambda: entry.get('link', None)),
 280         ('author',      True,  lambda: entry.get('author', entry.get('itunes_author', None))),
 281         ('duration',    False, lambda: get_duration(entry)),
 282         ('language',    False, lambda: entry.get('language', None)),
 283         ('files',       False, lambda: get_files(files)),
 284         ('released',    False, lambda: get_timestamp(entry)),
 285     )
 286
 287     episode = {}
 288     for name, is_text, func in PROPERTIES:
 289         set_val(episode, name, func, strip_html and is_text)
 290
 291     return episode
 292
 293
 294 def get_episode_files(entry):
 295     """Get the download / episode URL of a feedparser entry"""
 296
 297     urls = {}
 298     enclosures = getattr(entry, 'enclosures', [])
 299     for enclosure in enclosures:
 300         if 'href' in enclosure:
 301             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
 302             if check_mimetype(mimetype):
 303                 try:
 304                     filesize = int(enclosure['length'])
 305                 except ValueError:
 306                     filesize = None
 307                 urls[enclosure['href']] = (mimetype, filesize)
 308
 309     media_content = getattr(entry, 'media_content', [])
 310     for media in media_content:
 311         if 'url' in media:
 312             mimetype = get_mimetype(media.get('type', ''), media['url'])
 313             if check_mimetype(mimetype):
 314                 urls[media['url']] = (mimetype, None)
 315
 316     links = getattr(entry, 'links', [])
 317     for link in links:
 318         if not hasattr(link, 'href'):
 319             continue
 320
 321         if youtube.is_video_link(link['href']):
 322             urls[link['href']] = ('application/x-youtube', None)
 323
 324         # XXX: Implement link detection as in gPodder
 325
 326     return urls
 327
 328
 329 def get_episode_summary(entry):
 330     for key in ('summary', 'subtitle', 'link'):
 331         value = entry.get(key, None)
 332         if value:
 333             return value
 334
 335     return None
 336
 337
 338 def get_duration(entry):
 339     from utils import parse_time
 340
 341     str = entry.get('itunes_duration', '')
 342     try:
 343         return parse_time(str)
 344     except ValueError:
 345         return None
 346
 347
 348 def get_files(files):
 349     f = []
 350     for k, v in files.items():
 351         file = dict(url=k)
 352         if v[0]:
 353             file['mimetype'] = v[0]
 354         if v[1]:
 355             file['filesize'] = v[1]
 356         f.append(file)
 357     return f
 358
 359
 360 def get_timestamp(entry):
 361     from datetime import datetime
 362     try:
 363         return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 364     except:
 365         return None
 366
 367
 368 def get_additional_episode_data(episode, common_title):
 369     """
 370     Returns additional data about an episode that is calculated after
 371     the first pass over all episodes
 372     """
 373
 374     PROPERTIES = (
 375         ('number',      lambda: get_episode_number(episode.get('title', None), common_title)),
 376         ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
 377     )
 378
 379     data = {}
 380     for name, func in PROPERTIES:
 381         set_val(data, name, func)
 382
 383     return data
 384
 385
 386 def get_episode_number(title, common_title):
 387     """
 388     Returns the first number in the non-repeating part of the episode's title
 389     """
 390
 391     if title is None:
 392         return None
 393
 394     title = title.replace(common_title, '').strip()
 395     match = re.search(r'^\W*(\d+)', title)
 396     if not match:
 397         return None
 398
 399     return int(match.group(1))
 400
 401
 402 def get_short_title(title, common_title):
 403     """
 404     Returns the non-repeating part of the episode's title
 405     If an episode number is found, it is removed
 406     """
 407
 408     if title is None:
 409         return None
 410
 411     title = title.replace(common_title, '').strip()
 412     title = re.sub(r'^[\W\d]+', '', title)
 413     return title
 414
 415
 416 def subscribe_at_hub(feed):
 417     """ Tries to subscribe to the feed if it contains a hub URL """
 418
 419     if not feed.get('hub', False):
 420         return
 421
 422     import pubsubhubbub
 423
 424     # use the last URL in the redirect chain
 425     feed_url = feed['urls'][-1]
 426
 427     hub_url = feed.get('hub')
 428
 429     try:
 430         pubsubhubbub.Subscriber.subscribe(feed_url, hub_url)
 431     except pubsubhubbub.SubscriptionError, e:
 432         add_error(feed, 'hub-subscription', repr(e))