feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4
   5 import re
   6
   7 import urlstore
   8 import youtube
   9 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  10 import utils
  11
  12
  13 def parse_feeds(feed_urls, *args, **kwargs):
  14     """
  15     Parses several feeds, specified by feed_urls and returns their JSON
  16     objects and the latest of their modification dates. RSS-Redirects are
  17     followed automatically by including both feeds in the result.
  18     """
  19
  20     visited_urls = set()
  21     result = []
  22     last_modified = None
  23
  24     for url in feed_urls:
  25         res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
  26
  27         if not res:
  28             continue
  29
  30         visited_urls.update(visited)
  31
  32         # we follow RSS-redirects automatically
  33         if new and new not in (list(visited_urls) + feed_urls):
  34             feed_urls.append(new)
  35
  36         if not last_modified or (last_mod and last_mod > last_modified):
  37             last_modified = last_mod
  38
  39         result.append(res)
  40
  41     return result, last_modified
  42
  43
  44 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
  45     """
  46     Parses a feed and returns its JSON object, a list of urls that refer to
  47     this feed, an outgoing redirect and the timestamp of the last modification
  48     of the feed
  49     """
  50
  51     import feedparser
  52     from httputils import get_redirects
  53
  54     feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
  55
  56     if last_modified and modified and last_modified <= modified:
  57         return None, None, None, None
  58
  59     feed = feedparser.parse(feed_content)
  60
  61     podcast = dict()
  62
  63     PROPERTIES = (
  64         ('title',         True,  lambda: feed.feed.get('title', None)),
  65         ('link',          False, lambda: feed.feed.get('link', None)),
  66         ('description',   True,  lambda: feed.feed.get('subtitle', None)),
  67         ('author',        True,  lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
  68         ('language',      False, lambda: feed.feed.get('language', None)),
  69         ('urls',          False, lambda: get_redirects(feed_url)),
  70         ('new_location',  False, lambda: feed.feed.get('newlocation', None)),
  71         ('logo',          False, lambda: get_podcast_logo(feed)),
  72         ('logo_data',     False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), modified, size=scale_to, img_format=logo_format)),
  73         ('tags',          False, lambda: get_feed_tags(feed.feed)),
  74         ('episodes',      False, lambda: get_episodes(feed, strip_html)),
  75         ('content_types', False, lambda: get_podcast_types(podcast)),
  76     )
  77
  78     for name, is_text, func in PROPERTIES:
  79         set_val(podcast, name, func, strip_html and is_text)
  80
  81     return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
  82
  83
  84 def set_val(obj, name, func, remove_tags=False):
  85     from utils import remove_html_tags
  86
  87     val = func()
  88     if remove_tags: val = remove_html_tags(val)
  89     if val is not None:
  90         obj[name] = val
  91
  92
  93 def get_podcast_logo(feed):
  94     cover_art = None
  95     image = feed.feed.get('image', None)
  96     if image is not None:
  97         for key in ('href', 'url'):
  98             cover_art = getattr(image, key, None)
  99             if cover_art:
 100                 break
 101
 102     cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
 103
 104     return cover_art
 105
 106
 107 def get_data_uri(inline_logo, url, modified_since, **transform_args):
 108     """
 109     Fetches the logo, applies the specified transformations and
 110     returns the Data URI for the resulting image
 111     """
 112
 113     import base64
 114
 115     if None in (inline_logo, url):
 116         return None
 117
 118     url, content, last_modified = urlstore.get_url(url)
 119
 120     if last_modified and modified_since and last_modified <= modified:
 121         return None
 122
 123     mimetype = get_mimetype(None, url)
 124
 125     if any(transform_args.values()):
 126         content, mimetype = transform_image(content, mimetype, **transform_args)
 127
 128     encoded = base64.b64encode(content)
 129     return 'data:%s;base64,%s' % (mimetype, encoded)
 130
 131
 132 def transform_image(content, mimetype, size, img_format):
 133     """
 134     Transforms (resizes, converts) the image and returns
 135     the resulting bytes and mimetype
 136     """
 137
 138     from google.appengine.api import images
 139
 140     img_formats = dict(png=images.PNG, jpeg=images.JPEG)
 141
 142     img = images.Image(content)
 143
 144     if img_format:
 145         mimetype = 'image/%s' % img_format
 146     else:
 147         img_format = mimetype[mimetype.find('/')+1:]
 148
 149     if size:
 150         img.resize(min(size, img.width), min(size, img.height))
 151
 152     content = img.execute_transforms(output_encoding=img_formats[img_format])
 153     return content, mimetype
 154
 155
 156 def get_feed_tags(feed):
 157     tags = []
 158
 159     for tag in feed.get('tags', []):
 160         if tag['term']:
 161             tags.extend(filter(None, tag['term'].split(',')))
 162
 163         if tag['label']:
 164             tags.append(tag['label'])
 165
 166     return list(set(tags))
 167
 168
 169 def get_episodes(feed, strip_html):
 170     get_episode = lambda e: get_episode_metadata(e, strip_html)
 171     episodes = filter(None, map(get_episode, feed.entries))
 172
 173     # We take all non-empty titles
 174     titles = filter(None, [e.get('title', None) for e in episodes])
 175
 176     # get the longest common substring
 177     common_title = utils.longest_substr(titles)
 178
 179     # but consider only the part up to the first number. Otherwise we risk
 180     # removing part of the number (eg if a feed contains episodes 100 - 199)
 181     common_title = re.search(r'^\D*', common_title).group(0)
 182
 183     for e in episodes:
 184         e.update(get_additional_episode_data(e, common_title))
 185
 186     return episodes
 187
 188
 189
 190 def get_episode_metadata(entry, strip_html):
 191
 192     files = get_episode_files(entry)
 193     if not files:
 194         return None
 195
 196     PROPERTIES = (
 197         ('guid',        None,  lambda: entry.get('id', None)),
 198         ('title',       True,  lambda: entry.get('title', None)),
 199         ('description', True,  lambda: get_episode_summary(entry)),
 200         ('link',        False, lambda: entry.get('link', None)),
 201         ('author',      True,  lambda: entry.get('author', entry.get('itunes_author', None))),
 202         ('duration',    False, lambda: get_duration(entry)),
 203         ('language',    False, lambda: entry.get('language', None)),
 204         ('files',       False, lambda: get_files(files)),
 205         ('released',    False, lambda: get_timestamp(entry)),
 206     )
 207
 208     episode = {}
 209     for name, is_text, func in PROPERTIES:
 210         set_val(episode, name, func, strip_html and is_text)
 211
 212     return episode
 213
 214
 215 def get_episode_files(entry):
 216     """Get the download / episode URL of a feedparser entry"""
 217
 218     urls = {}
 219     enclosures = getattr(entry, 'enclosures', [])
 220     for enclosure in enclosures:
 221         if 'href' in enclosure:
 222             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
 223             if check_mimetype(mimetype):
 224                 try:
 225                     filesize = int(enclosure['length'])
 226                 except ValueError:
 227                     filesize = None
 228                 urls[enclosure['href']] = (mimetype, filesize)
 229
 230     media_content = getattr(entry, 'media_content', [])
 231     for media in media_content:
 232         if 'url' in media:
 233             mimetype = get_mimetype(media.get('type', ''), media['url'])
 234             if check_mimetype(mimetype):
 235                 urls[media['url']] = (mimetype, None)
 236
 237     links = getattr(entry, 'links', [])
 238     for link in links:
 239         if not hasattr(link, 'href'):
 240             continue
 241
 242         if youtube.is_video_link(link['href']):
 243             urls[link['href']] = ('application/x-youtube', None)
 244
 245         # XXX: Implement link detection as in gPodder
 246
 247     return urls
 248
 249
 250 def get_episode_summary(entry):
 251     for key in ('summary', 'subtitle', 'link'):
 252         value = entry.get(key, None)
 253         if value:
 254             return value
 255
 256     return None
 257
 258
 259 def get_duration(entry):
 260     from utils import parse_time
 261
 262     str = entry.get('itunes_duration', '')
 263     try:
 264         return parse_time(str)
 265     except ValueError:
 266         return None
 267
 268
 269 def get_files(files):
 270     f = []
 271     for k, v in files.items():
 272         file = dict(url=k)
 273         if v[0]:
 274             file['mimetype'] = v[0]
 275         if v[1]:
 276             file['filesize'] = v[1]
 277         f.append(file)
 278     return f
 279
 280
 281 def get_timestamp(entry):
 282     from datetime import datetime
 283     try:
 284         return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 285     except:
 286         return None
 287
 288
 289 def get_additional_episode_data(episode, common_title):
 290     """
 291     Returns additional data about an episode that is calculated after
 292     the first pass over all episodes
 293     """
 294
 295     PROPERTIES = (
 296         ('number',      lambda: get_episode_number(episode.get('title', None), common_title)),
 297         ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
 298     )
 299
 300     data = {}
 301     for name, func in PROPERTIES:
 302         set_val(data, name, func)
 303
 304     return data
 305
 306
 307 def get_episode_number(title, common_title):
 308     """
 309     Returns the first number in the non-repeating part of the episode's title
 310     """
 311
 312     if title is None:
 313         return None
 314
 315     title = title.replace(common_title, '').strip()
 316     match = re.search(r'^\W*(\d+)', title)
 317     if not match:
 318         return None
 319
 320     return int(match.group(1))
 321
 322
 323 def get_short_title(title, common_title):
 324     """
 325     Returns the non-repeating part of the episode's title
 326     If an episode number is found, it is removed
 327     """
 328
 329     if title is None:
 330         return None
 331
 332     title = title.replace(common_title, '').strip()
 333     title = re.sub(r'^[\W\d]+', '', title)
 334     return title