feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import base64
  29 #import socket
  30
  31 from google.appengine.api import images
  32
  33
  34 import feedcore
  35 from utils import parse_time, remove_html_tags
  36 import youtube
  37 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  38 from urls import get_redirects
  39
  40 #socket.setdefaulttimeout(10)
  41 fetcher = feedcore.Fetcher(USER_AGENT)
  42
  43
  44 def get_episode_files(entry):
  45     """Get the download / episode URL of a feedparser entry"""
  46
  47     urls = {}
  48     enclosures = getattr(entry, 'enclosures', [])
  49     for enclosure in enclosures:
  50         if 'href' in enclosure:
  51             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  52             if check_mimetype(mimetype):
  53                 try:
  54                     filesize = int(enclosure['length'])
  55                 except ValueError:
  56                     filesize = None
  57                 urls[enclosure['href']] = (mimetype, filesize)
  58
  59     media_content = getattr(entry, 'media_content', [])
  60     for media in media_content:
  61         if 'url' in media:
  62             mimetype = get_mimetype(media.get('type', ''), media['url'])
  63             if check_mimetype(mimetype):
  64                 urls[media['url']] = (mimetype, None)
  65
  66     links = getattr(entry, 'links', [])
  67     for link in links:
  68         if not hasattr(link, 'href'):
  69             continue
  70
  71         if youtube.is_video_link(link['href']):
  72             urls[link['href']] = ('application/x-youtube', None)
  73
  74         # XXX: Implement link detection as in gPodder
  75
  76     return urls
  77
  78 def get_episode_summary(entry):
  79     for key in ('summary', 'subtitle', 'link'):
  80         value = entry.get(key, None)
  81         if value:
  82             return value
  83
  84     return ''
  85
  86 def get_duration(entry):
  87     str = entry.get('itunes_duration', '')
  88
  89     try:
  90         return parse_time(str)
  91     except ValueError:
  92         return 0
  93
  94 def get_feed_tags(feed):
  95     tags = []
  96
  97     for tag in feed.get('tags', []):
  98         if tag['term']:
  99             tags.extend([t for t in tag['term'].split(',') if t])
 100
 101         if tag['label']:
 102             tags.append(tag['label'])
 103
 104     return set(tags)
 105
 106
 107 def update_feed_tags(podcast, tags):
 108     src = 'feed'
 109
 110     #delete all tags not found in the feed anymore
 111     #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
 112
 113     #create new found tags
 114     #for tag in tags:
 115     #    if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
 116     #        PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
 117
 118
 119 def get_episode_metadata(entry, files, strip_html):
 120     d = {
 121         'title': entry.get('title', entry.get('link', '')),
 122         'description': get_episode_summary(entry),
 123         'link': entry.get('link', ''),
 124         'author': entry.get('author', entry.get('itunes_author', '')),
 125         'duration': get_duration(entry),
 126         'language': entry.get('language', ''),
 127         'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
 128     }
 129     try:
 130         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 131     except:
 132         d['timestamp'] = None
 133
 134
 135     if strip_html:
 136         for x in ('title', 'description', 'author'):
 137             d[x] = remove_html_tags(d[x])
 138
 139     return d
 140
 141
 142 def parse_feeds(feed_urls, *args, **kwargs):
 143     visited_urls = set()
 144     result = []
 145     last_modified = None
 146
 147     for url in feed_urls:
 148         res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
 149
 150         if not res:
 151             continue
 152
 153         visited_urls.update(visited)
 154
 155         # we follow RSS-redirects automatically
 156         if new and new not in (list(visited_urls) + feed_urls):
 157             feed_urls.append(new)
 158
 159         if last_mod > last_modified:
 160             last_modified = last_mod
 161
 162         result.append(res)
 163
 164     return result, last_modified
 165
 166
 167 def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified):
 168     try:
 169         fetcher.fetch(feed_url)
 170
 171     except feedcore.NotModified:
 172         return None, None, None, None
 173
 174     except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
 175         return None, None, None, None
 176
 177     except feedcore.NewLocation, location:
 178         return parse_feed(location.data)
 179
 180     except feedcore.UpdatedFeed, updated:
 181         feed = updated.data
 182         podcast = dict()
 183         podcast['title'] = feed.feed.get('title', '')
 184         podcast['link']  = feed.feed.get('link', '')
 185         podcast['description'] = feed.feed.get('subtitle', '')
 186         podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
 187         podcast['language'] = feed.feed.get('language', '')
 188
 189         if strip_html:
 190             for x in ('title', 'description', 'author'):
 191                 podcast[x] = remove_html_tags(podcast[x])
 192
 193         urls = get_redirects(feed_url)
 194         podcast['urls'] = urls
 195
 196         if 'newlocation' in feed.feed:
 197             new_location = feed.feed.newlocation
 198             podcast['new_location'] = new_location
 199         else:
 200             new_location = ''
 201
 202         logo_url = get_podcast_logo(feed)
 203         podcast['logo'] = logo_url
 204         if inline_logo and logo_url:
 205             podcast['logo_data'] = get_data_uri(logo_url, scale_to)
 206
 207         #update_feed_tags(podcast, get_feed_tags(feed.feed))
 208
 209         podcast['episodes'] = []
 210         for entry in feed.entries:
 211             urls = get_episode_files(entry)
 212             if not urls:
 213                 continue
 214
 215             e = get_episode_metadata(entry, urls, strip_html)
 216             podcast['episodes'].append(e)
 217
 218         podcast['content_types'] = get_podcast_types(podcast)
 219
 220     except Exception, e:
 221         print >>sys.stderr, 'Exception:', e
 222
 223     return podcast, urls, new_location, feed.modified
 224
 225
 226 def get_podcast_logo(feed):
 227     cover_art = None
 228     image = feed.feed.get('image', None)
 229     if image is not None:
 230         for key in ('href', 'url'):
 231             cover_art = getattr(image, key, None)
 232             if cover_art:
 233                 break
 234
 235     yturl = youtube.get_real_cover(feed.feed.link)
 236     if yturl:
 237         cover_art = yturl
 238
 239     return cover_art
 240
 241
 242 def get_data_uri(url, size=None):
 243     content = urllib2.urlopen(url).read()
 244
 245     if size:
 246         img = images.Image(content)
 247         content = images.resize(content, min(size, img.width), min(size, img.height))
 248
 249     mimetype = get_mimetype(None, url)
 250     encoded = base64.b64encode(content)
 251     return 'data:%s;base64,%s' % (mimetype, encoded)