feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import base64
  29 #import socket
  30
  31 from google.appengine.api import images
  32
  33
  34 import feedcore
  35 from utils import parse_time
  36 import youtube
  37 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  38 from urls import get_redirects
  39
  40 #socket.setdefaulttimeout(10)
  41 fetcher = feedcore.Fetcher(USER_AGENT)
  42
  43
  44 def get_episode_files(entry):
  45     """Get the download / episode URL of a feedparser entry"""
  46
  47     urls = {}
  48     enclosures = getattr(entry, 'enclosures', [])
  49     for enclosure in enclosures:
  50         if 'href' in enclosure:
  51             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  52             if check_mimetype(mimetype):
  53                 try:
  54                     filesize = int(enclosure['length'])
  55                 except ValueError:
  56                     filesize = None
  57                 urls[enclosure['href']] = (mimetype, filesize)
  58
  59     media_content = getattr(entry, 'media_content', [])
  60     for media in media_content:
  61         if 'url' in media:
  62             mimetype = get_mimetype(media.get('type', ''), media['url'])
  63             if check_mimetype(mimetype):
  64                 urls[media['url']] = (mimetype, None)
  65
  66     links = getattr(entry, 'links', [])
  67     for link in links:
  68         if not hasattr(link, 'href'):
  69             continue
  70
  71         if youtube.is_video_link(link['href']):
  72             urls[link['href']] = ('application/x-youtube', None)
  73
  74         # XXX: Implement link detection as in gPodder
  75
  76     return urls
  77
  78 def get_episode_summary(entry):
  79     for key in ('summary', 'subtitle', 'link'):
  80         value = entry.get(key, None)
  81         if value:
  82             return value
  83
  84     return ''
  85
  86 def get_duration(entry):
  87     str = entry.get('itunes_duration', '')
  88
  89     try:
  90         return parse_time(str)
  91     except ValueError:
  92         return 0
  93
  94 def get_feed_tags(feed):
  95     tags = []
  96
  97     for tag in feed.get('tags', []):
  98         if tag['term']:
  99             tags.extend([t for t in tag['term'].split(',') if t])
 100
 101         if tag['label']:
 102             tags.append(tag['label'])
 103
 104     return set(tags)
 105
 106
 107 def update_feed_tags(podcast, tags):
 108     src = 'feed'
 109
 110     #delete all tags not found in the feed anymore
 111     #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
 112
 113     #create new found tags
 114     #for tag in tags:
 115     #    if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
 116     #        PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
 117
 118
 119 def get_episode_metadata(entry, files):
 120     d = {
 121         'title': entry.get('title', entry.get('link', '')),
 122         'description': get_episode_summary(entry),
 123         'link': entry.get('link', ''),
 124         'timestamp': None,
 125         'author': entry.get('author', entry.get('itunes_author', '')),
 126         'duration': get_duration(entry),
 127         'language': entry.get('language', ''),
 128         'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
 129         'url': files.keys()[0],
 130         'filesize': files.values()[0][1],
 131         'mimetype': files.values()[0][0],
 132     }
 133     try:
 134         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 135     except:
 136         d['timestamp'] = None
 137
 138     return d
 139
 140
 141 def parse_feeds(feed_urls, *args, **kwargs):
 142     visited_urls = set()
 143     result = []
 144
 145     for url in feed_urls:
 146         res, visited, new = parse_feed(url, *args, **kwargs)
 147
 148         visited_urls.update(visited)
 149
 150         # we follow RSS-redirects automatically
 151         if new and new not in (list(visited_urls) + feed_urls):
 152             feed_urls.append(new)
 153
 154         result.append(res)
 155
 156     return result
 157
 158
 159 def parse_feed(feed_url, inline_logo, scale_to):
 160     try:
 161         fetcher.fetch(feed_url)
 162
 163     except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
 164         pass
 165
 166     except feedcore.NewLocation, location:
 167         return parse_feed(location.data)
 168
 169     except feedcore.UpdatedFeed, updated:
 170         feed = updated.data
 171         podcast = dict()
 172         podcast['title'] = feed.feed.get('title', '')
 173         podcast['link']  = feed.feed.get('link', '')
 174         podcast['description'] = feed.feed.get('subtitle', '')
 175         podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
 176         podcast['language'] = feed.feed.get('language', '')
 177
 178         urls = get_redirects(feed_url)
 179         podcast['urls'] = urls
 180
 181         if 'newlocation' in feed.feed:
 182             new_location = feed.feed.newlocation
 183             podcast['new_location'] = new_location
 184         else:
 185             new_location = ''
 186
 187         logo_url = get_podcast_logo(feed)
 188         podcast['logo'] = logo_url
 189         if inline_logo and logo_url:
 190             podcast['logo_data'] = get_data_uri(logo_url, scale_to)
 191
 192         #update_feed_tags(podcast, get_feed_tags(feed.feed))
 193
 194         podcast['episodes'] = []
 195         for entry in feed.entries:
 196             urls = get_episode_files(entry)
 197             if not urls:
 198                 continue
 199
 200             e = get_episode_metadata(entry, urls)
 201             podcast['episodes'].append(e)
 202
 203         podcast['content_types'] = get_podcast_types(podcast)
 204
 205     except Exception, e:
 206         print >>sys.stderr, 'Exception:', e
 207
 208     return podcast, urls, new_location
 209
 210
 211 def get_podcast_logo(feed):
 212     cover_art = None
 213     image = feed.feed.get('image', None)
 214     if image is not None:
 215         for key in ('href', 'url'):
 216             cover_art = getattr(image, key, None)
 217             if cover_art:
 218                 break
 219
 220     yturl = youtube.get_real_cover(feed.feed.link)
 221     if yturl:
 222         cover_art = yturl
 223
 224     return cover_art
 225
 226
 227 def get_data_uri(url, size=None):
 228     content = urllib2.urlopen(url).read()
 229
 230     if size:
 231         img = images.Image(content)
 232         content = images.resize(content, min(size, img.width), min(size, img.height))
 233
 234     mimetype = get_mimetype(None, url)
 235     encoded = base64.b64encode(content)
 236     return 'data:%s;base64,%s' % (mimetype, encoded)