feedservice/feeddownloader.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of my.gpodder.org.
   5 #
   6 # my.gpodder.org is free software: you can redistribute it and/or modify it
   7 # under the terms of the GNU Affero General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or (at your
   9 # option) any later version.
  10 #
  11 # my.gpodder.org is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  14 # License for more details.
  15 #
  16 # You should have received a copy of the GNU Affero General Public License
  17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
  21
  22
  23 import os
  24 import sys
  25 import datetime
  26 import hashlib
  27 import urllib2
  28 import base64
  29 #import socket
  30
  31 import feedcore
  32 from utils import parse_time
  33 import youtube
  34 from mimetype import get_mimetype, check_mimetype, get_podcast_types
  35
  36 #socket.setdefaulttimeout(10)
  37 fetcher = feedcore.Fetcher(USER_AGENT)
  38
  39
  40 def get_episode_files(entry):
  41     """Get the download / episode URL of a feedparser entry"""
  42
  43     urls = {}
  44     enclosures = getattr(entry, 'enclosures', [])
  45     for enclosure in enclosures:
  46         if 'href' in enclosure:
  47             mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
  48             if check_mimetype(mimetype):
  49                 try:
  50                     filesize = int(enclosure['length'])
  51                 except ValueError:
  52                     filesize = None
  53                 urls[enclosure['href']] = (mimetype, filesize)
  54
  55     media_content = getattr(entry, 'media_content', [])
  56     for media in media_content:
  57         if 'url' in media:
  58             mimetype = get_mimetype(media.get('type', ''), media['url'])
  59             if check_mimetype(mimetype):
  60                 urls[media['url']] = (mimetype, None)
  61
  62     links = getattr(entry, 'links', [])
  63     for link in links:
  64         if not hasattr(link, 'href'):
  65             continue
  66
  67         if youtube.is_video_link(link['href']):
  68             urls[link['href']] = ('application/x-youtube', None)
  69
  70         # XXX: Implement link detection as in gPodder
  71
  72     return urls
  73
  74 def get_episode_summary(entry):
  75     for key in ('summary', 'subtitle', 'link'):
  76         value = entry.get(key, None)
  77         if value:
  78             return value
  79
  80     return ''
  81
  82 def get_duration(entry):
  83     str = entry.get('itunes_duration', '')
  84
  85     try:
  86         return parse_time(str)
  87     except ValueError:
  88         return 0
  89
  90 def get_feed_tags(feed):
  91     tags = []
  92
  93     for tag in feed.get('tags', []):
  94         if tag['term']:
  95             tags.extend([t for t in tag['term'].split(',') if t])
  96
  97         if tag['label']:
  98             tags.append(tag['label'])
  99
 100     return set(tags)
 101
 102
 103 def update_feed_tags(podcast, tags):
 104     src = 'feed'
 105
 106     #delete all tags not found in the feed anymore
 107     #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
 108
 109     #create new found tags
 110     #for tag in tags:
 111     #    if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
 112     #        PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
 113
 114
 115 def get_episode_metadata(entry, files):
 116     d = {
 117         'title': entry.get('title', entry.get('link', '')),
 118         'description': get_episode_summary(entry),
 119         'link': entry.get('link', ''),
 120         'timestamp': None,
 121         'author': entry.get('author', entry.get('itunes_author', '')),
 122         'duration': get_duration(entry),
 123         'language': entry.get('language', ''),
 124         'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
 125         'url': files.keys()[0],
 126         'filesize': files.values()[0][1],
 127         'mimetype': files.values()[0][0],
 128     }
 129     try:
 130         d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
 131     except:
 132         d['timestamp'] = None
 133
 134     return d
 135
 136
 137 def parse_feed(feed_url, inline_logo):
 138     try:
 139         fetcher.fetch(feed_url)
 140
 141     except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
 142         pass
 143
 144     except feedcore.NewLocation, location:
 145         return parse_feed(location.data)
 146
 147     except feedcore.UpdatedFeed, updated:
 148         feed = updated.data
 149         podcast = dict()
 150         podcast['title'] = feed.feed.get('title', '')
 151         podcast['link']  = feed.feed.get('link', '')
 152         podcast['description'] = feed.feed.get('subtitle', '')
 153         podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
 154         podcast['language'] = feed.feed.get('language', '')
 155
 156         logo_url = get_podcast_logo(feed)
 157         podcast['logo'] = logo_url
 158         if inline_logo and logo_url:
 159             podcast['logo_data'] = get_data_uri(logo_url)
 160
 161         #update_feed_tags(podcast, get_feed_tags(feed.feed))
 162
 163         podcast['episodes'] = []
 164         for entry in feed.entries:
 165             urls = get_episode_files(entry)
 166             if not urls:
 167                 continue
 168
 169             e = get_episode_metadata(entry, urls)
 170             podcast['episodes'].append(e)
 171
 172         podcast['content_types'] = get_podcast_types(podcast)
 173
 174     except Exception, e:
 175         print >>sys.stderr, 'Exception:', e
 176
 177     return podcast
 178
 179
 180 def get_podcast_logo(feed):
 181     cover_art = None
 182     image = feed.feed.get('image', None)
 183     if image is not None:
 184         for key in ('href', 'url'):
 185             cover_art = getattr(image, key, None)
 186             if cover_art:
 187                 break
 188
 189     yturl = youtube.get_real_cover(feed.feed.link)
 190     if yturl:
 191         cover_art = yturl
 192
 193     return cover_art
 194
 195 def get_data_uri(url):
 196     content = urllib2.urlopen(url).read()
 197     mimetype = get_mimetype(None, url)
 198     encoded = base64.b64encode(content)
 199     return 'data:%s;base64,%s' % (mimetype, encoded)