follow RSS-Redirects and parse new location
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blob96895ba597189472ee334600f6c1982c8bf284e3
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 import datetime
26 import hashlib
27 import urllib2
28 import base64
29 #import socket
31 from google.appengine.api import images
34 import feedcore
35 from utils import parse_time
36 import youtube
37 from mimetype import get_mimetype, check_mimetype, get_podcast_types
38 from urls import get_redirects
40 #socket.setdefaulttimeout(10)
41 fetcher = feedcore.Fetcher(USER_AGENT)
44 def get_episode_files(entry):
45 """Get the download / episode URL of a feedparser entry"""
47 urls = {}
48 enclosures = getattr(entry, 'enclosures', [])
49 for enclosure in enclosures:
50 if 'href' in enclosure:
51 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
52 if check_mimetype(mimetype):
53 try:
54 filesize = int(enclosure['length'])
55 except ValueError:
56 filesize = None
57 urls[enclosure['href']] = (mimetype, filesize)
59 media_content = getattr(entry, 'media_content', [])
60 for media in media_content:
61 if 'url' in media:
62 mimetype = get_mimetype(media.get('type', ''), media['url'])
63 if check_mimetype(mimetype):
64 urls[media['url']] = (mimetype, None)
66 links = getattr(entry, 'links', [])
67 for link in links:
68 if not hasattr(link, 'href'):
69 continue
71 if youtube.is_video_link(link['href']):
72 urls[link['href']] = ('application/x-youtube', None)
74 # XXX: Implement link detection as in gPodder
76 return urls
78 def get_episode_summary(entry):
79 for key in ('summary', 'subtitle', 'link'):
80 value = entry.get(key, None)
81 if value:
82 return value
84 return ''
86 def get_duration(entry):
87 str = entry.get('itunes_duration', '')
89 try:
90 return parse_time(str)
91 except ValueError:
92 return 0
94 def get_feed_tags(feed):
95 tags = []
97 for tag in feed.get('tags', []):
98 if tag['term']:
99 tags.extend([t for t in tag['term'].split(',') if t])
101 if tag['label']:
102 tags.append(tag['label'])
104 return set(tags)
107 def update_feed_tags(podcast, tags):
108 src = 'feed'
110 #delete all tags not found in the feed anymore
111 #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
113 #create new found tags
114 #for tag in tags:
115 # if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
116 # PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
119 def get_episode_metadata(entry, files):
120 d = {
121 'title': entry.get('title', entry.get('link', '')),
122 'description': get_episode_summary(entry),
123 'link': entry.get('link', ''),
124 'timestamp': None,
125 'author': entry.get('author', entry.get('itunes_author', '')),
126 'duration': get_duration(entry),
127 'language': entry.get('language', ''),
128 'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
129 'url': files.keys()[0],
130 'filesize': files.values()[0][1],
131 'mimetype': files.values()[0][0],
133 try:
134 d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
135 except:
136 d['timestamp'] = None
138 return d
141 def parse_feeds(feed_urls, *args, **kwargs):
142 visited_urls = set()
143 result = []
145 for url in feed_urls:
146 res, visited, new = parse_feed(url, *args, **kwargs)
148 visited_urls.update(visited)
150 # we follow RSS-redirects automatically
151 if new and new not in (list(visited_urls) + feed_urls):
152 feed_urls.append(new)
154 result.append(res)
156 return result
159 def parse_feed(feed_url, inline_logo, scale_to):
160 try:
161 fetcher.fetch(feed_url)
163 except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
164 pass
166 except feedcore.NewLocation, location:
167 return parse_feed(location.data)
169 except feedcore.UpdatedFeed, updated:
170 feed = updated.data
171 podcast = dict()
172 podcast['title'] = feed.feed.get('title', '')
173 podcast['link'] = feed.feed.get('link', '')
174 podcast['description'] = feed.feed.get('subtitle', '')
175 podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
176 podcast['language'] = feed.feed.get('language', '')
178 urls = get_redirects(feed_url)
179 podcast['urls'] = urls
181 if 'newlocation' in feed.feed:
182 new_location = feed.feed.newlocation
183 podcast['new_location'] = new_location
184 else:
185 new_location = ''
187 logo_url = get_podcast_logo(feed)
188 podcast['logo'] = logo_url
189 if inline_logo and logo_url:
190 podcast['logo_data'] = get_data_uri(logo_url, scale_to)
192 #update_feed_tags(podcast, get_feed_tags(feed.feed))
194 podcast['episodes'] = []
195 for entry in feed.entries:
196 urls = get_episode_files(entry)
197 if not urls:
198 continue
200 e = get_episode_metadata(entry, urls)
201 podcast['episodes'].append(e)
203 podcast['content_types'] = get_podcast_types(podcast)
205 except Exception, e:
206 print >>sys.stderr, 'Exception:', e
208 return podcast, urls, new_location
211 def get_podcast_logo(feed):
212 cover_art = None
213 image = feed.feed.get('image', None)
214 if image is not None:
215 for key in ('href', 'url'):
216 cover_art = getattr(image, key, None)
217 if cover_art:
218 break
220 yturl = youtube.get_real_cover(feed.feed.link)
221 if yturl:
222 cover_art = yturl
224 return cover_art
227 def get_data_uri(url, size=None):
228 content = urllib2.urlopen(url).read()
230 if size:
231 img = images.Image(content)
232 content = images.resize(content, min(size, img.width), min(size, img.height))
234 mimetype = get_mimetype(None, url)
235 encoded = base64.b64encode(content)
236 return 'data:%s;base64,%s' % (mimetype, encoded)