option strip_html to remove HTML tags from text fields
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blobb616f5a1b1543fb4c19236f87b859005998df4a2
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT = 'mygpo crawler (+http://my.gpodder.org)'
23 import os
24 import sys
25 import datetime
26 import hashlib
27 import urllib2
28 import base64
29 #import socket
31 from google.appengine.api import images
34 import feedcore
35 from utils import parse_time, remove_html_tags
36 import youtube
37 from mimetype import get_mimetype, check_mimetype, get_podcast_types
38 from urls import get_redirects
40 #socket.setdefaulttimeout(10)
41 fetcher = feedcore.Fetcher(USER_AGENT)
44 def get_episode_files(entry):
45 """Get the download / episode URL of a feedparser entry"""
47 urls = {}
48 enclosures = getattr(entry, 'enclosures', [])
49 for enclosure in enclosures:
50 if 'href' in enclosure:
51 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
52 if check_mimetype(mimetype):
53 try:
54 filesize = int(enclosure['length'])
55 except ValueError:
56 filesize = None
57 urls[enclosure['href']] = (mimetype, filesize)
59 media_content = getattr(entry, 'media_content', [])
60 for media in media_content:
61 if 'url' in media:
62 mimetype = get_mimetype(media.get('type', ''), media['url'])
63 if check_mimetype(mimetype):
64 urls[media['url']] = (mimetype, None)
66 links = getattr(entry, 'links', [])
67 for link in links:
68 if not hasattr(link, 'href'):
69 continue
71 if youtube.is_video_link(link['href']):
72 urls[link['href']] = ('application/x-youtube', None)
74 # XXX: Implement link detection as in gPodder
76 return urls
78 def get_episode_summary(entry):
79 for key in ('summary', 'subtitle', 'link'):
80 value = entry.get(key, None)
81 if value:
82 return value
84 return ''
86 def get_duration(entry):
87 str = entry.get('itunes_duration', '')
89 try:
90 return parse_time(str)
91 except ValueError:
92 return 0
94 def get_feed_tags(feed):
95 tags = []
97 for tag in feed.get('tags', []):
98 if tag['term']:
99 tags.extend([t for t in tag['term'].split(',') if t])
101 if tag['label']:
102 tags.append(tag['label'])
104 return set(tags)
107 def update_feed_tags(podcast, tags):
108 src = 'feed'
110 #delete all tags not found in the feed anymore
111 #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
113 #create new found tags
114 #for tag in tags:
115 # if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
116 # PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
119 def get_episode_metadata(entry, files, strip_html):
120 d = {
121 'title': entry.get('title', entry.get('link', '')),
122 'description': get_episode_summary(entry),
123 'link': entry.get('link', ''),
124 'author': entry.get('author', entry.get('itunes_author', '')),
125 'duration': get_duration(entry),
126 'language': entry.get('language', ''),
127 'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
129 try:
130 d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
131 except:
132 d['timestamp'] = None
135 if strip_html:
136 for x in ('title', 'description', 'author'):
137 d[x] = remove_html_tags(d[x])
139 return d
142 def parse_feeds(feed_urls, *args, **kwargs):
143 visited_urls = set()
144 result = []
145 last_modified = None
147 for url in feed_urls:
148 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
150 if not res:
151 continue
153 visited_urls.update(visited)
155 # we follow RSS-redirects automatically
156 if new and new not in (list(visited_urls) + feed_urls):
157 feed_urls.append(new)
159 if last_mod > last_modified:
160 last_modified = last_mod
162 result.append(res)
164 return result, last_modified
167 def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified):
168 try:
169 fetcher.fetch(feed_url)
171 except feedcore.NotModified:
172 return None, None, None, None
174 except (feedcore.Offline, feedcore.InvalidFeed, feedcore.WifiLogin, feedcore.AuthenticationRequired):
175 return None, None, None, None
177 except feedcore.NewLocation, location:
178 return parse_feed(location.data)
180 except feedcore.UpdatedFeed, updated:
181 feed = updated.data
182 podcast = dict()
183 podcast['title'] = feed.feed.get('title', '')
184 podcast['link'] = feed.feed.get('link', '')
185 podcast['description'] = feed.feed.get('subtitle', '')
186 podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
187 podcast['language'] = feed.feed.get('language', '')
189 if strip_html:
190 for x in ('title', 'description', 'author'):
191 podcast[x] = remove_html_tags(podcast[x])
193 urls = get_redirects(feed_url)
194 podcast['urls'] = urls
196 if 'newlocation' in feed.feed:
197 new_location = feed.feed.newlocation
198 podcast['new_location'] = new_location
199 else:
200 new_location = ''
202 logo_url = get_podcast_logo(feed)
203 podcast['logo'] = logo_url
204 if inline_logo and logo_url:
205 podcast['logo_data'] = get_data_uri(logo_url, scale_to)
207 #update_feed_tags(podcast, get_feed_tags(feed.feed))
209 podcast['episodes'] = []
210 for entry in feed.entries:
211 urls = get_episode_files(entry)
212 if not urls:
213 continue
215 e = get_episode_metadata(entry, urls, strip_html)
216 podcast['episodes'].append(e)
218 podcast['content_types'] = get_podcast_types(podcast)
220 except Exception, e:
221 print >>sys.stderr, 'Exception:', e
223 return podcast, urls, new_location, feed.modified
226 def get_podcast_logo(feed):
227 cover_art = None
228 image = feed.feed.get('image', None)
229 if image is not None:
230 for key in ('href', 'url'):
231 cover_art = getattr(image, key, None)
232 if cover_art:
233 break
235 yturl = youtube.get_real_cover(feed.feed.link)
236 if yturl:
237 cover_art = yturl
239 return cover_art
242 def get_data_uri(url, size=None):
243 content = urllib2.urlopen(url).read()
245 if size:
246 img = images.Image(content)
247 content = images.resize(content, min(size, img.width), min(size, img.height))
249 mimetype = get_mimetype(None, url)
250 encoded = base64.b64encode(content)
251 return 'data:%s;base64,%s' % (mimetype, encoded)