add option use_cache to bypass feed-cache
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blobda8bb2804d7793a8ebf9d17d2d0716b137b1efc4
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
6 import urlstore
7 import youtube
8 from mimetype import get_mimetype, check_mimetype, get_podcast_types
11 def parse_feeds(feed_urls, *args, **kwargs):
12 """
13 Parses several feeds, specified by feed_urls and returns their JSON
14 objects and the latest of their modification dates. RSS-Redirects are
15 followed automatically by including both feeds in the result.
16 """
18 visited_urls = set()
19 result = []
20 last_modified = None
22 for url in feed_urls:
23 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
25 if not res:
26 continue
28 visited_urls.update(visited)
30 # we follow RSS-redirects automatically
31 if new and new not in (list(visited_urls) + feed_urls):
32 feed_urls.append(new)
34 if not last_modified or (last_mod and last_mod > last_modified):
35 last_modified = last_mod
37 result.append(res)
39 return result, last_modified
42 def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified, use_cache):
43 """
44 Parses a feed and returns its JSON object, a list of urls that refer to
45 this feed, an outgoing redirect and the timestamp of the last modification
46 of the feed
47 """
49 import feedparser
50 from urls import get_redirects
52 feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
54 if last_modified and modified and last_modified <= modified:
55 return None, None, None, None
57 feed = feedparser.parse(feed_content)
59 podcast = dict()
61 PROPERTIES = (
62 ('title', True, lambda: feed.feed.get('title', None)),
63 ('link', False, lambda: feed.feed.get('link', None)),
64 ('description', True, lambda: feed.feed.get('subtitle', None)),
65 ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
66 ('language', False, lambda: feed.feed.get('language', None)),
67 ('urls', False, lambda: get_redirects(feed_url)),
68 ('new_location', False, lambda: get_newlocation(feed)),
69 ('logo', False, lambda: get_podcast_logo(feed)),
70 ('logo_data', False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), scale_to, modified)),
71 ('tags', False, lambda: get_feed_tags(feed.feed)),
72 ('episodes', False, lambda: get_episodes(feed, strip_html)),
73 ('content_types', False, lambda: get_podcast_types(podcast)),
76 for name, is_text, func in PROPERTIES:
77 set_val(podcast, name, func, strip_html and is_text)
79 return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
82 def set_val(obj, name, func, remove_tags):
83 from utils import remove_html_tags
85 val = func()
86 if remove_tags: val = remove_html_tags(val)
87 if val is not None:
88 obj[name] = val
91 def get_newlocation(feed):
92 if 'newlocation' in feed.feed:
93 return feed.feed.newlocation
94 else:
95 return None
98 def get_podcast_logo(feed):
99 cover_art = None
100 image = feed.feed.get('image', None)
101 if image is not None:
102 for key in ('href', 'url'):
103 cover_art = getattr(image, key, None)
104 if cover_art:
105 break
107 yturl = youtube.get_real_cover(feed.feed.get('link', None))
108 if yturl:
109 cover_art = yturl
111 return cover_art
114 def get_data_uri(inline_logo, url, size=None, modified_since=None):
115 import base64
116 from google.appengine.api import images
118 if not inline_logo or not url:
119 return None
121 url, content, last_modified = urlstore.get_url(url)
123 if last_modified and modified_since and last_modified <= modified:
124 return None
126 if size:
127 img = images.Image(content)
128 content = images.resize(content, min(size, img.width), min(size, img.height))
130 mimetype = get_mimetype(None, url)
131 encoded = base64.b64encode(content)
132 return 'data:%s;base64,%s' % (mimetype, encoded)
135 def get_feed_tags(feed):
136 tags = []
138 for tag in feed.get('tags', []):
139 if tag['term']:
140 tags.extend([t for t in tag['term'].split(',') if t])
142 if tag['label']:
143 tags.append(tag['label'])
145 return list(set(tags))
148 def get_episodes(feed, strip_html):
149 episodes = []
150 for entry in feed.entries:
151 urls = get_episode_files(entry)
152 if not urls:
153 continue
155 e = get_episode_metadata(entry, urls, strip_html)
156 episodes.append(e)
157 return episodes
161 def get_episode_files(entry):
162 """Get the download / episode URL of a feedparser entry"""
164 urls = {}
165 enclosures = getattr(entry, 'enclosures', [])
166 for enclosure in enclosures:
167 if 'href' in enclosure:
168 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
169 if check_mimetype(mimetype):
170 try:
171 filesize = int(enclosure['length'])
172 except ValueError:
173 filesize = None
174 urls[enclosure['href']] = (mimetype, filesize)
176 media_content = getattr(entry, 'media_content', [])
177 for media in media_content:
178 if 'url' in media:
179 mimetype = get_mimetype(media.get('type', ''), media['url'])
180 if check_mimetype(mimetype):
181 urls[media['url']] = (mimetype, None)
183 links = getattr(entry, 'links', [])
184 for link in links:
185 if not hasattr(link, 'href'):
186 continue
188 if youtube.is_video_link(link['href']):
189 urls[link['href']] = ('application/x-youtube', None)
191 # XXX: Implement link detection as in gPodder
193 return urls
196 def get_episode_metadata(entry, files, strip_html):
198 PROPERTIES = (
199 ('title', True, lambda: entry.get('title', entry.get('link', None))),
200 ('description', True, lambda: get_episode_summary(entry)),
201 ('link', False, lambda: entry.get('link', None)),
202 ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))),
203 ('duration', False, lambda: get_duration(entry)),
204 ('language', False, lambda: entry.get('language', None)),
205 ('files', False, lambda: get_files(files)),
206 ('timestamp', False, lambda: get_timestamp(entry)),
209 episode = {}
210 for name, is_text, func in PROPERTIES:
211 set_val(episode, name, func, strip_html and is_text)
213 return episode
217 def get_episode_summary(entry):
218 for key in ('summary', 'subtitle', 'link'):
219 value = entry.get(key, None)
220 if value:
221 return value
223 return None
226 def get_duration(entry):
227 from utils import parse_time
229 str = entry.get('itunes_duration', '')
230 try:
231 return parse_time(str)
232 except ValueError:
233 return None
236 def get_files(files):
237 f = []
238 for k, v in files.items():
239 file = dict(url=k)
240 if v[0]:
241 file['mimetype'] = v[0]
242 if v[1]:
243 file['filesize'] = v[1]
244 f.append(file)
245 return f
248 def get_timestamp(entry):
249 from datetime import datetime
250 try:
251 return datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
252 except:
253 return None