add paramter logo_format to convert inlined logos (bug 1276)
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blobb637b897899821010816999aaad442ee8b72c045
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
6 import urlstore
7 import youtube
8 from mimetype import get_mimetype, check_mimetype, get_podcast_types
11 def parse_feeds(feed_urls, *args, **kwargs):
12 """
13 Parses several feeds, specified by feed_urls and returns their JSON
14 objects and the latest of their modification dates. RSS-Redirects are
15 followed automatically by including both feeds in the result.
16 """
18 visited_urls = set()
19 result = []
20 last_modified = None
22 for url in feed_urls:
23 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
25 if not res:
26 continue
28 visited_urls.update(visited)
30 # we follow RSS-redirects automatically
31 if new and new not in (list(visited_urls) + feed_urls):
32 feed_urls.append(new)
34 if not last_modified or (last_mod and last_mod > last_modified):
35 last_modified = last_mod
37 result.append(res)
39 return result, last_modified
42 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
43 """
44 Parses a feed and returns its JSON object, a list of urls that refer to
45 this feed, an outgoing redirect and the timestamp of the last modification
46 of the feed
47 """
49 import feedparser
50 from httputils import get_redirects
52 feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
54 if last_modified and modified and last_modified <= modified:
55 return None, None, None, None
57 feed = feedparser.parse(feed_content)
59 podcast = dict()
61 PROPERTIES = (
62 ('title', True, lambda: feed.feed.get('title', None)),
63 ('link', False, lambda: feed.feed.get('link', None)),
64 ('description', True, lambda: feed.feed.get('subtitle', None)),
65 ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
66 ('language', False, lambda: feed.feed.get('language', None)),
67 ('urls', False, lambda: get_redirects(feed_url)),
68 ('new_location', False, lambda: feed.feed.get('newlocation', None)),
69 ('logo', False, lambda: get_podcast_logo(feed)),
70 ('logo_data', False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), modified, size=scale_to, img_format=logo_format)),
71 ('tags', False, lambda: get_feed_tags(feed.feed)),
72 ('episodes', False, lambda: get_episodes(feed, strip_html, podcast.get('title', None))),
73 ('content_types', False, lambda: get_podcast_types(podcast)),
76 for name, is_text, func in PROPERTIES:
77 set_val(podcast, name, func, strip_html and is_text)
79 return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
82 def set_val(obj, name, func, remove_tags):
83 from utils import remove_html_tags
85 val = func()
86 if remove_tags: val = remove_html_tags(val)
87 if val is not None:
88 obj[name] = val
91 def get_podcast_logo(feed):
92 cover_art = None
93 image = feed.feed.get('image', None)
94 if image is not None:
95 for key in ('href', 'url'):
96 cover_art = getattr(image, key, None)
97 if cover_art:
98 break
100 cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
102 return cover_art
105 def get_data_uri(inline_logo, url, modified_since, **transform_args):
107 Fetches the logo, applies the specified transformations and
108 returns the Data URI for the resulting image
111 import base64
113 if None in (inline_logo, url):
114 return None
116 url, content, last_modified = urlstore.get_url(url)
118 if last_modified and modified_since and last_modified <= modified:
119 return None
121 mimetype = get_mimetype(None, url)
123 if any(transform_args.values()):
124 content, mimetype = transform_image(content, mimetype, **transform_args)
126 encoded = base64.b64encode(content)
127 return 'data:%s;base64,%s' % (mimetype, encoded)
130 def transform_image(content, mimetype, size, img_format):
132 Transforms (resizes, converts) the image and returns
133 the resulting bytes and mimetype
136 from google.appengine.api import images
138 img_formats = dict(png=images.PNG, jpeg=images.JPEG)
140 img = images.Image(content)
142 if img_format:
143 mimetype = 'image/%s' % img_format
144 else:
145 img_format = mimetype[mimetype.find('/')+1:]
147 if size:
148 img.resize(min(size, img.width), min(size, img.height))
150 content = img.execute_transforms(output_encoding=img_formats[img_format])
151 return content, mimetype
154 def get_feed_tags(feed):
155 tags = []
157 for tag in feed.get('tags', []):
158 if tag['term']:
159 tags.extend(filter(None, tag['term'].split(',')))
161 if tag['label']:
162 tags.append(tag['label'])
164 return list(set(tags))
167 def get_episodes(feed, strip_html, podcast_title):
168 get_episode = lambda e: get_episode_metadata(e, strip_html, podcast_title)
169 return filter(None, map(get_episode, feed.entries))
172 def get_episode_metadata(entry, strip_html, podcast_title=None):
174 files = get_episode_files(entry)
175 if not files:
176 return None
178 PROPERTIES = (
179 ('guid', None, lambda: entry.get('id', None)),
180 ('title', True, lambda: entry.get('title', None)),
181 ('number', False, lambda: get_episode_number(entry.get('title', None), podcast_title)),
182 ('short_title', True, lambda: get_short_title(entry.get('title', None), podcast_title)),
183 ('description', True, lambda: get_episode_summary(entry)),
184 ('link', False, lambda: entry.get('link', None)),
185 ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))),
186 ('duration', False, lambda: get_duration(entry)),
187 ('language', False, lambda: entry.get('language', None)),
188 ('files', False, lambda: get_files(files)),
189 ('released', False, lambda: get_timestamp(entry)),
192 episode = {}
193 for name, is_text, func in PROPERTIES:
194 set_val(episode, name, func, strip_html and is_text)
196 return episode
199 def get_episode_files(entry):
200 """Get the download / episode URL of a feedparser entry"""
202 urls = {}
203 enclosures = getattr(entry, 'enclosures', [])
204 for enclosure in enclosures:
205 if 'href' in enclosure:
206 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
207 if check_mimetype(mimetype):
208 try:
209 filesize = int(enclosure['length'])
210 except ValueError:
211 filesize = None
212 urls[enclosure['href']] = (mimetype, filesize)
214 media_content = getattr(entry, 'media_content', [])
215 for media in media_content:
216 if 'url' in media:
217 mimetype = get_mimetype(media.get('type', ''), media['url'])
218 if check_mimetype(mimetype):
219 urls[media['url']] = (mimetype, None)
221 links = getattr(entry, 'links', [])
222 for link in links:
223 if not hasattr(link, 'href'):
224 continue
226 if youtube.is_video_link(link['href']):
227 urls[link['href']] = ('application/x-youtube', None)
229 # XXX: Implement link detection as in gPodder
231 return urls
234 def get_episode_number(title, podcast_title):
235 import re
237 if title is None:
238 return None
240 title = title.replace(podcast_title, '').strip()
241 match = re.search(r'^\W*(\d+)', title)
242 if not match:
243 return None
245 return int(match.group(1))
248 def get_short_title(title, podcast_title):
249 import re
251 if title is None:
252 return None
254 title = title.replace(podcast_title, '').strip()
255 title = re.sub(r'^[\W\d]+', '', title)
256 title = re.sub(r'\W+$', '', title)
257 return title
260 def get_episode_summary(entry):
261 for key in ('summary', 'subtitle', 'link'):
262 value = entry.get(key, None)
263 if value:
264 return value
266 return None
269 def get_duration(entry):
270 from utils import parse_time
272 str = entry.get('itunes_duration', '')
273 try:
274 return parse_time(str)
275 except ValueError:
276 return None
279 def get_files(files):
280 f = []
281 for k, v in files.items():
282 file = dict(url=k)
283 if v[0]:
284 file['mimetype'] = v[0]
285 if v[1]:
286 file['filesize'] = v[1]
287 f.append(file)
288 return f
291 def get_timestamp(entry):
292 from datetime import datetime
293 try:
294 return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
295 except:
296 return None