improve extraction of episode's short_title,number
[mygpo-feedservice.git] / feedservice / feeddownloader.py
blob1e54f3f1d06dd8c612201383aeffefbb4d9e599b
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
5 import re
7 import urlstore
8 import youtube
9 from mimetype import get_mimetype, check_mimetype, get_podcast_types
10 import utils
13 def parse_feeds(feed_urls, *args, **kwargs):
14 """
15 Parses several feeds, specified by feed_urls and returns their JSON
16 objects and the latest of their modification dates. RSS-Redirects are
17 followed automatically by including both feeds in the result.
18 """
20 visited_urls = set()
21 result = []
22 last_modified = None
24 for url in feed_urls:
25 res, visited, new, last_mod = parse_feed(url, *args, **kwargs)
27 if not res:
28 continue
30 visited_urls.update(visited)
32 # we follow RSS-redirects automatically
33 if new and new not in (list(visited_urls) + feed_urls):
34 feed_urls.append(new)
36 if not last_modified or (last_mod and last_mod > last_modified):
37 last_modified = last_mod
39 result.append(res)
41 return result, last_modified
44 def parse_feed(feed_url, inline_logo, scale_to, logo_format, strip_html, modified, use_cache):
45 """
46 Parses a feed and returns its JSON object, a list of urls that refer to
47 this feed, an outgoing redirect and the timestamp of the last modification
48 of the feed
49 """
51 import feedparser
52 from httputils import get_redirects
54 feed_url, feed_content, last_modified = urlstore.get_url(feed_url, use_cache)
56 if last_modified and modified and last_modified <= modified:
57 return None, None, None, None
59 feed = feedparser.parse(feed_content)
61 podcast = dict()
63 PROPERTIES = (
64 ('title', True, lambda: feed.feed.get('title', None)),
65 ('link', False, lambda: feed.feed.get('link', None)),
66 ('description', True, lambda: feed.feed.get('subtitle', None)),
67 ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
68 ('language', False, lambda: feed.feed.get('language', None)),
69 ('urls', False, lambda: get_redirects(feed_url)),
70 ('new_location', False, lambda: feed.feed.get('newlocation', None)),
71 ('logo', False, lambda: get_podcast_logo(feed)),
72 ('logo_data', False, lambda: get_data_uri(inline_logo, podcast.get('logo', None), modified, size=scale_to, img_format=logo_format)),
73 ('tags', False, lambda: get_feed_tags(feed.feed)),
74 ('episodes', False, lambda: get_episodes(feed, strip_html)),
75 ('content_types', False, lambda: get_podcast_types(podcast)),
78 for name, is_text, func in PROPERTIES:
79 set_val(podcast, name, func, strip_html and is_text)
81 return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
84 def set_val(obj, name, func, remove_tags=False):
85 from utils import remove_html_tags
87 val = func()
88 if remove_tags: val = remove_html_tags(val)
89 if val is not None:
90 obj[name] = val
93 def get_podcast_logo(feed):
94 cover_art = None
95 image = feed.feed.get('image', None)
96 if image is not None:
97 for key in ('href', 'url'):
98 cover_art = getattr(image, key, None)
99 if cover_art:
100 break
102 cover_art = youtube.get_real_cover(feed.feed.get('link', None)) or cover_art
104 return cover_art
107 def get_data_uri(inline_logo, url, modified_since, **transform_args):
109 Fetches the logo, applies the specified transformations and
110 returns the Data URI for the resulting image
113 import base64
115 if None in (inline_logo, url):
116 return None
118 url, content, last_modified = urlstore.get_url(url)
120 if last_modified and modified_since and last_modified <= modified:
121 return None
123 mimetype = get_mimetype(None, url)
125 if any(transform_args.values()):
126 content, mimetype = transform_image(content, mimetype, **transform_args)
128 encoded = base64.b64encode(content)
129 return 'data:%s;base64,%s' % (mimetype, encoded)
132 def transform_image(content, mimetype, size, img_format):
134 Transforms (resizes, converts) the image and returns
135 the resulting bytes and mimetype
138 from google.appengine.api import images
140 img_formats = dict(png=images.PNG, jpeg=images.JPEG)
142 img = images.Image(content)
144 if img_format:
145 mimetype = 'image/%s' % img_format
146 else:
147 img_format = mimetype[mimetype.find('/')+1:]
149 if size:
150 img.resize(min(size, img.width), min(size, img.height))
152 content = img.execute_transforms(output_encoding=img_formats[img_format])
153 return content, mimetype
156 def get_feed_tags(feed):
157 tags = []
159 for tag in feed.get('tags', []):
160 if tag['term']:
161 tags.extend(filter(None, tag['term'].split(',')))
163 if tag['label']:
164 tags.append(tag['label'])
166 return list(set(tags))
169 def get_episodes(feed, strip_html):
170 get_episode = lambda e: get_episode_metadata(e, strip_html)
171 episodes = filter(None, map(get_episode, feed.entries))
173 # We take all non-empty titles
174 titles = filter(None, [e.get('title', None) for e in episodes])
176 # get the longest common substring
177 common_title = utils.longest_substr(titles)
179 # but consider only the part up to the first number. Otherwise we risk
180 # removing part of the number (eg if a feed contains episodes 100 - 199)
181 common_title = re.search(r'^\D*', common_title).group(0)
183 for e in episodes:
184 e.update(get_additional_episode_data(e, common_title))
186 return episodes
190 def get_episode_metadata(entry, strip_html):
192 files = get_episode_files(entry)
193 if not files:
194 return None
196 PROPERTIES = (
197 ('guid', None, lambda: entry.get('id', None)),
198 ('title', True, lambda: entry.get('title', None)),
199 ('description', True, lambda: get_episode_summary(entry)),
200 ('link', False, lambda: entry.get('link', None)),
201 ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))),
202 ('duration', False, lambda: get_duration(entry)),
203 ('language', False, lambda: entry.get('language', None)),
204 ('files', False, lambda: get_files(files)),
205 ('released', False, lambda: get_timestamp(entry)),
208 episode = {}
209 for name, is_text, func in PROPERTIES:
210 set_val(episode, name, func, strip_html and is_text)
212 return episode
215 def get_episode_files(entry):
216 """Get the download / episode URL of a feedparser entry"""
218 urls = {}
219 enclosures = getattr(entry, 'enclosures', [])
220 for enclosure in enclosures:
221 if 'href' in enclosure:
222 mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
223 if check_mimetype(mimetype):
224 try:
225 filesize = int(enclosure['length'])
226 except ValueError:
227 filesize = None
228 urls[enclosure['href']] = (mimetype, filesize)
230 media_content = getattr(entry, 'media_content', [])
231 for media in media_content:
232 if 'url' in media:
233 mimetype = get_mimetype(media.get('type', ''), media['url'])
234 if check_mimetype(mimetype):
235 urls[media['url']] = (mimetype, None)
237 links = getattr(entry, 'links', [])
238 for link in links:
239 if not hasattr(link, 'href'):
240 continue
242 if youtube.is_video_link(link['href']):
243 urls[link['href']] = ('application/x-youtube', None)
245 # XXX: Implement link detection as in gPodder
247 return urls
250 def get_episode_summary(entry):
251 for key in ('summary', 'subtitle', 'link'):
252 value = entry.get(key, None)
253 if value:
254 return value
256 return None
259 def get_duration(entry):
260 from utils import parse_time
262 str = entry.get('itunes_duration', '')
263 try:
264 return parse_time(str)
265 except ValueError:
266 return None
269 def get_files(files):
270 f = []
271 for k, v in files.items():
272 file = dict(url=k)
273 if v[0]:
274 file['mimetype'] = v[0]
275 if v[1]:
276 file['filesize'] = v[1]
277 f.append(file)
278 return f
281 def get_timestamp(entry):
282 from datetime import datetime
283 try:
284 return datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
285 except:
286 return None
289 def get_additional_episode_data(episode, common_title):
291 Returns additional data about an episode that is calculated after
292 the first pass over all episodes
295 PROPERTIES = (
296 ('number', lambda: get_episode_number(episode.get('title', None), common_title)),
297 ('short_title', lambda: get_short_title(episode.get('title', None), common_title)),
300 data = {}
301 for name, func in PROPERTIES:
302 set_val(data, name, func)
304 return data
307 def get_episode_number(title, common_title):
309 Returns the first number in the non-repeating part of the episode's title
312 if title is None:
313 return None
315 title = title.replace(common_title, '').strip()
316 match = re.search(r'^\W*(\d+)', title)
317 if not match:
318 return None
320 return int(match.group(1))
323 def get_short_title(title, common_title):
325 Returns the non-repeating part of the episode's title
326 If an episode number is found, it is removed
329 if title is None:
330 return None
332 title = title.replace(common_title, '').strip()
333 title = re.sub(r'^[\W\d]+', '', title)
334 return title