2 # -*- coding: utf-8 -*-
9 from mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
13 def parse_feeds(feed_urls
, *args
, **kwargs
):
15 Parses several feeds, specified by feed_urls and returns their JSON
16 objects and the latest of their modification dates. RSS-Redirects are
17 followed automatically by including both feeds in the result.
25 res
, visited
, new
, last_mod
= parse_feed(url
, *args
, **kwargs
)
30 visited_urls
.update(visited
)
32 # we follow RSS-redirects automatically
33 if new
and new
not in (list(visited_urls
) + feed_urls
):
36 if not last_modified
or (last_mod
and last_mod
> last_modified
):
37 last_modified
= last_mod
41 return result
, last_modified
44 def parse_feed(feed_url
, inline_logo
, scale_to
, logo_format
, strip_html
, modified
, use_cache
):
46 Parses a feed and returns its JSON object, a list of urls that refer to
47 this feed, an outgoing redirect and the timestamp of the last modification
52 from httputils
import get_redirects
54 feed_url
, feed_content
, last_modified
= urlstore
.get_url(feed_url
, use_cache
)
56 if last_modified
and modified
and last_modified
<= modified
:
57 return None, None, None, None
59 feed
= feedparser
.parse(feed_content
)
64 ('title', True, lambda: feed
.feed
.get('title', None)),
65 ('link', False, lambda: feed
.feed
.get('link', None)),
66 ('description', True, lambda: feed
.feed
.get('subtitle', None)),
67 ('author', True, lambda: feed
.feed
.get('author', feed
.feed
.get('itunes_author', None))),
68 ('language', False, lambda: feed
.feed
.get('language', None)),
69 ('urls', False, lambda: get_redirects(feed_url
)),
70 ('new_location', False, lambda: feed
.feed
.get('newlocation', None)),
71 ('logo', False, lambda: get_podcast_logo(feed
)),
72 ('logo_data', False, lambda: get_data_uri(inline_logo
, podcast
.get('logo', None), modified
, size
=scale_to
, img_format
=logo_format
)),
73 ('tags', False, lambda: get_feed_tags(feed
.feed
)),
74 ('episodes', False, lambda: get_episodes(feed
, strip_html
)),
75 ('content_types', False, lambda: get_podcast_types(podcast
)),
78 for name
, is_text
, func
in PROPERTIES
:
79 set_val(podcast
, name
, func
, strip_html
and is_text
)
81 return podcast
, podcast
.get('urls', None), podcast
.get('new_location', None), last_modified
84 def set_val(obj
, name
, func
, remove_tags
=False):
85 from utils
import remove_html_tags
88 if remove_tags
: val
= remove_html_tags(val
)
93 def get_podcast_logo(feed
):
95 image
= feed
.feed
.get('image', None)
97 for key
in ('href', 'url'):
98 cover_art
= getattr(image
, key
, None)
102 cover_art
= youtube
.get_real_cover(feed
.feed
.get('link', None)) or cover_art
107 def get_data_uri(inline_logo
, url
, modified_since
, **transform_args
):
109 Fetches the logo, applies the specified transformations and
110 returns the Data URI for the resulting image
115 if None in (inline_logo
, url
):
118 url
, content
, last_modified
= urlstore
.get_url(url
)
120 if last_modified
and modified_since
and last_modified
<= modified
:
123 mimetype
= get_mimetype(None, url
)
125 if any(transform_args
.values()):
126 content
, mimetype
= transform_image(content
, mimetype
, **transform_args
)
128 encoded
= base64
.b64encode(content
)
129 return 'data:%s;base64,%s' % (mimetype
, encoded
)
132 def transform_image(content
, mimetype
, size
, img_format
):
134 Transforms (resizes, converts) the image and returns
135 the resulting bytes and mimetype
138 from google
.appengine
.api
import images
140 img_formats
= dict(png
=images
.PNG
, jpeg
=images
.JPEG
)
142 img
= images
.Image(content
)
145 mimetype
= 'image/%s' % img_format
147 img_format
= mimetype
[mimetype
.find('/')+1:]
150 img
.resize(min(size
, img
.width
), min(size
, img
.height
))
152 content
= img
.execute_transforms(output_encoding
=img_formats
[img_format
])
153 return content
, mimetype
156 def get_feed_tags(feed
):
159 for tag
in feed
.get('tags', []):
161 tags
.extend(filter(None, tag
['term'].split(',')))
164 tags
.append(tag
['label'])
166 return list(set(tags
))
169 def get_episodes(feed
, strip_html
):
170 get_episode
= lambda e
: get_episode_metadata(e
, strip_html
)
171 episodes
= filter(None, map(get_episode
, feed
.entries
))
173 # We take all non-empty titles
174 titles
= filter(None, [e
.get('title', None) for e
in episodes
])
176 # get the longest common substring
177 common_title
= utils
.longest_substr(titles
)
179 # but consider only the part up to the first number. Otherwise we risk
180 # removing part of the number (eg if a feed contains episodes 100 - 199)
181 common_title
= re
.search(r
'^\D*', common_title
).group(0)
184 e
.update(get_additional_episode_data(e
, common_title
))
190 def get_episode_metadata(entry
, strip_html
):
192 files
= get_episode_files(entry
)
197 ('guid', None, lambda: entry
.get('id', None)),
198 ('title', True, lambda: entry
.get('title', None)),
199 ('description', True, lambda: get_episode_summary(entry
)),
200 ('link', False, lambda: entry
.get('link', None)),
201 ('author', True, lambda: entry
.get('author', entry
.get('itunes_author', None))),
202 ('duration', False, lambda: get_duration(entry
)),
203 ('language', False, lambda: entry
.get('language', None)),
204 ('files', False, lambda: get_files(files
)),
205 ('released', False, lambda: get_timestamp(entry
)),
209 for name
, is_text
, func
in PROPERTIES
:
210 set_val(episode
, name
, func
, strip_html
and is_text
)
215 def get_episode_files(entry
):
216 """Get the download / episode URL of a feedparser entry"""
219 enclosures
= getattr(entry
, 'enclosures', [])
220 for enclosure
in enclosures
:
221 if 'href' in enclosure
:
222 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
223 if check_mimetype(mimetype
):
225 filesize
= int(enclosure
['length'])
228 urls
[enclosure
['href']] = (mimetype
, filesize
)
230 media_content
= getattr(entry
, 'media_content', [])
231 for media
in media_content
:
233 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
234 if check_mimetype(mimetype
):
235 urls
[media
['url']] = (mimetype
, None)
237 links
= getattr(entry
, 'links', [])
239 if not hasattr(link
, 'href'):
242 if youtube
.is_video_link(link
['href']):
243 urls
[link
['href']] = ('application/x-youtube', None)
245 # XXX: Implement link detection as in gPodder
250 def get_episode_summary(entry
):
251 for key
in ('summary', 'subtitle', 'link'):
252 value
= entry
.get(key
, None)
259 def get_duration(entry
):
260 from utils
import parse_time
262 str = entry
.get('itunes_duration', '')
264 return parse_time(str)
269 def get_files(files
):
271 for k
, v
in files
.items():
274 file['mimetype'] = v
[0]
276 file['filesize'] = v
[1]
281 def get_timestamp(entry
):
282 from datetime
import datetime
284 return datetime(*(entry
.updated_parsed
)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
289 def get_additional_episode_data(episode
, common_title
):
291 Returns additional data about an episode that is calculated after
292 the first pass over all episodes
296 ('number', lambda: get_episode_number(episode
.get('title', None), common_title
)),
297 ('short_title', lambda: get_short_title(episode
.get('title', None), common_title
)),
301 for name
, func
in PROPERTIES
:
302 set_val(data
, name
, func
)
307 def get_episode_number(title
, common_title
):
309 Returns the first number in the non-repeating part of the episode's title
315 title
= title
.replace(common_title
, '').strip()
316 match
= re
.search(r
'^\W*(\d+)', title
)
320 return int(match
.group(1))
323 def get_short_title(title
, common_title
):
325 Returns the non-repeating part of the episode's title
326 If an episode number is found, it is removed
332 title
= title
.replace(common_title
, '').strip()
333 title
= re
.sub(r
'^[\W\d]+', '', title
)