2 # -*- coding: utf-8 -*-
4 # This file is part of my.gpodder.org.
6 # my.gpodder.org is free software: you can redistribute it and/or modify it
7 # under the terms of the GNU Affero General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or (at your
9 # option) any later version.
11 # my.gpodder.org is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
14 # License for more details.
16 # You should have received a copy of the GNU Affero General Public License
17 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
20 USER_AGENT
= 'mygpo crawler (+http://my.gpodder.org)'
31 from google
.appengine
.api
import images
35 from utils
import parse_time
, remove_html_tags
37 from mimetype
import get_mimetype
, check_mimetype
, get_podcast_types
38 from urls
import get_redirects
40 #socket.setdefaulttimeout(10)
41 fetcher
= feedcore
.Fetcher(USER_AGENT
)
44 def get_episode_files(entry
):
45 """Get the download / episode URL of a feedparser entry"""
48 enclosures
= getattr(entry
, 'enclosures', [])
49 for enclosure
in enclosures
:
50 if 'href' in enclosure
:
51 mimetype
= get_mimetype(enclosure
.get('type', ''), enclosure
['href'])
52 if check_mimetype(mimetype
):
54 filesize
= int(enclosure
['length'])
57 urls
[enclosure
['href']] = (mimetype
, filesize
)
59 media_content
= getattr(entry
, 'media_content', [])
60 for media
in media_content
:
62 mimetype
= get_mimetype(media
.get('type', ''), media
['url'])
63 if check_mimetype(mimetype
):
64 urls
[media
['url']] = (mimetype
, None)
66 links
= getattr(entry
, 'links', [])
68 if not hasattr(link
, 'href'):
71 if youtube
.is_video_link(link
['href']):
72 urls
[link
['href']] = ('application/x-youtube', None)
74 # XXX: Implement link detection as in gPodder
78 def get_episode_summary(entry
):
79 for key
in ('summary', 'subtitle', 'link'):
80 value
= entry
.get(key
, None)
86 def get_duration(entry
):
87 str = entry
.get('itunes_duration', '')
90 return parse_time(str)
94 def get_feed_tags(feed
):
97 for tag
in feed
.get('tags', []):
99 tags
.extend([t
for t
in tag
['term'].split(',') if t
])
102 tags
.append(tag
['label'])
107 def update_feed_tags(podcast
, tags
):
110 #delete all tags not found in the feed anymore
111 #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
113 #create new found tags
115 # if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
116 # PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
119 def get_episode_metadata(entry
, files
, strip_html
):
121 'title': entry
.get('title', entry
.get('link', '')),
122 'description': get_episode_summary(entry
),
123 'link': entry
.get('link', ''),
124 'author': entry
.get('author', entry
.get('itunes_author', '')),
125 'duration': get_duration(entry
),
126 'language': entry
.get('language', ''),
127 'files': [ dict(url
=k
, mimetype
=v
[0], filesize
=v
[1]) for (k
, v
) in files
.items()],
130 d
['timestamp'] = datetime
.datetime(*(entry
.updated_parsed
)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
132 d
['timestamp'] = None
136 for x
in ('title', 'description', 'author'):
137 d
[x
] = remove_html_tags(d
[x
])
142 def parse_feeds(feed_urls
, *args
, **kwargs
):
147 for url
in feed_urls
:
148 res
, visited
, new
, last_mod
= parse_feed(url
, *args
, **kwargs
)
153 visited_urls
.update(visited
)
155 # we follow RSS-redirects automatically
156 if new
and new
not in (list(visited_urls
) + feed_urls
):
157 feed_urls
.append(new
)
159 if last_mod
> last_modified
:
160 last_modified
= last_mod
164 return result
, last_modified
167 def parse_feed(feed_url
, inline_logo
, scale_to
, strip_html
, modified
):
169 fetcher
.fetch(feed_url
)
171 except feedcore
.NotModified
:
172 return None, None, None, None
174 except (feedcore
.Offline
, feedcore
.InvalidFeed
, feedcore
.WifiLogin
, feedcore
.AuthenticationRequired
):
175 return None, None, None, None
177 except feedcore
.NewLocation
, location
:
178 return parse_feed(location
.data
)
180 except feedcore
.UpdatedFeed
, updated
:
183 podcast
['title'] = feed
.feed
.get('title', '')
184 podcast
['link'] = feed
.feed
.get('link', '')
185 podcast
['description'] = feed
.feed
.get('subtitle', '')
186 podcast
['author'] = feed
.feed
.get('author', feed
.feed
.get('itunes_author', ''))
187 podcast
['language'] = feed
.feed
.get('language', '')
190 for x
in ('title', 'description', 'author'):
191 podcast
[x
] = remove_html_tags(podcast
[x
])
193 urls
= get_redirects(feed_url
)
194 podcast
['urls'] = urls
196 if 'newlocation' in feed
.feed
:
197 new_location
= feed
.feed
.newlocation
198 podcast
['new_location'] = new_location
202 logo_url
= get_podcast_logo(feed
)
203 podcast
['logo'] = logo_url
204 if inline_logo
and logo_url
:
205 podcast
['logo_data'] = get_data_uri(logo_url
, scale_to
)
207 #update_feed_tags(podcast, get_feed_tags(feed.feed))
209 podcast
['episodes'] = []
210 for entry
in feed
.entries
:
211 urls
= get_episode_files(entry
)
215 e
= get_episode_metadata(entry
, urls
, strip_html
)
216 podcast
['episodes'].append(e
)
218 podcast
['content_types'] = get_podcast_types(podcast
)
221 print >>sys
.stderr
, 'Exception:', e
223 return podcast
, urls
, new_location
, feed
.modified
226 def get_podcast_logo(feed
):
228 image
= feed
.feed
.get('image', None)
229 if image
is not None:
230 for key
in ('href', 'url'):
231 cover_art
= getattr(image
, key
, None)
235 yturl
= youtube
.get_real_cover(feed
.feed
.link
)
242 def get_data_uri(url
, size
=None):
243 content
= urllib2
.urlopen(url
).read()
246 img
= images
.Image(content
)
247 content
= images
.resize(content
, min(size
, img
.width
), min(size
, img
.height
))
249 mimetype
= get_mimetype(None, url
)
250 encoded
= base64
.b64encode(content
)
251 return 'data:%s;base64,%s' % (mimetype
, encoded
)