From c5886adecb5ab989decf380a1fb729b7c76fa3eb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Stefan=20K=C3=B6gl?= Date: Wed, 26 Jan 2011 08:28:10 +0200 Subject: [PATCH] add tags to feed, code-cleanup/refactoring --- feedservice/feeddownloader.py | 329 ++++++++++++++++++++++-------------------- feedservice/mimetype.py | 4 +- 2 files changed, 172 insertions(+), 161 deletions(-) diff --git a/feedservice/feeddownloader.py b/feedservice/feeddownloader.py index a4c30dc..a61b31e 100644 --- a/feedservice/feeddownloader.py +++ b/feedservice/feeddownloader.py @@ -1,139 +1,20 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # -# This file is part of my.gpodder.org. -# -# my.gpodder.org is free software: you can redistribute it and/or modify it -# under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# my.gpodder.org is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public -# License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with my.gpodder.org. If not, see . -# - -import os -import sys -import datetime -import hashlib -import base64 - -import feedparser - -from google.appengine.api import images import urlstore -from utils import parse_time, remove_html_tags import youtube from mimetype import get_mimetype, check_mimetype, get_podcast_types -from urls import get_redirects - - -def get_episode_files(entry): - """Get the download / episode URL of a feedparser entry""" - - urls = {} - enclosures = getattr(entry, 'enclosures', []) - for enclosure in enclosures: - if 'href' in enclosure: - mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href']) - if check_mimetype(mimetype): - try: - filesize = int(enclosure['length']) - except ValueError: - filesize = None - urls[enclosure['href']] = (mimetype, filesize) - - media_content = getattr(entry, 'media_content', []) - for media in media_content: - if 'url' in media: - mimetype = get_mimetype(media.get('type', ''), media['url']) - if check_mimetype(mimetype): - urls[media['url']] = (mimetype, None) - - links = getattr(entry, 'links', []) - for link in links: - if not hasattr(link, 'href'): - continue - - if youtube.is_video_link(link['href']): - urls[link['href']] = ('application/x-youtube', None) - - # XXX: Implement link detection as in gPodder - - return urls - -def get_episode_summary(entry): - for key in ('summary', 'subtitle', 'link'): - value = entry.get(key, None) - if value: - return value - - return '' - -def get_duration(entry): - str = entry.get('itunes_duration', '') - - try: - return parse_time(str) - except ValueError: - return 0 - -def get_feed_tags(feed): - tags = [] - - for tag in feed.get('tags', []): - if tag['term']: - tags.extend([t for t in tag['term'].split(',') if t]) - - if tag['label']: - tags.append(tag['label']) - - return set(tags) - - -def update_feed_tags(podcast, tags): - src = 'feed' - - #delete all tags not found in the feed anymore - #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete() - - #create new found tags - #for tag in tags: - # if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists(): - # PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag) - - -def get_episode_metadata(entry, files, strip_html): - d = { - 'title': entry.get('title', entry.get('link', '')), - 'description': get_episode_summary(entry), - 'link': entry.get('link', ''), - 'author': entry.get('author', entry.get('itunes_author', '')), - 'duration': get_duration(entry), - 'language': entry.get('language', ''), - 'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()], - } - try: - d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S') - except: - d['timestamp'] = None - - - if strip_html: - for x in ('title', 'description', 'author'): - d[x] = remove_html_tags(d[x]) - - return d def parse_feeds(feed_urls, *args, **kwargs): + """ + Parses several feeds, specified by feed_urls and returns their JSON + objects and the latest of their modification dates. RSS-Redirects are + followed automatically by including both feeds in the result. + """ + visited_urls = set() result = [] last_modified = None @@ -159,6 +40,15 @@ def parse_feeds(feed_urls, *args, **kwargs): def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified): + """ + Parses a feed and returns its JSON object, a list of urls that refer to + this feed, an outgoing redirect and the timestamp of the last modification + of the feed + """ + + import feedparser + from urls import get_redirects + feed_url, feed_content, last_modified = urlstore.get_url(feed_url) if last_modified and modified and last_modified <= modified: @@ -168,46 +58,42 @@ def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified): feed.feed.link = feed_url podcast = dict() - podcast['title'] = feed.feed.get('title', '') - podcast['link'] = feed.feed.get('link', '') - podcast['description'] = feed.feed.get('subtitle', '') - podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', '')) - podcast['language'] = feed.feed.get('language', '') - if strip_html: - for x in ('title', 'description', 'author'): - podcast[x] = remove_html_tags(podcast[x]) + PROPERTIES = ( + ('title', True, lambda: feed.feed.get('title', None)), + ('link', False, lambda: feed.feed.get('link', None)), + ('description', True, lambda: feed.feed.get('subtitle', None)), + ('author', True, lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))), + ('language', False, lambda: feed.feed.get('language', None)), + ('urls', False, lambda: get_redirects(feed_url)), + ('new_location', False, lambda: get_newlocation(feed)), + ('logo', False, lambda: get_podcast_logo(feed)), + ('logo_data', False, lambda: get_data_uri(inline_logo, podcast['logo'], scale_to, modified)), + ('tags', False, lambda: get_feed_tags(feed.feed)), + ('episodes', False, lambda: get_episodes(feed, strip_html)), + ('content_types', False, lambda: get_podcast_types(podcast)), + ) - urls = get_redirects(feed_url) - podcast['urls'] = urls + for name, is_text, func in PROPERTIES: + set_val(podcast, name, func, strip_html and is_text) - if 'newlocation' in feed.feed: - new_location = feed.feed.newlocation - podcast['new_location'] = new_location - else: - new_location = '' + return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified - logo_url = get_podcast_logo(feed) - podcast['logo'] = logo_url - if inline_logo and logo_url: - data_uri = get_data_uri(logo_url, scale_to, modified) - if data_uri: - podcast['logo_data'] = data_uri - #update_feed_tags(podcast, get_feed_tags(feed.feed)) +def set_val(obj, name, func, remove_tags): + from utils import remove_html_tags - podcast['episodes'] = [] - for entry in feed.entries: - urls = get_episode_files(entry) - if not urls: - continue - - e = get_episode_metadata(entry, urls, strip_html) - podcast['episodes'].append(e) + val = func() + if remove_tags: val = remove_html_tags(val) + if val is not None: + obj[name] = val - podcast['content_types'] = get_podcast_types(podcast) - return podcast, urls, new_location, last_modified +def get_newlocation(feed): + if 'newlocation' in feed.feed: + return feed.feed.newlocation + else: + return None def get_podcast_logo(feed): @@ -226,7 +112,13 @@ def get_podcast_logo(feed): return cover_art -def get_data_uri(url, size=None, modified_since=None): +def get_data_uri(inline_logo, url, size=None, modified_since=None): + import base64 + from google.appengine.api import images + + if not inline_logo: + return None + url, content, last_modified = urlstore.get_url(url) if last_modified and modified_since and last_modified <= modified: @@ -239,3 +131,124 @@ def get_data_uri(url, size=None, modified_since=None): mimetype = get_mimetype(None, url) encoded = base64.b64encode(content) return 'data:%s;base64,%s' % (mimetype, encoded) + + +def get_feed_tags(feed): + tags = [] + + for tag in feed.get('tags', []): + if tag['term']: + tags.extend([t for t in tag['term'].split(',') if t]) + + if tag['label']: + tags.append(tag['label']) + + return list(set(tags)) + + +def get_episodes(feed, strip_html): + episodes = [] + for entry in feed.entries: + urls = get_episode_files(entry) + if not urls: + continue + + e = get_episode_metadata(entry, urls, strip_html) + episodes.append(e) + return episodes + + + +def get_episode_files(entry): + """Get the download / episode URL of a feedparser entry""" + + urls = {} + enclosures = getattr(entry, 'enclosures', []) + for enclosure in enclosures: + if 'href' in enclosure: + mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href']) + if check_mimetype(mimetype): + try: + filesize = int(enclosure['length']) + except ValueError: + filesize = None + urls[enclosure['href']] = (mimetype, filesize) + + media_content = getattr(entry, 'media_content', []) + for media in media_content: + if 'url' in media: + mimetype = get_mimetype(media.get('type', ''), media['url']) + if check_mimetype(mimetype): + urls[media['url']] = (mimetype, None) + + links = getattr(entry, 'links', []) + for link in links: + if not hasattr(link, 'href'): + continue + + if youtube.is_video_link(link['href']): + urls[link['href']] = ('application/x-youtube', None) + + # XXX: Implement link detection as in gPodder + + return urls + + +def get_episode_metadata(entry, files, strip_html): + + PROPERTIES = ( + ('title', True, lambda: entry.get('title', entry.get('link', None))), + ('description', True, lambda: get_episode_summary(entry)), + ('link', False, lambda: entry.get('link', None)), + ('author', True, lambda: entry.get('author', entry.get('itunes_author', None))), + ('duration', False, lambda: get_duration(entry)), + ('language', False, lambda: entry.get('language', None)), + ('files', False, lambda: get_files(files)), + ('timestamp', False, lambda: get_timestamp(entry)), + ) + + episode = {} + for name, is_text, func in PROPERTIES: + set_val(episode, name, func, strip_html and is_text) + + return episode + + + +def get_episode_summary(entry): + for key in ('summary', 'subtitle', 'link'): + value = entry.get(key, None) + if value: + return value + + return None + + +def get_duration(entry): + from utils import parse_time + + str = entry.get('itunes_duration', '') + try: + return parse_time(str) + except ValueError: + return None + + +def get_files(files): + f = [] + for k, v in files.items(): + file = dict(url=k) + if v[0]: + file['mimetype'] = v[0] + if v[1]: + file['filesize'] = v[1] + f.append(file) + return f + + +def get_timestamp(entry): + from datetime import datetime + try: + return datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S') + except: + return None diff --git a/feedservice/mimetype.py b/feedservice/mimetype.py index 457c08a..5e480a5 100644 --- a/feedservice/mimetype.py +++ b/feedservice/mimetype.py @@ -6,9 +6,7 @@ import mimetypes TYPE_THRESHOLD=.2 -_ = lambda s: s - -CONTENT_TYPES = (_('image'), _('audio'), _('video')) +CONTENT_TYPES = ('image', 'audio', 'video') def get_podcast_types(podcast): """Returns the types of a podcast -- 2.11.4.GIT