From c5886adecb5ab989decf380a1fb729b7c76fa3eb Mon Sep 17 00:00:00 2001
From: =?utf8?q?Stefan=20K=C3=B6gl?= <stefan@skoegl.net>
Date: Wed, 26 Jan 2011 08:28:10 +0200
Subject: [PATCH] add tags to feed, code-cleanup/refactoring

---
 feedservice/feeddownloader.py | 329 ++++++++++++++++++++++--------------------
 feedservice/mimetype.py       |   4 +-
 2 files changed, 172 insertions(+), 161 deletions(-)

diff --git a/feedservice/feeddownloader.py b/feedservice/feeddownloader.py
index a4c30dc..a61b31e 100644
--- a/feedservice/feeddownloader.py
+++ b/feedservice/feeddownloader.py
@@ -1,139 +1,20 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 #
-# This file is part of my.gpodder.org.
-#
-# my.gpodder.org is free software: you can redistribute it and/or modify it
-# under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or (at your
-# option) any later version.
-#
-# my.gpodder.org is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
-# License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
-#
-
-import os
-import sys
-import datetime
-import hashlib
-import base64
-
-import feedparser
-
-from google.appengine.api import images
 
 
 import urlstore
-from utils import parse_time, remove_html_tags
 import youtube
 from mimetype import get_mimetype, check_mimetype, get_podcast_types
-from urls import get_redirects
-
-
-def get_episode_files(entry):
-    """Get the download / episode URL of a feedparser entry"""
-
-    urls = {}
-    enclosures = getattr(entry, 'enclosures', [])
-    for enclosure in enclosures:
-        if 'href' in enclosure:
-            mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
-            if check_mimetype(mimetype):
-                try:
-                    filesize = int(enclosure['length'])
-                except ValueError:
-                    filesize = None
-                urls[enclosure['href']] = (mimetype, filesize)
-
-    media_content = getattr(entry, 'media_content', [])
-    for media in media_content:
-        if 'url' in media:
-            mimetype = get_mimetype(media.get('type', ''), media['url'])
-            if check_mimetype(mimetype):
-                urls[media['url']] = (mimetype, None)
-
-    links = getattr(entry, 'links', [])
-    for link in links:
-        if not hasattr(link, 'href'):
-            continue
-
-        if youtube.is_video_link(link['href']):
-            urls[link['href']] = ('application/x-youtube', None)
-
-        # XXX: Implement link detection as in gPodder
-
-    return urls
-
-def get_episode_summary(entry):
-    for key in ('summary', 'subtitle', 'link'):
-        value = entry.get(key, None)
-        if value:
-            return value
-
-    return ''
-
-def get_duration(entry):
-    str = entry.get('itunes_duration', '')
-
-    try:
-        return parse_time(str)
-    except ValueError:
-        return 0
-
-def get_feed_tags(feed):
-    tags = []
-
-    for tag in feed.get('tags', []):
-        if tag['term']:
-            tags.extend([t for t in tag['term'].split(',') if t])
-
-        if tag['label']:
-            tags.append(tag['label'])
-
-    return set(tags)
-
-
-def update_feed_tags(podcast, tags):
-    src = 'feed'
-
-    #delete all tags not found in the feed anymore
-    #PodcastTag.objects.filter(podcast=podcast, source=src).exclude(tag__in=tags).delete()
-
-    #create new found tags
-    #for tag in tags:
-    #    if not PodcastTag.objects.filter(podcast=podcast, source=src, tag=tag).exists():
-    #        PodcastTag.objects.get_or_create(podcast=podcast, source=src, tag=tag)
-
-
-def get_episode_metadata(entry, files, strip_html):
-    d = {
-        'title': entry.get('title', entry.get('link', '')),
-        'description': get_episode_summary(entry),
-        'link': entry.get('link', ''),
-        'author': entry.get('author', entry.get('itunes_author', '')),
-        'duration': get_duration(entry),
-        'language': entry.get('language', ''),
-        'files': [ dict(url=k, mimetype=v[0], filesize=v[1]) for (k, v) in files.items()],
-    }
-    try:
-        d['timestamp'] = datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
-    except:
-        d['timestamp'] = None
-
-
-    if strip_html:
-        for x in ('title', 'description', 'author'):
-            d[x] = remove_html_tags(d[x])
-
-    return d
 
 
 def parse_feeds(feed_urls, *args, **kwargs):
+    """
+    Parses several feeds, specified by feed_urls and returns their JSON
+    objects and the latest of their modification dates. RSS-Redirects are
+    followed automatically by including both feeds in the result.
+    """
+
     visited_urls = set()
     result = []
     last_modified = None
@@ -159,6 +40,15 @@ def parse_feeds(feed_urls, *args, **kwargs):
 
 
 def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified):
+    """
+    Parses a feed and returns its JSON object, a list of urls that refer to
+    this feed, an outgoing redirect and the timestamp of the last modification
+    of the feed
+    """
+
+    import feedparser
+    from urls import get_redirects
+
     feed_url, feed_content, last_modified = urlstore.get_url(feed_url)
 
     if last_modified and modified and last_modified <= modified:
@@ -168,46 +58,42 @@ def parse_feed(feed_url, inline_logo, scale_to, strip_html, modified):
     feed.feed.link = feed_url
 
     podcast = dict()
-    podcast['title'] = feed.feed.get('title', '')
-    podcast['link']  = feed.feed.get('link', '')
-    podcast['description'] = feed.feed.get('subtitle', '')
-    podcast['author'] = feed.feed.get('author', feed.feed.get('itunes_author', ''))
-    podcast['language'] = feed.feed.get('language', '')
 
-    if strip_html:
-        for x in ('title', 'description', 'author'):
-            podcast[x] = remove_html_tags(podcast[x])
+    PROPERTIES = (
+        ('title',         True,  lambda: feed.feed.get('title', None)),
+        ('link',          False, lambda: feed.feed.get('link', None)),
+        ('description',   True,  lambda: feed.feed.get('subtitle', None)),
+        ('author',        True,  lambda: feed.feed.get('author', feed.feed.get('itunes_author', None))),
+        ('language',      False, lambda: feed.feed.get('language', None)),
+        ('urls',          False, lambda: get_redirects(feed_url)),
+        ('new_location',  False, lambda: get_newlocation(feed)),
+        ('logo',          False, lambda: get_podcast_logo(feed)),
+        ('logo_data',     False, lambda: get_data_uri(inline_logo, podcast['logo'], scale_to, modified)),
+        ('tags',          False, lambda: get_feed_tags(feed.feed)),
+        ('episodes',      False, lambda: get_episodes(feed, strip_html)),
+        ('content_types', False, lambda: get_podcast_types(podcast)),
+    )
 
-    urls = get_redirects(feed_url)
-    podcast['urls'] = urls
+    for name, is_text, func in PROPERTIES:
+        set_val(podcast, name, func, strip_html and is_text)
 
-    if 'newlocation' in feed.feed:
-        new_location = feed.feed.newlocation
-        podcast['new_location'] = new_location
-    else:
-        new_location = ''
+    return podcast, podcast.get('urls', None), podcast.get('new_location', None), last_modified
 
-    logo_url = get_podcast_logo(feed)
-    podcast['logo'] = logo_url
-    if inline_logo and logo_url:
-        data_uri = get_data_uri(logo_url, scale_to, modified)
-        if data_uri:
-            podcast['logo_data'] = data_uri
 
-    #update_feed_tags(podcast, get_feed_tags(feed.feed))
+def set_val(obj, name, func, remove_tags):
+    from utils import remove_html_tags
 
-    podcast['episodes'] = []
-    for entry in feed.entries:
-        urls = get_episode_files(entry)
-        if not urls:
-            continue
-
-        e = get_episode_metadata(entry, urls, strip_html)
-        podcast['episodes'].append(e)
+    val = func()
+    if remove_tags: val = remove_html_tags(val)
+    if val is not None:
+        obj[name] = val
 
-    podcast['content_types'] = get_podcast_types(podcast)
 
-    return podcast, urls, new_location, last_modified
+def get_newlocation(feed):
+    if 'newlocation' in feed.feed:
+        return feed.feed.newlocation
+    else:
+        return None
 
 
 def get_podcast_logo(feed):
@@ -226,7 +112,13 @@ def get_podcast_logo(feed):
     return cover_art
 
 
-def get_data_uri(url, size=None, modified_since=None):
+def get_data_uri(inline_logo, url, size=None, modified_since=None):
+    import base64
+    from google.appengine.api import images
+
+    if not inline_logo:
+        return None
+
     url, content, last_modified = urlstore.get_url(url)
 
     if last_modified and modified_since and last_modified <= modified:
@@ -239,3 +131,124 @@ def get_data_uri(url, size=None, modified_since=None):
     mimetype = get_mimetype(None, url)
     encoded = base64.b64encode(content)
     return 'data:%s;base64,%s' % (mimetype, encoded)
+
+
+def get_feed_tags(feed):
+    tags = []
+
+    for tag in feed.get('tags', []):
+        if tag['term']:
+            tags.extend([t for t in tag['term'].split(',') if t])
+
+        if tag['label']:
+            tags.append(tag['label'])
+
+    return list(set(tags))
+
+
+def get_episodes(feed, strip_html):
+    episodes = []
+    for entry in feed.entries:
+        urls = get_episode_files(entry)
+        if not urls:
+            continue
+
+        e = get_episode_metadata(entry, urls, strip_html)
+        episodes.append(e)
+    return episodes
+
+
+
+def get_episode_files(entry):
+    """Get the download / episode URL of a feedparser entry"""
+
+    urls = {}
+    enclosures = getattr(entry, 'enclosures', [])
+    for enclosure in enclosures:
+        if 'href' in enclosure:
+            mimetype = get_mimetype(enclosure.get('type', ''), enclosure['href'])
+            if check_mimetype(mimetype):
+                try:
+                    filesize = int(enclosure['length'])
+                except ValueError:
+                    filesize = None
+                urls[enclosure['href']] = (mimetype, filesize)
+
+    media_content = getattr(entry, 'media_content', [])
+    for media in media_content:
+        if 'url' in media:
+            mimetype = get_mimetype(media.get('type', ''), media['url'])
+            if check_mimetype(mimetype):
+                urls[media['url']] = (mimetype, None)
+
+    links = getattr(entry, 'links', [])
+    for link in links:
+        if not hasattr(link, 'href'):
+            continue
+
+        if youtube.is_video_link(link['href']):
+            urls[link['href']] = ('application/x-youtube', None)
+
+        # XXX: Implement link detection as in gPodder
+
+    return urls
+
+
+def get_episode_metadata(entry, files, strip_html):
+
+    PROPERTIES = (
+        ('title',       True,  lambda: entry.get('title', entry.get('link', None))),
+        ('description', True,  lambda: get_episode_summary(entry)),
+        ('link',        False, lambda: entry.get('link', None)),
+        ('author',      True,  lambda: entry.get('author', entry.get('itunes_author', None))),
+        ('duration',    False, lambda: get_duration(entry)),
+        ('language',    False, lambda: entry.get('language', None)),
+        ('files',       False, lambda: get_files(files)),
+        ('timestamp',   False, lambda: get_timestamp(entry)),
+    )
+
+    episode = {}
+    for name, is_text, func in PROPERTIES:
+        set_val(episode, name, func, strip_html and is_text)
+
+    return episode
+
+
+
+def get_episode_summary(entry):
+    for key in ('summary', 'subtitle', 'link'):
+        value = entry.get(key, None)
+        if value:
+            return value
+
+    return None
+
+
+def get_duration(entry):
+    from utils import parse_time
+
+    str = entry.get('itunes_duration', '')
+    try:
+        return parse_time(str)
+    except ValueError:
+        return None
+
+
+def get_files(files):
+    f = []
+    for k, v in files.items():
+        file = dict(url=k)
+        if v[0]:
+            file['mimetype'] = v[0]
+        if v[1]:
+            file['filesize'] = v[1]
+        f.append(file)
+    return f
+
+
+def get_timestamp(entry):
+    from datetime import datetime
+    try:
+        return datetime.datetime(*(entry.updated_parsed)[:6]).strftime('%Y-%m-%dT%H:%M:%S')
+    except:
+        return None
diff --git a/feedservice/mimetype.py b/feedservice/mimetype.py
index 457c08a..5e480a5 100644
--- a/feedservice/mimetype.py
+++ b/feedservice/mimetype.py
@@ -6,9 +6,7 @@ import mimetypes
 TYPE_THRESHOLD=.2
 
 
-_ = lambda s: s
-
-CONTENT_TYPES = (_('image'), _('audio'), _('video'))
+CONTENT_TYPES = ('image', 'audio', 'video')
 
 def get_podcast_types(podcast):
     """Returns the types of a podcast
-- 
2.11.4.GIT