Subscribe to Feeds that contain Pubsubhubbub-Hubs
[mygpo-feedservice.git] / feedservice / urlstore.py
blobc952ea90c4172f752a2373589a81a27bd34ebc3e
1 from datetime import datetime, timedelta
2 import time
3 import urllib2
4 from email import utils
6 from google.appengine.ext import db
7 from google.appengine.api import memcache
10 USER_AGENT = 'mygpo-feedservice +http://mygpo-feedservice.appspot.com/'
13 class URLObject(db.Model):
14 url = db.StringProperty(required=True)
15 content = db.Blob()
16 etag = db.StringProperty(required=False)
17 expires = db.DateTimeProperty(required=False)
18 modified = db.DateTimeProperty(required=False)
20 def expired(self):
21 return self.expires and self.expires <= datetime.utcnow()
23 def valid(self):
24 return len(self.content) > 0
26 def __repr__(self):
27 return '%s (%s, %s, %s)' % (self.url, self.etag, self.expires, self.modified)
30 def get_url(url, use_cache=True):
31 """
32 Gets the contents for the given URL from either memcache,
33 the datastore or the URL itself
34 """
36 cached = from_cache(url) if use_cache else None
38 if not cached or cached.expired() or not cached.valid():
39 resp = fetch_url(url, cached)
40 else:
41 resp = cached.url, cached.content, cached.modified
43 return resp
46 def from_cache(url):
47 """
48 Tries to get the object for the given URL from Memcache or the Datastore
49 """
50 return memcache.get(url)
53 def fetch_url(url, cached=None, add_expires=timedelta()):
54 """
55 Fetches the given URL and stores the resulting object in the Cache
56 """
58 request = urllib2.Request(url)
59 request.add_header('User-Agent', USER_AGENT)
60 opener = urllib2.build_opener()
62 if getattr(cached, 'modified', False):
63 lm_str = utils.formatdate(time.mktime(cached.modified.timetuple()))
64 request.add_header('If-Modified-Since', lm_str)
66 if getattr(cached, 'etag', False):
67 request.add_header('If-None-Match', cached.etag)
69 try:
70 r = opener.open(request)
71 obj = cached or URLObject(url=url)
72 obj.content = r.read()
73 obj.expires = parse_header_date(r.headers.dict.get('expires', None)) + add_expires
74 obj.modified = parse_header_date(r.headers.dict.get('last-modified', None))
75 obj.etag = r.headers.dict.get('etag', None)
77 memcache.set(url, obj)
79 except urllib2.HTTPError, e:
80 if e.code == 304:
81 obj = cached
82 pass
83 else:
84 raise
86 return obj.url, obj.content, obj.modified
89 def parse_header_date(date_str):
90 """
91 Parses dates in RFC2822 format to datetime objects
92 """
93 if not date_str:
94 return None
95 ts = time.mktime(utils.parsedate(date_str))
96 return datetime.utcfromtimestamp(ts)