add option use_cache to bypass feed-cache
[mygpo-feedservice.git] / feedservice / urlstore.py
bloba01646c0321845e1a7f5f3d7f67896e35251cef6
1 from datetime import datetime
2 import time
3 import urllib2
4 from email import utils
6 from google.appengine.ext import db
7 from google.appengine.api import memcache
10 USER_AGENT = 'mygpo-feedservice +http://mygpo-feedservice.appspot.com/'
13 class URLObject(db.Model):
14 url = db.StringProperty(required=True)
15 content = db.Blob()
16 etag = db.StringProperty(required=False)
17 expires = db.DateTimeProperty(required=False)
18 modified = db.DateTimeProperty(required=False)
20 def expired(self):
21 return self.expires and self.expires <= datetime.utcnow()
23 def valid(self):
24 return len(self.content) > 0
26 def __repr__(self):
27 return '%s (%s, %s, %s)' % (self.url, self.etag, self.expires, self.modified)
30 def get_url(url, use_cache=True):
31 """
32 Gets the contents for the given URL from either memcache,
33 the datastore or the URL itself
34 """
36 cached = from_cache(url) if use_cache else None
38 if not cached or cached.expired() or not cached.valid():
39 resp = fetch_url(url, cached)
40 else:
41 resp = cached.url, cached.content, cached.modified
43 return resp
46 def from_cache(url):
47 """
48 Tries to get the object for the given URL from Memcache or the Datastore
49 """
50 obj = memcache.get(url)
51 return obj or URLObject.all().filter('url =', url).get()
54 def fetch_url(url, cached=None):
55 """
56 Fetches the given URL and stores the resulting object in the Cache
57 """
59 request = urllib2.Request(url)
60 request.add_header('User-Agent', USER_AGENT)
61 opener = urllib2.build_opener()
63 if getattr(cached, 'modified', False):
64 lm_str = utils.formatdate(time.mktime(cached.modified.timetuple()))
65 request.add_header('If-Modified-Since', lm_str)
67 if getattr(cached, 'etag', False):
68 request.add_header('If-None-Match', cached.etag)
70 try:
71 r = opener.open(request)
72 obj = cached or URLObject(url=url)
73 obj.content = r.read()
74 obj.expires = parse_header_date(r.headers.dict.get('expires', None))
75 obj.modified = parse_header_date(r.headers.dict.get('last-modified', None))
76 obj.etag = r.headers.dict.get('etag', None)
78 obj.put()
79 memcache.set(url, obj)
81 except urllib2.HTTPError, e:
82 if e.code == 304:
83 obj = cached
84 pass
85 else:
86 raise
88 return obj.url, obj.content, obj.modified
91 def parse_header_date(date_str):
92 """
93 Parses dates in RFC2822 format to datetime objects
94 """
95 if not date_str:
96 return None
97 ts = time.mktime(utils.parsedate(date_str))
98 return datetime.utcfromtimestamp(ts)