include requested URL "as-is" in response
[mygpo-feedservice.git] / feedservice / urls.py
blob69e67b26bffb2a373212ac6094a122f746c876e6
1 import urllib2
2 import urlparse
4 class RedirectCollector(urllib2.HTTPRedirectHandler):
5 """Collects all seen (intermediate) redirects for a HTTP request"""
7 def __init__(self, *args, **kwargs):
8 self.urls = []
10 def redirect_request(self, req, fp, code, msg, hdrs, newurl):
11 self.urls.append(newurl)
12 return urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
15 def get_redirects(url):
16 """ Returns the complete redirect chain, starting from url """
17 collector = RedirectCollector()
18 collector.urls.append(url)
19 opener = urllib2.build_opener(collector)
20 opener.open(url)
21 urls = map(basic_sanitizing, collector.urls)
23 # include un-sanitized URL for easy matching of
24 #response to request URLs
25 if urls[0] != url:
26 urls = [url] + urls
28 return urls
31 def basic_sanitizing(url):
32 """
33 does basic sanitizing through urlparse and additionally converts the netloc to lowercase
34 """
35 r = urlparse.urlsplit(url)
36 netloc = r.netloc.lower()
37 r2 = urlparse.SplitResult(r.scheme, netloc, r.path or '/', r.query, r.fragment)
38 return r2.geturl()