reddit.sh: Install plugins before attempting first startup
[reddit.git] / scripts / tracker.py
blob8435f13941160415a6d05386fb895ee9e017762d
1 #!/usr/bin/python
2 # The contents of this file are subject to the Common Public Attribution
3 # License Version 1.0. (the "License"); you may not use this file except in
4 # compliance with the License. You may obtain a copy of the License at
5 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
6 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
7 # software over a computer network and provide for limited attribution for the
8 # Original Developer. In addition, Exhibit A has been modified to be consistent
9 # with Exhibit B.
11 # Software distributed under the License is distributed on an "AS IS" basis,
12 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
13 # the specific language governing rights and limitations under the License.
15 # The Original Code is reddit.
17 # The Original Developer is the Initial Developer. The Initial Developer of
18 # the Original Code is reddit Inc.
20 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
21 # Inc. All Rights Reserved.
22 ###############################################################################
23 """
24 This is a tiny Flask app used for a couple of self-serve ad tracking
25 mechanisms. The URLs it provides are:
27 /click
29 Promoted links have their URL replaced with a /click URL by the JS
30 (after a call to /fetch-trackers). Redirect to the actual URL after logging
31 the click. This must be run in a place whose logs are stored for traffic
32 analysis.
34 For convenience, the script can compile itself into a Zip archive suitable for
35 use on Amazon Elastic Beanstalk (and possibly other systems).
37 """
40 import cStringIO
41 import os
42 import hashlib
43 import hmac
44 import time
45 import urllib
46 from urlparse import parse_qsl, urlparse, urlunparse
48 from ConfigParser import RawConfigParser
49 from wsgiref.handlers import format_date_time
51 from flask import Flask, request, json, make_response, abort, redirect
54 application = Flask(__name__)
55 REQUIRED_PACKAGES = [
56 "flask",
60 class ApplicationConfig(object):
61 """A thin wrapper around ConfigParser that remembers what we read.
63 The remembered settings can then be written out to a minimal config file
64 when building the Elastic Beanstalk zipfile.
66 """
67 def __init__(self):
68 self.input = RawConfigParser()
69 config_filename = os.environ.get("CONFIG", "production.ini")
70 with open(config_filename) as f:
71 self.input.readfp(f)
72 self.output = RawConfigParser()
74 def get(self, section, key):
75 value = self.input.get(section, key)
77 # remember that we needed this configuration value
78 if (section.upper() != "DEFAULT" and
79 not self.output.has_section(section)):
80 self.output.add_section(section)
81 self.output.set(section, key, value)
83 return value
85 def to_config(self):
86 io = cStringIO.StringIO()
87 self.output.write(io)
88 return io.getvalue()
91 config = ApplicationConfig()
92 tracking_secret = config.get('DEFAULT', 'tracking_secret')
93 reddit_domain = config.get('DEFAULT', 'domain')
94 reddit_domain_prefix = config.get('DEFAULT', 'domain_prefix')
97 @application.route("/")
98 def healthcheck():
99 return "I am healthy."
102 @application.route('/click')
103 def click_redirect():
104 destination = request.args['url'].encode('utf-8')
105 fullname = request.args['id'].encode('utf-8')
106 observed_mac = request.args['hash']
108 expected_hashable = ''.join((destination, fullname))
109 expected_mac = hmac.new(
110 tracking_secret, expected_hashable, hashlib.sha1).hexdigest()
112 if not constant_time_compare(expected_mac, observed_mac):
113 abort(403)
115 # fix encoding in the query string of the destination
116 u = urlparse(destination)
117 if u.query:
118 u = _fix_query_encoding(u)
119 destination = u.geturl()
121 return _redirect_nocache(destination)
124 @application.route('/event_redirect')
125 def event_redirect():
126 destination = request.args['url'].encode('utf-8')
128 # Parse and avoid open redirects
129 netloc = "%s.%s" % (reddit_domain_prefix, reddit_domain)
130 u = urlparse(destination)._replace(netloc=netloc, scheme="https")
132 if u.query:
133 u = _fix_query_encoding(u)
134 destination = u.geturl()
136 return _redirect_nocache(destination)
139 @application.route('/event_click')
140 def event_click():
141 """Take in an evented request, append session data to payload, and redirect.
143 This is only useful for situations in which we're navigating from a request
144 that does not have session information - i.e. served from redditmedia.com.
145 If we want to track a click and the user that did so from these pages,
146 we need to identify the user before sending the payload.
148 Note: If we add hmac validation, this will need verify and resign before
149 redirecting. We can also probably drop a redirect here once we're not
150 relying on log files for event tracking and have a proper events endpoint.
152 try:
153 session_str = urllib.unquote(request.cookies.get('reddit_session', ''))
154 user_id = int(session_str.split(',')[0])
155 except ValueError:
156 user_id = None
158 args = request.args.to_dict()
159 if user_id:
160 payload = args.get('data').encode('utf-8')
161 try:
162 payload_json = json.loads(payload)
163 except ValueError:
164 # if we fail to load the JSON, continue on to the redirect to not
165 # block the user - ETL can deal with/report the malformed data.
166 pass
167 else:
168 payload_json['user_id'] = user_id
169 args['data'] = json.dumps(payload_json)
171 return _redirect_nocache('/event_redirect?%s' % urllib.urlencode(args))
174 def _fix_query_encoding(parse_result):
175 "Fix encoding in the query string."
176 query_params = parse_qsl(parse_result.query, keep_blank_values=True)
178 # this effectively calls urllib.quote_plus on every query value
179 return parse_result._replace(query=urllib.urlencode(query_params))
182 def _redirect_nocache(destination):
183 now = format_date_time(time.time())
184 response = redirect(destination)
185 response.headers['Cache-control'] = 'no-cache'
186 response.headers['Pragma'] = 'no-cache'
187 response.headers['Date'] = now
188 response.headers['Expires'] = now
189 return response
192 # copied from r2.lib.utils
193 def constant_time_compare(actual, expected):
195 Returns True if the two strings are equal, False otherwise
197 The time taken is dependent on the number of characters provided
198 instead of the number of characters that match.
200 actual_len = len(actual)
201 expected_len = len(expected)
202 result = actual_len ^ expected_len
203 if expected_len > 0:
204 for i in xrange(actual_len):
205 result |= ord(actual[i]) ^ ord(expected[i % expected_len])
206 return result == 0
209 if __name__ == "__main__":
210 # package up for elastic beanstalk
211 import zipfile
213 with zipfile.ZipFile("/tmp/tracker.zip", "w", zipfile.ZIP_DEFLATED) as zip:
214 zip.write(__file__, "application.py")
215 zip.writestr("production.ini", config.to_config())
216 zip.writestr("requirements.txt", "\n".join(REQUIRED_PACKAGES) + "\n")