Update README for archival
[reddit.git] / scripts / inject_test_data.py
blobdb5f96e57884a49ae42d9a67c7f911d1fff55919
1 # The contents of this file are subject to the Common Public Attribution
2 # License Version 1.0. (the "License"); you may not use this file except in
3 # compliance with the License. You may obtain a copy of the License at
4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
6 # software over a computer network and provide for limited attribution for the
7 # Original Developer. In addition, Exhibit A has been modified to be consistent
8 # with Exhibit B.
10 # Software distributed under the License is distributed on an "AS IS" basis,
11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
12 # the specific language governing rights and limitations under the License.
14 # The Original Code is reddit.
16 # The Original Developer is the Initial Developer. The Initial Developer of
17 # the Original Code is reddit Inc.
19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
20 # Inc. All Rights Reserved.
21 ###############################################################################
23 from __future__ import division
25 import collections
26 import HTMLParser
27 import itertools
28 import random
29 import string
30 import time
32 import requests
34 from pylons import app_globals as g
36 from r2.lib.db import queries
37 from r2.lib import amqp
38 from r2.lib.utils import weighted_lottery, get_requests_resp_json
39 from r2.lib.voting import cast_vote
40 from r2.models import (
41 Account,
42 Comment,
43 Link,
44 LocalizedDefaultSubreddits,
45 LocalizedFeaturedSubreddits,
46 NotFound,
47 register,
48 Subreddit,
49 Vote,
53 unescape_htmlentities = HTMLParser.HTMLParser().unescape
56 class TextGenerator(object):
57 """A Markov Chain based text mimicker."""
59 def __init__(self, order=8):
60 self.order = order
61 self.starts = collections.Counter()
62 self.start_lengths = collections.defaultdict(collections.Counter)
63 self.models = [
64 collections.defaultdict(collections.Counter)
65 for i in xrange(self.order)]
67 @staticmethod
68 def _in_groups(input_iterable, n):
69 iterables = itertools.tee(input_iterable, n)
70 for offset, iterable in enumerate(iterables):
71 for _ in xrange(offset):
72 next(iterable, None)
73 return itertools.izip(*iterables)
75 def add_sample(self, sample):
76 """Add a sample to the model of text for this generator."""
78 if len(sample) <= self.order:
79 return
81 start = sample[:self.order]
82 self.starts[start] += 1
83 self.start_lengths[start][len(sample)] += 1
84 for order, model in enumerate(self.models, 1):
85 for chars in self._in_groups(sample, order+1):
86 prefix = "".join(chars[:-1])
87 next_char = chars[-1]
88 model[prefix][next_char] += 1
90 def generate(self):
91 """Generate a string similar to samples previously fed in."""
93 start = weighted_lottery(self.starts)
94 desired_length = weighted_lottery(self.start_lengths[start])
95 desired_length = max(desired_length, self.order)
97 generated = []
98 generated.extend(start)
99 while len(generated) < desired_length:
100 # try each model, from highest order down, til we find
101 # something
102 for order, model in reversed(list(enumerate(self.models, 1))):
103 current_prefix = "".join(generated[-order:])
104 frequencies = model[current_prefix]
105 if frequencies:
106 generated.append(weighted_lottery(frequencies))
107 break
108 else:
109 generated.append(random.choice(string.lowercase))
111 return "".join(generated)
114 def fetch_listing(path, limit=1000, batch_size=100):
115 """Fetch a reddit listing from reddit.com."""
117 session = requests.Session()
118 session.headers.update({
119 "User-Agent": "reddit-test-data-generator/1.0",
122 base_url = "https://api.reddit.com" + path
124 after = None
125 count = 0
126 while count < limit:
127 params = {"limit": batch_size, "count": count}
128 if after:
129 params["after"] = after
131 print "> {}-{}".format(count, count+batch_size)
132 response = session.get(base_url, params=params)
133 response.raise_for_status()
135 listing = get_requests_resp_json(response)["data"]
136 for child in listing["children"]:
137 yield child["data"]
138 count += 1
140 after = listing["after"]
141 if not after:
142 break
144 # obey reddit.com's ratelimits
145 # see: https://github.com/reddit/reddit/wiki/API#rules
146 time.sleep(2)
149 class Modeler(object):
150 def __init__(self):
151 self.usernames = TextGenerator(order=2)
153 def model_subreddit(self, subreddit_name):
154 """Return a model of links and comments in a given subreddit."""
156 subreddit_path = "/r/{}".format(subreddit_name)
157 print ">>>", subreddit_path
159 print ">> Links"
160 titles = TextGenerator(order=5)
161 selfposts = TextGenerator(order=8)
162 link_count = self_count = 0
163 urls = set()
164 for link in fetch_listing(subreddit_path, limit=500):
165 self.usernames.add_sample(link["author"])
166 titles.add_sample(unescape_htmlentities(link["title"]))
167 if link["is_self"]:
168 self_count += 1
169 selfposts.add_sample(unescape_htmlentities(link["selftext"]))
170 else:
171 urls.add(link["url"])
172 link_count += 1
173 self_frequency = self_count / link_count
175 print ">> Comments"
176 comments = TextGenerator(order=8)
177 for comment in fetch_listing(subreddit_path + "/comments"):
178 self.usernames.add_sample(comment["author"])
179 comments.add_sample(unescape_htmlentities(comment["body"]))
181 return SubredditModel(
182 subreddit_name, titles, selfposts, urls, comments, self_frequency)
184 def generate_username(self):
185 """Generate and return a username like those seen on reddit.com."""
186 return self.usernames.generate()
189 class SubredditModel(object):
190 """A snapshot of a subreddit's links and comments."""
192 def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
193 self.name = name
194 self.titles = titles
195 self.selfposts = selfposts
196 self.urls = list(urls)
197 self.comments = comments
198 self.selfpost_frequency = self_frequency
200 def generate_link_title(self):
201 """Generate and return a title like those seen in the subreddit."""
202 return self.titles.generate()
204 def generate_link_url(self):
205 """Generate and return a URL from one seen in the subreddit.
207 The URL returned may be "self" indicating a self post. This should
208 happen with the same frequency it is seen in the modeled subreddit.
211 if random.random() < self.selfpost_frequency:
212 return "self"
213 else:
214 return random.choice(self.urls)
216 def generate_selfpost_body(self):
217 """Generate and return a self-post body like seen in the subreddit."""
218 return self.selfposts.generate()
220 def generate_comment_body(self):
221 """Generate and return a comment body like seen in the subreddit."""
222 return self.comments.generate()
225 def fuzz_number(number):
226 return int(random.betavariate(2, 8) * 5 * number)
229 def ensure_account(name):
230 """Look up or register an account and return it."""
231 try:
232 account = Account._by_name(name)
233 print ">> found /u/{}".format(name)
234 return account
235 except NotFound:
236 print ">> registering /u/{}".format(name)
237 return register(name, "password", "127.0.0.1")
240 def ensure_subreddit(name, author):
241 """Look up or create a subreddit and return it."""
242 try:
243 sr = Subreddit._by_name(name)
244 print ">> found /r/{}".format(name)
245 return sr
246 except NotFound:
247 print ">> creating /r/{}".format(name)
248 sr = Subreddit._new(
249 name=name,
250 title="/r/{}".format(name),
251 author_id=author._id,
252 lang="en",
253 ip="127.0.0.1",
255 sr._commit()
256 return sr
259 def inject_test_data(num_links=25, num_comments=25, num_votes=5):
260 """Flood your reddit install with test data based on reddit.com."""
262 print ">>>> Ensuring configured objects exist"
263 system_user = ensure_account(g.system_user)
264 ensure_account(g.automoderator_account)
265 ensure_subreddit(g.default_sr, system_user)
266 ensure_subreddit(g.takedown_sr, system_user)
267 ensure_subreddit(g.beta_sr, system_user)
268 ensure_subreddit(g.promo_sr_name, system_user)
270 print
271 print
273 print ">>>> Fetching real data from reddit.com"
274 modeler = Modeler()
275 subreddits = [
276 modeler.model_subreddit("pics"),
277 modeler.model_subreddit("videos"),
278 modeler.model_subreddit("askhistorians"),
280 extra_settings = {
281 "pics": {
282 "show_media": True,
284 "videos": {
285 "show_media": True,
289 print
290 print
292 print ">>>> Generating test data"
293 print ">>> Accounts"
294 account_query = Account._query(sort="_date", limit=500, data=True)
295 accounts = [a for a in account_query if a.name != g.system_user]
296 accounts.extend(
297 ensure_account(modeler.generate_username())
298 for i in xrange(50 - len(accounts)))
300 print ">>> Content"
301 things = []
302 for sr_model in subreddits:
303 sr_author = random.choice(accounts)
304 sr = ensure_subreddit(sr_model.name, sr_author)
306 # make the system user subscribed for easier testing
307 if sr.add_subscriber(system_user):
308 sr._incr("_ups", 1)
310 # apply any custom config we need for this sr
311 for setting, value in extra_settings.get(sr.name, {}).iteritems():
312 setattr(sr, setting, value)
313 sr._commit()
315 for i in xrange(num_links):
316 link_author = random.choice(accounts)
317 url = sr_model.generate_link_url()
318 is_self = (url == "self")
319 content = sr_model.generate_selfpost_body() if is_self else url
320 link = Link._submit(
321 is_self=is_self,
322 title=sr_model.generate_link_title(),
323 content=content,
324 author=link_author,
325 sr=sr,
326 ip="127.0.0.1",
328 queries.new_link(link)
329 things.append(link)
331 comments = [None]
332 for i in xrange(fuzz_number(num_comments)):
333 comment_author = random.choice(accounts)
334 comment, inbox_rel = Comment._new(
335 comment_author,
336 link,
337 parent=random.choice(comments),
338 body=sr_model.generate_comment_body(),
339 ip="127.0.0.1",
341 queries.new_comment(comment, inbox_rel)
342 comments.append(comment)
343 things.append(comment)
345 for thing in things:
346 for i in xrange(fuzz_number(num_votes)):
347 direction = random.choice([
348 Vote.DIRECTIONS.up,
349 Vote.DIRECTIONS.unvote,
350 Vote.DIRECTIONS.down,
352 voter = random.choice(accounts)
354 cast_vote(voter, thing, direction)
356 amqp.worker.join()
358 srs = [Subreddit._by_name(n) for n in ("pics", "videos", "askhistorians")]
359 LocalizedDefaultSubreddits.set_global_srs(srs)
360 LocalizedFeaturedSubreddits.set_global_srs([Subreddit._by_name('pics')])