SubredditRules: Style subreddit report form.
[reddit.git] / scripts / inject_test_data.py
blob14d651842015d64701e07e66604a66c94eb2c141
1 # The contents of this file are subject to the Common Public Attribution
2 # License Version 1.0. (the "License"); you may not use this file except in
3 # compliance with the License. You may obtain a copy of the License at
4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
6 # software over a computer network and provide for limited attribution for the
7 # Original Developer. In addition, Exhibit A has been modified to be consistent
8 # with Exhibit B.
10 # Software distributed under the License is distributed on an "AS IS" basis,
11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
12 # the specific language governing rights and limitations under the License.
14 # The Original Code is reddit.
16 # The Original Developer is the Initial Developer. The Initial Developer of
17 # the Original Code is reddit Inc.
19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
20 # Inc. All Rights Reserved.
21 ###############################################################################
23 from __future__ import division
25 import collections
26 import HTMLParser
27 import itertools
28 import random
29 import string
30 import time
32 import requests
34 from pylons import app_globals as g
36 from r2.lib.db import queries
37 from r2.lib import amqp
38 from r2.lib.utils import weighted_lottery, get_requests_resp_json
39 from r2.lib.voting import cast_vote
40 from r2.models import (
41 Account,
42 Comment,
43 Link,
44 LocalizedDefaultSubreddits,
45 NotFound,
46 register,
47 Subreddit,
48 Vote,
52 unescape_htmlentities = HTMLParser.HTMLParser().unescape
55 class TextGenerator(object):
56 """A Markov Chain based text mimicker."""
58 def __init__(self, order=8):
59 self.order = order
60 self.starts = collections.Counter()
61 self.start_lengths = collections.defaultdict(collections.Counter)
62 self.models = [
63 collections.defaultdict(collections.Counter)
64 for i in xrange(self.order)]
66 @staticmethod
67 def _in_groups(input_iterable, n):
68 iterables = itertools.tee(input_iterable, n)
69 for offset, iterable in enumerate(iterables):
70 for _ in xrange(offset):
71 next(iterable, None)
72 return itertools.izip(*iterables)
74 def add_sample(self, sample):
75 """Add a sample to the model of text for this generator."""
77 if len(sample) <= self.order:
78 return
80 start = sample[:self.order]
81 self.starts[start] += 1
82 self.start_lengths[start][len(sample)] += 1
83 for order, model in enumerate(self.models, 1):
84 for chars in self._in_groups(sample, order+1):
85 prefix = "".join(chars[:-1])
86 next_char = chars[-1]
87 model[prefix][next_char] += 1
89 def generate(self):
90 """Generate a string similar to samples previously fed in."""
92 start = weighted_lottery(self.starts)
93 desired_length = weighted_lottery(self.start_lengths[start])
94 desired_length = max(desired_length, self.order)
96 generated = []
97 generated.extend(start)
98 while len(generated) < desired_length:
99 # try each model, from highest order down, til we find
100 # something
101 for order, model in reversed(list(enumerate(self.models, 1))):
102 current_prefix = "".join(generated[-order:])
103 frequencies = model[current_prefix]
104 if frequencies:
105 generated.append(weighted_lottery(frequencies))
106 break
107 else:
108 generated.append(random.choice(string.lowercase))
110 return "".join(generated)
113 def fetch_listing(path, limit=1000, batch_size=100):
114 """Fetch a reddit listing from reddit.com."""
116 session = requests.Session()
117 session.headers.update({
118 "User-Agent": "reddit-test-data-generator/1.0",
121 base_url = "https://api.reddit.com" + path
123 after = None
124 count = 0
125 while count < limit:
126 params = {"limit": batch_size, "count": count}
127 if after:
128 params["after"] = after
130 print "> {}-{}".format(count, count+batch_size)
131 response = session.get(base_url, params=params)
132 response.raise_for_status()
134 listing = get_requests_resp_json(response)["data"]
135 for child in listing["children"]:
136 yield child["data"]
137 count += 1
139 after = listing["after"]
140 if not after:
141 break
143 # obey reddit.com's ratelimits
144 # see: https://github.com/reddit/reddit/wiki/API#rules
145 time.sleep(2)
148 class Modeler(object):
149 def __init__(self):
150 self.usernames = TextGenerator(order=2)
152 def model_subreddit(self, subreddit_name):
153 """Return a model of links and comments in a given subreddit."""
155 subreddit_path = "/r/{}".format(subreddit_name)
156 print ">>>", subreddit_path
158 print ">> Links"
159 titles = TextGenerator(order=5)
160 selfposts = TextGenerator(order=8)
161 link_count = self_count = 0
162 urls = set()
163 for link in fetch_listing(subreddit_path, limit=500):
164 self.usernames.add_sample(link["author"])
165 titles.add_sample(unescape_htmlentities(link["title"]))
166 if link["is_self"]:
167 self_count += 1
168 selfposts.add_sample(unescape_htmlentities(link["selftext"]))
169 else:
170 urls.add(link["url"])
171 link_count += 1
172 self_frequency = self_count / link_count
174 print ">> Comments"
175 comments = TextGenerator(order=8)
176 for comment in fetch_listing(subreddit_path + "/comments"):
177 self.usernames.add_sample(comment["author"])
178 comments.add_sample(unescape_htmlentities(comment["body"]))
180 return SubredditModel(
181 subreddit_name, titles, selfposts, urls, comments, self_frequency)
183 def generate_username(self):
184 """Generate and return a username like those seen on reddit.com."""
185 return self.usernames.generate()
188 class SubredditModel(object):
189 """A snapshot of a subreddit's links and comments."""
191 def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
192 self.name = name
193 self.titles = titles
194 self.selfposts = selfposts
195 self.urls = list(urls)
196 self.comments = comments
197 self.selfpost_frequency = self_frequency
199 def generate_link_title(self):
200 """Generate and return a title like those seen in the subreddit."""
201 return self.titles.generate()
203 def generate_link_url(self):
204 """Generate and return a URL from one seen in the subreddit.
206 The URL returned may be "self" indicating a self post. This should
207 happen with the same frequency it is seen in the modeled subreddit.
210 if random.random() < self.selfpost_frequency:
211 return "self"
212 else:
213 return random.choice(self.urls)
215 def generate_selfpost_body(self):
216 """Generate and return a self-post body like seen in the subreddit."""
217 return self.selfposts.generate()
219 def generate_comment_body(self):
220 """Generate and return a comment body like seen in the subreddit."""
221 return self.comments.generate()
224 def fuzz_number(number):
225 return int(random.betavariate(2, 8) * 5 * number)
228 def ensure_account(name):
229 """Look up or register an account and return it."""
230 try:
231 account = Account._by_name(name)
232 print ">> found /u/{}".format(name)
233 return account
234 except NotFound:
235 print ">> registering /u/{}".format(name)
236 return register(name, "password", "127.0.0.1")
239 def ensure_subreddit(name, author):
240 """Look up or create a subreddit and return it."""
241 try:
242 sr = Subreddit._by_name(name)
243 print ">> found /r/{}".format(name)
244 return sr
245 except NotFound:
246 print ">> creating /r/{}".format(name)
247 sr = Subreddit._new(
248 name=name,
249 title="/r/{}".format(name),
250 author_id=author._id,
251 lang="en",
252 ip="127.0.0.1",
254 sr._commit()
255 return sr
258 def inject_test_data(num_links=25, num_comments=25, num_votes=5):
259 """Flood your reddit install with test data based on reddit.com."""
261 print ">>>> Ensuring configured objects exist"
262 system_user = ensure_account(g.system_user)
263 ensure_account(g.automoderator_account)
264 ensure_subreddit(g.default_sr, system_user)
265 ensure_subreddit(g.takedown_sr, system_user)
266 ensure_subreddit(g.beta_sr, system_user)
267 ensure_subreddit(g.promo_sr_name, system_user)
269 print
270 print
272 print ">>>> Fetching real data from reddit.com"
273 modeler = Modeler()
274 subreddits = [
275 modeler.model_subreddit("pics"),
276 modeler.model_subreddit("videos"),
277 modeler.model_subreddit("askhistorians"),
279 extra_settings = {
280 "pics": {
281 "show_media": True,
283 "videos": {
284 "show_media": True,
288 print
289 print
291 print ">>>> Generating test data"
292 print ">>> Accounts"
293 account_query = Account._query(sort="_date", limit=500, data=True)
294 accounts = [a for a in account_query if a.name != g.system_user]
295 accounts.extend(
296 ensure_account(modeler.generate_username())
297 for i in xrange(50 - len(accounts)))
299 print ">>> Content"
300 things = []
301 for sr_model in subreddits:
302 sr_author = random.choice(accounts)
303 sr = ensure_subreddit(sr_model.name, sr_author)
305 # make the system user subscribed for easier testing
306 if sr.add_subscriber(system_user):
307 sr._incr("_ups", 1)
309 # apply any custom config we need for this sr
310 for setting, value in extra_settings.get(sr.name, {}).iteritems():
311 setattr(sr, setting, value)
312 sr._commit()
314 for i in xrange(num_links):
315 link_author = random.choice(accounts)
316 url = sr_model.generate_link_url()
317 is_self = (url == "self")
318 content = sr_model.generate_selfpost_body() if is_self else url
319 link = Link._submit(
320 is_self=is_self,
321 title=sr_model.generate_link_title(),
322 content=content,
323 author=link_author,
324 sr=sr,
325 ip="127.0.0.1",
327 queries.new_link(link)
328 things.append(link)
330 comments = [None]
331 for i in xrange(fuzz_number(num_comments)):
332 comment_author = random.choice(accounts)
333 comment, inbox_rel = Comment._new(
334 comment_author,
335 link,
336 parent=random.choice(comments),
337 body=sr_model.generate_comment_body(),
338 ip="127.0.0.1",
340 queries.new_comment(comment, inbox_rel)
341 comments.append(comment)
342 things.append(comment)
344 for thing in things:
345 for i in xrange(fuzz_number(num_votes)):
346 direction = random.choice([
347 Vote.DIRECTIONS.up,
348 Vote.DIRECTIONS.unvote,
349 Vote.DIRECTIONS.down,
351 voter = random.choice(accounts)
353 cast_vote(voter, thing, direction)
355 amqp.worker.join()
357 srs = [Subreddit._by_name(n) for n in ("pics", "videos", "askhistorians")]
358 LocalizedDefaultSubreddits.set_global_srs(srs)