1 # The contents of this file are subject to the Common Public Attribution
2 # License Version 1.0. (the "License"); you may not use this file except in
3 # compliance with the License. You may obtain a copy of the License at
4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
6 # software over a computer network and provide for limited attribution for the
7 # Original Developer. In addition, Exhibit A has been modified to be consistent
10 # Software distributed under the License is distributed on an "AS IS" basis,
11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
12 # the specific language governing rights and limitations under the License.
14 # The Original Code is reddit.
16 # The Original Developer is the Initial Developer. The Initial Developer of
17 # the Original Code is reddit Inc.
19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
20 # Inc. All Rights Reserved.
21 ###############################################################################
23 from __future__
import division
34 from pylons
import app_globals
as g
36 from r2
.lib
.db
import queries
37 from r2
.lib
import amqp
38 from r2
.lib
.utils
import weighted_lottery
, get_requests_resp_json
39 from r2
.lib
.voting
import cast_vote
40 from r2
.models
import (
44 LocalizedDefaultSubreddits
,
52 unescape_htmlentities
= HTMLParser
.HTMLParser().unescape
55 class TextGenerator(object):
56 """A Markov Chain based text mimicker."""
58 def __init__(self
, order
=8):
60 self
.starts
= collections
.Counter()
61 self
.start_lengths
= collections
.defaultdict(collections
.Counter
)
63 collections
.defaultdict(collections
.Counter
)
64 for i
in xrange(self
.order
)]
67 def _in_groups(input_iterable
, n
):
68 iterables
= itertools
.tee(input_iterable
, n
)
69 for offset
, iterable
in enumerate(iterables
):
70 for _
in xrange(offset
):
72 return itertools
.izip(*iterables
)
74 def add_sample(self
, sample
):
75 """Add a sample to the model of text for this generator."""
77 if len(sample
) <= self
.order
:
80 start
= sample
[:self
.order
]
81 self
.starts
[start
] += 1
82 self
.start_lengths
[start
][len(sample
)] += 1
83 for order
, model
in enumerate(self
.models
, 1):
84 for chars
in self
._in
_groups
(sample
, order
+1):
85 prefix
= "".join(chars
[:-1])
87 model
[prefix
][next_char
] += 1
90 """Generate a string similar to samples previously fed in."""
92 start
= weighted_lottery(self
.starts
)
93 desired_length
= weighted_lottery(self
.start_lengths
[start
])
94 desired_length
= max(desired_length
, self
.order
)
97 generated
.extend(start
)
98 while len(generated
) < desired_length
:
99 # try each model, from highest order down, til we find
101 for order
, model
in reversed(list(enumerate(self
.models
, 1))):
102 current_prefix
= "".join(generated
[-order
:])
103 frequencies
= model
[current_prefix
]
105 generated
.append(weighted_lottery(frequencies
))
108 generated
.append(random
.choice(string
.lowercase
))
110 return "".join(generated
)
113 def fetch_listing(path
, limit
=1000, batch_size
=100):
114 """Fetch a reddit listing from reddit.com."""
116 session
= requests
.Session()
117 session
.headers
.update({
118 "User-Agent": "reddit-test-data-generator/1.0",
121 base_url
= "https://api.reddit.com" + path
126 params
= {"limit": batch_size
, "count": count
}
128 params
["after"] = after
130 print "> {}-{}".format(count
, count
+batch_size
)
131 response
= session
.get(base_url
, params
=params
)
132 response
.raise_for_status()
134 listing
= get_requests_resp_json(response
)["data"]
135 for child
in listing
["children"]:
139 after
= listing
["after"]
143 # obey reddit.com's ratelimits
144 # see: https://github.com/reddit/reddit/wiki/API#rules
148 class Modeler(object):
150 self
.usernames
= TextGenerator(order
=2)
152 def model_subreddit(self
, subreddit_name
):
153 """Return a model of links and comments in a given subreddit."""
155 subreddit_path
= "/r/{}".format(subreddit_name
)
156 print ">>>", subreddit_path
159 titles
= TextGenerator(order
=5)
160 selfposts
= TextGenerator(order
=8)
161 link_count
= self_count
= 0
163 for link
in fetch_listing(subreddit_path
, limit
=500):
164 self
.usernames
.add_sample(link
["author"])
165 titles
.add_sample(unescape_htmlentities(link
["title"]))
168 selfposts
.add_sample(unescape_htmlentities(link
["selftext"]))
170 urls
.add(link
["url"])
172 self_frequency
= self_count
/ link_count
175 comments
= TextGenerator(order
=8)
176 for comment
in fetch_listing(subreddit_path
+ "/comments"):
177 self
.usernames
.add_sample(comment
["author"])
178 comments
.add_sample(unescape_htmlentities(comment
["body"]))
180 return SubredditModel(
181 subreddit_name
, titles
, selfposts
, urls
, comments
, self_frequency
)
183 def generate_username(self
):
184 """Generate and return a username like those seen on reddit.com."""
185 return self
.usernames
.generate()
188 class SubredditModel(object):
189 """A snapshot of a subreddit's links and comments."""
191 def __init__(self
, name
, titles
, selfposts
, urls
, comments
, self_frequency
):
194 self
.selfposts
= selfposts
195 self
.urls
= list(urls
)
196 self
.comments
= comments
197 self
.selfpost_frequency
= self_frequency
199 def generate_link_title(self
):
200 """Generate and return a title like those seen in the subreddit."""
201 return self
.titles
.generate()
203 def generate_link_url(self
):
204 """Generate and return a URL from one seen in the subreddit.
206 The URL returned may be "self" indicating a self post. This should
207 happen with the same frequency it is seen in the modeled subreddit.
210 if random
.random() < self
.selfpost_frequency
:
213 return random
.choice(self
.urls
)
215 def generate_selfpost_body(self
):
216 """Generate and return a self-post body like seen in the subreddit."""
217 return self
.selfposts
.generate()
219 def generate_comment_body(self
):
220 """Generate and return a comment body like seen in the subreddit."""
221 return self
.comments
.generate()
224 def fuzz_number(number
):
225 return int(random
.betavariate(2, 8) * 5 * number
)
228 def ensure_account(name
):
229 """Look up or register an account and return it."""
231 account
= Account
._by
_name
(name
)
232 print ">> found /u/{}".format(name
)
235 print ">> registering /u/{}".format(name
)
236 return register(name
, "password", "127.0.0.1")
239 def ensure_subreddit(name
, author
):
240 """Look up or create a subreddit and return it."""
242 sr
= Subreddit
._by
_name
(name
)
243 print ">> found /r/{}".format(name
)
246 print ">> creating /r/{}".format(name
)
249 title
="/r/{}".format(name
),
250 author_id
=author
._id
,
258 def inject_test_data(num_links
=25, num_comments
=25, num_votes
=5):
259 """Flood your reddit install with test data based on reddit.com."""
261 print ">>>> Ensuring configured objects exist"
262 system_user
= ensure_account(g
.system_user
)
263 ensure_account(g
.automoderator_account
)
264 ensure_subreddit(g
.default_sr
, system_user
)
265 ensure_subreddit(g
.takedown_sr
, system_user
)
266 ensure_subreddit(g
.beta_sr
, system_user
)
267 ensure_subreddit(g
.promo_sr_name
, system_user
)
272 print ">>>> Fetching real data from reddit.com"
275 modeler
.model_subreddit("pics"),
276 modeler
.model_subreddit("videos"),
277 modeler
.model_subreddit("askhistorians"),
291 print ">>>> Generating test data"
293 account_query
= Account
._query
(sort
="_date", limit
=500, data
=True)
294 accounts
= [a
for a
in account_query
if a
.name
!= g
.system_user
]
296 ensure_account(modeler
.generate_username())
297 for i
in xrange(50 - len(accounts
)))
301 for sr_model
in subreddits
:
302 sr_author
= random
.choice(accounts
)
303 sr
= ensure_subreddit(sr_model
.name
, sr_author
)
305 # make the system user subscribed for easier testing
306 if sr
.add_subscriber(system_user
):
309 # apply any custom config we need for this sr
310 for setting
, value
in extra_settings
.get(sr
.name
, {}).iteritems():
311 setattr(sr
, setting
, value
)
314 for i
in xrange(num_links
):
315 link_author
= random
.choice(accounts
)
316 url
= sr_model
.generate_link_url()
317 is_self
= (url
== "self")
318 content
= sr_model
.generate_selfpost_body() if is_self
else url
321 title
=sr_model
.generate_link_title(),
327 queries
.new_link(link
)
331 for i
in xrange(fuzz_number(num_comments
)):
332 comment_author
= random
.choice(accounts
)
333 comment
, inbox_rel
= Comment
._new
(
336 parent
=random
.choice(comments
),
337 body
=sr_model
.generate_comment_body(),
340 queries
.new_comment(comment
, inbox_rel
)
341 comments
.append(comment
)
342 things
.append(comment
)
345 for i
in xrange(fuzz_number(num_votes
)):
346 direction
= random
.choice([
348 Vote
.DIRECTIONS
.unvote
,
349 Vote
.DIRECTIONS
.down
,
351 voter
= random
.choice(accounts
)
353 cast_vote(voter
, thing
, direction
)
357 srs
= [Subreddit
._by
_name
(n
) for n
in ("pics", "videos", "askhistorians")]
358 LocalizedDefaultSubreddits
.set_global_srs(srs
)