1 # The contents of this file are subject to the Common Public Attribution
2 # License Version 1.0. (the "License"); you may not use this file except in
3 # compliance with the License. You may obtain a copy of the License at
4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
6 # software over a computer network and provide for limited attribution for the
7 # Original Developer. In addition, Exhibit A has been modified to be consistent
10 # Software distributed under the License is distributed on an "AS IS" basis,
11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
12 # the specific language governing rights and limitations under the License.
14 # The Original Code is reddit.
16 # The Original Developer is the Initial Developer. The Initial Developer of
17 # the Original Code is reddit Inc.
19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
20 # Inc. All Rights Reserved.
21 ###############################################################################
23 from __future__
import division
34 from pylons
import app_globals
as g
36 from r2
.lib
.db
import queries
37 from r2
.lib
import amqp
38 from r2
.lib
.utils
import weighted_lottery
, get_requests_resp_json
39 from r2
.lib
.voting
import cast_vote
40 from r2
.models
import (
44 LocalizedDefaultSubreddits
,
45 LocalizedFeaturedSubreddits
,
53 unescape_htmlentities
= HTMLParser
.HTMLParser().unescape
56 class TextGenerator(object):
57 """A Markov Chain based text mimicker."""
59 def __init__(self
, order
=8):
61 self
.starts
= collections
.Counter()
62 self
.start_lengths
= collections
.defaultdict(collections
.Counter
)
64 collections
.defaultdict(collections
.Counter
)
65 for i
in xrange(self
.order
)]
68 def _in_groups(input_iterable
, n
):
69 iterables
= itertools
.tee(input_iterable
, n
)
70 for offset
, iterable
in enumerate(iterables
):
71 for _
in xrange(offset
):
73 return itertools
.izip(*iterables
)
75 def add_sample(self
, sample
):
76 """Add a sample to the model of text for this generator."""
78 if len(sample
) <= self
.order
:
81 start
= sample
[:self
.order
]
82 self
.starts
[start
] += 1
83 self
.start_lengths
[start
][len(sample
)] += 1
84 for order
, model
in enumerate(self
.models
, 1):
85 for chars
in self
._in
_groups
(sample
, order
+1):
86 prefix
= "".join(chars
[:-1])
88 model
[prefix
][next_char
] += 1
91 """Generate a string similar to samples previously fed in."""
93 start
= weighted_lottery(self
.starts
)
94 desired_length
= weighted_lottery(self
.start_lengths
[start
])
95 desired_length
= max(desired_length
, self
.order
)
98 generated
.extend(start
)
99 while len(generated
) < desired_length
:
100 # try each model, from highest order down, til we find
102 for order
, model
in reversed(list(enumerate(self
.models
, 1))):
103 current_prefix
= "".join(generated
[-order
:])
104 frequencies
= model
[current_prefix
]
106 generated
.append(weighted_lottery(frequencies
))
109 generated
.append(random
.choice(string
.lowercase
))
111 return "".join(generated
)
114 def fetch_listing(path
, limit
=1000, batch_size
=100):
115 """Fetch a reddit listing from reddit.com."""
117 session
= requests
.Session()
118 session
.headers
.update({
119 "User-Agent": "reddit-test-data-generator/1.0",
122 base_url
= "https://api.reddit.com" + path
127 params
= {"limit": batch_size
, "count": count
}
129 params
["after"] = after
131 print "> {}-{}".format(count
, count
+batch_size
)
132 response
= session
.get(base_url
, params
=params
)
133 response
.raise_for_status()
135 listing
= get_requests_resp_json(response
)["data"]
136 for child
in listing
["children"]:
140 after
= listing
["after"]
144 # obey reddit.com's ratelimits
145 # see: https://github.com/reddit/reddit/wiki/API#rules
149 class Modeler(object):
151 self
.usernames
= TextGenerator(order
=2)
153 def model_subreddit(self
, subreddit_name
):
154 """Return a model of links and comments in a given subreddit."""
156 subreddit_path
= "/r/{}".format(subreddit_name
)
157 print ">>>", subreddit_path
160 titles
= TextGenerator(order
=5)
161 selfposts
= TextGenerator(order
=8)
162 link_count
= self_count
= 0
164 for link
in fetch_listing(subreddit_path
, limit
=500):
165 self
.usernames
.add_sample(link
["author"])
166 titles
.add_sample(unescape_htmlentities(link
["title"]))
169 selfposts
.add_sample(unescape_htmlentities(link
["selftext"]))
171 urls
.add(link
["url"])
173 self_frequency
= self_count
/ link_count
176 comments
= TextGenerator(order
=8)
177 for comment
in fetch_listing(subreddit_path
+ "/comments"):
178 self
.usernames
.add_sample(comment
["author"])
179 comments
.add_sample(unescape_htmlentities(comment
["body"]))
181 return SubredditModel(
182 subreddit_name
, titles
, selfposts
, urls
, comments
, self_frequency
)
184 def generate_username(self
):
185 """Generate and return a username like those seen on reddit.com."""
186 return self
.usernames
.generate()
189 class SubredditModel(object):
190 """A snapshot of a subreddit's links and comments."""
192 def __init__(self
, name
, titles
, selfposts
, urls
, comments
, self_frequency
):
195 self
.selfposts
= selfposts
196 self
.urls
= list(urls
)
197 self
.comments
= comments
198 self
.selfpost_frequency
= self_frequency
200 def generate_link_title(self
):
201 """Generate and return a title like those seen in the subreddit."""
202 return self
.titles
.generate()
204 def generate_link_url(self
):
205 """Generate and return a URL from one seen in the subreddit.
207 The URL returned may be "self" indicating a self post. This should
208 happen with the same frequency it is seen in the modeled subreddit.
211 if random
.random() < self
.selfpost_frequency
:
214 return random
.choice(self
.urls
)
216 def generate_selfpost_body(self
):
217 """Generate and return a self-post body like seen in the subreddit."""
218 return self
.selfposts
.generate()
220 def generate_comment_body(self
):
221 """Generate and return a comment body like seen in the subreddit."""
222 return self
.comments
.generate()
225 def fuzz_number(number
):
226 return int(random
.betavariate(2, 8) * 5 * number
)
229 def ensure_account(name
):
230 """Look up or register an account and return it."""
232 account
= Account
._by
_name
(name
)
233 print ">> found /u/{}".format(name
)
236 print ">> registering /u/{}".format(name
)
237 return register(name
, "password", "127.0.0.1")
240 def ensure_subreddit(name
, author
):
241 """Look up or create a subreddit and return it."""
243 sr
= Subreddit
._by
_name
(name
)
244 print ">> found /r/{}".format(name
)
247 print ">> creating /r/{}".format(name
)
250 title
="/r/{}".format(name
),
251 author_id
=author
._id
,
259 def inject_test_data(num_links
=25, num_comments
=25, num_votes
=5):
260 """Flood your reddit install with test data based on reddit.com."""
262 print ">>>> Ensuring configured objects exist"
263 system_user
= ensure_account(g
.system_user
)
264 ensure_account(g
.automoderator_account
)
265 ensure_subreddit(g
.default_sr
, system_user
)
266 ensure_subreddit(g
.takedown_sr
, system_user
)
267 ensure_subreddit(g
.beta_sr
, system_user
)
268 ensure_subreddit(g
.promo_sr_name
, system_user
)
273 print ">>>> Fetching real data from reddit.com"
276 modeler
.model_subreddit("pics"),
277 modeler
.model_subreddit("videos"),
278 modeler
.model_subreddit("askhistorians"),
292 print ">>>> Generating test data"
294 account_query
= Account
._query
(sort
="_date", limit
=500, data
=True)
295 accounts
= [a
for a
in account_query
if a
.name
!= g
.system_user
]
297 ensure_account(modeler
.generate_username())
298 for i
in xrange(50 - len(accounts
)))
302 for sr_model
in subreddits
:
303 sr_author
= random
.choice(accounts
)
304 sr
= ensure_subreddit(sr_model
.name
, sr_author
)
306 # make the system user subscribed for easier testing
307 if sr
.add_subscriber(system_user
):
310 # apply any custom config we need for this sr
311 for setting
, value
in extra_settings
.get(sr
.name
, {}).iteritems():
312 setattr(sr
, setting
, value
)
315 for i
in xrange(num_links
):
316 link_author
= random
.choice(accounts
)
317 url
= sr_model
.generate_link_url()
318 is_self
= (url
== "self")
319 content
= sr_model
.generate_selfpost_body() if is_self
else url
322 title
=sr_model
.generate_link_title(),
328 queries
.new_link(link
)
332 for i
in xrange(fuzz_number(num_comments
)):
333 comment_author
= random
.choice(accounts
)
334 comment
, inbox_rel
= Comment
._new
(
337 parent
=random
.choice(comments
),
338 body
=sr_model
.generate_comment_body(),
341 queries
.new_comment(comment
, inbox_rel
)
342 comments
.append(comment
)
343 things
.append(comment
)
346 for i
in xrange(fuzz_number(num_votes
)):
347 direction
= random
.choice([
349 Vote
.DIRECTIONS
.unvote
,
350 Vote
.DIRECTIONS
.down
,
352 voter
= random
.choice(accounts
)
354 cast_vote(voter
, thing
, direction
)
358 srs
= [Subreddit
._by
_name
(n
) for n
in ("pics", "videos", "askhistorians")]
359 LocalizedDefaultSubreddits
.set_global_srs(srs
)
360 LocalizedFeaturedSubreddits
.set_global_srs([Subreddit
._by
_name
('pics')])