scripts/inject_test_data.py

   1 # The contents of this file are subject to the Common Public Attribution
   2 # License Version 1.0. (the "License"); you may not use this file except in
   3 # compliance with the License. You may obtain a copy of the License at
   4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
   5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
   6 # software over a computer network and provide for limited attribution for the
   7 # Original Developer. In addition, Exhibit A has been modified to be consistent
   8 # with Exhibit B.
   9 #
  10 # Software distributed under the License is distributed on an "AS IS" basis,
  11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12 # the specific language governing rights and limitations under the License.
  13 #
  14 # The Original Code is reddit.
  15 #
  16 # The Original Developer is the Initial Developer.  The Initial Developer of
  17 # the Original Code is reddit Inc.
  18 #
  19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
  20 # Inc. All Rights Reserved.
  21 ###############################################################################
  22
  23 from __future__ import division
  24
  25 import collections
  26 import HTMLParser
  27 import itertools
  28 import random
  29 import string
  30 import time
  31
  32 import requests
  33
  34 from pylons import app_globals as g
  35
  36 from r2.lib.db import queries
  37 from r2.lib import amqp
  38 from r2.lib.utils import weighted_lottery, get_requests_resp_json
  39 from r2.lib.voting import cast_vote
  40 from r2.models import (
  41     Account,
  42     Comment,
  43     Link,
  44     LocalizedDefaultSubreddits,
  45     LocalizedFeaturedSubreddits,
  46     NotFound,
  47     register,
  48     Subreddit,
  49     Vote,
  50 )
  51
  52
  53 unescape_htmlentities = HTMLParser.HTMLParser().unescape
  54
  55
  56 class TextGenerator(object):
  57     """A Markov Chain based text mimicker."""
  58
  59     def __init__(self, order=8):
  60         self.order = order
  61         self.starts = collections.Counter()
  62         self.start_lengths = collections.defaultdict(collections.Counter)
  63         self.models = [
  64             collections.defaultdict(collections.Counter)
  65             for i in xrange(self.order)]
  66
  67     @staticmethod
  68     def _in_groups(input_iterable, n):
  69         iterables = itertools.tee(input_iterable, n)
  70         for offset, iterable in enumerate(iterables):
  71             for _ in xrange(offset):
  72                 next(iterable, None)
  73         return itertools.izip(*iterables)
  74
  75     def add_sample(self, sample):
  76         """Add a sample to the model of text for this generator."""
  77
  78         if len(sample) <= self.order:
  79             return
  80
  81         start = sample[:self.order]
  82         self.starts[start] += 1
  83         self.start_lengths[start][len(sample)] += 1
  84         for order, model in enumerate(self.models, 1):
  85             for chars in self._in_groups(sample, order+1):
  86                 prefix = "".join(chars[:-1])
  87                 next_char = chars[-1]
  88                 model[prefix][next_char] += 1
  89
  90     def generate(self):
  91         """Generate a string similar to samples previously fed in."""
  92
  93         start = weighted_lottery(self.starts)
  94         desired_length = weighted_lottery(self.start_lengths[start])
  95         desired_length = max(desired_length, self.order)
  96
  97         generated = []
  98         generated.extend(start)
  99         while len(generated) < desired_length:
 100             # try each model, from highest order down, til we find
 101             # something
 102             for order, model in reversed(list(enumerate(self.models, 1))):
 103                 current_prefix = "".join(generated[-order:])
 104                 frequencies = model[current_prefix]
 105                 if frequencies:
 106                     generated.append(weighted_lottery(frequencies))
 107                     break
 108             else:
 109                 generated.append(random.choice(string.lowercase))
 110
 111         return "".join(generated)
 112
 113
 114 def fetch_listing(path, limit=1000, batch_size=100):
 115     """Fetch a reddit listing from reddit.com."""
 116
 117     session = requests.Session()
 118     session.headers.update({
 119         "User-Agent": "reddit-test-data-generator/1.0",
 120     })
 121
 122     base_url = "https://api.reddit.com" + path
 123
 124     after = None
 125     count = 0
 126     while count < limit:
 127         params = {"limit": batch_size, "count": count}
 128         if after:
 129             params["after"] = after
 130
 131         print "> {}-{}".format(count, count+batch_size)
 132         response = session.get(base_url, params=params)
 133         response.raise_for_status()
 134
 135         listing = get_requests_resp_json(response)["data"]
 136         for child in listing["children"]:
 137             yield child["data"]
 138             count += 1
 139
 140         after = listing["after"]
 141         if not after:
 142             break
 143
 144         # obey reddit.com's ratelimits
 145         # see: https://github.com/reddit/reddit/wiki/API#rules
 146         time.sleep(2)
 147
 148
 149 class Modeler(object):
 150     def __init__(self):
 151         self.usernames = TextGenerator(order=2)
 152
 153     def model_subreddit(self, subreddit_name):
 154         """Return a model of links and comments in a given subreddit."""
 155
 156         subreddit_path = "/r/{}".format(subreddit_name)
 157         print ">>>", subreddit_path
 158
 159         print ">> Links"
 160         titles = TextGenerator(order=5)
 161         selfposts = TextGenerator(order=8)
 162         link_count = self_count = 0
 163         urls = set()
 164         for link in fetch_listing(subreddit_path, limit=500):
 165             self.usernames.add_sample(link["author"])
 166             titles.add_sample(unescape_htmlentities(link["title"]))
 167             if link["is_self"]:
 168                 self_count += 1
 169                 selfposts.add_sample(unescape_htmlentities(link["selftext"]))
 170             else:
 171                 urls.add(link["url"])
 172             link_count += 1
 173         self_frequency = self_count / link_count
 174
 175         print ">> Comments"
 176         comments = TextGenerator(order=8)
 177         for comment in fetch_listing(subreddit_path + "/comments"):
 178             self.usernames.add_sample(comment["author"])
 179             comments.add_sample(unescape_htmlentities(comment["body"]))
 180
 181         return SubredditModel(
 182             subreddit_name, titles, selfposts, urls, comments, self_frequency)
 183
 184     def generate_username(self):
 185         """Generate and return a username like those seen on reddit.com."""
 186         return self.usernames.generate()
 187
 188
 189 class SubredditModel(object):
 190     """A snapshot of a subreddit's links and comments."""
 191
 192     def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
 193         self.name = name
 194         self.titles = titles
 195         self.selfposts = selfposts
 196         self.urls = list(urls)
 197         self.comments = comments
 198         self.selfpost_frequency = self_frequency
 199
 200     def generate_link_title(self):
 201         """Generate and return a title like those seen in the subreddit."""
 202         return self.titles.generate()
 203
 204     def generate_link_url(self):
 205         """Generate and return a URL from one seen in the subreddit.
 206
 207         The URL returned may be "self" indicating a self post. This should
 208         happen with the same frequency it is seen in the modeled subreddit.
 209
 210         """
 211         if random.random() < self.selfpost_frequency:
 212             return "self"
 213         else:
 214             return random.choice(self.urls)
 215
 216     def generate_selfpost_body(self):
 217         """Generate and return a self-post body like seen in the subreddit."""
 218         return self.selfposts.generate()
 219
 220     def generate_comment_body(self):
 221         """Generate and return a comment body like seen in the subreddit."""
 222         return self.comments.generate()
 223
 224
 225 def fuzz_number(number):
 226     return int(random.betavariate(2, 8) * 5 * number)
 227
 228
 229 def ensure_account(name):
 230     """Look up or register an account and return it."""
 231     try:
 232         account = Account._by_name(name)
 233         print ">> found /u/{}".format(name)
 234         return account
 235     except NotFound:
 236         print ">> registering /u/{}".format(name)
 237         return register(name, "password", "127.0.0.1")
 238
 239
 240 def ensure_subreddit(name, author):
 241     """Look up or create a subreddit and return it."""
 242     try:
 243         sr = Subreddit._by_name(name)
 244         print ">> found /r/{}".format(name)
 245         return sr
 246     except NotFound:
 247         print ">> creating /r/{}".format(name)
 248         sr = Subreddit._new(
 249             name=name,
 250             title="/r/{}".format(name),
 251             author_id=author._id,
 252             lang="en",
 253             ip="127.0.0.1",
 254         )
 255         sr._commit()
 256         return sr
 257
 258
 259 def inject_test_data(num_links=25, num_comments=25, num_votes=5):
 260     """Flood your reddit install with test data based on reddit.com."""
 261
 262     print ">>>> Ensuring configured objects exist"
 263     system_user = ensure_account(g.system_user)
 264     ensure_account(g.automoderator_account)
 265     ensure_subreddit(g.default_sr, system_user)
 266     ensure_subreddit(g.takedown_sr, system_user)
 267     ensure_subreddit(g.beta_sr, system_user)
 268     ensure_subreddit(g.promo_sr_name, system_user)
 269
 270     print
 271     print
 272
 273     print ">>>> Fetching real data from reddit.com"
 274     modeler = Modeler()
 275     subreddits = [
 276         modeler.model_subreddit("pics"),
 277         modeler.model_subreddit("videos"),
 278         modeler.model_subreddit("askhistorians"),
 279     ]
 280     extra_settings = {
 281         "pics": {
 282             "show_media": True,
 283         },
 284         "videos": {
 285             "show_media": True,
 286         },
 287     }
 288
 289     print
 290     print
 291
 292     print ">>>> Generating test data"
 293     print ">>> Accounts"
 294     account_query = Account._query(sort="_date", limit=500, data=True)
 295     accounts = [a for a in account_query if a.name != g.system_user]
 296     accounts.extend(
 297         ensure_account(modeler.generate_username())
 298         for i in xrange(50 - len(accounts)))
 299
 300     print ">>> Content"
 301     things = []
 302     for sr_model in subreddits:
 303         sr_author = random.choice(accounts)
 304         sr = ensure_subreddit(sr_model.name, sr_author)
 305
 306         # make the system user subscribed for easier testing
 307         if sr.add_subscriber(system_user):
 308             sr._incr("_ups", 1)
 309
 310         # apply any custom config we need for this sr
 311         for setting, value in extra_settings.get(sr.name, {}).iteritems():
 312             setattr(sr, setting, value)
 313         sr._commit()
 314
 315         for i in xrange(num_links):
 316             link_author = random.choice(accounts)
 317             url = sr_model.generate_link_url()
 318             is_self = (url == "self")
 319             content = sr_model.generate_selfpost_body() if is_self else url
 320             link = Link._submit(
 321                 is_self=is_self,
 322                 title=sr_model.generate_link_title(),
 323                 content=content,
 324                 author=link_author,
 325                 sr=sr,
 326                 ip="127.0.0.1",
 327             )
 328             queries.new_link(link)
 329             things.append(link)
 330
 331             comments = [None]
 332             for i in xrange(fuzz_number(num_comments)):
 333                 comment_author = random.choice(accounts)
 334                 comment, inbox_rel = Comment._new(
 335                     comment_author,
 336                     link,
 337                     parent=random.choice(comments),
 338                     body=sr_model.generate_comment_body(),
 339                     ip="127.0.0.1",
 340                 )
 341                 queries.new_comment(comment, inbox_rel)
 342                 comments.append(comment)
 343                 things.append(comment)
 344
 345     for thing in things:
 346         for i in xrange(fuzz_number(num_votes)):
 347             direction = random.choice([
 348                 Vote.DIRECTIONS.up,
 349                 Vote.DIRECTIONS.unvote,
 350                 Vote.DIRECTIONS.down,
 351             ])
 352             voter = random.choice(accounts)
 353
 354             cast_vote(voter, thing, direction)
 355
 356     amqp.worker.join()
 357
 358     srs = [Subreddit._by_name(n) for n in ("pics", "videos", "askhistorians")]
 359     LocalizedDefaultSubreddits.set_global_srs(srs)
 360     LocalizedFeaturedSubreddits.set_global_srs([Subreddit._by_name('pics')])