scripts/inject_test_data.py

   1 # The contents of this file are subject to the Common Public Attribution
   2 # License Version 1.0. (the "License"); you may not use this file except in
   3 # compliance with the License. You may obtain a copy of the License at
   4 # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
   5 # License Version 1.1, but Sections 14 and 15 have been added to cover use of
   6 # software over a computer network and provide for limited attribution for the
   7 # Original Developer. In addition, Exhibit A has been modified to be consistent
   8 # with Exhibit B.
   9 #
  10 # Software distributed under the License is distributed on an "AS IS" basis,
  11 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12 # the specific language governing rights and limitations under the License.
  13 #
  14 # The Original Code is reddit.
  15 #
  16 # The Original Developer is the Initial Developer.  The Initial Developer of
  17 # the Original Code is reddit Inc.
  18 #
  19 # All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
  20 # Inc. All Rights Reserved.
  21 ###############################################################################
  22
  23 from __future__ import division
  24
  25 import collections
  26 import HTMLParser
  27 import itertools
  28 import random
  29 import string
  30 import time
  31
  32 import requests
  33
  34 from pylons import app_globals as g
  35
  36 from r2.lib.db import queries
  37 from r2.lib import amqp
  38 from r2.lib.utils import weighted_lottery, get_requests_resp_json
  39 from r2.lib.voting import cast_vote
  40 from r2.models import (
  41     Account,
  42     Comment,
  43     Link,
  44     LocalizedDefaultSubreddits,
  45     NotFound,
  46     register,
  47     Subreddit,
  48     Vote,
  49 )
  50
  51
  52 unescape_htmlentities = HTMLParser.HTMLParser().unescape
  53
  54
  55 class TextGenerator(object):
  56     """A Markov Chain based text mimicker."""
  57
  58     def __init__(self, order=8):
  59         self.order = order
  60         self.starts = collections.Counter()
  61         self.start_lengths = collections.defaultdict(collections.Counter)
  62         self.models = [
  63             collections.defaultdict(collections.Counter)
  64             for i in xrange(self.order)]
  65
  66     @staticmethod
  67     def _in_groups(input_iterable, n):
  68         iterables = itertools.tee(input_iterable, n)
  69         for offset, iterable in enumerate(iterables):
  70             for _ in xrange(offset):
  71                 next(iterable, None)
  72         return itertools.izip(*iterables)
  73
  74     def add_sample(self, sample):
  75         """Add a sample to the model of text for this generator."""
  76
  77         if len(sample) <= self.order:
  78             return
  79
  80         start = sample[:self.order]
  81         self.starts[start] += 1
  82         self.start_lengths[start][len(sample)] += 1
  83         for order, model in enumerate(self.models, 1):
  84             for chars in self._in_groups(sample, order+1):
  85                 prefix = "".join(chars[:-1])
  86                 next_char = chars[-1]
  87                 model[prefix][next_char] += 1
  88
  89     def generate(self):
  90         """Generate a string similar to samples previously fed in."""
  91
  92         start = weighted_lottery(self.starts)
  93         desired_length = weighted_lottery(self.start_lengths[start])
  94         desired_length = max(desired_length, self.order)
  95
  96         generated = []
  97         generated.extend(start)
  98         while len(generated) < desired_length:
  99             # try each model, from highest order down, til we find
 100             # something
 101             for order, model in reversed(list(enumerate(self.models, 1))):
 102                 current_prefix = "".join(generated[-order:])
 103                 frequencies = model[current_prefix]
 104                 if frequencies:
 105                     generated.append(weighted_lottery(frequencies))
 106                     break
 107             else:
 108                 generated.append(random.choice(string.lowercase))
 109
 110         return "".join(generated)
 111
 112
 113 def fetch_listing(path, limit=1000, batch_size=100):
 114     """Fetch a reddit listing from reddit.com."""
 115
 116     session = requests.Session()
 117     session.headers.update({
 118         "User-Agent": "reddit-test-data-generator/1.0",
 119     })
 120
 121     base_url = "https://api.reddit.com" + path
 122
 123     after = None
 124     count = 0
 125     while count < limit:
 126         params = {"limit": batch_size, "count": count}
 127         if after:
 128             params["after"] = after
 129
 130         print "> {}-{}".format(count, count+batch_size)
 131         response = session.get(base_url, params=params)
 132         response.raise_for_status()
 133
 134         listing = get_requests_resp_json(response)["data"]
 135         for child in listing["children"]:
 136             yield child["data"]
 137             count += 1
 138
 139         after = listing["after"]
 140         if not after:
 141             break
 142
 143         # obey reddit.com's ratelimits
 144         # see: https://github.com/reddit/reddit/wiki/API#rules
 145         time.sleep(2)
 146
 147
 148 class Modeler(object):
 149     def __init__(self):
 150         self.usernames = TextGenerator(order=2)
 151
 152     def model_subreddit(self, subreddit_name):
 153         """Return a model of links and comments in a given subreddit."""
 154
 155         subreddit_path = "/r/{}".format(subreddit_name)
 156         print ">>>", subreddit_path
 157
 158         print ">> Links"
 159         titles = TextGenerator(order=5)
 160         selfposts = TextGenerator(order=8)
 161         link_count = self_count = 0
 162         urls = set()
 163         for link in fetch_listing(subreddit_path, limit=500):
 164             self.usernames.add_sample(link["author"])
 165             titles.add_sample(unescape_htmlentities(link["title"]))
 166             if link["is_self"]:
 167                 self_count += 1
 168                 selfposts.add_sample(unescape_htmlentities(link["selftext"]))
 169             else:
 170                 urls.add(link["url"])
 171             link_count += 1
 172         self_frequency = self_count / link_count
 173
 174         print ">> Comments"
 175         comments = TextGenerator(order=8)
 176         for comment in fetch_listing(subreddit_path + "/comments"):
 177             self.usernames.add_sample(comment["author"])
 178             comments.add_sample(unescape_htmlentities(comment["body"]))
 179
 180         return SubredditModel(
 181             subreddit_name, titles, selfposts, urls, comments, self_frequency)
 182
 183     def generate_username(self):
 184         """Generate and return a username like those seen on reddit.com."""
 185         return self.usernames.generate()
 186
 187
 188 class SubredditModel(object):
 189     """A snapshot of a subreddit's links and comments."""
 190
 191     def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
 192         self.name = name
 193         self.titles = titles
 194         self.selfposts = selfposts
 195         self.urls = list(urls)
 196         self.comments = comments
 197         self.selfpost_frequency = self_frequency
 198
 199     def generate_link_title(self):
 200         """Generate and return a title like those seen in the subreddit."""
 201         return self.titles.generate()
 202
 203     def generate_link_url(self):
 204         """Generate and return a URL from one seen in the subreddit.
 205
 206         The URL returned may be "self" indicating a self post. This should
 207         happen with the same frequency it is seen in the modeled subreddit.
 208
 209         """
 210         if random.random() < self.selfpost_frequency:
 211             return "self"
 212         else:
 213             return random.choice(self.urls)
 214
 215     def generate_selfpost_body(self):
 216         """Generate and return a self-post body like seen in the subreddit."""
 217         return self.selfposts.generate()
 218
 219     def generate_comment_body(self):
 220         """Generate and return a comment body like seen in the subreddit."""
 221         return self.comments.generate()
 222
 223
 224 def fuzz_number(number):
 225     return int(random.betavariate(2, 8) * 5 * number)
 226
 227
 228 def ensure_account(name):
 229     """Look up or register an account and return it."""
 230     try:
 231         account = Account._by_name(name)
 232         print ">> found /u/{}".format(name)
 233         return account
 234     except NotFound:
 235         print ">> registering /u/{}".format(name)
 236         return register(name, "password", "127.0.0.1")
 237
 238
 239 def ensure_subreddit(name, author):
 240     """Look up or create a subreddit and return it."""
 241     try:
 242         sr = Subreddit._by_name(name)
 243         print ">> found /r/{}".format(name)
 244         return sr
 245     except NotFound:
 246         print ">> creating /r/{}".format(name)
 247         sr = Subreddit._new(
 248             name=name,
 249             title="/r/{}".format(name),
 250             author_id=author._id,
 251             lang="en",
 252             ip="127.0.0.1",
 253         )
 254         sr._commit()
 255         return sr
 256
 257
 258 def inject_test_data(num_links=25, num_comments=25, num_votes=5):
 259     """Flood your reddit install with test data based on reddit.com."""
 260
 261     print ">>>> Ensuring configured objects exist"
 262     system_user = ensure_account(g.system_user)
 263     ensure_account(g.automoderator_account)
 264     ensure_subreddit(g.default_sr, system_user)
 265     ensure_subreddit(g.takedown_sr, system_user)
 266     ensure_subreddit(g.beta_sr, system_user)
 267     ensure_subreddit(g.promo_sr_name, system_user)
 268
 269     print
 270     print
 271
 272     print ">>>> Fetching real data from reddit.com"
 273     modeler = Modeler()
 274     subreddits = [
 275         modeler.model_subreddit("pics"),
 276         modeler.model_subreddit("videos"),
 277         modeler.model_subreddit("askhistorians"),
 278     ]
 279     extra_settings = {
 280         "pics": {
 281             "show_media": True,
 282         },
 283         "videos": {
 284             "show_media": True,
 285         },
 286     }
 287
 288     print
 289     print
 290
 291     print ">>>> Generating test data"
 292     print ">>> Accounts"
 293     account_query = Account._query(sort="_date", limit=500, data=True)
 294     accounts = [a for a in account_query if a.name != g.system_user]
 295     accounts.extend(
 296         ensure_account(modeler.generate_username())
 297         for i in xrange(50 - len(accounts)))
 298
 299     print ">>> Content"
 300     things = []
 301     for sr_model in subreddits:
 302         sr_author = random.choice(accounts)
 303         sr = ensure_subreddit(sr_model.name, sr_author)
 304
 305         # make the system user subscribed for easier testing
 306         if sr.add_subscriber(system_user):
 307             sr._incr("_ups", 1)
 308
 309         # apply any custom config we need for this sr
 310         for setting, value in extra_settings.get(sr.name, {}).iteritems():
 311             setattr(sr, setting, value)
 312         sr._commit()
 313
 314         for i in xrange(num_links):
 315             link_author = random.choice(accounts)
 316             url = sr_model.generate_link_url()
 317             is_self = (url == "self")
 318             content = sr_model.generate_selfpost_body() if is_self else url
 319             link = Link._submit(
 320                 is_self=is_self,
 321                 title=sr_model.generate_link_title(),
 322                 content=content,
 323                 author=link_author,
 324                 sr=sr,
 325                 ip="127.0.0.1",
 326             )
 327             queries.new_link(link)
 328             things.append(link)
 329
 330             comments = [None]
 331             for i in xrange(fuzz_number(num_comments)):
 332                 comment_author = random.choice(accounts)
 333                 comment, inbox_rel = Comment._new(
 334                     comment_author,
 335                     link,
 336                     parent=random.choice(comments),
 337                     body=sr_model.generate_comment_body(),
 338                     ip="127.0.0.1",
 339                 )
 340                 queries.new_comment(comment, inbox_rel)
 341                 comments.append(comment)
 342                 things.append(comment)
 343
 344     for thing in things:
 345         for i in xrange(fuzz_number(num_votes)):
 346             direction = random.choice([
 347                 Vote.DIRECTIONS.up,
 348                 Vote.DIRECTIONS.unvote,
 349                 Vote.DIRECTIONS.down,
 350             ])
 351             voter = random.choice(accounts)
 352
 353             cast_vote(voter, thing, direction)
 354
 355     amqp.worker.join()
 356
 357     srs = [Subreddit._by_name(n) for n in ("pics", "videos", "askhistorians")]
 358     LocalizedDefaultSubreddits.set_global_srs(srs)