src/applications/repository/graphcache/PhabricatorRepositoryGraphCache.php

   1 <?php
   2
   3 /**
   4  * Given a commit and a path, efficiently determine the most recent ancestor
   5  * commit where the path was touched.
   6  *
   7  * In Git and Mercurial, log operations with a path are relatively slow. For
   8  * example:
   9  *
  10  *    git log -n1 <commit> -- <path>
  11  *
  12  * ...routinely takes several hundred milliseconds, and equivalent requests
  13  * often take longer in Mercurial.
  14  *
  15  * Unfortunately, this operation is fundamental to rendering a repository for
  16  * the web, and essentially everything else that's slow can be reduced to this
  17  * plus some trivial work afterward. Making this fast is desirable and powerful,
  18  * and allows us to make other things fast by expressing them in terms of this
  19  * query.
  20  *
  21  * Because the query is fundamentally a graph query, it isn't easy to express
  22  * in a reasonable way in MySQL, and we can't do round trips to the server to
  23  * walk the graph without incurring huge performance penalties.
  24  *
  25  * However, the total amount of data in the graph is relatively small. By
  26  * caching it in chunks and keeping it in APC, we can reasonably load and walk
  27  * the graph in PHP quickly.
  28  *
  29  * For more context, see T2683.
  30  *
  31  * Structure of the Cache
  32  * ======================
  33  *
  34  * The cache divides commits into buckets (see @{method:getBucketSize}). To
  35  * walk the graph, we pull a commit's bucket. The bucket is a map from commit
  36  * IDs to a list of parents and changed paths, separated by `null`. For
  37  * example, a bucket might look like this:
  38  *
  39  *   array(
  40  *     1 => array(0, null, 17, 18),
  41  *     2 => array(1, null, 4),
  42  *     // ...
  43  *   )
  44  *
  45  * This means that commit ID 1 has parent commit 0 (a special value meaning
  46  * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1,
  47  * and affected path 4.
  48  *
  49  * This data structure attempts to balance compactness, ease of construction,
  50  * simplicity of cache semantics, and lookup performance. In the average case,
  51  * it appears to do a reasonable job at this.
  52  *
  53  * @task query Querying the Graph Cache
  54  * @task cache Cache Internals
  55  */
  56 final class PhabricatorRepositoryGraphCache extends Phobject {
  57
  58   private $rebuiltKeys = array();
  59
  60
  61 /* -(  Querying the Graph Cache  )------------------------------------------- */
  62
  63
  64   /**
  65    * Search the graph cache for the most modification to a path.
  66    *
  67    * @param int     The commit ID to search ancestors of.
  68    * @param int     The path ID to search for changes to.
  69    * @param float   Maximum number of seconds to spend trying to satisfy this
  70    *                query using the graph cache. By default, `0.5` (500ms).
  71    * @return mixed  Commit ID, or `null` if no ancestors exist, or `false` if
  72    *                the graph cache was unable to determine the answer.
  73    * @task query
  74    */
  75   public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) {
  76     $commit_id = (int)$commit_id;
  77     $path_id = (int)$path_id;
  78
  79     $bucket_data = null;
  80     $data_key = null;
  81     $seen = array();
  82
  83     $t_start = microtime(true);
  84     $iterations = 0;
  85     while (true) {
  86       $bucket_key = $this->getBucketKey($commit_id);
  87
  88       if (($data_key != $bucket_key) || $bucket_data === null) {
  89         $bucket_data = $this->getBucketData($bucket_key);
  90         $data_key = $bucket_key;
  91       }
  92
  93       if (empty($bucket_data[$commit_id])) {
  94         // Rebuild the cache bucket, since the commit might be a very recent
  95         // one that we'll pick up by rebuilding.
  96
  97         $bucket_data = $this->getBucketData($bucket_key, $bucket_data);
  98         if (empty($bucket_data[$commit_id])) {
  99           // A rebuild didn't help. This can occur legitimately if the commit
 100           // is new and hasn't parsed yet.
 101           return false;
 102         }
 103
 104         // Otherwise, the rebuild gave us the data, so we can keep going.
 105
 106         $did_fill = true;
 107       } else {
 108         $did_fill = false;
 109       }
 110
 111       // Sanity check so we can survive and recover from bad data.
 112       if (isset($seen[$commit_id])) {
 113         phlog(pht('Unexpected infinite loop in %s!', __CLASS__));
 114         return false;
 115       } else {
 116         $seen[$commit_id] = true;
 117       }
 118
 119       // `$data` is a list: the commit's parent IDs, followed by `null`,
 120       // followed by the modified paths in ascending order. We figure out the
 121       // first parent first, then check if the path was touched. If the path
 122       // was touched, this is the commit we're after. If not, walk backward
 123       // in the tree.
 124
 125       $items = $bucket_data[$commit_id];
 126       $size = count($items);
 127
 128       // Walk past the parent information.
 129       $parent_id = null;
 130       for ($ii = 0;; ++$ii) {
 131         if ($items[$ii] === null) {
 132           break;
 133         }
 134         if ($parent_id === null) {
 135           $parent_id = $items[$ii];
 136         }
 137       }
 138
 139       // Look for a modification to the path.
 140       for (; $ii < $size; ++$ii) {
 141         $item = $items[$ii];
 142         if ($item > $path_id) {
 143           break;
 144         }
 145         if ($item === $path_id) {
 146           return $commit_id;
 147         }
 148       }
 149
 150       if ($parent_id) {
 151         $commit_id = $parent_id;
 152
 153         // Periodically check if we've spent too long looking for a result
 154         // in the cache, and return so we can fall back to a VCS operation.
 155         // This keeps us from having a degenerate worst case if, e.g., the
 156         // cache is cold and we need to inspect a very large number of blocks
 157         // to satisfy the query.
 158
 159         ++$iterations;
 160
 161         // If we performed a cache fill in this cycle, always check the time
 162         // limit, since cache fills may take a significant amount of time.
 163
 164         if ($did_fill || ($iterations % 64 === 0)) {
 165           $t_end = microtime(true);
 166           if (($t_end - $t_start) > $time) {
 167             return false;
 168           }
 169         }
 170         continue;
 171       }
 172
 173       // If we have an explicit 0, that means this commit really has no parents.
 174       // Usually, it is the first commit in the repository.
 175       if ($parent_id === 0) {
 176         return null;
 177       }
 178
 179       // If we didn't find a parent, the parent data isn't available. We fail
 180       // to find an answer in the cache and fall back to querying the VCS.
 181       return false;
 182     }
 183   }
 184
 185
 186 /* -(  Cache Internals  )---------------------------------------------------- */
 187
 188
 189   /**
 190    * Get the bucket key for a given commit ID.
 191    *
 192    * @param   int   Commit ID.
 193    * @return  int   Bucket key.
 194    * @task cache
 195    */
 196   private function getBucketKey($commit_id) {
 197     return (int)floor($commit_id / $this->getBucketSize());
 198   }
 199
 200
 201   /**
 202    * Get the cache key for a given bucket key (from @{method:getBucketKey}).
 203    *
 204    * @param   int     Bucket key.
 205    * @return  string  Cache key.
 206    * @task cache
 207    */
 208   private function getBucketCacheKey($bucket_key) {
 209     static $prefix;
 210
 211     if ($prefix === null) {
 212       $self = get_class($this);
 213       $size = $this->getBucketSize();
 214       $prefix = "{$self}:{$size}:2:";
 215     }
 216
 217     return $prefix.$bucket_key;
 218   }
 219
 220
 221   /**
 222    * Get the number of items per bucket.
 223    *
 224    * @return  int Number of items to store per bucket.
 225    * @task cache
 226    */
 227   private function getBucketSize() {
 228     return 4096;
 229   }
 230
 231
 232   /**
 233    * Retrieve or build a graph cache bucket from the cache.
 234    *
 235    * Normally, this operates as a readthrough cache call. It can also be used
 236    * to force a cache update by passing the existing data to `$rebuild_data`.
 237    *
 238    * @param   int     Bucket key, from @{method:getBucketKey}.
 239    * @param   mixed   Current data, to force a cache rebuild of this bucket.
 240    * @return  array   Data from the cache.
 241    * @task cache
 242    */
 243   private function getBucketData($bucket_key, $rebuild_data = null) {
 244     $cache_key = $this->getBucketCacheKey($bucket_key);
 245
 246     // TODO: This cache stuff could be handled more gracefully, but the
 247     // database cache currently requires values to be strings and needs
 248     // some tweaking to support this as part of a stack. Our cache semantics
 249     // here are also unusual (not purely readthrough) because this cache is
 250     // appendable.
 251
 252     $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache();
 253     $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache();
 254     if ($rebuild_data === null) {
 255       $bucket_data = $cache_level1->getKey($cache_key);
 256       if ($bucket_data) {
 257         return $bucket_data;
 258       }
 259
 260       $bucket_data = $cache_level2->getKey($cache_key);
 261       if ($bucket_data) {
 262         $unserialized = @unserialize($bucket_data);
 263         if ($unserialized) {
 264           // Fill APC if we got a database hit but missed in APC.
 265           $cache_level1->setKey($cache_key, $unserialized);
 266           return $unserialized;
 267         }
 268       }
 269     }
 270
 271     if (!is_array($rebuild_data)) {
 272       $rebuild_data = array();
 273     }
 274
 275     $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data);
 276
 277     // Don't bother writing the data if we didn't update anything.
 278     if ($bucket_data !== $rebuild_data) {
 279       $cache_level2->setKey($cache_key, serialize($bucket_data));
 280       $cache_level1->setKey($cache_key, $bucket_data);
 281     }
 282
 283     return $bucket_data;
 284   }
 285
 286
 287   /**
 288    * Rebuild a cache bucket, amending existing data if available.
 289    *
 290    * @param   int     Bucket key, from @{method:getBucketKey}.
 291    * @param   array   Existing bucket data.
 292    * @return  array   Rebuilt bucket data.
 293    * @task cache
 294    */
 295   private function rebuildBucket($bucket_key, array $current_data) {
 296
 297     // First, check if we've already rebuilt this bucket. In some cases (like
 298     // browsing a repository at some commit) it's common to issue many lookups
 299     // against one commit. If that commit has been discovered but not yet
 300     // fully imported, we'll repeatedly attempt to rebuild the bucket. If the
 301     // first rebuild did not work, subsequent rebuilds are very unlikely to
 302     // have any effect. We can just skip the rebuild in these cases.
 303
 304     if (isset($this->rebuiltKeys[$bucket_key])) {
 305       return $current_data;
 306     } else {
 307       $this->rebuiltKeys[$bucket_key] = true;
 308     }
 309
 310     $bucket_min = ($bucket_key * $this->getBucketSize());
 311     $bucket_max = ($bucket_min + $this->getBucketSize()) - 1;
 312
 313     // We need to reload all of the commits in the bucket because there is
 314     // no guarantee that they'll get parsed in order, so we can fill large
 315     // commit IDs before small ones. Later on, we'll ignore the commits we
 316     // already know about.
 317
 318     $table_commit = new PhabricatorRepositoryCommit();
 319     $table_repository = new PhabricatorRepository();
 320     $conn_r = $table_commit->establishConnection('r');
 321
 322     // Find all the Git and Mercurial commits in the block which have completed
 323     // change import. We can't fill the cache accurately for commits which have
 324     // not completed change import, so just pretend we don't know about them.
 325     // In these cases, we will ultimately fall back to VCS queries.
 326
 327     $commit_rows = queryfx_all(
 328       $conn_r,
 329       'SELECT c.id FROM %T c
 330         JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls)
 331         WHERE c.id BETWEEN %d AND %d
 332           AND (c.importStatus & %d) = %d',
 333       $table_commit->getTableName(),
 334       $table_repository->getTableName(),
 335       array(
 336         PhabricatorRepositoryType::REPOSITORY_TYPE_GIT,
 337         PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL,
 338       ),
 339       $bucket_min,
 340       $bucket_max,
 341       PhabricatorRepositoryCommit::IMPORTED_CHANGE,
 342       PhabricatorRepositoryCommit::IMPORTED_CHANGE);
 343
 344     // If we don't have any data, just return the existing data.
 345     if (!$commit_rows) {
 346       return $current_data;
 347     }
 348
 349     // Remove the commits we already have data for. We don't need to rebuild
 350     // these. If there's nothing left, return the existing data.
 351
 352     $commit_ids = ipull($commit_rows, 'id', 'id');
 353     $commit_ids = array_diff_key($commit_ids, $current_data);
 354
 355     if (!$commit_ids) {
 356       return $current_data;
 357     }
 358
 359     // Find all the path changes for the new commits.
 360     $path_changes = queryfx_all(
 361       $conn_r,
 362       'SELECT commitID, pathID FROM %T
 363         WHERE commitID IN (%Ld)
 364         AND (isDirect = 1 OR changeType = %d)',
 365       PhabricatorRepository::TABLE_PATHCHANGE,
 366       $commit_ids,
 367       DifferentialChangeType::TYPE_CHILD);
 368     $path_changes = igroup($path_changes, 'commitID');
 369
 370     // Find all the parents for the new commits.
 371     $parents = queryfx_all(
 372       $conn_r,
 373       'SELECT childCommitID, parentCommitID FROM %T
 374         WHERE childCommitID IN (%Ld)
 375         ORDER BY id ASC',
 376       PhabricatorRepository::TABLE_PARENTS,
 377       $commit_ids);
 378     $parents = igroup($parents, 'childCommitID');
 379
 380     // Build the actual data for the cache.
 381     foreach ($commit_ids as $commit_id) {
 382       $parent_ids = array();
 383       if (!empty($parents[$commit_id])) {
 384         foreach ($parents[$commit_id] as $row) {
 385           $parent_ids[] = (int)$row['parentCommitID'];
 386         }
 387       } else {
 388         // We expect all rows to have parents (commits with no parents get
 389         // an explicit "0" placeholder). If we're in an older repository, the
 390         // parent information might not have been populated yet. Decline to fill
 391         // the cache if we don't have the parent information, since the fill
 392         // will be incorrect.
 393         continue;
 394       }
 395
 396       if (isset($path_changes[$commit_id])) {
 397         $path_ids = $path_changes[$commit_id];
 398         foreach ($path_ids as $key => $path_id) {
 399           $path_ids[$key] = (int)$path_id['pathID'];
 400         }
 401         sort($path_ids);
 402       } else {
 403         $path_ids = array();
 404       }
 405
 406       $value = $parent_ids;
 407       $value[] = null;
 408       foreach ($path_ids as $path_id) {
 409         $value[] = $path_id;
 410       }
 411
 412       $current_data[$commit_id] = $value;
 413     }
 414
 415     return $current_data;
 416   }
 417
 418 }