Merge "docs: Fix typo"
[mediawiki.git] / includes / cache / BacklinkCache.php
blob9714b01906f3f203be79c2c81f6ab681e9d8bd63
1 <?php
2 /**
3 * Class for fetching backlink lists, approximate backlink counts and
4 * partitions.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
21 * @file
22 * @author Tim Starling
23 * @copyright © 2009, Tim Starling, Domas Mituzas
24 * @copyright © 2010, Max Sem
25 * @copyright © 2011, Antoine Musso
28 namespace MediaWiki\Cache;
30 use Iterator;
31 use LogicException;
32 use MediaWiki\Config\ServiceOptions;
33 use MediaWiki\HookContainer\HookContainer;
34 use MediaWiki\HookContainer\HookRunner;
35 use MediaWiki\Linker\LinksMigration;
36 use MediaWiki\MainConfigNames;
37 use MediaWiki\Page\PageIdentity;
38 use MediaWiki\Page\PageIdentityValue;
39 use MediaWiki\Page\PageReference;
40 use MediaWiki\Title\Title;
41 use MediaWiki\Title\TitleValue;
42 use Psr\Log\LoggerInterface;
43 use RuntimeException;
44 use stdClass;
45 use Wikimedia\ObjectCache\WANObjectCache;
46 use Wikimedia\Rdbms\Database;
47 use Wikimedia\Rdbms\IConnectionProvider;
48 use Wikimedia\Rdbms\IReadableDatabase;
49 use Wikimedia\Rdbms\IResultWrapper;
50 use Wikimedia\Rdbms\SelectQueryBuilder;
52 /**
53 * Class for fetching backlink lists, approximate backlink counts and
54 * partitions. This is a shared cache.
56 * Instances of this class should typically be fetched with the method
57 * ::getBacklinkCache() from the BacklinkCacheFactory service.
59 * Ideally you should only get your backlinks from here when you think
60 * there is some advantage in caching them. Otherwise, it's just a waste
61 * of memory.
63 class BacklinkCache {
64 /**
65 * @internal Used by ServiceWiring.php
67 public const CONSTRUCTOR_OPTIONS = [
68 MainConfigNames::UpdateRowsPerJob,
71 /**
72 * Multi-dimensional array representing batches. Keys are:
73 * > (string) links table name
74 * > (int) batch size
75 * > 'numRows' : Number of rows for this link table
76 * > 'batches' : [ [ $start, $end ] ]
78 * @see BacklinkCache::partitionResult()
79 * @var array[]
81 private $partitionCache = [];
83 /**
84 * Contains the whole links from a database result.
85 * This is raw data that will be partitioned in $partitionCache
87 * Initialized with BacklinkCache::queryLinks()
89 * @var IResultWrapper[]
91 private $fullResultCache = [];
93 /**
94 * Cache for hasLinks()
96 * @var bool[]
98 private $hasLinksCache = [];
100 /** @var WANObjectCache */
101 private $wanCache;
103 /** @var HookRunner */
104 private $hookRunner;
107 * Local copy of a PageReference object
108 * @var PageReference
110 private $page;
112 private const CACHE_EXPIRY = 3600;
113 private IConnectionProvider $dbProvider;
114 private ServiceOptions $options;
115 private LinksMigration $linksMigration;
116 private LoggerInterface $logger;
119 * Create a new BacklinkCache
121 * @param ServiceOptions $options
122 * @param LinksMigration $linksMigration
123 * @param WANObjectCache $wanCache
124 * @param HookContainer $hookContainer
125 * @param IConnectionProvider $dbProvider
126 * @param LoggerInterface $logger
127 * @param PageReference $page Page to create a backlink cache for
129 public function __construct(
130 ServiceOptions $options,
131 LinksMigration $linksMigration,
132 WANObjectCache $wanCache,
133 HookContainer $hookContainer,
134 IConnectionProvider $dbProvider,
135 LoggerInterface $logger,
136 PageReference $page
138 $options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
139 $this->options = $options;
140 $this->linksMigration = $linksMigration;
141 $this->wanCache = $wanCache;
142 $this->hookRunner = new HookRunner( $hookContainer );
143 $this->dbProvider = $dbProvider;
144 $this->logger = $logger;
145 $this->page = $page;
149 * @since 1.37
150 * @return PageReference
152 public function getPage(): PageReference {
153 return $this->page;
157 * Get the replica DB connection to the database
159 * @return IReadableDatabase
161 private function getDB() {
162 return $this->dbProvider->getReplicaDatabase();
166 * Get the backlinks for a given table. Cached in process memory only.
167 * @param string $table
168 * @param int|bool $startId
169 * @param int|bool $endId
170 * @param int|float $max Integer, or INF for no max
171 * @return Iterator<PageIdentity>
172 * @since 1.37
174 public function getLinkPages(
175 string $table, $startId = false, $endId = false, $max = INF
176 ): Iterator {
177 $i = 0;
178 foreach ( $this->queryLinks( $table, $startId, $endId, $max ) as $row ) {
179 yield PageIdentityValue::localIdentity(
180 $row->page_id, $row->page_namespace, $row->page_title );
182 // queryLinks() may return too many rows
183 if ( is_finite( $max ) && ++$i >= $max ) {
184 break;
190 * Get the backlinks for a given table. Cached in process memory only.
192 * @param string $table
193 * @param int|bool $startId
194 * @param int|bool $endId
195 * @param int|float $max A hint for the maximum number of rows to return.
196 * May return more rows if there is a previously cached result set.
197 * @param string $select 'all' or 'ids'
198 * @return IResultWrapper
200 private function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
201 if ( !$startId && !$endId && isset( $this->fullResultCache[$table] ) ) {
202 $this->logger->debug( __METHOD__ . ': got results from cache' );
203 return $this->fullResultCache[$table];
206 $this->logger->debug( __METHOD__ . ': got results from DB' );
207 $queryBuilder = $this->initQueryBuilderForTable( $table, $select );
208 $fromField = $this->getPrefix( $table ) . '_from';
209 // Use the from field in the condition rather than the joined page_id,
210 // because databases are stupid and don't necessarily propagate indexes.
211 if ( $startId ) {
212 $queryBuilder->where(
213 $this->getDB()->expr( $fromField, '>=', $startId )
216 if ( $endId ) {
217 $queryBuilder->where(
218 $this->getDB()->expr( $fromField, '<=', $endId )
221 $queryBuilder->orderBy( $fromField );
222 if ( is_finite( $max ) && $max > 0 ) {
223 $queryBuilder->limit( $max );
226 $res = $queryBuilder->caller( __METHOD__ )->fetchResultSet();
228 if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
229 // The full results fit within the limit, so cache them
230 $this->fullResultCache[$table] = $res;
231 } else {
232 $this->logger->debug( __METHOD__ . ": results from DB were uncacheable" );
235 return $res;
239 * Get the field name prefix for a given table
240 * @param string $table
241 * @return null|string
243 private function getPrefix( $table ) {
244 static $prefixes = [
245 'pagelinks' => 'pl',
246 'imagelinks' => 'il',
247 'categorylinks' => 'cl',
248 'templatelinks' => 'tl',
249 'redirect' => 'rd',
252 if ( isset( $prefixes[$table] ) ) {
253 return $prefixes[$table];
254 } else {
255 $prefix = null;
256 $this->hookRunner->onBacklinkCacheGetPrefix( $table, $prefix );
257 if ( $prefix ) {
258 return $prefix;
259 } else {
260 throw new LogicException( "Invalid table \"$table\" in " . __CLASS__ );
266 * Initialize a new SelectQueryBuilder for selecting backlinks,
267 * with a join on the page table if needed.
269 * @param string $table
270 * @param string $select
271 * @return SelectQueryBuilder
273 private function initQueryBuilderForTable( string $table, string $select ): SelectQueryBuilder {
274 $prefix = $this->getPrefix( $table );
275 $queryBuilder = $this->getDB()->newSelectQueryBuilder();
276 $joinPageTable = $select !== 'ids';
278 if ( $select === 'ids' ) {
279 $queryBuilder->select( [ 'page_id' => $prefix . '_from' ] );
280 } else {
281 $queryBuilder->select( [ 'page_namespace', 'page_title', 'page_id' ] );
283 $queryBuilder->from( $table );
286 * If the table is one of the tables known to this method,
287 * we can use a nice join() method later, always joining on page_id={$prefix}_from.
288 * If the table is unknown here, and only supported via a hook,
289 * the hook only produces a single $conds array,
290 * so we have to use a traditional / ANSI-89 JOIN,
291 * with the page table just added to the list of tables and the join conds in the WHERE part.
293 $knownTable = true;
295 switch ( $table ) {
296 case 'pagelinks':
297 case 'templatelinks':
298 $queryBuilder->where(
299 $this->linksMigration->getLinksConditions( $table, TitleValue::newFromPage( $this->page ) )
301 break;
302 case 'redirect':
303 $queryBuilder->where( [
304 "{$prefix}_namespace" => $this->page->getNamespace(),
305 "{$prefix}_title" => $this->page->getDBkey(),
306 "{$prefix}_interwiki" => [ '', null ],
307 ] );
308 break;
309 case 'imagelinks':
310 case 'categorylinks':
311 $queryBuilder->where( [
312 "{$prefix}_to" => $this->page->getDBkey(),
313 ] );
314 break;
315 default:
316 $knownTable = false;
317 $conds = null;
318 $this->hookRunner->onBacklinkCacheGetConditions( $table,
319 Title::newFromPageReference( $this->page ),
320 $conds
322 if ( !$conds ) {
323 throw new LogicException( "Invalid table \"$table\" in " . __CLASS__ );
325 if ( $joinPageTable ) {
326 $queryBuilder->table( 'page' ); // join condition in $conds
327 } else {
328 // remove any page_id condition from $conds
329 $conds = array_filter( (array)$conds, static function ( $clause ) { // kind of janky
330 return !preg_match( '/(\b|=)page_id(\b|=)/', (string)$clause );
331 } );
333 $queryBuilder->where( $conds );
334 break;
337 if ( $knownTable && $joinPageTable ) {
338 $queryBuilder->join( 'page', null, "page_id={$prefix}_from" );
340 if ( $joinPageTable ) {
341 $queryBuilder->straightJoinOption();
344 return $queryBuilder;
348 * Check if there are any backlinks. Only use the process cache, since the
349 * WAN cache is potentially stale (T368006).
351 * @param string $table
352 * @return bool
354 public function hasLinks( $table ) {
355 if ( isset( $this->hasLinksCache[$table] ) ) {
356 return $this->hasLinksCache[$table];
358 if ( isset( $this->partitionCache[$table] ) ) {
359 $entry = reset( $this->partitionCache[$table] );
360 return (bool)$entry['numRows'];
362 if ( isset( $this->fullResultCache[$table] ) ) {
363 return (bool)$this->fullResultCache[$table]->numRows();
365 $hasLinks = (bool)$this->queryLinks( $table, false, false, 1 )->numRows();
366 $this->hasLinksCache[$table] = $hasLinks;
367 return $hasLinks;
371 * Get the approximate number of backlinks
372 * @param string $table
373 * @return int
375 public function getNumLinks( $table ) {
376 if ( isset( $this->partitionCache[$table] ) ) {
377 $entry = reset( $this->partitionCache[$table] );
378 return $entry['numRows'];
381 if ( isset( $this->fullResultCache[$table] ) ) {
382 return $this->fullResultCache[$table]->numRows();
385 return $this->wanCache->getWithSetCallback(
386 $this->wanCache->makeKey(
387 'numbacklinks',
388 CacheKeyHelper::getKeyForPage( $this->page ),
389 $table
391 self::CACHE_EXPIRY,
392 function ( $oldValue, &$ttl, array &$setOpts ) use ( $table ) {
393 $setOpts += Database::getCacheSetOptions( $this->getDB() );
395 // Use partition() since it will batch the query and skip the JOIN.
396 // Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
397 $batchSize = $this->options->get( MainConfigNames::UpdateRowsPerJob );
398 $this->partition( $table, $batchSize );
399 return $this->partitionCache[$table][$batchSize]['numRows'];
405 * Partition the backlinks into batches.
406 * Returns an array giving the start and end of each range. The first
407 * batch has a start of false, and the last batch has an end of false.
409 * @param string $table The links table name
410 * @param int $batchSize
411 * @return array
413 public function partition( $table, $batchSize ) {
414 if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
415 $this->logger->debug( __METHOD__ . ": got from partition cache" );
417 return $this->partitionCache[$table][$batchSize]['batches'];
420 $this->partitionCache[$table][$batchSize] = false;
421 $cacheEntry =& $this->partitionCache[$table][$batchSize];
423 if ( isset( $this->fullResultCache[$table] ) ) {
424 $res = $this->fullResultCache[$table];
425 $numRows = $res->numRows();
426 $batches = $this->partitionResult( $res, $numRows, $batchSize );
427 $this->openBatchEnds( $batches );
428 $cacheEntry = [ 'numRows' => $numRows, 'batches' => $batches ];
429 $this->logger->debug( __METHOD__ . ": got from full result cache" );
431 return $cacheEntry['batches'];
434 $cacheEntry = $this->wanCache->getWithSetCallback(
435 $this->wanCache->makeKey(
436 'backlinks',
437 CacheKeyHelper::getKeyForPage( $this->page ),
438 $table,
439 $batchSize
441 self::CACHE_EXPIRY,
442 function ( $oldValue, &$ttl, array &$setOpts ) use ( $table, $batchSize ) {
443 $setOpts += Database::getCacheSetOptions( $this->getDB() );
445 $value = [ 'numRows' => 0, 'batches' => [] ];
447 // Do the selects in batches to avoid client-side OOMs (T45452).
448 // Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
449 $selectSize = max( $batchSize, 200_000 - ( 200_000 % $batchSize ) );
450 $start = false;
451 do {
452 $res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
453 $numRows = $res->numRows();
454 $batches = $this->partitionResult( $res, $numRows, $batchSize );
455 // Merge the link count and range partitions for this chunk
456 $value['numRows'] += $numRows;
457 $value['batches'] = array_merge( $value['batches'], $batches );
458 if ( count( $batches ) ) {
459 // pick up after this inclusive range
460 $start = end( $batches )[1] + 1;
462 } while ( $numRows >= $selectSize );
463 // Make sure the first range has start=false and the last one has end=false
464 $this->openBatchEnds( $value['batches'] );
466 return $value;
470 return $cacheEntry['batches'];
474 * Modify an array of batches, setting the start of the first batch to
475 * false, and the end of the last batch to false, so that the complete
476 * set of batches covers the entire ID range from 0 to infinity.
478 private function openBatchEnds( array &$batches ) {
479 if ( !count( $batches ) ) {
480 $batches = [ [ false, false ] ];
481 } else {
482 $batches[0][0] = false;
483 $batches[ array_key_last( $batches ) ][1] = false;
488 * Partition a DB result with backlinks in it into batches
489 * @param IResultWrapper $res Database result
490 * @param int $numRows The number of rows to use from the result set
491 * @param int $batchSize
492 * @return int[][]
494 private function partitionResult( $res, $numRows, $batchSize ) {
495 $numBatches = ceil( $numRows / $batchSize );
496 $batches = [];
497 for ( $i = 0; $i < $numBatches; $i++ ) {
498 $rowNum = $i * $batchSize;
499 $res->seek( $rowNum );
500 $row = $res->fetchObject();
501 $start = (int)$row->page_id;
503 $rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
504 $res->seek( $rowNum );
505 $row = $res->fetchObject();
506 $end = (int)$row->page_id;
508 // Check order
509 if ( $start && $end && $start > $end ) {
510 throw new RuntimeException( __METHOD__ . ': Internal error: query result out of order' );
513 $batches[] = [ $start, $end ];
516 return $batches;
520 * Get a PageIdentity iterator for cascade-protected template/file use backlinks
522 * @return Iterator<PageIdentity>
523 * @since 1.37
525 public function getCascadeProtectedLinkPages(): Iterator {
526 foreach ( $this->getCascadeProtectedLinksInternal() as $row ) {
527 yield PageIdentityValue::localIdentity(
528 $row->page_id, $row->page_namespace, $row->page_title );
533 * Get an array of cascade-protected template/file use backlinks
535 * @return stdClass[]
537 private function getCascadeProtectedLinksInternal(): array {
538 $dbr = $this->getDB();
540 // @todo: use UNION without breaking tests that use temp tables
541 $resSets = [];
542 $linkConds = $this->linksMigration->getLinksConditions(
543 'templatelinks', TitleValue::newFromPage( $this->page )
545 $resSets[] = $dbr->newSelectQueryBuilder()
546 ->select( [ 'page_namespace', 'page_title', 'page_id' ] )
547 ->from( 'templatelinks' )
548 ->join( 'page_restrictions', null, 'tl_from = pr_page' )
549 ->join( 'page', null, 'page_id = tl_from' )
550 ->where( $linkConds )
551 ->andWhere( [ 'pr_cascade' => 1 ] )
552 ->distinct()
553 ->caller( __METHOD__ )->fetchResultSet();
554 if ( $this->page->getNamespace() === NS_FILE ) {
555 $resSets[] = $dbr->newSelectQueryBuilder()
556 ->select( [ 'page_namespace', 'page_title', 'page_id' ] )
557 ->from( 'imagelinks' )
558 ->join( 'page_restrictions', null, 'il_from = pr_page' )
559 ->join( 'page', null, 'page_id = il_from' )
560 ->where( [
561 'il_to' => $this->page->getDBkey(),
562 'pr_cascade' => 1,
564 ->distinct()
565 ->caller( __METHOD__ )->fetchResultSet();
568 // Combine and de-duplicate the results
569 $mergedRes = [];
570 foreach ( $resSets as $res ) {
571 foreach ( $res as $row ) {
572 // Index by page_id to remove duplicates
573 $mergedRes[$row->page_id] = $row;
577 // Now that we've de-duplicated, throw away the keys
578 return array_values( $mergedRes );
582 /** @deprecated class alias since 1.42 */
583 class_alias( BacklinkCache::class, 'BacklinkCache' );