3 * Class for fetching backlink lists, approximate backlink counts and
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
22 * @author Tim Starling
23 * @copyright © 2009, Tim Starling, Domas Mituzas
24 * @copyright © 2010, Max Sem
25 * @copyright © 2011, Antoine Musso
28 namespace MediaWiki\Cache
;
32 use MediaWiki\Config\ServiceOptions
;
33 use MediaWiki\HookContainer\HookContainer
;
34 use MediaWiki\HookContainer\HookRunner
;
35 use MediaWiki\Linker\LinksMigration
;
36 use MediaWiki\MainConfigNames
;
37 use MediaWiki\Page\PageIdentity
;
38 use MediaWiki\Page\PageIdentityValue
;
39 use MediaWiki\Page\PageReference
;
40 use MediaWiki\Title\Title
;
41 use MediaWiki\Title\TitleValue
;
42 use Psr\Log\LoggerInterface
;
45 use Wikimedia\ObjectCache\WANObjectCache
;
46 use Wikimedia\Rdbms\Database
;
47 use Wikimedia\Rdbms\IConnectionProvider
;
48 use Wikimedia\Rdbms\IReadableDatabase
;
49 use Wikimedia\Rdbms\IResultWrapper
;
50 use Wikimedia\Rdbms\SelectQueryBuilder
;
53 * Class for fetching backlink lists, approximate backlink counts and
54 * partitions. This is a shared cache.
56 * Instances of this class should typically be fetched with the method
57 * ::getBacklinkCache() from the BacklinkCacheFactory service.
59 * Ideally you should only get your backlinks from here when you think
60 * there is some advantage in caching them. Otherwise, it's just a waste
65 * @internal Used by ServiceWiring.php
67 public const CONSTRUCTOR_OPTIONS
= [
68 MainConfigNames
::UpdateRowsPerJob
,
72 * Multi-dimensional array representing batches. Keys are:
73 * > (string) links table name
75 * > 'numRows' : Number of rows for this link table
76 * > 'batches' : [ [ $start, $end ] ]
78 * @see BacklinkCache::partitionResult()
81 private $partitionCache = [];
84 * Contains the whole links from a database result.
85 * This is raw data that will be partitioned in $partitionCache
87 * Initialized with BacklinkCache::queryLinks()
89 * @var IResultWrapper[]
91 private $fullResultCache = [];
94 * Cache for hasLinks()
98 private $hasLinksCache = [];
100 /** @var WANObjectCache */
103 /** @var HookRunner */
107 * Local copy of a PageReference object
112 private const CACHE_EXPIRY
= 3600;
113 private IConnectionProvider
$dbProvider;
114 private ServiceOptions
$options;
115 private LinksMigration
$linksMigration;
116 private LoggerInterface
$logger;
119 * Create a new BacklinkCache
121 * @param ServiceOptions $options
122 * @param LinksMigration $linksMigration
123 * @param WANObjectCache $wanCache
124 * @param HookContainer $hookContainer
125 * @param IConnectionProvider $dbProvider
126 * @param LoggerInterface $logger
127 * @param PageReference $page Page to create a backlink cache for
129 public function __construct(
130 ServiceOptions
$options,
131 LinksMigration
$linksMigration,
132 WANObjectCache
$wanCache,
133 HookContainer
$hookContainer,
134 IConnectionProvider
$dbProvider,
135 LoggerInterface
$logger,
138 $options->assertRequiredOptions( self
::CONSTRUCTOR_OPTIONS
);
139 $this->options
= $options;
140 $this->linksMigration
= $linksMigration;
141 $this->wanCache
= $wanCache;
142 $this->hookRunner
= new HookRunner( $hookContainer );
143 $this->dbProvider
= $dbProvider;
144 $this->logger
= $logger;
150 * @return PageReference
152 public function getPage(): PageReference
{
157 * Get the replica DB connection to the database
159 * @return IReadableDatabase
161 private function getDB() {
162 return $this->dbProvider
->getReplicaDatabase();
166 * Get the backlinks for a given table. Cached in process memory only.
167 * @param string $table
168 * @param int|bool $startId
169 * @param int|bool $endId
170 * @param int|float $max Integer, or INF for no max
171 * @return Iterator<PageIdentity>
174 public function getLinkPages(
175 string $table, $startId = false, $endId = false, $max = INF
178 foreach ( $this->queryLinks( $table, $startId, $endId, $max ) as $row ) {
179 yield PageIdentityValue
::localIdentity(
180 $row->page_id
, $row->page_namespace
, $row->page_title
);
182 // queryLinks() may return too many rows
183 if ( is_finite( $max ) && ++
$i >= $max ) {
190 * Get the backlinks for a given table. Cached in process memory only.
192 * @param string $table
193 * @param int|bool $startId
194 * @param int|bool $endId
195 * @param int|float $max A hint for the maximum number of rows to return.
196 * May return more rows if there is a previously cached result set.
197 * @param string $select 'all' or 'ids'
198 * @return IResultWrapper
200 private function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
201 if ( !$startId && !$endId && isset( $this->fullResultCache
[$table] ) ) {
202 $this->logger
->debug( __METHOD__
. ': got results from cache' );
203 return $this->fullResultCache
[$table];
206 $this->logger
->debug( __METHOD__
. ': got results from DB' );
207 $queryBuilder = $this->initQueryBuilderForTable( $table, $select );
208 $fromField = $this->getPrefix( $table ) . '_from';
209 // Use the from field in the condition rather than the joined page_id,
210 // because databases are stupid and don't necessarily propagate indexes.
212 $queryBuilder->where(
213 $this->getDB()->expr( $fromField, '>=', $startId )
217 $queryBuilder->where(
218 $this->getDB()->expr( $fromField, '<=', $endId )
221 $queryBuilder->orderBy( $fromField );
222 if ( is_finite( $max ) && $max > 0 ) {
223 $queryBuilder->limit( $max );
226 $res = $queryBuilder->caller( __METHOD__
)->fetchResultSet();
228 if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
229 // The full results fit within the limit, so cache them
230 $this->fullResultCache
[$table] = $res;
232 $this->logger
->debug( __METHOD__
. ": results from DB were uncacheable" );
239 * Get the field name prefix for a given table
240 * @param string $table
241 * @return null|string
243 private function getPrefix( $table ) {
246 'imagelinks' => 'il',
247 'categorylinks' => 'cl',
248 'templatelinks' => 'tl',
252 if ( isset( $prefixes[$table] ) ) {
253 return $prefixes[$table];
256 $this->hookRunner
->onBacklinkCacheGetPrefix( $table, $prefix );
260 throw new LogicException( "Invalid table \"$table\" in " . __CLASS__
);
266 * Initialize a new SelectQueryBuilder for selecting backlinks,
267 * with a join on the page table if needed.
269 * @param string $table
270 * @param string $select
271 * @return SelectQueryBuilder
273 private function initQueryBuilderForTable( string $table, string $select ): SelectQueryBuilder
{
274 $prefix = $this->getPrefix( $table );
275 $queryBuilder = $this->getDB()->newSelectQueryBuilder();
276 $joinPageTable = $select !== 'ids';
278 if ( $select === 'ids' ) {
279 $queryBuilder->select( [ 'page_id' => $prefix . '_from' ] );
281 $queryBuilder->select( [ 'page_namespace', 'page_title', 'page_id' ] );
283 $queryBuilder->from( $table );
286 * If the table is one of the tables known to this method,
287 * we can use a nice join() method later, always joining on page_id={$prefix}_from.
288 * If the table is unknown here, and only supported via a hook,
289 * the hook only produces a single $conds array,
290 * so we have to use a traditional / ANSI-89 JOIN,
291 * with the page table just added to the list of tables and the join conds in the WHERE part.
297 case 'templatelinks':
298 $queryBuilder->where(
299 $this->linksMigration
->getLinksConditions( $table, TitleValue
::newFromPage( $this->page
) )
303 $queryBuilder->where( [
304 "{$prefix}_namespace" => $this->page
->getNamespace(),
305 "{$prefix}_title" => $this->page
->getDBkey(),
306 "{$prefix}_interwiki" => [ '', null ],
310 case 'categorylinks':
311 $queryBuilder->where( [
312 "{$prefix}_to" => $this->page
->getDBkey(),
318 $this->hookRunner
->onBacklinkCacheGetConditions( $table,
319 Title
::newFromPageReference( $this->page
),
323 throw new LogicException( "Invalid table \"$table\" in " . __CLASS__
);
325 if ( $joinPageTable ) {
326 $queryBuilder->table( 'page' ); // join condition in $conds
328 // remove any page_id condition from $conds
329 $conds = array_filter( (array)$conds, static function ( $clause ) { // kind of janky
330 return !preg_match( '/(\b|=)page_id(\b|=)/', (string)$clause );
333 $queryBuilder->where( $conds );
337 if ( $knownTable && $joinPageTable ) {
338 $queryBuilder->join( 'page', null, "page_id={$prefix}_from" );
340 if ( $joinPageTable ) {
341 $queryBuilder->straightJoinOption();
344 return $queryBuilder;
348 * Check if there are any backlinks. Only use the process cache, since the
349 * WAN cache is potentially stale (T368006).
351 * @param string $table
354 public function hasLinks( $table ) {
355 if ( isset( $this->hasLinksCache
[$table] ) ) {
356 return $this->hasLinksCache
[$table];
358 if ( isset( $this->partitionCache
[$table] ) ) {
359 $entry = reset( $this->partitionCache
[$table] );
360 return (bool)$entry['numRows'];
362 if ( isset( $this->fullResultCache
[$table] ) ) {
363 return (bool)$this->fullResultCache
[$table]->numRows();
365 $hasLinks = (bool)$this->queryLinks( $table, false, false, 1 )->numRows();
366 $this->hasLinksCache
[$table] = $hasLinks;
371 * Get the approximate number of backlinks
372 * @param string $table
375 public function getNumLinks( $table ) {
376 if ( isset( $this->partitionCache
[$table] ) ) {
377 $entry = reset( $this->partitionCache
[$table] );
378 return $entry['numRows'];
381 if ( isset( $this->fullResultCache
[$table] ) ) {
382 return $this->fullResultCache
[$table]->numRows();
385 return $this->wanCache
->getWithSetCallback(
386 $this->wanCache
->makeKey(
388 CacheKeyHelper
::getKeyForPage( $this->page
),
392 function ( $oldValue, &$ttl, array &$setOpts ) use ( $table ) {
393 $setOpts +
= Database
::getCacheSetOptions( $this->getDB() );
395 // Use partition() since it will batch the query and skip the JOIN.
396 // Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
397 $batchSize = $this->options
->get( MainConfigNames
::UpdateRowsPerJob
);
398 $this->partition( $table, $batchSize );
399 return $this->partitionCache
[$table][$batchSize]['numRows'];
405 * Partition the backlinks into batches.
406 * Returns an array giving the start and end of each range. The first
407 * batch has a start of false, and the last batch has an end of false.
409 * @param string $table The links table name
410 * @param int $batchSize
413 public function partition( $table, $batchSize ) {
414 if ( isset( $this->partitionCache
[$table][$batchSize] ) ) {
415 $this->logger
->debug( __METHOD__
. ": got from partition cache" );
417 return $this->partitionCache
[$table][$batchSize]['batches'];
420 $this->partitionCache
[$table][$batchSize] = false;
421 $cacheEntry =& $this->partitionCache
[$table][$batchSize];
423 if ( isset( $this->fullResultCache
[$table] ) ) {
424 $res = $this->fullResultCache
[$table];
425 $numRows = $res->numRows();
426 $batches = $this->partitionResult( $res, $numRows, $batchSize );
427 $this->openBatchEnds( $batches );
428 $cacheEntry = [ 'numRows' => $numRows, 'batches' => $batches ];
429 $this->logger
->debug( __METHOD__
. ": got from full result cache" );
431 return $cacheEntry['batches'];
434 $cacheEntry = $this->wanCache
->getWithSetCallback(
435 $this->wanCache
->makeKey(
437 CacheKeyHelper
::getKeyForPage( $this->page
),
442 function ( $oldValue, &$ttl, array &$setOpts ) use ( $table, $batchSize ) {
443 $setOpts +
= Database
::getCacheSetOptions( $this->getDB() );
445 $value = [ 'numRows' => 0, 'batches' => [] ];
447 // Do the selects in batches to avoid client-side OOMs (T45452).
448 // Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
449 $selectSize = max( $batchSize, 200_000
- ( 200_000 %
$batchSize ) );
452 $res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
453 $numRows = $res->numRows();
454 $batches = $this->partitionResult( $res, $numRows, $batchSize );
455 // Merge the link count and range partitions for this chunk
456 $value['numRows'] +
= $numRows;
457 $value['batches'] = array_merge( $value['batches'], $batches );
458 if ( count( $batches ) ) {
459 // pick up after this inclusive range
460 $start = end( $batches )[1] +
1;
462 } while ( $numRows >= $selectSize );
463 // Make sure the first range has start=false and the last one has end=false
464 $this->openBatchEnds( $value['batches'] );
470 return $cacheEntry['batches'];
474 * Modify an array of batches, setting the start of the first batch to
475 * false, and the end of the last batch to false, so that the complete
476 * set of batches covers the entire ID range from 0 to infinity.
478 * @param array &$batches
480 private function openBatchEnds( array &$batches ) {
481 if ( !count( $batches ) ) {
482 $batches = [ [ false, false ] ];
484 $batches[0][0] = false;
485 $batches[ array_key_last( $batches ) ][1] = false;
490 * Partition a DB result with backlinks in it into batches
491 * @param IResultWrapper $res Database result
492 * @param int $numRows The number of rows to use from the result set
493 * @param int $batchSize
496 private function partitionResult( $res, $numRows, $batchSize ) {
497 $numBatches = ceil( $numRows / $batchSize );
499 for ( $i = 0; $i < $numBatches; $i++
) {
500 $rowNum = $i * $batchSize;
501 $res->seek( $rowNum );
502 $row = $res->fetchObject();
503 $start = (int)$row->page_id
;
505 $rowNum = min( $numRows - 1, ( $i +
1 ) * $batchSize - 1 );
506 $res->seek( $rowNum );
507 $row = $res->fetchObject();
508 $end = (int)$row->page_id
;
511 if ( $start && $end && $start > $end ) {
512 throw new RuntimeException( __METHOD__
. ': Internal error: query result out of order' );
515 $batches[] = [ $start, $end ];
522 * Get a PageIdentity iterator for cascade-protected template/file use backlinks
524 * @return Iterator<PageIdentity>
527 public function getCascadeProtectedLinkPages(): Iterator
{
528 foreach ( $this->getCascadeProtectedLinksInternal() as $row ) {
529 yield PageIdentityValue
::localIdentity(
530 $row->page_id
, $row->page_namespace
, $row->page_title
);
535 * Get an array of cascade-protected template/file use backlinks
539 private function getCascadeProtectedLinksInternal(): array {
540 $dbr = $this->getDB();
542 // @todo: use UNION without breaking tests that use temp tables
544 $linkConds = $this->linksMigration
->getLinksConditions(
545 'templatelinks', TitleValue
::newFromPage( $this->page
)
547 $resSets[] = $dbr->newSelectQueryBuilder()
548 ->select( [ 'page_namespace', 'page_title', 'page_id' ] )
549 ->from( 'templatelinks' )
550 ->join( 'page_restrictions', null, 'tl_from = pr_page' )
551 ->join( 'page', null, 'page_id = tl_from' )
552 ->where( $linkConds )
553 ->andWhere( [ 'pr_cascade' => 1 ] )
555 ->caller( __METHOD__
)->fetchResultSet();
556 if ( $this->page
->getNamespace() === NS_FILE
) {
557 $resSets[] = $dbr->newSelectQueryBuilder()
558 ->select( [ 'page_namespace', 'page_title', 'page_id' ] )
559 ->from( 'imagelinks' )
560 ->join( 'page_restrictions', null, 'il_from = pr_page' )
561 ->join( 'page', null, 'page_id = il_from' )
563 'il_to' => $this->page
->getDBkey(),
567 ->caller( __METHOD__
)->fetchResultSet();
570 // Combine and de-duplicate the results
572 foreach ( $resSets as $res ) {
573 foreach ( $res as $row ) {
574 // Index by page_id to remove duplicates
575 $mergedRes[$row->page_id
] = $row;
579 // Now that we've de-duplicated, throw away the keys
580 return array_values( $mergedRes );
584 /** @deprecated class alias since 1.42 */
585 class_alias( BacklinkCache
::class, 'BacklinkCache' );