Merge "doc: SpanInterface: more dev-friendly comments"
[mediawiki.git] / maintenance / refreshLinks.php
blob8697848009b9de87d5263fca5b8a980a6ecc2ae4
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
18 * @file
21 use MediaWiki\Deferred\DeferredUpdates;
22 use MediaWiki\Linker\LinkTarget;
23 use MediaWiki\Maintenance\Maintenance;
24 use MediaWiki\MediaWikiServices;
25 use MediaWiki\Revision\RevisionRecord;
26 use MediaWiki\Title\Title;
27 use Wikimedia\Rdbms\IExpression;
28 use Wikimedia\Rdbms\IReadableDatabase;
29 use Wikimedia\Rdbms\SelectQueryBuilder;
31 // @codeCoverageIgnoreStart
32 require_once __DIR__ . '/Maintenance.php';
33 // @codeCoverageIgnoreEnd
35 /**
36 * Refresh link tables.
38 * @ingroup Maintenance
40 class RefreshLinks extends Maintenance {
41 private const REPORTING_INTERVAL = 100;
43 public function __construct() {
44 parent::__construct();
45 $this->addDescription( 'Refresh link tables' );
46 $this->addOption( 'verbose', 'Output information about link refresh progress', false, false, 'v' );
47 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
48 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
49 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
50 $this->addOption( 'touched-only', 'Only fix pages that have been touched after last update' );
51 $this->addOption( 'e', 'Last page id to refresh', false, true );
52 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
53 'query, default 100,000', false, true );
54 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
55 $this->addOption( 'category', 'Only fix pages in this category', false, true );
56 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
57 $this->addOption( 'before-timestamp', 'Only fix pages that were last updated before this timestamp',
58 false, true );
59 $this->addArg( 'start', 'Page_id to start from, default 1', false );
60 $this->setBatchSize( 100 );
63 public function execute() {
64 // Note that there is a difference between not specifying the start
65 // and end IDs and using the minimum and maximum values from the page
66 // table. In the latter case, deleteLinksFromNonexistent() will not
67 // delete entries for nonexistent IDs that fall outside the range.
68 $start = (int)$this->getArg( 0 ) ?: null;
69 $end = (int)$this->getOption( 'e' ) ?: null;
70 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100_000 );
72 if ( $this->hasOption( 'dfn-only' ) ) {
73 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
74 return;
77 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
78 $builder = $dbr->newSelectQueryBuilder()
79 ->from( 'page' )
80 ->where( self::intervalCond( $dbr, 'page_id', $start, $end ) )
81 ->limit( $this->getBatchSize() );
83 if ( $this->hasOption( 'namespace' ) ) {
84 $builder->andWhere( [ 'page_namespace' => (int)$this->getOption( 'namespace' ) ] );
87 if ( $this->hasOption( 'before-timestamp' ) ) {
88 $builder->andWhere(
89 $dbr->expr( 'page_links_updated', '<', $this->getOption( 'before-timestamp' ) )
90 ->or( 'page_links_updated', '=', null )
94 if ( $this->hasOption( 'category' ) ) {
95 $category = $this->getOption( 'category' );
96 $title = Title::makeTitleSafe( NS_CATEGORY, $category );
97 if ( !$title ) {
98 $this->fatalError( "'$category' is an invalid category name!\n" );
100 $this->refreshCategory( $builder, $title );
101 } elseif ( $this->hasOption( 'tracking-category' ) ) {
102 // See TrackingCategories::CORE_TRACKING_CATEGORIES for tracking category keys defined by core
103 $this->refreshTrackingCategory( $builder, $this->getOption( 'tracking-category' ) );
104 } else {
105 $new = $this->hasOption( 'new-only' );
106 $redir = $this->hasOption( 'redirects-only' );
107 $touched = $this->hasOption( 'touched-only' );
108 $what = $redir ? 'redirects' : 'links';
109 if ( $new ) {
110 $builder->andWhere( [ 'page_is_new' => 1 ] );
111 $this->output( "Refreshing $what from new pages...\n" );
112 } else {
113 if ( $touched ) {
114 $builder->andWhere( [
115 $dbr->expr( 'page_touched', '>', 'page_links_updated' )
116 ->or( 'page_links_updated', '=', null ),
117 ] );
119 $this->output( "Refreshing $what from pages...\n" );
121 $this->doRefreshLinks( $builder, $redir );
122 if ( !$this->hasOption( 'namespace' ) ) {
123 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
129 * Do the actual link refreshing.
130 * @param SelectQueryBuilder $builder
131 * @param bool $redirectsOnly Only fix redirects
132 * @param array $indexFields
134 private function doRefreshLinks(
135 SelectQueryBuilder $builder,
136 bool $redirectsOnly = false,
137 array $indexFields = [ 'page_id' ]
139 // Give extensions a chance to optimize settings
140 $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this );
142 $estimateCount = $builder->caller( __METHOD__ )->estimateRowCount();
143 $this->output( "Estimated page count: $estimateCount\n" );
145 $i = 0;
146 $lastIndexes = array_fill_keys( $indexFields, 0 );
147 $selectFields = in_array( 'page_id', $indexFields )
148 ? $indexFields : [ 'page_id', ...$indexFields ];
149 $verbose = $this->hasOption( 'verbose' );
150 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
151 do {
152 $batchCond = $dbr->buildComparison( '>', $lastIndexes );
153 $res = ( clone $builder )->select( $selectFields )
154 ->andWhere( [ $batchCond ] )
155 ->orderBy( $indexFields )
156 ->caller( __METHOD__ )->fetchResultSet();
158 if ( $verbose ) {
159 $this->output( "Refreshing links for {$res->numRows()} pages\n" );
162 foreach ( $res as $row ) {
163 if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
164 $this->output( "$i\n" );
165 $this->waitForReplication();
167 if ( $verbose ) {
168 $this->output( "Refreshing links for page ID {$row->page_id}\n" );
170 self::fixRedirect( $this, $row->page_id );
171 if ( !$redirectsOnly ) {
172 self::fixLinksFromArticle( $row->page_id );
175 if ( $res->numRows() ) {
176 $res->seek( $res->numRows() - 1 );
177 foreach ( $indexFields as $field ) {
178 $lastIndexes[$field] = $res->current()->$field;
182 } while ( $res->numRows() == $this->getBatchSize() );
186 * Update the redirect entry for a given page.
188 * This methods bypasses the "redirect" table to get the redirect target,
189 * and parses the page's content to fetch it. This allows to be sure that
190 * the redirect target is up to date and valid.
191 * This is particularly useful when modifying namespaces to be sure the
192 * entry in the "redirect" table points to the correct page and not to an
193 * invalid one.
195 * @internal
196 * @param Maintenance $maint
197 * @param int $id The page ID to check
199 public static function fixRedirect( Maintenance $maint, $id ) {
200 $page = $maint->getServiceContainer()->getWikiPageFactory()->newFromID( $id );
202 // In case the page just got deleted.
203 if ( $page === null ) {
204 return;
207 $rt = null;
208 $content = $page->getContent( RevisionRecord::RAW );
209 if ( $content !== null ) {
210 $rt = $content->getRedirectTarget();
213 $dbw = $maint->getDB( DB_PRIMARY );
214 if ( $rt === null ) {
215 // The page is not a redirect
216 // Delete any redirect table entry for it
217 $dbw->newDeleteQueryBuilder()
218 ->deleteFrom( 'redirect' )
219 ->where( [ 'rd_from' => $id ] )
220 ->caller( __METHOD__ )->execute();
221 $fieldValue = 0;
222 } else {
223 $page->insertRedirectEntry( $rt );
224 $fieldValue = 1;
227 // Update the page table to be sure it is an a consistent state
228 $dbw->newUpdateQueryBuilder()
229 ->update( 'page' )
230 ->set( [ 'page_is_redirect' => $fieldValue ] )
231 ->where( [ 'page_id' => $id ] )
232 ->caller( __METHOD__ )
233 ->execute();
237 * Run LinksUpdate for all links on a given page_id
238 * @param int $id The page_id
240 public static function fixLinksFromArticle( $id ) {
241 $services = MediaWikiServices::getInstance();
242 $page = $services->getWikiPageFactory()->newFromID( $id );
244 // In case the page just got deleted.
245 if ( $page === null ) {
246 return;
249 // Defer updates to post-send but then immediately execute deferred updates;
250 // this is the simplest way to run all updates immediately (including updates
251 // scheduled by other updates).
252 $page->doSecondaryDataUpdates( [
253 'defer' => DeferredUpdates::POSTSEND,
254 'causeAction' => 'refresh-links-maintenance',
255 'recursive' => false,
256 ] );
257 DeferredUpdates::doUpdates();
261 * Removes non-existing links from pages from pagelinks, imagelinks,
262 * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
264 * @param int|null $start Page_id to start from
265 * @param int|null $end Page_id to stop at
266 * @param int $batchSize The size of deletion batches
267 * @param int $chunkSize Maximum number of existent IDs to check per query
269 * @author Merlijn van Deen <valhallasw@arctus.nl>
271 private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
272 $chunkSize = 100_000
274 $this->waitForReplication();
275 $this->output( "Deleting illegal entries from the links tables...\n" );
276 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
277 do {
278 // Find the start of the next chunk. This is based only
279 // on existent page_ids.
280 $nextStart = $dbr->newSelectQueryBuilder()
281 ->select( 'page_id' )
282 ->from( 'page' )
283 ->where( [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] )
284 ->orderBy( 'page_id' )
285 ->offset( $chunkSize )
286 ->caller( __METHOD__ )->fetchField();
288 if ( $nextStart !== false ) {
289 // To find the end of the current chunk, subtract one.
290 // This will serve to limit the number of rows scanned in
291 // dfnCheckInterval(), per query, to at most the sum of
292 // the chunk size and deletion batch size.
293 $chunkEnd = $nextStart - 1;
294 } else {
295 // This is the last chunk. Check all page_ids up to $end.
296 $chunkEnd = $end;
299 $fmtStart = $start !== null ? "[$start" : '(-INF';
300 $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
301 $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
302 $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
304 $start = $nextStart;
306 } while ( $nextStart !== false );
310 * @see RefreshLinks::deleteLinksFromNonexistent()
311 * @param int|null $start Page_id to start from
312 * @param int|null $end Page_id to stop at
313 * @param int $batchSize The size of deletion batches
315 private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
316 $dbw = $this->getPrimaryDB();
317 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
319 $linksTables = [
320 // table name => page_id field
321 'pagelinks' => 'pl_from',
322 'imagelinks' => 'il_from',
323 'categorylinks' => 'cl_from',
324 'templatelinks' => 'tl_from',
325 'externallinks' => 'el_from',
326 'iwlinks' => 'iwl_from',
327 'langlinks' => 'll_from',
328 'redirect' => 'rd_from',
329 'page_props' => 'pp_page',
332 foreach ( $linksTables as $table => $field ) {
333 $this->output( " $table: 0" );
334 $tableStart = $start;
335 $counter = 0;
336 do {
337 $ids = $dbr->newSelectQueryBuilder()
338 ->select( $field )
339 ->distinct()
340 ->from( $table )
341 ->leftJoin( 'page', null, "$field = page_id" )
342 ->where( self::intervalCond( $dbr, $field, $tableStart, $end ) )
343 ->andWhere( [ 'page_id' => null ] )
344 ->orderBy( $field )
345 ->limit( $batchSize )
346 ->caller( __METHOD__ )->fetchFieldValues();
348 $numIds = count( $ids );
349 if ( $numIds ) {
350 $counter += $numIds;
351 $dbw->newDeleteQueryBuilder()
352 ->deleteFrom( $table )
353 ->where( [ $field => $ids ] )
354 ->caller( __METHOD__ )->execute();
355 $this->output( ", $counter" );
356 $tableStart = $ids[$numIds - 1] + 1;
357 $this->waitForReplication();
360 } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
362 $this->output( " deleted.\n" );
367 * Build a SQL expression for a closed interval.
369 * By specifying a null $start or $end, it is also possible to create
370 * half-bounded or unbounded intervals using this function.
372 * @param IReadableDatabase $db
373 * @param string $var Field name
374 * @param mixed $start First value to include or null
375 * @param mixed $end Last value to include or null
376 * @return IExpression
378 private static function intervalCond( IReadableDatabase $db, $var, $start, $end ) {
379 if ( $start === null && $end === null ) {
380 return $db->expr( $var, '!=', null );
381 } elseif ( $end === null ) {
382 return $db->expr( $var, '>=', $start );
383 } elseif ( $start === null ) {
384 return $db->expr( $var, '<=', $end );
385 } else {
386 return $db->expr( $var, '>=', $start )->and( $var, '<=', $end );
391 * Refershes links for pages in a tracking category
393 * @param SelectQueryBuilder $builder
394 * @param string $category Category key
396 private function refreshTrackingCategory( SelectQueryBuilder $builder, $category ) {
397 $cats = $this->getPossibleCategories( $category );
399 if ( !$cats ) {
400 $this->error( "Tracking category '$category' is disabled\n" );
401 // Output to stderr but don't bail out.
404 foreach ( $cats as $cat ) {
405 $this->refreshCategory( clone $builder, $cat );
410 * Refreshes links to a category
412 * @param SelectQueryBuilder $builder
413 * @param LinkTarget $category
415 private function refreshCategory( SelectQueryBuilder $builder, LinkTarget $category ) {
416 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
418 $builder->join( 'categorylinks', null, 'page_id=cl_from' )
419 ->andWhere( [ 'cl_to' => $category->getDBkey() ] );
420 $this->doRefreshLinks( $builder, false, [ 'cl_timestamp', 'cl_from' ] );
424 * Returns a list of possible categories for a given tracking category key
426 * @param string $categoryKey
427 * @return LinkTarget[]
429 private function getPossibleCategories( $categoryKey ) {
430 $cats = $this->getServiceContainer()->getTrackingCategories()->getTrackingCategories();
431 if ( isset( $cats[$categoryKey] ) ) {
432 return $cats[$categoryKey]['cats'];
434 $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
438 // @codeCoverageIgnoreStart
439 $maintClass = RefreshLinks::class;
440 require_once RUN_MAINTENANCE_IF_MAIN;
441 // @codeCoverageIgnoreEnd