Merge ".mailmap: Correct two contributor names"
[mediawiki.git] / maintenance / findBadBlobs.php
blob2aac2714aef4aab2bdc71b515ba1e4accf3bf947
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
18 * @file
19 * @ingroup Maintenance
22 use MediaWiki\Maintenance\Maintenance;
23 use MediaWiki\Revision\RevisionArchiveRecord;
24 use MediaWiki\Revision\RevisionRecord;
25 use MediaWiki\Revision\RevisionStore;
26 use MediaWiki\Revision\RevisionStoreRecord;
27 use MediaWiki\Revision\SlotRecord;
28 use MediaWiki\Storage\BlobStore;
30 // @codeCoverageIgnoreStart
31 require_once __DIR__ . '/Maintenance.php';
32 // @codeCoverageIgnoreEnd
34 /**
35 * Maintenance script for finding and marking bad content blobs.
37 * @ingroup Maintenance
39 class FindBadBlobs extends Maintenance {
41 private RevisionStore $revisionStore;
42 private BlobStore $blobStore;
44 public function __construct() {
45 parent::__construct();
47 $this->setBatchSize( 1000 );
48 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
49 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
50 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
51 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
52 false, true );
53 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
54 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
55 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
56 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
57 . 'Default: 1000', false, true );
58 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
59 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
60 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
63 /**
64 * @return string
66 private function getStartTimestamp() {
67 $tsOpt = $this->getOption( 'scan-from' );
68 if ( strlen( $tsOpt ) < 14 ) {
69 $this->fatalError( 'Bad timestamp: ' . $tsOpt
70 . ', please provide time and date down to the second.' );
73 $ts = wfTimestamp( TS_MW, $tsOpt );
74 if ( !$ts ) {
75 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
78 return $ts;
81 /**
82 * @return int[]
84 private function getRevisionIds() {
85 $opt = $this->getOption( 'revisions' );
87 if ( $opt === '-' ) {
88 $opt = stream_get_contents( STDIN );
90 if ( !$opt ) {
91 return [];
95 return $this->parseIntList( $opt );
98 /**
99 * @inheritDoc
101 public function execute() {
102 $services = $this->getServiceContainer();
103 $this->revisionStore = $services->getRevisionStore();
104 $this->blobStore = $services->getBlobStore();
106 if ( $this->hasOption( 'revisions' ) ) {
107 if ( $this->hasOption( 'scan-from' ) ) {
108 $this->fatalError( 'Cannot use --revisions together with --scan-from' );
111 $ids = $this->getRevisionIds();
113 $count = $this->scanRevisionsById( $ids );
114 } elseif ( $this->hasOption( 'scan-from' ) ) {
115 if ( $this->hasOption( 'mark' ) ) {
116 $this->fatalError( 'Cannot use --mark with --scan-from, '
117 . 'use --revisions to specify revisions to mark.' );
120 $fromTimestamp = $this->getStartTimestamp();
121 $total = $this->getOption( 'limit', 1000 );
123 $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
125 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
126 . "scanned in the revision table.\n" );
127 } else {
128 if ( $this->hasOption( 'mark' ) ) {
129 $this->fatalError( 'The --mark must be used together with --revisions' );
130 } else {
131 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
135 if ( $this->hasOption( 'mark' ) ) {
136 $this->output( "Marked $count bad revisions.\n" );
137 } else {
138 $this->output( "Found $count bad revisions.\n" );
140 if ( $count > 0 ) {
141 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
142 $this->output( "that can then be used with the --revisions option. E.g.\n" );
143 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
149 * @param string $fromTimestamp
150 * @param int $total
152 * @return int
154 private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
155 $count = 0;
156 $lastRevId = 0;
157 $firstRevId = 0;
158 $lastTimestamp = $fromTimestamp;
159 $revisionRowsScanned = 0;
160 $archiveRowsScanned = 0;
162 $this->output( "Scanning revisions table, "
163 . "$total rows starting at rev_timestamp $fromTimestamp\n" );
165 while ( $revisionRowsScanned < $total ) {
166 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
167 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
168 if ( !$revisions ) {
169 break;
172 foreach ( $revisions as $rev ) {
173 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
174 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
175 $lastRevId = max( $lastRevId, $rev->getId() );
177 $count += $this->checkRevision( $rev );
180 $lastTimestamp = $rev->getTimestamp();
181 $batchSize = count( $revisions );
182 $revisionRowsScanned += $batchSize;
183 $this->output(
184 "\t- Scanned a batch of $batchSize revisions, "
185 . "up to revision $lastRevId ($lastTimestamp)\n"
188 $this->waitForReplication();
191 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
192 // revision ID just before the first revision ID we found above as the starting point
193 // of the scan, and scan up to on revision after the last revision ID we found above.
194 // If $firstRevId is 0, the loop body above didn't execute,
195 // so we should skip the one below as well.
196 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
197 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
198 $maxArchived = $maxArchived ?: PHP_INT_MAX;
200 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
201 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
202 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
203 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
204 if ( !$revisions ) {
205 break;
207 /** @var RevisionRecord $rev */
208 foreach ( $revisions as $rev ) {
209 $count += $this->checkRevision( $rev );
211 $fromArchived = $rev->getId();
212 $batchSize = count( $revisions );
213 $archiveRowsScanned += $batchSize;
214 $this->output(
215 "\t- Scanned a batch of $batchSize archived revisions, "
216 . "up to revision $fromArchived ($lastTimestamp)\n"
219 $this->waitForReplication();
222 return $count;
226 * @param int $afterId
227 * @param string $fromTimestamp
228 * @param int $batchSize
230 * @return RevisionStoreRecord[]
232 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
233 $db = $this->getReplicaDB();
234 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
235 $rows = $queryBuilder->joinComment()
236 ->where( $db->buildComparison( '>', [
237 'rev_timestamp' => $fromTimestamp,
238 'rev_id' => $afterId,
239 ] ) )
240 ->useIndex( [ 'revision' => 'rev_timestamp' ] )
241 ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
242 ->limit( $batchSize )
243 ->caller( __METHOD__ )->fetchResultSet();
244 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
245 $this->handleStatus( $result );
247 $records = array_filter( $result->value );
249 '@phan-var RevisionStoreRecord[] $records';
250 return $records;
254 * @param int $afterId
255 * @param int $uptoId
256 * @param int $batchSize
258 * @return RevisionArchiveRecord[]
260 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
261 $db = $this->getReplicaDB();
262 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
263 ->joinComment()
264 ->where( [ $db->expr( 'ar_rev_id', '>', $afterId ), $db->expr( 'ar_rev_id', '<=', $uptoId ) ] )
265 ->orderBy( 'ar_rev_id' )
266 ->limit( $batchSize )
267 ->caller( __METHOD__ )->fetchResultSet();
268 $result = $this->revisionStore->newRevisionsFromBatch(
269 $rows,
270 [ 'archive' => true, 'slots' => true ]
272 $this->handleStatus( $result );
274 $records = array_filter( $result->value );
276 '@phan-var RevisionArchiveRecord[] $records';
277 return $records;
281 * Returns the revision ID next to $revId, according to $comp and $dir
283 * @param int $revId
284 * @param string $comp the comparator, either '<' or '>', to go with $dir
285 * @param string $dir the sort direction to go with $comp, either 'ARC' or 'DESC'
287 * @return int
289 private function getNextRevision( int $revId, string $comp, string $dir ) {
290 $db = $this->getReplicaDB();
291 $next = $db->newSelectQueryBuilder()
292 ->select( 'rev_id' )
293 ->from( 'revision' )
294 ->where( "rev_id $comp $revId" )
295 ->orderBy( [ "rev_id" ], $dir )
296 ->caller( __METHOD__ )
297 ->fetchField();
298 return (int)$next;
302 * @param array $ids
304 * @return int
306 private function scanRevisionsById( array $ids ) {
307 $count = 0;
308 $total = count( $ids );
310 $this->output( "Scanning $total ids\n" );
312 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
313 $revisions = $this->loadRevisionsById( $batch );
315 if ( !$revisions ) {
316 continue;
319 /** @var RevisionRecord $rev */
320 foreach ( $revisions as $rev ) {
321 $count += $this->checkRevision( $rev );
324 $batchSize = count( $revisions );
325 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
328 return $count;
332 * @param int[] $ids
334 * @return RevisionRecord[]
336 private function loadRevisionsById( array $ids ) {
337 $db = $this->getReplicaDB();
338 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
340 $rows = $queryBuilder
341 ->joinComment()
342 ->where( [ 'rev_id' => $ids ] )
343 ->caller( __METHOD__ )->fetchResultSet();
345 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
347 $this->handleStatus( $result );
349 $revisions = array_filter( $result->value );
350 '@phan-var RevisionArchiveRecord[] $revisions';
352 // if not all revisions were found, check the archive table.
353 if ( count( $revisions ) < count( $ids ) ) {
354 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
355 ->joinComment()
356 ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
357 ->caller( __METHOD__ )->fetchResultSet();
359 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
360 $rows,
361 [ 'slots' => true, 'archive' => true ]
364 $this->handleStatus( $archiveResult );
366 // don't use array_merge, since it will re-index
367 $revisions += array_filter( $archiveResult->value );
370 return $revisions;
374 * @param RevisionRecord $rev
376 * @return int
378 private function checkRevision( RevisionRecord $rev ) {
379 $count = 0;
380 foreach ( $rev->getSlots()->getSlots() as $slot ) {
381 $count += $this->checkSlot( $rev, $slot );
384 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
385 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
388 return $count;
392 * @param RevisionRecord $rev
393 * @param SlotRecord $slot
395 * @return int
397 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
398 $address = $slot->getAddress();
400 try {
401 $this->blobStore->getBlob( $address );
402 // nothing to do
403 return 0;
404 } catch ( Exception $ex ) {
405 $error = $ex->getMessage();
406 $type = get_class( $ex );
409 // NOTE: output the revision ID again at the end in a separate column for easy processing
410 // via the "cut" shell command.
411 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
412 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
413 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
414 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
416 if ( $this->hasOption( 'mark' ) ) {
417 $newAddress = $this->markBlob( $slot, $error );
418 $this->output( "\tChanged address to <$newAddress>\n" );
421 return 1;
425 * @param SlotRecord $slot
426 * @param string|null $error
428 * @return false|string
430 private function markBlob( SlotRecord $slot, ?string $error = null ) {
431 $args = [];
433 if ( $this->hasOption( 'mark' ) ) {
434 $args['reason'] = $this->getOption( 'mark' );
437 if ( $error ) {
438 $args['error'] = $error;
441 $address = $slot->getAddress() ?: 'empty';
442 $badAddress = 'bad:' . urlencode( $address );
444 if ( $args ) {
445 $badAddress .= '?' . wfArrayToCgi( $args );
448 $badAddress = substr( $badAddress, 0, 255 );
450 $dbw = $this->getPrimaryDB();
451 $dbw->newUpdateQueryBuilder()
452 ->update( 'content' )
453 ->set( [ 'content_address' => $badAddress ] )
454 ->where( [ 'content_id' => $slot->getContentId() ] )
455 ->caller( __METHOD__ )->execute();
457 return $badAddress;
460 private function handleStatus( StatusValue $status ) {
461 if ( !$status->isOK() ) {
462 $this->fatalError( $status );
464 if ( !$status->isGood() ) {
465 $this->error( $status );
471 // @codeCoverageIgnoreStart
472 $maintClass = FindBadBlobs::class;
473 require_once RUN_MAINTENANCE_IF_MAIN;
474 // @codeCoverageIgnoreEnd