Localisation updates from https://translatewiki.net.
[mediawiki.git] / maintenance / storage / trackBlobs.php
blob8fe041b36d24f71bf70a8b895a7a0f66347dfecd
1 <?php
2 /**
3 * Adds blobs from a given external storage cluster to the blob_tracking table.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * @file
21 * @ingroup Maintenance
24 use MediaWiki\Revision\SlotRecord;
25 use Wikimedia\Rdbms\DBConnectionError;
26 use Wikimedia\Rdbms\IExpression;
27 use Wikimedia\Rdbms\LikeValue;
29 // @codeCoverageIgnoreStart
30 require_once __DIR__ . '/../Maintenance.php';
31 // @codeCoverageIgnoreEnd
33 class TrackBlobs extends Maintenance {
34 /** @var string[] */
35 public $clusters;
36 /** @var IExpression|null */
37 public $textClause;
38 /** @var bool */
39 public $doBlobOrphans;
40 /** @var array */
41 public $trackedBlobs = [];
43 /** @var int */
44 public $batchSize = 1000;
45 /** @var int */
46 public $reportingInterval = 10;
48 public function __construct() {
49 parent::__construct();
51 $this->addArg( 'cluster', 'cluster(s) to scan', true, true );
53 $this->addDescription(
54 'Adds blobs from a given ES cluster to the blob_tracking table. ' .
55 'Automatically deletes the tracking table and starts from the start again when restarted.'
59 public function execute() {
60 $this->clusters = $this->parameters->getArgs();
61 if ( extension_loaded( 'gmp' ) ) {
62 $this->doBlobOrphans = true;
63 foreach ( $this->clusters as $cluster ) {
64 $this->trackedBlobs[$cluster] = gmp_init( 0 );
66 } else {
67 echo "Warning: the gmp extension is needed to find orphan blobs\n";
70 $this->checkIntegrity();
71 $this->initTrackingTable();
72 $this->trackRevisions();
73 $this->trackOrphanText();
74 if ( $this->doBlobOrphans ) {
75 $this->findOrphanBlobs();
77 $this->output( "All done.\n" );
80 private function checkIntegrity() {
81 echo "Doing integrity check...\n";
82 $dbr = $this->getReplicaDB();
84 // Scan for HistoryBlobStub objects in the text table (T22757)
86 $exists = (bool)$dbr->newSelectQueryBuilder()
87 ->select( '1' )
88 ->from( 'text' )
89 ->where(
90 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
91 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
92 ->caller( __METHOD__ )->fetchField();
94 if ( $exists ) {
95 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
96 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
97 "to fix this.\n";
98 exit( 1 );
101 echo "Integrity check OK\n";
104 private function initTrackingTable() {
105 $dbw = $this->getDB( DB_PRIMARY );
106 if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
107 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
108 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
110 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
113 private function getTextClause() {
114 if ( !$this->textClause ) {
115 $dbr = $this->getReplicaDB();
116 $conds = [];
117 foreach ( $this->clusters as $cluster ) {
118 $conds[] = $dbr->expr(
119 'old_text',
120 IExpression::LIKE,
121 new LikeValue( "DB://$cluster/", $dbr->anyString() )
124 $this->textClause = $dbr->orExpr( $conds );
127 return $this->textClause;
130 private function interpretPointer( $text ) {
131 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
132 return false;
135 return [
136 'cluster' => $m[1],
137 'id' => intval( $m[2] ),
138 'hash' => $m[3] ?? null
143 * Scan the revision table for rows stored in the specified clusters
145 private function trackRevisions() {
146 $dbw = $this->getPrimaryDB();
147 $dbr = $this->getReplicaDB();
149 $textClause = $this->getTextClause();
150 $startId = 0;
151 $endId = (int)$dbr->newSelectQueryBuilder()
152 ->select( 'MAX(rev_id)' )
153 ->from( 'revision' )
154 ->caller( __METHOD__ )->fetchField();
155 $batchesDone = 0;
156 $rowsInserted = 0;
158 echo "Finding revisions...\n";
160 $conds = [
161 $textClause,
162 $dbr->expr(
163 'old_flags',
164 IExpression::LIKE,
165 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
168 $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
170 $conds = array_merge( [
171 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ),
172 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
173 ], $conds );
175 while ( true ) {
176 $res = $dbr->newSelectQueryBuilder()
177 ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
178 ->from( 'revision' )
179 ->join( 'slots', null, 'rev_id=slot_revision_id' )
180 ->join( 'content', null, 'content_id=slot_content_id' )
181 ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
182 ->where( $dbr->expr( 'rev_id', '>', $startId ) )
183 ->andWhere( $conds )
184 ->orderBy( 'rev_id' )
185 ->limit( $this->batchSize )
186 ->caller( __METHOD__ )->fetchResultSet();
187 if ( !$res->numRows() ) {
188 break;
191 $insertBatch = [];
192 foreach ( $res as $row ) {
193 $startId = (int)$row->rev_id;
194 $info = $this->interpretPointer( $row->old_text );
195 if ( !$info ) {
196 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
197 continue;
199 if ( !in_array( $info['cluster'], $this->clusters ) ) {
200 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
201 continue;
203 $insertBatch[] = [
204 'bt_page' => $row->rev_page,
205 'bt_rev_id' => $row->rev_id,
206 'bt_text_id' => $row->old_id,
207 'bt_cluster' => $info['cluster'],
208 'bt_blob_id' => $info['id'],
209 'bt_cgz_hash' => $info['hash']
211 if ( $this->doBlobOrphans ) {
212 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
215 $dbw->newInsertQueryBuilder()
216 ->insertInto( 'blob_tracking' )
217 ->rows( $insertBatch )
218 ->caller( __METHOD__ )->execute();
219 $rowsInserted += count( $insertBatch );
221 ++$batchesDone;
222 if ( $batchesDone >= $this->reportingInterval ) {
223 $batchesDone = 0;
224 echo "$startId / $endId\n";
225 $this->waitForReplication();
228 echo "Found $rowsInserted revisions\n";
232 * Scan the text table for orphan text
233 * Orphan text here does not imply DB corruption -- deleted text tracked by the
234 * archive table counts as orphan for our purposes.
236 private function trackOrphanText() {
237 # Wait until the blob_tracking table is available in the replica DB
238 $dbw = $this->getPrimaryDB();
239 $dbr = $this->getReplicaDB();
240 $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] );
242 $textClause = $this->getTextClause();
243 $startId = 0;
244 $endId = (int)$dbr->newSelectQueryBuilder()
245 ->select( 'MAX(old_id)' )
246 ->from( 'text' )
247 ->caller( __METHOD__ )->fetchField();
248 $rowsInserted = 0;
249 $batchesDone = 0;
251 echo "Finding orphan text...\n";
253 # Scan the text table for orphan text
254 while ( true ) {
255 $res = $dbr->newSelectQueryBuilder()
256 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
257 ->from( 'text' )
258 ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
259 ->where( [
260 $dbr->expr( 'old_id', '>', $startId ),
261 $textClause,
262 $dbr->expr(
263 'old_flags',
264 IExpression::LIKE,
265 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
267 'bt_text_id' => null,
269 ->orderBy( 'old_id' )
270 ->limit( $this->batchSize )
271 ->caller( __METHOD__ )->fetchResultSet();
273 if ( !$res->numRows() ) {
274 break;
277 $insertBatch = [];
278 foreach ( $res as $row ) {
279 $startId = (int)$row->old_id;
280 $info = $this->interpretPointer( $row->old_text );
281 if ( !$info ) {
282 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
283 continue;
285 if ( !in_array( $info['cluster'], $this->clusters ) ) {
286 echo "Invalid cluster returned in SQL query\n";
287 continue;
290 $insertBatch[] = [
291 'bt_page' => 0,
292 'bt_rev_id' => 0,
293 'bt_text_id' => $row->old_id,
294 'bt_cluster' => $info['cluster'],
295 'bt_blob_id' => $info['id'],
296 'bt_cgz_hash' => $info['hash']
298 if ( $this->doBlobOrphans ) {
299 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
302 $dbw->newInsertQueryBuilder()
303 ->insertInto( 'blob_tracking' )
304 ->rows( $insertBatch )
305 ->caller( __METHOD__ )->execute();
307 $rowsInserted += count( $insertBatch );
308 ++$batchesDone;
309 if ( $batchesDone >= $this->reportingInterval ) {
310 $batchesDone = 0;
311 echo "$startId / $endId\n";
312 $this->waitForReplication();
315 echo "Found $rowsInserted orphan text rows\n";
319 * Scan the blobs table for rows not registered in blob_tracking (and thus not
320 * registered in the text table).
322 * Orphan blobs are indicative of DB corruption. They are inaccessible and
323 * should probably be deleted.
325 private function findOrphanBlobs() {
326 if ( !extension_loaded( 'gmp' ) ) {
327 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
329 return;
332 $dbw = $this->getPrimaryDB();
333 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
334 $dbStore = $this->getServiceContainer()->getExternalStoreFactory()->getStore( 'DB' );
335 '@phan-var ExternalStoreDB $dbStore'; /** @var ExternalStoreDB $dbStore */
337 foreach ( $this->clusters as $cluster ) {
338 echo "Searching for orphan blobs in $cluster...\n";
339 $lb = $lbFactory->getExternalLB( $cluster );
340 try {
341 $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
342 } catch ( DBConnectionError $e ) {
343 if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
344 echo "No database on $cluster\n";
345 } else {
346 echo "Error on $cluster: " . $e->getMessage() . "\n";
348 continue;
350 $table = $dbStore->getTable( $cluster );
351 if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
352 echo "No blobs table on cluster $cluster\n";
353 continue;
355 $startId = 0;
356 $batchesDone = 0;
357 $actualBlobs = gmp_init( 0 );
358 $endId = (int)$extDB->newSelectQueryBuilder()
359 ->select( 'MAX(blob_id)' )
360 ->from( $table )
361 ->caller( __METHOD__ )->fetchField();
363 // Build a bitmap of actual blob rows
364 while ( true ) {
365 $res = $extDB->newSelectQueryBuilder()
366 ->select( [ 'blob_id' ] )
367 ->from( $table )
368 ->where( $extDB->expr( 'blob_id', '>', $startId ) )
369 ->orderBy( 'blob_id' )
370 ->limit( $this->batchSize )
371 ->caller( __METHOD__ )->fetchResultSet();
373 if ( !$res->numRows() ) {
374 break;
377 foreach ( $res as $row ) {
378 gmp_setbit( $actualBlobs, $row->blob_id );
379 $startId = (int)$row->blob_id;
382 ++$batchesDone;
383 if ( $batchesDone >= $this->reportingInterval ) {
384 $batchesDone = 0;
385 echo "$startId / $endId\n";
389 // Find actual blobs that weren't tracked by the previous passes
390 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
391 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
393 // Traverse the orphan list
394 $insertBatch = [];
395 $id = 0;
396 $numOrphans = 0;
397 while ( true ) {
398 $id = gmp_scan1( $orphans, $id );
399 if ( $id == -1 ) {
400 break;
402 $insertBatch[] = [
403 'bo_cluster' => $cluster,
404 'bo_blob_id' => $id
406 if ( count( $insertBatch ) > $this->batchSize ) {
407 $dbw->newInsertQueryBuilder()
408 ->insertInto( 'blob_orphans' )
409 ->rows( $insertBatch )
410 ->caller( __METHOD__ )->execute();
411 $insertBatch = [];
414 ++$id;
415 ++$numOrphans;
417 if ( $insertBatch ) {
418 $dbw->newInsertQueryBuilder()
419 ->insertInto( 'blob_orphans' )
420 ->rows( $insertBatch )
421 ->caller( __METHOD__ )->execute();
423 echo "Found $numOrphans orphan(s) in $cluster\n";
428 // @codeCoverageIgnoreStart
429 $maintClass = TrackBlobs::class;
430 require_once RUN_MAINTENANCE_IF_MAIN;
431 // @codeCoverageIgnoreEnd