3 require( dirname( __FILE__
) .'/../commandLine.inc' );
6 if ( count( $args ) < 1 ) {
7 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
13 $tracker = new TrackBlobs( $args );
14 $tracker->trackBlobs();
17 var $clusters, $textClause;
19 var $trackedBlobs = array();
21 var $batchSize = 1000;
22 var $reportingInterval = 10;
24 function __construct( $clusters ) {
25 $this->clusters
= $clusters;
26 if ( extension_loaded( 'gmp' ) ) {
27 $this->doBlobOrphans
= true;
28 foreach ( $clusters as $cluster ) {
29 $this->trackedBlobs
[$cluster] = gmp_init( 0 );
32 echo "Warning: the gmp extension is needed to find orphan blobs\n";
36 function trackBlobs() {
37 $this->initTrackingTable();
38 $this->trackRevisions();
39 $this->trackOrphanText();
40 if ( $this->doBlobOrphans
) {
41 $this->findOrphanBlobs();
45 function initTrackingTable() {
46 $dbw = wfGetDB( DB_MASTER
);
47 if ( $dbw->tableExists( 'blob_tracking' ) ) {
48 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
49 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
51 $dbw->sourceFile( dirname( __FILE__
) . '/blob_tracking.sql' );
54 function getTextClause() {
55 if ( !$this->textClause
) {
56 $dbr = wfGetDB( DB_SLAVE
);
57 $this->textClause
= '';
58 foreach ( $this->clusters
as $cluster ) {
59 if ( $this->textClause
!= '' ) {
60 $this->textClause
.= ' OR ';
62 $this->textClause
.= 'old_text LIKE ' . $dbr->addQuotes( $dbr->escapeLike( "DB://$cluster/" ) . '%' );
65 return $this->textClause
;
68 function interpretPointer( $text ) {
69 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
74 'id' => intval( $m[2] ),
75 'hash' => isset( $m[3] ) ?
$m[2] : null
80 * Scan the revision table for rows stored in the specified clusters
82 function trackRevisions() {
83 $dbw = wfGetDB( DB_MASTER
);
84 $dbr = wfGetDB( DB_SLAVE
);
86 $textClause = $this->getTextClause();
88 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__
);
92 echo "Finding revisions...\n";
95 $res = $dbr->select( array( 'revision', 'text' ),
96 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
98 'rev_id > ' . $dbr->addQuotes( $startId ),
101 "old_flags LIKE '%external%'",
105 'ORDER BY' => 'rev_id',
106 'LIMIT' => $this->batchSize
109 if ( !$res->numRows() ) {
113 $insertBatch = array();
114 foreach ( $res as $row ) {
115 $startId = $row->rev_id
;
116 $info = $this->interpretPointer( $row->old_text
);
118 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
121 if ( !in_array( $info['cluster'], $this->clusters
) ) {
122 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
125 $insertBatch[] = array(
126 'bt_page' => $row->rev_page
,
127 'bt_rev_id' => $row->rev_id
,
128 'bt_text_id' => $row->old_id
,
129 'bt_cluster' => $info['cluster'],
130 'bt_blob_id' => $info['id'],
131 'bt_cgz_hash' => $info['hash']
133 if ( $this->doBlobOrphans
) {
134 gmp_setbit( $this->trackedBlobs
[$info['cluster']], $info['id'] );
137 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
138 $rowsInserted +
= count( $insertBatch );
141 if ( $batchesDone >= $this->reportingInterval
) {
143 echo "$startId / $endId\n";
144 wfWaitForSlaves( 5 );
147 echo "Found $rowsInserted revisions\n";
151 * Scan the text table for orphan text
152 * Orphan text here does not imply DB corruption -- deleted text tracked by the
153 * archive table counts as orphan for our purposes.
155 function trackOrphanText() {
156 # Wait until the blob_tracking table is available in the slave
157 $dbw = wfGetDB( DB_MASTER
);
158 $dbr = wfGetDB( DB_SLAVE
);
159 $pos = $dbw->getMasterPos();
160 $dbr->masterPosWait( $pos, 100000 );
162 $textClause = $this->getTextClause( $this->clusters
);
164 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__
);
168 echo "Finding orphan text...\n";
170 # Scan the text table for orphan text
172 $res = $dbr->select( array( 'text', 'blob_tracking' ),
173 array( 'old_id', 'old_flags', 'old_text' ),
175 'old_id>' . $dbr->addQuotes( $startId ),
177 "old_flags LIKE '%external%'",
182 'ORDER BY' => 'old_id',
183 'LIMIT' => $this->batchSize
185 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
188 foreach ( $res as $row ) {
189 $ids[] = $row->old_id
;
192 if ( !$res->numRows() ) {
196 $insertBatch = array();
197 foreach ( $res as $row ) {
198 $startId = $row->old_id
;
199 $info = $this->interpretPointer( $row->old_text
);
201 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
204 if ( !in_array( $info['cluster'], $this->clusters
) ) {
205 echo "Invalid cluster returned in SQL query\n";
209 $insertBatch[] = array(
212 'bt_text_id' => $row->old_id
,
213 'bt_cluster' => $info['cluster'],
214 'bt_blob_id' => $info['id'],
215 'bt_cgz_hash' => $info['hash']
217 if ( $this->doBlobOrphans
) {
218 gmp_setbit( $this->trackedBlobs
[$info['cluster']], $info['id'] );
221 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
223 $rowsInserted +
= count( $insertBatch );
225 if ( $batchesDone >= $this->reportingInterval
) {
227 echo "$startId / $endId\n";
228 wfWaitForSlaves( 5 );
231 echo "Found $rowsInserted orphan text rows\n";
235 * Scan the blobs table for rows not registered in blob_tracking (and thus not
236 * registered in the text table).
238 * Orphan blobs are indicative of DB corruption. They are inaccessible and
239 * should probably be deleted.
241 function findOrphanBlobs() {
242 if ( !extension_loaded( 'gmp' ) ) {
243 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
247 $dbw = wfGetDB( DB_MASTER
);
249 foreach ( $this->clusters
as $cluster ) {
250 echo "Searching for orphan blobs in $cluster...\n";
251 $lb = wfGetLBFactory()->getExternalLB( $cluster );
252 $extDB = $lb->getConnection( DB_SLAVE
);
255 $actualBlobs = gmp_init( 0 );
256 $endId = $extDB->selectField( 'blobs', 'MAX(blob_id)', false, __METHOD__
);
258 // Build a bitmap of actual blob rows
260 $res = $extDB->select( 'blobs',
262 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
264 array( 'LIMIT' => $this->batchSize
, 'ORDER BY' => 'blob_id' )
267 if ( !$res->numRows() ) {
271 foreach ( $res as $row ) {
272 gmp_setbit( $actualBlobs, $row->blob_id
);
274 $startId = $row->blob_id
;
277 if ( $batchesDone >= $this->reportingInterval
) {
279 echo "$startId / $endId\n";
283 // Find actual blobs that weren't tracked by the previous passes
284 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
285 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs
[$cluster] ) );
287 // Traverse the orphan list
288 $insertBatch = array();
291 $id = gmp_scan1( $orphans, $id );
295 $insertBatch[] = array(
296 'bo_cluster' => $cluster,
303 echo "Found " . count( $insertBatch ) . " orphan(s) in $cluster\n";
304 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__
);