Merge "Update README & COPYING"
[mediawiki.git] / maintenance / storage / fixBug20757.php
blob101aa068bad3cd5c386d7d838f3b5420dd7c2476
1 <?php
2 /**
3 * Script to fix bug 20757.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * @file
21 * @ingroup Maintenance ExternalStorage
24 require_once __DIR__ . '/../Maintenance.php';
26 /**
27 * Maintenance script to fix bug 20757.
29 * @ingroup Maintenance ExternalStorage
31 class FixBug20757 extends Maintenance {
32 public $batchSize = 10000;
33 public $mapCache = array();
34 public $mapCacheSize = 0;
35 public $maxMapCacheSize = 1000000;
37 function __construct() {
38 parent::__construct();
39 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
40 $this->addOption( 'dry-run', 'Report only' );
41 $this->addOption( 'start', 'old_id to start at', false, true );
44 function execute() {
45 $dbr = wfGetDB( DB_SLAVE );
46 $dbw = wfGetDB( DB_MASTER );
48 $dryRun = $this->getOption( 'dry-run' );
49 if ( $dryRun ) {
50 print "Dry run only.\n";
53 $startId = $this->getOption( 'start', 0 );
54 $numGood = 0;
55 $numFixed = 0;
56 $numBad = 0;
58 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
60 if ( $dbr->getType() == 'mysql'
61 && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) )
63 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
64 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
65 } else {
66 // No CONVERT() in MySQL 4.0
67 $lowerLeft = 'LOWER(LEFT(old_text,22))';
70 while ( true ) {
71 print "ID: $startId / $totalRevs\r";
73 $res = $dbr->select(
74 'text',
75 array( 'old_id', 'old_flags', 'old_text' ),
76 array(
77 'old_id > ' . intval( $startId ),
78 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
79 "$lowerLeft = 'o:15:\"historyblobstub\"'",
81 __METHOD__,
82 array(
83 'ORDER BY' => 'old_id',
84 'LIMIT' => $this->batchSize,
88 if ( !$res->numRows() ) {
89 break;
92 $secondaryIds = array();
93 $stubs = array();
95 foreach ( $res as $row ) {
96 $startId = $row->old_id;
98 // Basic sanity checks
99 $obj = unserialize( $row->old_text );
100 if ( $obj === false ) {
101 print "{$row->old_id}: unrecoverable: cannot unserialize\n";
102 ++$numBad;
103 continue;
106 if ( !is_object( $obj ) ) {
107 print "{$row->old_id}: unrecoverable: unserialized to type " .
108 gettype( $obj ) . ", possible double-serialization\n";
109 ++$numBad;
110 continue;
113 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
114 print "{$row->old_id}: unrecoverable: unexpected object class " .
115 get_class( $obj ) . "\n";
116 ++$numBad;
117 continue;
120 // Process flags
121 $flags = explode( ',', $row->old_flags );
122 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
123 $legacyEncoding = false;
124 } else {
125 $legacyEncoding = true;
128 // Queue the stub for future batch processing
129 $id = intval( $obj->mOldId );
130 $secondaryIds[] = $id;
131 $stubs[$row->old_id] = array(
132 'legacyEncoding' => $legacyEncoding,
133 'secondaryId' => $id,
134 'hash' => $obj->mHash,
138 $secondaryIds = array_unique( $secondaryIds );
140 if ( !count( $secondaryIds ) ) {
141 continue;
144 // Run the batch query on blob_tracking
145 $res = $dbr->select(
146 'blob_tracking',
147 '*',
148 array(
149 'bt_text_id' => $secondaryIds,
151 __METHOD__
153 $trackedBlobs = array();
154 foreach ( $res as $row ) {
155 $trackedBlobs[$row->bt_text_id] = $row;
158 // Process the stubs
159 foreach ( $stubs as $primaryId => $stub ) {
160 $secondaryId = $stub['secondaryId'];
161 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
162 // No tracked blob. Work out what went wrong
163 $secondaryRow = $dbr->selectRow(
164 'text',
165 array( 'old_flags', 'old_text' ),
166 array( 'old_id' => $secondaryId ),
167 __METHOD__
169 if ( !$secondaryRow ) {
170 print "$primaryId: unrecoverable: secondary row is missing\n";
171 ++$numBad;
172 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
173 // Not broken yet, and not in the tracked clusters so it won't get
174 // broken by the current RCT run.
175 ++$numGood;
176 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
177 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
178 ++$numBad;
179 } else {
180 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
181 ++$numBad;
183 unset( $stubs[$primaryId] );
184 continue;
186 $trackRow = $trackedBlobs[$secondaryId];
188 // Check that the specified text really is available in the tracked source row
189 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
190 $text = ExternalStore::fetchFromURL( $url );
191 if ( $text === false ) {
192 print "$primaryId: unrecoverable: source text missing\n";
193 ++$numBad;
194 unset( $stubs[$primaryId] );
195 continue;
197 if ( md5( $text ) !== $stub['hash'] ) {
198 print "$primaryId: unrecoverable: content hashes do not match\n";
199 ++$numBad;
200 unset( $stubs[$primaryId] );
201 continue;
204 // Find the page_id and rev_id
205 // The page is probably the same as the page of the secondary row
206 $pageId = intval( $trackRow->bt_page );
207 if ( !$pageId ) {
208 $revId = $pageId = 0;
209 } else {
210 $revId = $this->findTextIdInPage( $pageId, $primaryId );
211 if ( !$revId ) {
212 // Actually an orphan
213 $pageId = $revId = 0;
217 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
219 if ( !$dryRun ) {
220 // Reset the text row to point to the original copy
221 $dbw->begin( __METHOD__ );
222 $dbw->update(
223 'text',
224 // SET
225 array(
226 'old_flags' => $newFlags,
227 'old_text' => $url
229 // WHERE
230 array( 'old_id' => $primaryId ),
231 __METHOD__
234 // Add a blob_tracking row so that the new reference can be recompressed
235 // without needing to run trackBlobs.php again
236 $dbw->insert( 'blob_tracking',
237 array(
238 'bt_page' => $pageId,
239 'bt_rev_id' => $revId,
240 'bt_text_id' => $primaryId,
241 'bt_cluster' => $trackRow->bt_cluster,
242 'bt_blob_id' => $trackRow->bt_blob_id,
243 'bt_cgz_hash' => $stub['hash'],
244 'bt_new_url' => null,
245 'bt_moved' => 0,
247 __METHOD__
249 $dbw->commit( __METHOD__ );
250 $this->waitForSlaves();
253 print "$primaryId: resolved to $url\n";
254 ++$numFixed;
258 print "\n";
259 print "Fixed: $numFixed\n";
260 print "Unrecoverable: $numBad\n";
261 print "Good stubs: $numGood\n";
264 function waitForSlaves() {
265 static $iteration = 0;
266 ++$iteration;
267 if ( ++$iteration > 50 == 0 ) {
268 wfWaitForSlaves();
269 $iteration = 0;
273 function findTextIdInPage( $pageId, $textId ) {
274 $ids = $this->getRevTextMap( $pageId );
275 if ( !isset( $ids[$textId] ) ) {
276 return null;
277 } else {
278 return $ids[$textId];
282 function getRevTextMap( $pageId ) {
283 if ( !isset( $this->mapCache[$pageId] ) ) {
284 // Limit cache size
285 while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
286 $key = key( $this->mapCache );
287 $this->mapCacheSize -= count( $this->mapCache[$key] );
288 unset( $this->mapCache[$key] );
291 $dbr = wfGetDB( DB_SLAVE );
292 $map = array();
293 $res = $dbr->select( 'revision',
294 array( 'rev_id', 'rev_text_id' ),
295 array( 'rev_page' => $pageId ),
296 __METHOD__
298 foreach ( $res as $row ) {
299 $map[$row->rev_text_id] = $row->rev_id;
301 $this->mapCache[$pageId] = $map;
302 $this->mapCacheSize += count( $map );
304 return $this->mapCache[$pageId];
308 * This is based on part of HistoryBlobStub::getText().
309 * Determine if the text can be retrieved from the row in the normal way.
310 * @param $stub
311 * @param $secondaryRow
312 * @return bool
314 function isUnbrokenStub( $stub, $secondaryRow ) {
315 $flags = explode( ',', $secondaryRow->old_flags );
316 $text = $secondaryRow->old_text;
317 if ( in_array( 'external', $flags ) ) {
318 $url = $text;
319 @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
320 if ( $path == "" ) {
321 return false;
323 $text = ExternalStore::fetchFromUrl( $url );
325 if ( !in_array( 'object', $flags ) ) {
326 return false;
329 if ( in_array( 'gzip', $flags ) ) {
330 $obj = unserialize( gzinflate( $text ) );
331 } else {
332 $obj = unserialize( $text );
335 if ( !is_object( $obj ) ) {
336 // Correct for old double-serialization bug.
337 $obj = unserialize( $obj );
340 if ( !is_object( $obj ) ) {
341 return false;
344 $obj->uncompress();
345 $text = $obj->getItem( $stub['hash'] );
346 return $text !== false;
350 $maintClass = 'FixBug20757';
351 require_once RUN_MAINTENANCE_IF_MAIN;