3 * Find all rows in the categorylinks table whose collation is out-of-date
4 * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey
5 * using the page title and cl_sortkey_prefix.
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
23 * @ingroup Maintenance
24 * @author Aryeh Gregor (Simetrical)
27 require_once __DIR__
. '/Maintenance.php';
29 use MediaWiki\MainConfigNames
;
30 use MediaWiki\MediaWikiServices
;
31 use Wikimedia\Rdbms\IDatabase
;
32 use Wikimedia\Rdbms\IMaintainableDatabase
;
33 use Wikimedia\Rdbms\IResultWrapper
;
34 use Wikimedia\Rdbms\LBFactory
;
37 * Maintenance script that will find all rows in the categorylinks table
38 * whose collation is out-of-date.
40 * @ingroup Maintenance
42 class UpdateCollation
extends Maintenance
{
44 public $sizeHistogram = [];
47 private $numRowsProcessed = 0;
56 private $verboseStats;
62 private $collationName;
64 /** @var string|null */
70 /** @var IMaintainableDatabase */
76 /** @var NamespaceInfo */
77 private $namespaceInfo;
79 public function __construct() {
80 parent
::__construct();
82 $this->addDescription( <<<TEXT
83 This script will find all rows in the categorylinks table whose collation is
84 out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
85 repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
86 collations are up-to-date, it will do nothing.
90 $this->setBatchSize( 100 );
91 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
92 'supposed to be up-to-date.', false, false, 'f' );
93 $this->addOption( 'previous-collation', 'Set the previous value of ' .
94 '$wgCategoryCollation here to speed up this script, especially if your ' .
95 'categorylinks table is large. This will only update rows with that ' .
96 'collation, though, so it may miss out-of-date rows with a different, ' .
97 'even older collation.', false, true );
98 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
99 'use instead of $wgCategoryCollation. Usually you should not use this, ' .
100 'you should just update $wgCategoryCollation in LocalSettings.php.',
102 $this->addOption( 'target-table', 'Copy rows from categorylinks into the ' .
103 'specified table instead of updating them in place.', false, true );
104 $this->addOption( 'remote', 'Use Shellbox to calculate the new sort keys ' .
106 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
107 'compile statistics.' );
108 $this->addOption( 'verbose-stats', 'Show more statistics.' );
112 * Get services and initialise member variables
114 private function init() {
115 $services = MediaWikiServices
::getInstance();
116 $this->namespaceInfo
= $services->getNamespaceInfo();
117 $this->lbFactory
= $services->getDBLoadBalancerFactory();
119 if ( $this->hasOption( 'target-collation' ) ) {
120 $this->collationName
= $this->getOption( 'target-collation' );
122 $this->collationName
= $this->getConfig()->get( MainConfigNames
::CategoryCollation
);
124 if ( $this->hasOption( 'remote' ) ) {
125 $realCollationName = 'remote-' . $this->collationName
;
127 $realCollationName = $this->collationName
;
129 $this->collation
= $services->getCollationFactory()->makeCollation( $realCollationName );
131 // Collation check: in some cases the constructor will work,
132 // but this will raise an exception, breaking all category pages
133 $this->collation
->getSortKey( 'MediaWiki' );
135 $this->force
= $this->getOption( 'force' );
136 $this->dryRun
= $this->getOption( 'dry-run' );
137 $this->verboseStats
= $this->getOption( 'verbose-stats' );
138 $this->dbw
= $this->getDB( DB_PRIMARY
);
139 $this->dbr
= $this->getDB( DB_REPLICA
);
140 $this->targetTable
= $this->getOption( 'target-table' );
143 public function execute() {
145 $batchSize = $this->getBatchSize();
147 if ( $this->targetTable
) {
148 if ( !$this->dbw
->tableExists( $this->targetTable
, __METHOD__
) ) {
149 $this->output( "Creating table {$this->targetTable}\n" );
151 'CREATE TABLE ' . $this->dbw
->tableName( $this->targetTable
) .
152 ' LIKE ' . $this->dbw
->tableName( 'categorylinks' ),
158 // Locally at least, (my local is a rather old version of mysql)
159 // mysql seems to filesort if there is both an equality
160 // (but not for an inequality) condition on cl_collation in the
161 // WHERE and it is also the first item in the ORDER BY.
162 if ( $this->hasOption( 'previous-collation' ) ) {
163 $orderBy = 'cl_to, cl_type, cl_from';
165 $orderBy = 'cl_collation, cl_to, cl_type, cl_from';
168 'LIMIT' => $batchSize,
169 'ORDER BY' => $orderBy,
170 'STRAIGHT_JOIN' // per T58041
173 $collationConds = [];
174 if ( !$this->force
&& !$this->targetTable
) {
175 if ( $this->hasOption( 'previous-collation' ) ) {
176 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
179 0 => 'cl_collation != ' . $this->dbr
->addQuotes( $this->collationName
)
183 $count = $this->dbr
->estimateRowCount(
189 // Improve estimate if feasible
190 if ( $count < 1000000 ) {
191 $count = $this->dbr
->selectField(
199 $this->output( "Collations up-to-date.\n" );
203 if ( $this->dryRun
) {
204 $this->output( "$count rows would be updated.\n" );
206 $this->output( "Fixing collation for $count rows.\n" );
211 $this->output( "Selecting next $batchSize rows..." );
213 // cl_type must be selected as a number for proper paging because
215 if ( $this->dbw
->getType() === 'mysql' ) {
216 $clType = 'cl_type+0 AS "cl_type_numeric"';
220 $res = $this->dbw
->select(
221 [ 'categorylinks', 'page' ],
223 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
224 'cl_sortkey', $clType, 'cl_timestamp',
225 'page_namespace', 'page_title'
227 array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
231 $this->output( " processing..." );
233 if ( $res->numRows() ) {
234 if ( $this->targetTable
) {
235 $this->copyBatch( $res );
237 $this->updateBatch( $res );
239 $res->seek( $res->numRows() - 1 );
240 $lastRow = $res->fetchObject();
241 $batchConds = [ $this->getBatchCondition( $lastRow, $this->dbw
) ];
244 if ( $this->dryRun
) {
245 $this->output( "{$this->numRowsProcessed} rows would be updated so far.\n" );
247 $this->output( "{$this->numRowsProcessed} done.\n" );
249 } while ( $res->numRows() == $batchSize );
251 if ( !$this->dryRun
) {
252 $this->output( "{$this->numRowsProcessed} rows processed\n" );
255 if ( $this->verboseStats
) {
256 $this->output( "\n" );
257 $this->showSortKeySizeHistogram();
262 * Return an SQL expression selecting rows which sort above the given row,
263 * assuming an ordering of cl_collation, cl_to, cl_type, cl_from
264 * @param stdClass $row
265 * @param IDatabase $dbw
268 private function getBatchCondition( $row, $dbw ) {
269 if ( $this->hasOption( 'previous-collation' ) ) {
270 $fields = [ 'cl_to', 'cl_type', 'cl_from' ];
272 $fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
275 foreach ( $fields as $field ) {
276 if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
277 // Range conditions with enums are weird in mysql
278 // This must be a numeric literal, or it won't work.
279 $value = intval( $row->cl_type_numeric
);
281 $value = $row->$field;
283 $conds[ $field ] = $value;
286 return $dbw->buildComparison( '>', $conds );
290 * Update a set of rows in the categorylinks table
292 * @param IResultWrapper $res The rows to update
294 private function updateBatch( $res ) {
295 if ( !$this->dryRun
) {
296 $this->beginTransaction( $this->dbw
, __METHOD__
);
298 foreach ( $res as $row ) {
299 $title = Title
::newFromRow( $row );
300 if ( !$row->cl_collation
) {
301 # This is an old-style row, so the sortkey needs to be
303 if ( $row->cl_sortkey
== $title->getText()
304 ||
$row->cl_sortkey
== $title->getPrefixedText()
308 # Custom sortkey, use it as a prefix
309 $prefix = $row->cl_sortkey
;
312 $prefix = $row->cl_sortkey_prefix
;
314 # cl_type will be wrong for lots of pages if cl_collation is 0,
315 # so let's update it while we're here.
316 $type = $this->namespaceInfo
->getCategoryLinkType( $row->page_namespace
);
317 $newSortKey = $this->collation
->getSortKey(
318 $title->getCategorySortkey( $prefix ) );
319 $this->updateSortKeySizeHistogram( $newSortKey );
320 // Truncate to 230 bytes to avoid DB error
321 $newSortKey = substr( $newSortKey, 0, 230 );
323 if ( $this->dryRun
) {
324 // Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
325 // other fields, if any, those usually only happen when upgrading old MediaWikis.)
326 $this->numRowsProcessed +
= ( $row->cl_sortkey
!== $newSortKey );
331 'cl_sortkey' => $newSortKey,
332 'cl_sortkey_prefix' => $prefix,
333 'cl_collation' => $this->collationName
,
335 'cl_timestamp = cl_timestamp',
337 [ 'cl_from' => $row->cl_from
, 'cl_to' => $row->cl_to
],
340 $this->numRowsProcessed++
;
343 if ( !$this->dryRun
) {
344 $this->commitTransaction( $this->dbw
, __METHOD__
);
349 * Copy a set of rows to the target table
351 * @param IResultWrapper $res
353 private function copyBatch( $res ) {
355 foreach ( $res as $row ) {
356 $title = Title
::newFromRow( $row );
357 $sortKeyInputs[] = $title->getCategorySortkey( $row->cl_sortkey_prefix
);
359 $sortKeys = $this->collation
->getSortKeys( $sortKeyInputs );
361 foreach ( $res as $i => $row ) {
362 if ( !isset( $sortKeys[$i] ) ) {
363 throw new MWException( 'Unable to get sort key' );
365 $newSortKey = $sortKeys[$i];
366 $this->updateSortKeySizeHistogram( $newSortKey );
367 // Truncate to 230 bytes to avoid DB error
368 $newSortKey = substr( $newSortKey, 0, 230 );
369 $type = $this->namespaceInfo
->getCategoryLinkType( $row->page_namespace
);
371 'cl_from' => $row->cl_from
,
372 'cl_to' => $row->cl_to
,
373 'cl_sortkey' => $newSortKey,
374 'cl_sortkey_prefix' => $row->cl_sortkey_prefix
,
375 'cl_collation' => $this->collationName
,
377 'cl_timestamp' => $row->cl_timestamp
380 if ( $this->dryRun
) {
381 $this->numRowsProcessed +
= count( $rowsToInsert );
383 $this->beginTransaction( $this->dbw
, __METHOD__
);
384 $this->dbw
->insert( $this->targetTable
, $rowsToInsert, __METHOD__
, [ 'IGNORE' ] );
385 $this->numRowsProcessed +
= $this->dbw
->affectedRows();
386 $this->commitTransaction( $this->dbw
, __METHOD__
);
391 * Update the verbose statistics
395 private function updateSortKeySizeHistogram( $key ) {
396 if ( !$this->verboseStats
) {
399 $length = strlen( $key );
400 if ( !isset( $this->sizeHistogram
[$length] ) ) {
401 $this->sizeHistogram
[$length] = 0;
403 $this->sizeHistogram
[$length]++
;
407 * Show the verbose statistics
409 private function showSortKeySizeHistogram() {
410 if ( !$this->sizeHistogram
) {
413 $maxLength = max( array_keys( $this->sizeHistogram
) );
414 if ( $maxLength == 0 ) {
418 $coarseHistogram = array_fill( 0, $numBins, 0 );
419 $coarseBoundaries = [];
421 for ( $i = 0; $i < $numBins - 1; $i++
) {
422 $boundary +
= $maxLength / $numBins;
423 $coarseBoundaries[$i] = round( $boundary );
425 $coarseBoundaries[$numBins - 1] = $maxLength +
1;
427 for ( $i = 0; $i <= $maxLength; $i++
) {
431 $val = $this->sizeHistogram
[$i] ??
0;
432 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++
) {
433 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
434 if ( $coarseBoundaries[$coarseIndex] > $i ) {
435 $coarseHistogram[$coarseIndex] +
= $val;
439 if ( $coarseIndex == $numBins - 1 ) {
440 $coarseHistogram[$coarseIndex] +
= $val;
445 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
447 $maxBinVal = max( $coarseHistogram );
448 $scale = 60 / $maxBinVal;
450 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++
) {
451 $val = $coarseHistogram[$coarseIndex] ??
0;
452 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset False positive
453 $boundary = $coarseBoundaries[$coarseIndex];
454 $this->output( sprintf( "%-10s %-10d |%s\n",
455 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
457 str_repeat( '*', $scale * $val ) ) );
458 $prevBoundary = $boundary;
463 $maintClass = UpdateCollation
::class;
464 require_once RUN_MAINTENANCE_IF_MAIN
;