3 use Wikimedia\Rdbms\IDatabase
;
5 require_once __DIR__
. '/Maintenance.php';
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
11 * @ingroup Maintenance
14 class DeduplicateArchiveRevId
extends LoggedUpdateMaintenance
{
18 * @phan-var array{tables:string[],fields:string[],joins:array}|null
20 private $arActorQuery = null;
23 private $reassigned = 0;
25 public function __construct() {
26 parent
::__construct();
27 $this->addDescription(
28 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
30 $this->setBatchSize( 10000 );
33 protected function getUpdateKey() {
37 protected function doDBUpdates() {
38 $this->output( "Deduplicating ar_rev_id...\n" );
39 $dbw = $this->getDB( DB_MASTER
);
40 // Sanity check. If this is a new install, we don't need to do anything here.
41 if ( PopulateArchiveRevId
::isNewInstall( $dbw ) ) {
42 $this->output( "New install, nothing to do here.\n" );
46 PopulateArchiveRevId
::checkMysqlAutoIncrementBug( $dbw );
48 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__
);
49 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__
);
50 $batchSize = $this->getBatchSize();
52 $this->arActorQuery
= ActorMigration
::newMigration()->getJoin( 'ar_user' );
53 $revActorQuery = ActorMigration
::newMigration()->getJoin( 'rev_user' );
55 for ( $id = $minId; $id <= $maxId; $id +
= $batchSize ) {
56 $endId = min( $maxId, $id +
$batchSize - 1 );
58 $this->beginTransaction( $dbw, __METHOD__
);
60 // Lock the archive and revision table rows for the IDs we're checking
61 // to try to prevent deletions or undeletions from confusing things.
65 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
72 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
74 [ 'LOCK IN SHARE MODE' ]
77 // Figure out the ar_rev_ids we actually need to look at
79 [ 'archive', 'revision' ] +
$revActorQuery['tables'],
80 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] +
$revActorQuery['fields'],
81 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
84 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] +
$revActorQuery['joins']
87 foreach ( $res as $row ) {
88 $revRows[$row->rev_id
] = $row;
91 $arRevIds = $dbw->selectFieldValues(
94 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
96 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
98 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
101 $this->processArRevIds( $dbw, $arRevIds, $revRows );
104 $this->output( "... $id-$endId\n" );
105 $this->commitTransaction( $dbw, __METHOD__
);
109 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
110 . "$this->reassigned assigned new IDs.\n"
116 * Process a set of ar_rev_ids
117 * @param IDatabase $dbw
118 * @param int[] $arRevIds IDs to process
119 * @param stdClass[] $revRows Existing revision-table row data
121 private function processArRevIds( IDatabase
$dbw, array $arRevIds, array $revRows ) {
122 // Select all the data we need for deduplication
124 [ 'archive' ] +
$this->arActorQuery
['tables'],
125 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
126 +
$this->arActorQuery
['fields'],
127 [ 'ar_rev_id' => $arRevIds ],
130 $this->arActorQuery
['joins']
133 // Determine which rows we need to delete or reassign
137 foreach ( $res as $row ) {
138 // Revision-table row exists?
139 if ( isset( $revRows[$row->ar_rev_id
] ) ) {
140 $revRow = $revRows[$row->ar_rev_id
];
142 // Record the rev_id as seen, so the code below will always delete or reassign.
143 if ( !isset( $seen[$revRow->rev_id
] ) ) {
144 $seen[$revRow->rev_id
] = [
145 'first' => "revision row",
149 // Delete the archive row if it seems to be the same regardless
150 // of page, because moves can change IDs and titles.
151 if ( $row->ar_timestamp
=== $revRow->rev_timestamp
&&
152 $row->ar_sha1
=== $revRow->rev_sha1
&&
153 $row->ar_user
=== $revRow->rev_user
&&
154 $row->ar_user_text
=== $revRow->rev_user_text
157 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
159 $toDelete[] = $row->ar_id
;
164 $key = $this->getSeenKey( $row );
165 if ( !isset( $seen[$row->ar_rev_id
] ) ) {
166 // This rev_id hasn't even been seen yet, nothing to do besides record it.
167 $seen[$row->ar_rev_id
] = [
168 'first' => "archive row $row->ar_id",
171 } elseif ( !isset( $seen[$row->ar_rev_id
][$key] ) ) {
172 // The rev_id was seen, but not this particular change. Reassign it.
173 $seen[$row->ar_rev_id
][$key] = $row->ar_id
;
175 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
176 . "for rev_id $row->ar_rev_id, reassigning\n"
178 $toReassign[] = $row->ar_id
;
180 // The rev_id was seen with a row that matches this change. Delete it.
182 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
183 . "for rev_id $row->ar_rev_id, deleting\n"
185 $toDelete[] = $row->ar_id
;
189 // Perform the updates
191 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__
);
192 $this->deleted +
= $dbw->affectedRows();
195 $this->reassigned +
= PopulateArchiveRevId
::reassignArRevIds( $dbw, $toReassign );
200 * Make a key identifying a "unique" change from a row
201 * @param stdClass $row
204 private function getSeenKey( $row ) {
205 return implode( "\n", [
217 $maintClass = DeduplicateArchiveRevId
::class;
218 require_once RUN_MAINTENANCE_IF_MAIN
;