Updating composer/semver (1.5.1 => 1.5.2)
[mediawiki.git] / maintenance / deduplicateArchiveRevId.php
blob3108a77aaddd6cb3a88158a149ae3eaf4a4a8d29
1 <?php
3 use Wikimedia\Rdbms\IDatabase;
5 require_once __DIR__ . '/Maintenance.php';
7 /**
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
11 * @ingroup Maintenance
12 * @since 1.32
14 class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
16 /**
17 * @var array[]|null
18 * @phan-var array{tables:string[],fields:string[],joins:array}|null
20 private $arActorQuery = null;
22 private $deleted = 0;
23 private $reassigned = 0;
25 public function __construct() {
26 parent::__construct();
27 $this->addDescription(
28 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
30 $this->setBatchSize( 10000 );
33 protected function getUpdateKey() {
34 return __CLASS__;
37 protected function doDBUpdates() {
38 $this->output( "Deduplicating ar_rev_id...\n" );
39 $dbw = $this->getDB( DB_MASTER );
40 // Sanity check. If this is a new install, we don't need to do anything here.
41 if ( PopulateArchiveRevId::isNewInstall( $dbw ) ) {
42 $this->output( "New install, nothing to do here.\n" );
43 return true;
46 PopulateArchiveRevId::checkMysqlAutoIncrementBug( $dbw );
48 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
49 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
50 $batchSize = $this->getBatchSize();
52 $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
53 $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
55 for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
56 $endId = min( $maxId, $id + $batchSize - 1 );
58 $this->beginTransaction( $dbw, __METHOD__ );
60 // Lock the archive and revision table rows for the IDs we're checking
61 // to try to prevent deletions or undeletions from confusing things.
62 $dbw->selectRowCount(
63 'archive',
64 '1',
65 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
66 __METHOD__,
67 [ 'FOR UPDATE' ]
69 $dbw->selectRowCount(
70 'revision',
71 '1',
72 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
73 __METHOD__,
74 [ 'LOCK IN SHARE MODE' ]
77 // Figure out the ar_rev_ids we actually need to look at
78 $res = $dbw->select(
79 [ 'archive', 'revision' ] + $revActorQuery['tables'],
80 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
81 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
82 __METHOD__,
83 [ 'DISTINCT' ],
84 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
86 $revRows = [];
87 foreach ( $res as $row ) {
88 $revRows[$row->rev_id] = $row;
91 $arRevIds = $dbw->selectFieldValues(
92 [ 'archive' ],
93 'ar_rev_id',
94 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
95 __METHOD__,
96 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
98 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
100 if ( $arRevIds ) {
101 $this->processArRevIds( $dbw, $arRevIds, $revRows );
104 $this->output( "... $id-$endId\n" );
105 $this->commitTransaction( $dbw, __METHOD__ );
108 $this->output(
109 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
110 . "$this->reassigned assigned new IDs.\n"
112 return true;
116 * Process a set of ar_rev_ids
117 * @param IDatabase $dbw
118 * @param int[] $arRevIds IDs to process
119 * @param stdClass[] $revRows Existing revision-table row data
121 private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
122 // Select all the data we need for deduplication
123 $res = $dbw->select(
124 [ 'archive' ] + $this->arActorQuery['tables'],
125 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
126 + $this->arActorQuery['fields'],
127 [ 'ar_rev_id' => $arRevIds ],
128 __METHOD__,
130 $this->arActorQuery['joins']
133 // Determine which rows we need to delete or reassign
134 $seen = [];
135 $toDelete = [];
136 $toReassign = [];
137 foreach ( $res as $row ) {
138 // Revision-table row exists?
139 if ( isset( $revRows[$row->ar_rev_id] ) ) {
140 $revRow = $revRows[$row->ar_rev_id];
142 // Record the rev_id as seen, so the code below will always delete or reassign.
143 if ( !isset( $seen[$revRow->rev_id] ) ) {
144 $seen[$revRow->rev_id] = [
145 'first' => "revision row",
149 // Delete the archive row if it seems to be the same regardless
150 // of page, because moves can change IDs and titles.
151 if ( $row->ar_timestamp === $revRow->rev_timestamp &&
152 $row->ar_sha1 === $revRow->rev_sha1 &&
153 $row->ar_user === $revRow->rev_user &&
154 $row->ar_user_text === $revRow->rev_user_text
156 $this->output(
157 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
159 $toDelete[] = $row->ar_id;
160 continue;
164 $key = $this->getSeenKey( $row );
165 if ( !isset( $seen[$row->ar_rev_id] ) ) {
166 // This rev_id hasn't even been seen yet, nothing to do besides record it.
167 $seen[$row->ar_rev_id] = [
168 'first' => "archive row $row->ar_id",
169 $key => $row->ar_id,
171 } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
172 // The rev_id was seen, but not this particular change. Reassign it.
173 $seen[$row->ar_rev_id][$key] = $row->ar_id;
174 $this->output(
175 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
176 . "for rev_id $row->ar_rev_id, reassigning\n"
178 $toReassign[] = $row->ar_id;
179 } else {
180 // The rev_id was seen with a row that matches this change. Delete it.
181 $this->output(
182 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
183 . "for rev_id $row->ar_rev_id, deleting\n"
185 $toDelete[] = $row->ar_id;
189 // Perform the updates
190 if ( $toDelete ) {
191 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
192 $this->deleted += $dbw->affectedRows();
194 if ( $toReassign ) {
195 $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
200 * Make a key identifying a "unique" change from a row
201 * @param stdClass $row
202 * @return string
204 private function getSeenKey( $row ) {
205 return implode( "\n", [
206 $row->ar_namespace,
207 $row->ar_title,
208 $row->ar_timestamp,
209 $row->ar_sha1,
210 $row->ar_user,
211 $row->ar_user_text,
212 ] );
217 $maintClass = DeduplicateArchiveRevId::class;
218 require_once RUN_MAINTENANCE_IF_MAIN;