Merge "mediawiki.diff: Replace pixel font-size value for relative one"
[mediawiki.git] / maintenance / categoryChangesAsRdf.php
blob8a66f0f5f45b4e2c952226c21cadd0711febbb5e
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
20 use MediaWiki\Category\CategoriesRdf;
21 use MediaWiki\MainConfigNames;
22 use MediaWiki\Maintenance\Maintenance;
23 use MediaWiki\Utils\MWTimestamp;
24 use Wikimedia\Purtle\RdfWriter;
25 use Wikimedia\Purtle\TurtleRdfWriter;
26 use Wikimedia\Rdbms\IReadableDatabase;
28 // @codeCoverageIgnoreStart
29 require_once __DIR__ . '/Maintenance.php';
30 // @codeCoverageIgnoreEnd
32 /**
33 * Maintenance script to provide RDF representation of the recent changes in category tree.
35 * @ingroup Maintenance
36 * @since 1.30
38 class CategoryChangesAsRdf extends Maintenance {
39 /**
40 * Insert query
42 private const SPARQL_INSERT = <<<SPARQL
43 INSERT DATA {
47 SPARQL;
49 /**
50 * Delete query
52 private const SPARQL_DELETE = <<<SPARQLD
53 DELETE {
54 ?category ?x ?y
55 } WHERE {
56 ?category ?x ?y
57 VALUES ?category {
62 SPARQLD;
64 /**
65 * @var RdfWriter
67 private $rdfWriter;
68 /**
69 * Categories RDF helper.
70 * @var CategoriesRdf
72 private $categoriesRdf;
74 /** @var string */
75 private $startTS;
76 /** @var string */
77 private $endTS;
79 /**
80 * List of processed page IDs,
81 * so we don't try to process same thing twice
82 * @var true[]
84 protected $processed = [];
86 public function __construct() {
87 parent::__construct();
89 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
91 $this->setBatchSize( 200 );
92 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
93 true, 'o' );
94 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or MediaWiki format.',
95 true, true, 's' );
96 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or MediaWiki format.', true,
97 true, 'e' );
101 * Initialize external service classes.
103 public function initialize() {
104 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
105 $this->rdfWriter = new TurtleRdfWriter();
106 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
109 public function execute() {
110 $this->initialize();
111 $startTS = new MWTimestamp( $this->getOption( "start" ) );
113 $endTS = new MWTimestamp( $this->getOption( "end" ) );
114 $now = new MWTimestamp();
115 $rcMaxAge = $this->getConfig()->get( MainConfigNames::RCMaxAge );
117 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$startTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
118 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
120 if ( (int)$now->getTimestamp( TS_UNIX ) - (int)$endTS->getTimestamp( TS_UNIX ) > $rcMaxAge ) {
121 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
124 $this->startTS = $startTS->getTimestamp();
125 $this->endTS = $endTS->getTimestamp();
127 $outFile = $this->getOption( 'output', 'php://stdout' );
128 if ( $outFile === '-' ) {
129 $outFile = 'php://stdout';
132 $output = fopen( $outFile, 'wb' );
134 $this->categoriesRdf->setupPrefixes();
135 $this->rdfWriter->start();
137 $prefixes = $this->getRdf();
138 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
139 // Also strip dot at the end.
140 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
141 fwrite( $output, $prefixes );
143 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
145 // Deletes go first because if the page was deleted, other changes
146 // do not matter. This only gets true deletes, i.e. not pages that were restored.
147 $this->handleDeletes( $dbr, $output );
148 // Moves go before additions because if category is moved, we should not process creation
149 // as it would produce wrong data - because create row has old title
150 $this->handleMoves( $dbr, $output );
151 // We need to handle restores too since delete may have happened in previous update.
152 $this->handleRestores( $dbr, $output );
153 // Process newly added pages
154 $this->handleAdds( $dbr, $output );
155 // Process page edits
156 $this->handleEdits( $dbr, $output );
157 // Process categorization changes
158 $this->handleCategorization( $dbr, $output );
160 // Update timestamp
161 fwrite( $output, $this->updateTS( $this->endTS ) );
165 * Get the text of SPARQL INSERT DATA clause
166 * @return string
168 private function getInsertRdf() {
169 $rdfText = $this->getRdf();
170 if ( !$rdfText ) {
171 return "";
173 return sprintf( self::SPARQL_INSERT, $rdfText );
177 * Get SPARQL for updating set of categories
178 * @param IReadableDatabase $dbr
179 * @param string[] $deleteUrls List of URIs to be deleted, with <>
180 * @param string[] $pages List of categories: id => title
181 * @param string $mark Marks which operation requests the query
182 * @return string SPARQL query
184 private function getCategoriesUpdate( IReadableDatabase $dbr, $deleteUrls, $pages, $mark ) {
185 if ( !$deleteUrls ) {
186 return "";
189 if ( $pages ) {
190 $this->writeParentCategories( $dbr, $pages );
193 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
194 $this->getInsertRdf();
198 * Write parent data for a set of categories.
199 * The list has the child categories.
200 * @param IReadableDatabase $dbr
201 * @param string[] $pages List of child categories: id => title
203 private function writeParentCategories( IReadableDatabase $dbr, $pages ) {
204 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ), __METHOD__ ) as $row ) {
205 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
210 * Generate SPARQL Update code for updating dump timestamp
211 * @param string|int $timestamp Timestamp for last change
212 * @return string SPARQL Update query for timestamp.
214 public function updateTS( $timestamp ) {
215 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
216 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
217 $tsQuery = <<<SPARQL
218 DELETE {
219 $dumpUrl schema:dateModified ?o .
221 WHERE {
222 $dumpUrl schema:dateModified ?o .
224 INSERT DATA {
225 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
228 SPARQL;
229 return $tsQuery;
233 * Set up standard iterator for retrieving category changes.
234 * @param IReadableDatabase $dbr
235 * @param string[] $columns List of additional fields to get
236 * @param string $fname Name of the calling function
237 * @return BatchRowIterator
239 private function setupChangesIterator(
240 IReadableDatabase $dbr,
241 array $columns,
242 string $fname
244 $it = new BatchRowIterator( $dbr,
245 $dbr->newSelectQueryBuilder()
246 ->from( 'recentchanges' )
247 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] )
248 ->leftJoin( 'category', null, [ 'cat_title = rc_title' ] )
249 ->select( array_merge( $columns, [
250 'rc_title',
251 'rc_cur_id',
252 'pp_propname',
253 'cat_pages',
254 'cat_subcats',
255 'cat_files'
256 ] ) )
257 ->caller( $fname ),
258 [ 'rc_timestamp' ],
259 $this->mBatchSize
261 $this->addTimestampConditions( $it, $dbr );
262 return $it;
266 * Fetch newly created categories
267 * @param IReadableDatabase $dbr
268 * @param string $fname Name of the calling function
269 * @return BatchRowIterator
271 protected function getNewCatsIterator( IReadableDatabase $dbr, $fname ) {
272 $it = $this->setupChangesIterator( $dbr, [], $fname );
273 $it->sqb->conds( [
274 'rc_namespace' => NS_CATEGORY,
275 'rc_new' => 1,
276 ] );
277 return $it;
281 * Fetch moved categories
282 * @param IReadableDatabase $dbr
283 * @param string $fname Name of the calling function
284 * @return BatchRowIterator
286 protected function getMovedCatsIterator( IReadableDatabase $dbr, $fname ) {
287 $it = $this->setupChangesIterator(
288 $dbr,
289 [ 'page_title', 'page_namespace' ],
290 $fname
292 $it->sqb->conds( [
293 'rc_namespace' => NS_CATEGORY,
294 'rc_new' => 0,
295 'rc_log_type' => 'move',
296 'rc_type' => RC_LOG,
297 ] );
298 $it->sqb->join( 'page', null, 'rc_cur_id = page_id' );
299 $this->addIndex( $it );
300 return $it;
304 * Fetch deleted categories
305 * @param IReadableDatabase $dbr
306 * @param string $fname Name of the calling function
307 * @return BatchRowIterator
309 protected function getDeletedCatsIterator( IReadableDatabase $dbr, $fname ) {
310 $it = new BatchRowIterator( $dbr,
311 $dbr->newSelectQueryBuilder()
312 ->from( 'recentchanges' )
313 ->select( [ 'rc_cur_id', 'rc_title' ] )
314 ->where( [
315 'rc_namespace' => NS_CATEGORY,
316 'rc_new' => 0,
317 'rc_log_type' => 'delete',
318 'rc_log_action' => 'delete',
319 'rc_type' => RC_LOG,
320 // We will fetch ones that do not have page record. If they do,
321 // this means they were restored, thus restoring handler will pick it up.
322 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
324 ->caller( $fname ),
325 [ 'rc_timestamp' ],
326 $this->mBatchSize
328 $this->addTimestampConditions( $it, $dbr );
329 $this->addIndex( $it );
330 return $it;
334 * Fetch restored categories
335 * @param IReadableDatabase $dbr
336 * @param string $fname Name of the calling function
337 * @return BatchRowIterator
339 protected function getRestoredCatsIterator( IReadableDatabase $dbr, $fname ) {
340 $it = $this->setupChangesIterator( $dbr, [], $fname );
341 $it->sqb->conds( [
342 'rc_namespace' => NS_CATEGORY,
343 'rc_new' => 0,
344 'rc_log_type' => 'delete',
345 'rc_log_action' => 'restore',
346 'rc_type' => RC_LOG,
347 // We will only fetch ones that have page record
348 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
349 ] );
350 $this->addIndex( $it );
351 return $it;
355 * Fetch categorization changes or edits
356 * @param IReadableDatabase $dbr
357 * @param int $type
358 * @param string $fname Name of the calling function
359 * @return BatchRowIterator
361 protected function getChangedCatsIterator( IReadableDatabase $dbr, $type, $fname ) {
362 $it = $this->setupChangesIterator( $dbr, [], $fname );
363 $it->sqb->conds( [
364 'rc_namespace' => NS_CATEGORY,
365 'rc_new' => 0,
366 'rc_type' => $type,
367 ] );
368 $this->addIndex( $it );
369 return $it;
373 * Add timestamp limits to iterator
374 * @param BatchRowIterator $it Iterator
375 * @param IReadableDatabase $dbr
377 private function addTimestampConditions( BatchRowIterator $it, IReadableDatabase $dbr ) {
378 $it->sqb->conds( [
379 $dbr->expr( 'rc_timestamp', '>=', $dbr->timestamp( $this->startTS ) ),
380 $dbr->expr( 'rc_timestamp', '<', $dbr->timestamp( $this->endTS ) ),
381 ] );
385 * Need to force index, somehow on terbium the optimizer chooses wrong one
387 private function addIndex( BatchRowIterator $it ) {
388 $it->sqb->options( [
389 'USE INDEX' => [ 'recentchanges' => 'rc_new_name_timestamp' ]
390 ] );
394 * Get iterator for links for categories.
395 * @param IReadableDatabase $dbr
396 * @param int[] $ids List of page IDs
397 * @param string $fname Name of the calling function
398 * @return Traversable
400 protected function getCategoryLinksIterator( IReadableDatabase $dbr, array $ids, $fname ) {
401 $it = new BatchRowIterator(
402 $dbr,
403 $dbr->newSelectQueryBuilder()
404 ->from( 'categorylinks' )
405 ->select( [ 'cl_from', 'cl_to' ] )
406 ->where( [
407 'cl_type' => 'subcat',
408 'cl_from' => $ids
410 ->caller( $fname ),
411 [ 'cl_from', 'cl_to' ],
412 $this->mBatchSize
414 return new RecursiveIteratorIterator( $it );
418 * Get accumulated RDF.
419 * @return string
421 public function getRdf() {
422 return $this->rdfWriter->drain();
426 * Handle category deletes.
427 * @param IReadableDatabase $dbr
428 * @param resource $output File to write the output
430 public function handleDeletes( IReadableDatabase $dbr, $output ) {
431 // This only does "true" deletes - i.e. those that the page stays deleted
433 foreach ( $this->getDeletedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
434 $deleteUrls = [];
435 foreach ( $batch as $row ) {
436 // This can produce duplicates, we don't care
437 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
438 $this->processed[$row->rc_cur_id] = true;
440 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
445 * Write category data to RDF.
446 * @param stdclass $row Database row
448 private function writeCategoryData( $row ) {
449 $this->categoriesRdf->writeCategoryData(
450 $row->rc_title,
451 $row->pp_propname === 'hiddencat',
452 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
453 (int)$row->cat_subcats
458 * @param IReadableDatabase $dbr
459 * @param resource $output
461 public function handleMoves( IReadableDatabase $dbr, $output ) {
462 foreach ( $this->getMovedCatsIterator( $dbr, __METHOD__ ) as $batch ) {
463 $pages = [];
464 $deleteUrls = [];
465 foreach ( $batch as $row ) {
466 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
468 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
469 // We already captured this one before
470 continue;
473 if ( $row->page_namespace != NS_CATEGORY ) {
474 // If page was moved out of Category:, we'll just delete
475 continue;
477 $row->rc_title = $row->page_title;
478 $this->writeCategoryData( $row );
479 $pages[$row->rc_cur_id] = $row->page_title;
480 $this->processed[$row->rc_cur_id] = true;
483 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
488 * @param IReadableDatabase $dbr
489 * @param resource $output
491 public function handleRestores( IReadableDatabase $dbr, $output ) {
492 fwrite( $output, "# Restores\n" );
494 // This will only find those restores that were not deleted later.
495 foreach ( $this->getRestoredCatsIterator( $dbr, __METHOD__ ) as $batch ) {
496 $pages = [];
497 foreach ( $batch as $row ) {
498 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
499 // We already captured this one before
500 continue;
502 $this->writeCategoryData( $row );
503 $pages[$row->rc_cur_id] = $row->rc_title;
504 $this->processed[$row->rc_cur_id] = true;
507 if ( !$pages ) {
508 continue;
511 $this->writeParentCategories( $dbr, $pages );
513 fwrite( $output, $this->getInsertRdf() );
518 * @param IReadableDatabase $dbr
519 * @param resource $output
521 public function handleAdds( IReadableDatabase $dbr, $output ) {
522 fwrite( $output, "# Additions\n" );
524 foreach ( $this->getNewCatsIterator( $dbr, __METHOD__ ) as $batch ) {
525 $pages = [];
526 foreach ( $batch as $row ) {
527 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
528 // We already captured this one before
529 continue;
531 $this->writeCategoryData( $row );
532 $pages[$row->rc_cur_id] = $row->rc_title;
533 $this->processed[$row->rc_cur_id] = true;
536 if ( !$pages ) {
537 continue;
540 $this->writeParentCategories( $dbr, $pages );
541 fwrite( $output, $this->getInsertRdf() );
546 * Handle edits for category texts
547 * @param IReadableDatabase $dbr
548 * @param resource $output
550 public function handleEdits( IReadableDatabase $dbr, $output ) {
551 // Editing category can change hidden flag and add new parents.
552 // TODO: it's pretty expensive to update all edited categories, and most edits
553 // aren't actually interesting for us. Some way to know which are interesting?
554 // We can capture recategorization on the next step, but not change in hidden status.
556 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT, __METHOD__ ) as $batch ) {
557 $pages = [];
558 $deleteUrls = [];
559 foreach ( $batch as $row ) {
560 // Note that on categorization event, cur_id points to
561 // the child page, not the parent category!
562 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
563 // We already captured this one before
564 continue;
566 $this->writeCategoryData( $row );
567 $pages[$row->rc_cur_id] = $row->rc_title;
568 $this->processed[$row->rc_cur_id] = true;
569 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
572 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
577 * Handles categorization changes
578 * @param IReadableDatabase $dbr
579 * @param resource $output
581 public function handleCategorization( IReadableDatabase $dbr, $output ) {
582 $processedTitle = [];
584 // Categorization change can add new parents and change counts
585 // for the parent category.
587 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE, __METHOD__ ) as $batch ) {
589 * Note that on categorization event, cur_id points to
590 * the child page, not the parent category!
591 * So we need to have a two-stage process, since we have ID from one
592 * category and title from another, and we need both for proper updates.
593 * TODO: For now, we do full update even though some data hasn't changed,
594 * e.g. parents for parent cat and counts for child cat.
596 $childPages = [];
597 $parentCats = [];
598 foreach ( $batch as $row ) {
599 $childPages[$row->rc_cur_id] = true;
600 $parentCats[$row->rc_title] = true;
603 $pages = [];
604 $deleteUrls = [];
606 if ( $childPages ) {
607 // Load child rows by ID
608 $childRows = $dbr->newSelectQueryBuilder()
609 ->select( [
610 'page_id',
611 'rc_title' => 'page_title',
612 'pp_propname',
613 'cat_pages',
614 'cat_subcats',
615 'cat_files',
617 ->from( 'page' )
618 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
619 ->leftJoin( 'category', null, [ 'cat_title = page_title' ] )
620 ->where( [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ] )
621 ->caller( __METHOD__ )->fetchResultSet();
622 foreach ( $childRows as $row ) {
623 if ( isset( $this->processed[$row->page_id] ) ) {
624 // We already captured this one before
625 continue;
627 $this->writeCategoryData( $row );
628 if ( $row->page_id ) {
629 $pages[$row->page_id] = $row->rc_title;
630 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
631 $this->processed[$row->page_id] = true;
636 if ( $parentCats ) {
637 // Load parent rows by title
638 $parentRows = $dbr->newSelectQueryBuilder()
639 ->select( [
640 'page_id',
641 'rc_title' => 'cat_title',
642 'pp_propname',
643 'cat_pages',
644 'cat_subcats',
645 'cat_files',
647 ->from( 'category' )
648 ->leftJoin( 'page', null, [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ] )
649 ->leftJoin( 'page_props', null, [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ] )
650 ->where( [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ] )
651 ->caller( __METHOD__ )->fetchResultSet();
652 foreach ( $parentRows as $row ) {
653 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
654 // We already captured this one before
655 continue;
657 if ( isset( $processedTitle[$row->rc_title] ) ) {
658 // We already captured this one before
659 continue;
661 $this->writeCategoryData( $row );
662 if ( $row->page_id ) {
663 $pages[$row->page_id] = $row->rc_title;
664 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
665 $this->processed[$row->page_id] = true;
667 $processedTitle[$row->rc_title] = true;
671 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
676 // @codeCoverageIgnoreStart
677 $maintClass = CategoryChangesAsRdf::class;
678 require_once RUN_MAINTENANCE_IF_MAIN;
679 // @codeCoverageIgnoreEnd