Merge "ResourceLoader: Deprecate ResourceLoader::makeConfigSetScript"
[mediawiki.git] / includes / import / WikiImporter.php
blob05c482aac4483d3120396cc802f5fe9c44ad9d3a
1 <?php
2 /**
3 * MediaWiki page data importer.
5 * Copyright © 2003,2005 Brooke Vibber <bvibber@wikimedia.org>
6 * https://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
23 * @file
24 * @ingroup SpecialPage
27 use MediaWiki\Cache\CacheKeyHelper;
28 use MediaWiki\Config\Config;
29 use MediaWiki\Content\Content;
30 use MediaWiki\Content\IContentHandlerFactory;
31 use MediaWiki\Deferred\DeferredUpdates;
32 use MediaWiki\Deferred\SiteStatsUpdate;
33 use MediaWiki\HookContainer\HookContainer;
34 use MediaWiki\HookContainer\HookRunner;
35 use MediaWiki\Language\Language;
36 use MediaWiki\MainConfigNames;
37 use MediaWiki\Page\PageIdentity;
38 use MediaWiki\Page\WikiPageFactory;
39 use MediaWiki\Permissions\Authority;
40 use MediaWiki\Revision\SlotRecord;
41 use MediaWiki\Revision\SlotRoleRegistry;
42 use MediaWiki\Status\Status;
43 use MediaWiki\Title\ForeignTitle;
44 use MediaWiki\Title\ImportTitleFactory;
45 use MediaWiki\Title\NaiveForeignTitleFactory;
46 use MediaWiki\Title\NaiveImportTitleFactory;
47 use MediaWiki\Title\NamespaceAwareForeignTitleFactory;
48 use MediaWiki\Title\NamespaceImportTitleFactory;
49 use MediaWiki\Title\NamespaceInfo;
50 use MediaWiki\Title\SubpageImportTitleFactory;
51 use MediaWiki\Title\Title;
52 use MediaWiki\Title\TitleFactory;
53 use MediaWiki\User\ExternalUserNames;
54 use Wikimedia\AtEase\AtEase;
55 use Wikimedia\Message\MessageParam;
56 use Wikimedia\Message\MessageSpecifier;
57 use Wikimedia\NormalizedException\NormalizedException;
58 use Wikimedia\Rdbms\IDBAccessObject;
60 /**
61 * XML file reader for the page data importer.
63 * implements Special:Import
64 * @ingroup SpecialPage
66 class WikiImporter {
67 /** @var XMLReader|null */
68 private $reader;
70 /** @var string */
71 private $sourceAdapterId;
73 /** @var array|null */
74 private $foreignNamespaces = null;
76 /** @var callable|null */
77 private $mLogItemCallback;
79 /** @var callable */
80 private $mUploadCallback;
82 /** @var callable|null */
83 private $mRevisionCallback;
85 /** @var callable|null */
86 private $mPageCallback;
88 /** @var callable|null */
89 private $mSiteInfoCallback;
91 /** @var callable|null */
92 private $mPageOutCallback;
94 /** @var callable|null */
95 private $mNoticeCallback;
97 /** @var bool|null */
98 private $mDebug;
100 /** @var bool|null */
101 private $mImportUploads;
103 /** @var string|null */
104 private $mImageBasePath;
106 /** @var bool */
107 private $mNoUpdates = false;
109 /** @var int */
110 private $pageOffset = 0;
112 private ImportTitleFactory $importTitleFactory;
113 private ExternalUserNames $externalUserNames;
115 /** @var array */
116 private $countableCache = [];
118 /** @var bool */
119 private $disableStatisticsUpdate = false;
122 * Authority used for permission checks only (to ensure that the user performing the import is
123 * allowed to edit the pages they're importing). To skip the checks, use UltimateAuthority.
125 * If you want to also log the import actions, see ImportReporter.
127 private Authority $performer;
129 private Config $config;
130 private HookRunner $hookRunner;
131 private Language $contentLanguage;
132 private NamespaceInfo $namespaceInfo;
133 private TitleFactory $titleFactory;
134 private WikiPageFactory $wikiPageFactory;
135 private UploadRevisionImporter $uploadRevisionImporter;
136 private IContentHandlerFactory $contentHandlerFactory;
137 private SlotRoleRegistry $slotRoleRegistry;
140 * Creates an ImportXMLReader drawing from the source provided
142 public function __construct(
143 ImportSource $source,
144 Authority $performer,
145 Config $config,
146 HookContainer $hookContainer,
147 Language $contentLanguage,
148 NamespaceInfo $namespaceInfo,
149 TitleFactory $titleFactory,
150 WikiPageFactory $wikiPageFactory,
151 UploadRevisionImporter $uploadRevisionImporter,
152 IContentHandlerFactory $contentHandlerFactory,
153 SlotRoleRegistry $slotRoleRegistry
155 $this->performer = $performer;
156 $this->config = $config;
157 $this->hookRunner = new HookRunner( $hookContainer );
158 $this->contentLanguage = $contentLanguage;
159 $this->namespaceInfo = $namespaceInfo;
160 $this->titleFactory = $titleFactory;
161 $this->wikiPageFactory = $wikiPageFactory;
162 $this->uploadRevisionImporter = $uploadRevisionImporter;
163 $this->contentHandlerFactory = $contentHandlerFactory;
164 $this->slotRoleRegistry = $slotRoleRegistry;
166 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
167 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
169 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
171 $this->openReader();
173 // Default callbacks
174 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
175 $this->setRevisionCallback( [ $this, "importRevision" ] );
176 $this->setUploadCallback( [ $this, 'importUpload' ] );
177 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
178 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
180 $this->importTitleFactory = new NaiveImportTitleFactory(
181 $this->contentLanguage,
182 $this->namespaceInfo,
183 $this->titleFactory
185 $this->externalUserNames = new ExternalUserNames( 'imported', false );
189 * @return null|XMLReader
191 public function getReader() {
192 return $this->reader;
196 * @param string $err
198 public function throwXmlError( $err ) {
199 $this->debug( "FAILURE: $err" );
200 wfDebug( "WikiImporter XML error: $err" );
204 * @param string $data
206 public function debug( $data ) {
207 if ( $this->mDebug ) {
208 wfDebug( "IMPORT: $data" );
213 * @param string $data
215 public function warn( $data ) {
216 wfDebug( "IMPORT: $data" );
220 * @param string $msg
221 * @phpcs:ignore Generic.Files.LineLength
222 * @param MessageParam|MessageSpecifier|string|int|float|list<MessageParam|MessageSpecifier|string|int|float> ...$params
223 * See Message::params()
225 public function notice( $msg, ...$params ) {
226 if ( is_callable( $this->mNoticeCallback ) ) {
227 call_user_func( $this->mNoticeCallback, $msg, $params );
228 } else { # No ImportReporter -> CLI
229 // T177997: the command line importers should call setNoticeCallback()
230 // for their own custom callback to echo the notice
231 wfDebug( wfMessage( $msg, $params )->text() );
236 * Set debug mode...
237 * @param bool $debug
239 public function setDebug( $debug ) {
240 $this->mDebug = $debug;
244 * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer
245 * @param bool $noupdates
247 public function setNoUpdates( $noupdates ) {
248 $this->mNoUpdates = $noupdates;
252 * Sets 'pageOffset' value. So it will skip the first n-1 pages
253 * and start from the nth page. It's 1-based indexing.
254 * @param int $nthPage
255 * @since 1.29
257 public function setPageOffset( $nthPage ) {
258 $this->pageOffset = $nthPage;
262 * Set a callback that displays notice messages
264 * @param callable $callback
265 * @return callable
267 public function setNoticeCallback( $callback ) {
268 return wfSetVar( $this->mNoticeCallback, $callback );
272 * Sets the action to perform as each new page in the stream is reached.
273 * @param callable|null $callback
274 * @return callable|null
276 public function setPageCallback( $callback ) {
277 $previous = $this->mPageCallback;
278 $this->mPageCallback = $callback;
279 return $previous;
283 * Sets the action to perform as each page in the stream is completed.
284 * Callback accepts the page title (as a Title object), a second object
285 * with the original title form (in case it's been overridden into a
286 * local namespace), and a count of revisions.
288 * @param callable|null $callback
289 * @return callable|null
291 public function setPageOutCallback( $callback ) {
292 $previous = $this->mPageOutCallback;
293 $this->mPageOutCallback = $callback;
294 return $previous;
298 * Sets the action to perform as each page revision is reached.
299 * @param callable|null $callback
300 * @return callable|null
302 public function setRevisionCallback( $callback ) {
303 $previous = $this->mRevisionCallback;
304 $this->mRevisionCallback = $callback;
305 return $previous;
309 * Sets the action to perform as each file upload version is reached.
310 * @param callable $callback
311 * @return callable
313 public function setUploadCallback( $callback ) {
314 $previous = $this->mUploadCallback;
315 $this->mUploadCallback = $callback;
316 return $previous;
320 * Sets the action to perform as each log item reached.
321 * @param callable $callback
322 * @return callable
324 public function setLogItemCallback( $callback ) {
325 $previous = $this->mLogItemCallback;
326 $this->mLogItemCallback = $callback;
327 return $previous;
331 * Sets the action to perform when site info is encountered
332 * @param callable $callback
333 * @return callable
335 public function setSiteInfoCallback( $callback ) {
336 $previous = $this->mSiteInfoCallback;
337 $this->mSiteInfoCallback = $callback;
338 return $previous;
342 * Sets the factory object to use to convert ForeignTitle objects into local
343 * Title objects
344 * @param ImportTitleFactory $factory
346 public function setImportTitleFactory( $factory ) {
347 $this->importTitleFactory = $factory;
351 * Set a target namespace to override the defaults
352 * @param null|int $namespace
353 * @return bool
355 public function setTargetNamespace( $namespace ) {
356 if ( $namespace === null ) {
357 // Don't override namespaces
358 $this->setImportTitleFactory(
359 new NaiveImportTitleFactory(
360 $this->contentLanguage,
361 $this->namespaceInfo,
362 $this->titleFactory
365 return true;
366 } elseif (
367 $namespace >= 0 &&
368 $this->namespaceInfo->exists( intval( $namespace ) )
370 $namespace = intval( $namespace );
371 $this->setImportTitleFactory(
372 new NamespaceImportTitleFactory(
373 $this->namespaceInfo,
374 $this->titleFactory,
375 $namespace
378 return true;
379 } else {
380 return false;
385 * Set a target root page under which all pages are imported
386 * @param null|string $rootpage
387 * @return Status
389 public function setTargetRootPage( $rootpage ) {
390 $status = Status::newGood();
391 $nsInfo = $this->namespaceInfo;
392 if ( $rootpage === null ) {
393 // No rootpage
394 $this->setImportTitleFactory(
395 new NaiveImportTitleFactory(
396 $this->contentLanguage,
397 $nsInfo,
398 $this->titleFactory
401 } elseif ( $rootpage !== '' ) {
402 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
403 $title = Title::newFromText( $rootpage );
405 if ( !$title || $title->isExternal() ) {
406 $status->fatal( 'import-rootpage-invalid' );
407 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
408 $displayNSText = $title->getNamespace() === NS_MAIN
409 ? wfMessage( 'blanknamespace' )->text()
410 : $this->contentLanguage->getNsText( $title->getNamespace() );
411 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
412 } else {
413 // set namespace to 'all', so the namespace check in processTitle() can pass
414 $this->setTargetNamespace( null );
415 $this->setImportTitleFactory(
416 new SubpageImportTitleFactory(
417 $nsInfo,
418 $this->titleFactory,
419 $title
424 return $status;
428 * @param string $dir
430 public function setImageBasePath( $dir ) {
431 $this->mImageBasePath = $dir;
435 * @param bool $import
437 public function setImportUploads( $import ) {
438 $this->mImportUploads = $import;
442 * @since 1.31
443 * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames
444 * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally
446 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
447 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
451 * Statistics update can cause a lot of time
452 * @since 1.29
454 public function disableStatisticsUpdate() {
455 $this->disableStatisticsUpdate = true;
459 * Default per-page callback. Sets up some things related to site statistics
460 * @param array $titleAndForeignTitle Two-element array, with Title object at
461 * index 0 and ForeignTitle object at index 1
462 * @return bool
464 public function beforeImportPage( $titleAndForeignTitle ) {
465 $title = $titleAndForeignTitle[0];
466 $page = $this->wikiPageFactory->newFromTitle( $title );
467 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
468 return true;
472 * Default per-revision callback, performs the import.
473 * @param WikiRevision $revision
474 * @return bool
476 public function importRevision( $revision ) {
477 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
478 $this->notice( 'import-error-bad-location',
479 $revision->getTitle()->getPrefixedText(),
480 $revision->getID(),
481 $revision->getModel(),
482 $revision->getFormat()
485 return false;
488 try {
489 return $revision->importOldRevision();
490 } catch ( MWContentSerializationException $ex ) {
491 $this->notice( 'import-error-unserialize',
492 $revision->getTitle()->getPrefixedText(),
493 $revision->getID(),
494 $revision->getModel(),
495 $revision->getFormat()
499 return false;
503 * Default per-revision callback, performs the import.
504 * @param WikiRevision $revision
505 * @return bool
507 public function importLogItem( $revision ) {
508 return $revision->importLogItem();
512 * Dummy for now...
513 * @param WikiRevision $revision
514 * @return bool
516 public function importUpload( $revision ) {
517 $status = $this->uploadRevisionImporter->import( $revision );
518 return $status->isGood();
522 * Mostly for hook use
523 * @param PageIdentity $pageIdentity
524 * @param ForeignTitle $foreignTitle
525 * @param int $revCount
526 * @param int $sRevCount
527 * @param array $pageInfo
528 * @return bool
530 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
531 $sRevCount, $pageInfo
533 // Update article count statistics (T42009)
534 // The normal counting logic in WikiPage->doEditUpdates() is designed for
535 // one-revision-at-a-time editing, not bulk imports. In this situation it
536 // suffers from issues of replica DB lag. We let WikiPage handle the total page
537 // and revision count, and we implement our own custom logic for the
538 // article (content page) count.
539 if ( !$this->disableStatisticsUpdate ) {
540 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
542 $page->loadPageData( IDBAccessObject::READ_LATEST );
543 $rev = $page->getRevisionRecord();
544 if ( $rev === null ) {
546 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
547 ' because WikiPage::getRevisionRecord() returned null' );
548 } else {
549 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
550 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
551 $countable = $update->isCountable();
552 if ( array_key_exists( $countKey, $this->countableCache ) &&
553 $countable != $this->countableCache[$countKey] ) {
554 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
555 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
556 ] ) );
561 $title = Title::newFromPageIdentity( $pageIdentity );
562 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
563 $revCount, $sRevCount, $pageInfo );
567 * Notify the callback function of site info
568 * @param array $siteInfo
569 * @return mixed|false
571 private function siteInfoCallback( $siteInfo ) {
572 if ( $this->mSiteInfoCallback ) {
573 return call_user_func_array(
574 $this->mSiteInfoCallback,
575 [ $siteInfo, $this ]
577 } else {
578 return false;
583 * Notify the callback function when a new "<page>" is reached.
584 * @param array $title
586 public function pageCallback( $title ) {
587 if ( $this->mPageCallback ) {
588 call_user_func( $this->mPageCallback, $title );
593 * Notify the callback function when a "</page>" is closed.
594 * @param PageIdentity $pageIdentity
595 * @param ForeignTitle $foreignTitle
596 * @param int $revCount
597 * @param int $sucCount Number of revisions for which callback returned true
598 * @param array $pageInfo Associative array of page information
600 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
601 $sucCount, $pageInfo ) {
602 if ( $this->mPageOutCallback ) {
603 call_user_func_array( $this->mPageOutCallback, func_get_args() );
608 * Notify the callback function of a revision
609 * @param WikiRevision $revision
610 * @return bool|mixed
612 private function revisionCallback( $revision ) {
613 if ( $this->mRevisionCallback ) {
614 return call_user_func_array(
615 $this->mRevisionCallback,
616 [ $revision, $this ]
618 } else {
619 return false;
624 * Notify the callback function of a new log item
625 * @param WikiRevision $revision
626 * @return mixed|false
628 private function logItemCallback( $revision ) {
629 if ( $this->mLogItemCallback ) {
630 return call_user_func_array(
631 $this->mLogItemCallback,
632 [ $revision, $this ]
634 } else {
635 return false;
640 * Retrieves the contents of the named attribute of the current element.
641 * @param string $attr The name of the attribute
642 * @return string The value of the attribute or an empty string if it is not set in the current
643 * element.
645 public function nodeAttribute( $attr ) {
646 return $this->reader->getAttribute( $attr ) ?? '';
650 * Shouldn't something like this be built-in to XMLReader?
651 * Fetches text contents of the current element, assuming
652 * no sub-elements or such scary things.
653 * @return string
654 * @internal
656 public function nodeContents() {
657 if ( $this->reader->isEmptyElement ) {
658 return "";
660 $buffer = "";
661 while ( $this->reader->read() ) {
662 switch ( $this->reader->nodeType ) {
663 case XMLReader::TEXT:
664 case XMLReader::CDATA:
665 case XMLReader::SIGNIFICANT_WHITESPACE:
666 $buffer .= $this->reader->value;
667 break;
668 case XMLReader::END_ELEMENT:
669 return $buffer;
673 $this->reader->close();
674 return '';
678 * Primary entry point
679 * @throws Exception
680 * @return bool
682 public function doImport() {
683 $this->syntaxCheckXML();
685 // Calls to reader->read need to be wrapped in calls to
686 // libxml_disable_entity_loader() to avoid local file
687 // inclusion attacks (T48932).
688 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
689 $oldDisable = @libxml_disable_entity_loader( true );
690 try {
691 $this->reader->read();
693 if ( $this->reader->localName != 'mediawiki' ) {
694 // phpcs:ignore Generic.PHP.NoSilencedErrors
695 @libxml_disable_entity_loader( $oldDisable );
696 $error = libxml_get_last_error();
697 if ( $error ) {
698 throw new NormalizedException( "XML error at line {line}: {message}", [
699 'line' => $error->line,
700 'message' => $error->message,
701 ] );
702 } else {
703 throw new UnexpectedValueException(
704 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
708 $this->debug( "<mediawiki> tag is correct." );
710 $this->debug( "Starting primary dump processing loop." );
712 $keepReading = $this->reader->read();
713 $skip = false;
714 $pageCount = 0;
715 while ( $keepReading ) {
716 $tag = $this->reader->localName;
717 if ( $this->pageOffset ) {
718 if ( $tag === 'page' ) {
719 $pageCount++;
721 if ( $pageCount < $this->pageOffset ) {
722 $keepReading = $this->reader->next();
723 continue;
726 $type = $this->reader->nodeType;
728 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
729 // Do nothing
730 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
731 break;
732 } elseif ( $tag == 'siteinfo' ) {
733 $this->handleSiteInfo();
734 } elseif ( $tag == 'page' ) {
735 $this->handlePage();
736 } elseif ( $tag == 'logitem' ) {
737 $this->handleLogItem();
738 } elseif ( $tag != '#text' ) {
739 $this->warn( "Unhandled top-level XML tag $tag" );
741 $skip = true;
744 if ( $skip ) {
745 $keepReading = $this->reader->next();
746 $skip = false;
747 $this->debug( "Skip" );
748 } else {
749 $keepReading = $this->reader->read();
752 } finally {
753 // phpcs:ignore Generic.PHP.NoSilencedErrors
754 @libxml_disable_entity_loader( $oldDisable );
755 $this->reader->close();
758 return true;
761 private function handleSiteInfo() {
762 $this->debug( "Enter site info handler." );
763 $siteInfo = [];
765 // Fields that can just be stuffed in the siteInfo object
766 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
768 while ( $this->reader->read() ) {
769 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
770 $this->reader->localName == 'siteinfo' ) {
771 break;
774 $tag = $this->reader->localName;
776 if ( $tag == 'namespace' ) {
777 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
778 $this->nodeContents();
779 } elseif ( in_array( $tag, $normalFields ) ) {
780 $siteInfo[$tag] = $this->nodeContents();
784 $siteInfo['_namespaces'] = $this->foreignNamespaces;
785 $this->siteInfoCallback( $siteInfo );
788 private function handleLogItem() {
789 $this->debug( "Enter log item handler." );
790 $logInfo = [];
792 // Fields that can just be stuffed in the pageInfo object
793 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
794 'logtitle', 'params' ];
796 while ( $this->reader->read() ) {
797 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
798 $this->reader->localName == 'logitem' ) {
799 break;
802 $tag = $this->reader->localName;
804 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
805 // Do nothing
806 } elseif ( in_array( $tag, $normalFields ) ) {
807 $logInfo[$tag] = $this->nodeContents();
808 } elseif ( $tag == 'contributor' ) {
809 $logInfo['contributor'] = $this->handleContributor();
810 } elseif ( $tag != '#text' ) {
811 $this->warn( "Unhandled log-item XML tag $tag" );
815 $this->processLogItem( $logInfo );
819 * @param array $logInfo
820 * @return mixed|false
822 private function processLogItem( $logInfo ) {
823 $revision = new WikiRevision();
825 if ( isset( $logInfo['id'] ) ) {
826 $revision->setID( $logInfo['id'] );
828 $revision->setType( $logInfo['type'] );
829 $revision->setAction( $logInfo['action'] );
830 if ( isset( $logInfo['timestamp'] ) ) {
831 $revision->setTimestamp( $logInfo['timestamp'] );
833 if ( isset( $logInfo['params'] ) ) {
834 $revision->setParams( $logInfo['params'] );
836 if ( isset( $logInfo['logtitle'] ) ) {
837 // @todo Using Title for non-local titles is a recipe for disaster.
838 // We should use ForeignTitle here instead.
839 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
842 $revision->setNoUpdates( $this->mNoUpdates );
844 if ( isset( $logInfo['comment'] ) ) {
845 $revision->setComment( $logInfo['comment'] );
848 if ( isset( $logInfo['contributor']['username'] ) ) {
849 $revision->setUsername(
850 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
852 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
853 $revision->setUserIP( $logInfo['contributor']['ip'] );
854 } else {
855 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
858 return $this->logItemCallback( $revision );
861 private function handlePage() {
862 // Handle page data.
863 $this->debug( "Enter page handler." );
864 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
866 // Fields that can just be stuffed in the pageInfo object
867 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
869 $skip = false;
870 $badTitle = false;
872 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
873 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
874 $this->reader->localName == 'page' ) {
875 break;
878 $skip = false;
880 $tag = $this->reader->localName;
882 if ( $badTitle ) {
883 // The title is invalid, bail out of this page
884 $skip = true;
885 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
886 // Do nothing
887 } elseif ( in_array( $tag, $normalFields ) ) {
888 // An XML snippet:
889 // <page>
890 // <id>123</id>
891 // <title>Page</title>
892 // <redirect title="NewTitle"/>
893 // ...
894 // Because the redirect tag is built differently, we need special handling for that case.
895 if ( $tag == 'redirect' ) {
896 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
897 } else {
898 $pageInfo[$tag] = $this->nodeContents();
900 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
901 if ( !isset( $title ) ) {
902 $title = $this->processTitle( $pageInfo['title'],
903 $pageInfo['ns'] ?? null );
905 // $title is either an array of two titles or false.
906 if ( is_array( $title ) ) {
907 $this->pageCallback( $title );
908 [ $pageInfo['_title'], $foreignTitle ] = $title;
909 } else {
910 $badTitle = true;
911 $skip = true;
915 if ( $title ) {
916 if ( $tag == 'revision' ) {
917 $this->handleRevision( $pageInfo );
918 } else {
919 $this->handleUpload( $pageInfo );
922 } elseif ( $tag != '#text' ) {
923 $this->warn( "Unhandled page XML tag $tag" );
924 $skip = true;
928 // @note $pageInfo is only set if a valid $title is processed above with
929 // no error. If we have a valid $title, then pageCallback is called
930 // above, $pageInfo['title'] is set and we do pageOutCallback here.
931 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
932 // set since they both come from $title above.
933 if ( array_key_exists( '_title', $pageInfo ) ) {
934 /** @var Title $title */
935 $title = $pageInfo['_title'];
936 $this->pageOutCallback(
937 $title,
938 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
939 $foreignTitle,
940 $pageInfo['revisionCount'],
941 $pageInfo['successfulRevisionCount'],
942 $pageInfo
948 * @param array &$pageInfo
950 private function handleRevision( &$pageInfo ) {
951 $this->debug( "Enter revision handler" );
952 $revisionInfo = [];
954 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
955 'model', 'format', 'text', 'sha1' ];
957 $skip = false;
959 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
960 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
961 $this->reader->localName == 'revision' ) {
962 break;
965 $tag = $this->reader->localName;
967 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
968 $this, $pageInfo, $revisionInfo )
970 // Do nothing
971 } elseif ( in_array( $tag, $normalFields ) ) {
972 $revisionInfo[$tag] = $this->nodeContents();
973 } elseif ( $tag == 'content' ) {
974 // We can have multiple content tags, so make this an array.
975 $revisionInfo[$tag][] = $this->handleContent();
976 } elseif ( $tag == 'contributor' ) {
977 $revisionInfo['contributor'] = $this->handleContributor();
978 } elseif ( $tag != '#text' ) {
979 $this->warn( "Unhandled revision XML tag $tag" );
980 $skip = true;
984 $pageInfo['revisionCount']++;
985 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
986 $pageInfo['successfulRevisionCount']++;
990 private function handleContent() {
991 $this->debug( "Enter content handler" );
992 $contentInfo = [];
994 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
996 $skip = false;
998 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
999 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1000 $this->reader->localName == 'content' ) {
1001 break;
1004 $tag = $this->reader->localName;
1006 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1007 $this, $contentInfo )
1009 // Do nothing
1010 } elseif ( in_array( $tag, $normalFields ) ) {
1011 $contentInfo[$tag] = $this->nodeContents();
1012 } elseif ( $tag != '#text' ) {
1013 $this->warn( "Unhandled content XML tag $tag" );
1014 $skip = true;
1018 return $contentInfo;
1022 * @param PageIdentity $page
1023 * @param int $revisionId
1024 * @param array $contentInfo
1026 * @return Content
1028 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1029 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1031 if ( !isset( $contentInfo['text'] ) ) {
1032 throw new InvalidArgumentException( 'Missing text field in import.' );
1035 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1036 // database errors and instability. Testing for revisions with only listed
1037 // content models, as other content models might use serialization formats
1038 // which aren't checked against $wgMaxArticleSize.
1039 if ( ( !isset( $contentInfo['model'] ) ||
1040 in_array( $contentInfo['model'], [
1041 'wikitext',
1042 'css',
1043 'json',
1044 'javascript',
1045 'text',
1047 ] ) ) &&
1048 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1050 throw new RuntimeException( 'The text of ' .
1051 ( $revisionId ?
1052 "the revision with ID $revisionId" :
1053 'a revision'
1054 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1057 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1058 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1059 ->getRoleHandler( $role )
1060 ->getDefaultModel( $page );
1061 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1063 $text = $handler->importTransform( $contentInfo['text'] );
1065 return $handler->unserializeContent( $text );
1069 * @param array $pageInfo
1070 * @param array $revisionInfo
1071 * @return mixed|false
1073 private function processRevision( $pageInfo, $revisionInfo ) {
1074 $revision = new WikiRevision();
1076 $revId = $revisionInfo['id'] ?? 0;
1077 if ( $revId ) {
1078 $revision->setID( $revisionInfo['id'] );
1081 $title = $pageInfo['_title'];
1082 $revision->setTitle( $title );
1084 $content = $this->makeContent( $title, $revId, $revisionInfo );
1085 $revision->setContent( SlotRecord::MAIN, $content );
1087 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1088 if ( !isset( $slotInfo['role'] ) ) {
1089 throw new RuntimeException( "Missing role for imported slot." );
1092 $content = $this->makeContent( $title, $revId, $slotInfo );
1093 $revision->setContent( $slotInfo['role'], $content );
1095 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1097 if ( isset( $revisionInfo['comment'] ) ) {
1098 $revision->setComment( $revisionInfo['comment'] );
1101 if ( isset( $revisionInfo['minor'] ) ) {
1102 $revision->setMinor( true );
1104 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1105 $revision->setUsername(
1106 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1108 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1109 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1110 } else {
1111 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1113 if ( isset( $revisionInfo['sha1'] ) ) {
1114 $revision->setSha1Base36( $revisionInfo['sha1'] );
1116 $revision->setNoUpdates( $this->mNoUpdates );
1118 return $this->revisionCallback( $revision );
1122 * @param array &$pageInfo
1123 * @return mixed
1125 private function handleUpload( &$pageInfo ) {
1126 $this->debug( "Enter upload handler" );
1127 $uploadInfo = [];
1129 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1130 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1132 $skip = false;
1134 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1135 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1136 $this->reader->localName == 'upload' ) {
1137 break;
1140 $tag = $this->reader->localName;
1142 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1143 // Do nothing
1144 } elseif ( in_array( $tag, $normalFields ) ) {
1145 $uploadInfo[$tag] = $this->nodeContents();
1146 } elseif ( $tag == 'contributor' ) {
1147 $uploadInfo['contributor'] = $this->handleContributor();
1148 } elseif ( $tag == 'contents' ) {
1149 $contents = $this->nodeContents();
1150 $encoding = $this->reader->getAttribute( 'encoding' );
1151 if ( $encoding === 'base64' ) {
1152 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1153 $uploadInfo['isTempSrc'] = true;
1155 } elseif ( $tag != '#text' ) {
1156 $this->warn( "Unhandled upload XML tag $tag" );
1157 $skip = true;
1161 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1162 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1163 if ( file_exists( $path ) ) {
1164 $uploadInfo['fileSrc'] = $path;
1165 $uploadInfo['isTempSrc'] = false;
1169 if ( $this->mImportUploads ) {
1170 return $this->processUpload( $pageInfo, $uploadInfo );
1175 * @param string $contents
1176 * @return string
1178 private function dumpTemp( $contents ) {
1179 $filename = tempnam( wfTempDir(), 'importupload' );
1180 file_put_contents( $filename, $contents );
1181 return $filename;
1185 * @param array $pageInfo
1186 * @param array $uploadInfo
1187 * @return mixed
1189 private function processUpload( $pageInfo, $uploadInfo ) {
1190 $revision = new WikiRevision();
1191 $revId = $pageInfo['id'];
1192 $title = $pageInfo['_title'];
1193 // T292348: text key may be absent, force addition if null
1194 $uploadInfo['text'] ??= '';
1195 $content = $this->makeContent( $title, $revId, $uploadInfo );
1197 $revision->setTitle( $title );
1198 $revision->setID( $revId );
1199 $revision->setTimestamp( $uploadInfo['timestamp'] );
1200 $revision->setContent( SlotRecord::MAIN, $content );
1201 $revision->setFilename( $uploadInfo['filename'] );
1202 if ( isset( $uploadInfo['archivename'] ) ) {
1203 $revision->setArchiveName( $uploadInfo['archivename'] );
1205 $revision->setSrc( $uploadInfo['src'] );
1206 if ( isset( $uploadInfo['fileSrc'] ) ) {
1207 $revision->setFileSrc( $uploadInfo['fileSrc'],
1208 !empty( $uploadInfo['isTempSrc'] )
1211 if ( isset( $uploadInfo['sha1base36'] ) ) {
1212 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1214 $revision->setSize( intval( $uploadInfo['size'] ) );
1215 $revision->setComment( $uploadInfo['comment'] );
1217 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1218 $revision->setUsername(
1219 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1221 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1222 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1224 $revision->setNoUpdates( $this->mNoUpdates );
1226 return call_user_func( $this->mUploadCallback, $revision );
1230 * @return array
1232 private function handleContributor() {
1233 $this->debug( "Enter contributor handler." );
1235 if ( $this->reader->isEmptyElement ) {
1236 return [];
1239 $fields = [ 'id', 'ip', 'username' ];
1240 $info = [];
1242 while ( $this->reader->read() ) {
1243 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1244 $this->reader->localName == 'contributor' ) {
1245 break;
1248 $tag = $this->reader->localName;
1250 if ( in_array( $tag, $fields ) ) {
1251 $info[$tag] = $this->nodeContents();
1255 return $info;
1259 * @param string $text
1260 * @param string|null $ns
1261 * @return array|false
1263 private function processTitle( $text, $ns = null ) {
1264 if ( $this->foreignNamespaces === null ) {
1265 $foreignTitleFactory = new NaiveForeignTitleFactory(
1266 $this->contentLanguage
1268 } else {
1269 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1270 $this->foreignNamespaces );
1273 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1274 intval( $ns ) );
1276 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1277 $foreignTitle );
1279 if ( $title === null ) {
1280 # Invalid page title? Ignore the page
1281 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1282 return false;
1283 } elseif ( $title->isExternal() ) {
1284 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1285 return false;
1286 } elseif ( !$title->canExist() ) {
1287 $this->notice( 'import-error-special', $title->getPrefixedText() );
1288 return false;
1289 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1290 # Do not import if the importing wiki user cannot edit this page
1291 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1292 return false;
1295 return [ $title, $foreignTitle ];
1299 * Open the XMLReader connected to the source adapter id
1300 * @suppress PhanStaticCallToNonStatic, UnusedSuppression -- for PHP 7.4 support
1302 private function openReader() {
1303 // Enable the entity loader, as it is needed for loading external URLs via
1304 // XMLReader::open (T86036)
1305 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1306 $oldDisable = @libxml_disable_entity_loader( false );
1308 if ( PHP_VERSION_ID >= 80000 ) {
1309 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1310 $reader = XMLReader::open(
1311 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1312 if ( $reader instanceof XMLReader ) {
1313 $this->reader = $reader;
1314 $status = true;
1315 } else {
1316 $status = false;
1318 } else {
1319 // A static call generated a deprecation warning prior to PHP 8.0
1320 $this->reader = new XMLReader;
1321 $status = $this->reader->open(
1322 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1324 if ( !$status ) {
1325 $error = libxml_get_last_error();
1326 // phpcs:ignore Generic.PHP.NoSilencedErrors
1327 @libxml_disable_entity_loader( $oldDisable );
1328 throw new RuntimeException(
1329 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1332 // phpcs:ignore Generic.PHP.NoSilencedErrors
1333 @libxml_disable_entity_loader( $oldDisable );
1337 * Check the syntax of the given xml
1339 private function syntaxCheckXML() {
1340 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1341 return;
1343 AtEase::suppressWarnings();
1344 $oldDisable = libxml_disable_entity_loader( false );
1345 try {
1346 while ( $this->reader->read() );
1347 $error = libxml_get_last_error();
1348 if ( $error ) {
1349 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1350 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1351 throw new RuntimeException( $errorMessage );
1353 } finally {
1354 libxml_disable_entity_loader( $oldDisable );
1355 AtEase::restoreWarnings();
1356 $this->reader->close();
1359 // Reopen for the real import
1360 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1361 $this->openReader();