3 * MediaWiki page data importer.
5 * Copyright © 2003,2005 Brooke Vibber <bvibber@wikimedia.org>
6 * https://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
24 * @ingroup SpecialPage
27 use MediaWiki\Cache\CacheKeyHelper
;
28 use MediaWiki\Config\Config
;
29 use MediaWiki\Content\Content
;
30 use MediaWiki\Content\IContentHandlerFactory
;
31 use MediaWiki\Deferred\DeferredUpdates
;
32 use MediaWiki\Deferred\SiteStatsUpdate
;
33 use MediaWiki\HookContainer\HookContainer
;
34 use MediaWiki\HookContainer\HookRunner
;
35 use MediaWiki\Language\Language
;
36 use MediaWiki\MainConfigNames
;
37 use MediaWiki\Page\PageIdentity
;
38 use MediaWiki\Page\WikiPageFactory
;
39 use MediaWiki\Permissions\Authority
;
40 use MediaWiki\Revision\SlotRecord
;
41 use MediaWiki\Revision\SlotRoleRegistry
;
42 use MediaWiki\Status\Status
;
43 use MediaWiki\Title\ForeignTitle
;
44 use MediaWiki\Title\ImportTitleFactory
;
45 use MediaWiki\Title\NaiveForeignTitleFactory
;
46 use MediaWiki\Title\NaiveImportTitleFactory
;
47 use MediaWiki\Title\NamespaceAwareForeignTitleFactory
;
48 use MediaWiki\Title\NamespaceImportTitleFactory
;
49 use MediaWiki\Title\NamespaceInfo
;
50 use MediaWiki\Title\SubpageImportTitleFactory
;
51 use MediaWiki\Title\Title
;
52 use MediaWiki\Title\TitleFactory
;
53 use MediaWiki\User\ExternalUserNames
;
54 use Wikimedia\AtEase\AtEase
;
55 use Wikimedia\Message\MessageParam
;
56 use Wikimedia\Message\MessageSpecifier
;
57 use Wikimedia\NormalizedException\NormalizedException
;
58 use Wikimedia\Rdbms\IDBAccessObject
;
61 * XML file reader for the page data importer.
63 * implements Special:Import
64 * @ingroup SpecialPage
67 /** @var XMLReader|null */
71 private $sourceAdapterId;
73 /** @var array|null */
74 private $foreignNamespaces = null;
76 /** @var callable|null */
77 private $mLogItemCallback;
80 private $mUploadCallback;
82 /** @var callable|null */
83 private $mRevisionCallback;
85 /** @var callable|null */
86 private $mPageCallback;
88 /** @var callable|null */
89 private $mSiteInfoCallback;
91 /** @var callable|null */
92 private $mPageOutCallback;
94 /** @var callable|null */
95 private $mNoticeCallback;
100 /** @var bool|null */
101 private $mImportUploads;
103 /** @var string|null */
104 private $mImageBasePath;
107 private $mNoUpdates = false;
110 private $pageOffset = 0;
112 private ImportTitleFactory
$importTitleFactory;
113 private ExternalUserNames
$externalUserNames;
116 private $countableCache = [];
119 private $disableStatisticsUpdate = false;
122 * Authority used for permission checks only (to ensure that the user performing the import is
123 * allowed to edit the pages they're importing). To skip the checks, use UltimateAuthority.
125 * If you want to also log the import actions, see ImportReporter.
127 private Authority
$performer;
129 private Config
$config;
130 private HookRunner
$hookRunner;
131 private Language
$contentLanguage;
132 private NamespaceInfo
$namespaceInfo;
133 private TitleFactory
$titleFactory;
134 private WikiPageFactory
$wikiPageFactory;
135 private UploadRevisionImporter
$uploadRevisionImporter;
136 private IContentHandlerFactory
$contentHandlerFactory;
137 private SlotRoleRegistry
$slotRoleRegistry;
140 * Creates an ImportXMLReader drawing from the source provided
142 public function __construct(
143 ImportSource
$source,
144 Authority
$performer,
146 HookContainer
$hookContainer,
147 Language
$contentLanguage,
148 NamespaceInfo
$namespaceInfo,
149 TitleFactory
$titleFactory,
150 WikiPageFactory
$wikiPageFactory,
151 UploadRevisionImporter
$uploadRevisionImporter,
152 IContentHandlerFactory
$contentHandlerFactory,
153 SlotRoleRegistry
$slotRoleRegistry
155 $this->performer
= $performer;
156 $this->config
= $config;
157 $this->hookRunner
= new HookRunner( $hookContainer );
158 $this->contentLanguage
= $contentLanguage;
159 $this->namespaceInfo
= $namespaceInfo;
160 $this->titleFactory
= $titleFactory;
161 $this->wikiPageFactory
= $wikiPageFactory;
162 $this->uploadRevisionImporter
= $uploadRevisionImporter;
163 $this->contentHandlerFactory
= $contentHandlerFactory;
164 $this->slotRoleRegistry
= $slotRoleRegistry;
166 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
167 stream_wrapper_register( 'uploadsource', UploadSourceAdapter
::class );
169 $this->sourceAdapterId
= UploadSourceAdapter
::registerSource( $source );
174 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
175 $this->setRevisionCallback( [ $this, "importRevision" ] );
176 $this->setUploadCallback( [ $this, 'importUpload' ] );
177 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
178 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
180 $this->importTitleFactory
= new NaiveImportTitleFactory(
181 $this->contentLanguage
,
182 $this->namespaceInfo
,
185 $this->externalUserNames
= new ExternalUserNames( 'imported', false );
189 * @return null|XMLReader
191 public function getReader() {
192 return $this->reader
;
198 public function throwXmlError( $err ) {
199 $this->debug( "FAILURE: $err" );
200 wfDebug( "WikiImporter XML error: $err" );
204 * @param string $data
206 public function debug( $data ) {
207 if ( $this->mDebug
) {
208 wfDebug( "IMPORT: $data" );
213 * @param string $data
215 public function warn( $data ) {
216 wfDebug( "IMPORT: $data" );
221 * @phpcs:ignore Generic.Files.LineLength
222 * @param MessageParam|MessageSpecifier|string|int|float|list<MessageParam|MessageSpecifier|string|int|float> ...$params
223 * See Message::params()
225 public function notice( $msg, ...$params ) {
226 if ( is_callable( $this->mNoticeCallback
) ) {
227 call_user_func( $this->mNoticeCallback
, $msg, $params );
228 } else { # No ImportReporter -> CLI
229 // T177997: the command line importers should call setNoticeCallback()
230 // for their own custom callback to echo the notice
231 wfDebug( wfMessage( $msg, $params )->text() );
239 public function setDebug( $debug ) {
240 $this->mDebug
= $debug;
244 * Set 'no updates' mode. In this mode, the link tables will not be updated by the importer
245 * @param bool $noupdates
247 public function setNoUpdates( $noupdates ) {
248 $this->mNoUpdates
= $noupdates;
252 * Sets 'pageOffset' value. So it will skip the first n-1 pages
253 * and start from the nth page. It's 1-based indexing.
254 * @param int $nthPage
257 public function setPageOffset( $nthPage ) {
258 $this->pageOffset
= $nthPage;
262 * Set a callback that displays notice messages
264 * @param callable $callback
267 public function setNoticeCallback( $callback ) {
268 return wfSetVar( $this->mNoticeCallback
, $callback );
272 * Sets the action to perform as each new page in the stream is reached.
273 * @param callable|null $callback
274 * @return callable|null
276 public function setPageCallback( $callback ) {
277 $previous = $this->mPageCallback
;
278 $this->mPageCallback
= $callback;
283 * Sets the action to perform as each page in the stream is completed.
284 * Callback accepts the page title (as a Title object), a second object
285 * with the original title form (in case it's been overridden into a
286 * local namespace), and a count of revisions.
288 * @param callable|null $callback
289 * @return callable|null
291 public function setPageOutCallback( $callback ) {
292 $previous = $this->mPageOutCallback
;
293 $this->mPageOutCallback
= $callback;
298 * Sets the action to perform as each page revision is reached.
299 * @param callable|null $callback
300 * @return callable|null
302 public function setRevisionCallback( $callback ) {
303 $previous = $this->mRevisionCallback
;
304 $this->mRevisionCallback
= $callback;
309 * Sets the action to perform as each file upload version is reached.
310 * @param callable $callback
313 public function setUploadCallback( $callback ) {
314 $previous = $this->mUploadCallback
;
315 $this->mUploadCallback
= $callback;
320 * Sets the action to perform as each log item reached.
321 * @param callable $callback
324 public function setLogItemCallback( $callback ) {
325 $previous = $this->mLogItemCallback
;
326 $this->mLogItemCallback
= $callback;
331 * Sets the action to perform when site info is encountered
332 * @param callable $callback
335 public function setSiteInfoCallback( $callback ) {
336 $previous = $this->mSiteInfoCallback
;
337 $this->mSiteInfoCallback
= $callback;
342 * Sets the factory object to use to convert ForeignTitle objects into local
344 * @param ImportTitleFactory $factory
346 public function setImportTitleFactory( $factory ) {
347 $this->importTitleFactory
= $factory;
351 * Set a target namespace to override the defaults
352 * @param null|int $namespace
355 public function setTargetNamespace( $namespace ) {
356 if ( $namespace === null ) {
357 // Don't override namespaces
358 $this->setImportTitleFactory(
359 new NaiveImportTitleFactory(
360 $this->contentLanguage
,
361 $this->namespaceInfo
,
368 $this->namespaceInfo
->exists( intval( $namespace ) )
370 $namespace = intval( $namespace );
371 $this->setImportTitleFactory(
372 new NamespaceImportTitleFactory(
373 $this->namespaceInfo
,
385 * Set a target root page under which all pages are imported
386 * @param null|string $rootpage
389 public function setTargetRootPage( $rootpage ) {
390 $status = Status
::newGood();
391 $nsInfo = $this->namespaceInfo
;
392 if ( $rootpage === null ) {
394 $this->setImportTitleFactory(
395 new NaiveImportTitleFactory(
396 $this->contentLanguage
,
401 } elseif ( $rootpage !== '' ) {
402 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
403 $title = Title
::newFromText( $rootpage );
405 if ( !$title ||
$title->isExternal() ) {
406 $status->fatal( 'import-rootpage-invalid' );
407 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
408 $displayNSText = $title->getNamespace() === NS_MAIN
409 ?
wfMessage( 'blanknamespace' )->text()
410 : $this->contentLanguage
->getNsText( $title->getNamespace() );
411 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
413 // set namespace to 'all', so the namespace check in processTitle() can pass
414 $this->setTargetNamespace( null );
415 $this->setImportTitleFactory(
416 new SubpageImportTitleFactory(
430 public function setImageBasePath( $dir ) {
431 $this->mImageBasePath
= $dir;
435 * @param bool $import
437 public function setImportUploads( $import ) {
438 $this->mImportUploads
= $import;
443 * @param string $usernamePrefix Prefix to apply to unknown (and possibly also known) usernames
444 * @param bool $assignKnownUsers Whether to apply the prefix to usernames that exist locally
446 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
447 $this->externalUserNames
= new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
451 * Statistics update can cause a lot of time
454 public function disableStatisticsUpdate() {
455 $this->disableStatisticsUpdate
= true;
459 * Default per-page callback. Sets up some things related to site statistics
460 * @param array $titleAndForeignTitle Two-element array, with Title object at
461 * index 0 and ForeignTitle object at index 1
464 public function beforeImportPage( $titleAndForeignTitle ) {
465 $title = $titleAndForeignTitle[0];
466 $page = $this->wikiPageFactory
->newFromTitle( $title );
467 $this->countableCache
['title_' . $title->getPrefixedText()] = $page->isCountable();
472 * Default per-revision callback, performs the import.
473 * @param WikiRevision $revision
476 public function importRevision( $revision ) {
477 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
478 $this->notice( 'import-error-bad-location',
479 $revision->getTitle()->getPrefixedText(),
481 $revision->getModel(),
482 $revision->getFormat()
489 return $revision->importOldRevision();
490 } catch ( MWContentSerializationException
$ex ) {
491 $this->notice( 'import-error-unserialize',
492 $revision->getTitle()->getPrefixedText(),
494 $revision->getModel(),
495 $revision->getFormat()
503 * Default per-revision callback, performs the import.
504 * @param WikiRevision $revision
507 public function importLogItem( $revision ) {
508 return $revision->importLogItem();
513 * @param WikiRevision $revision
516 public function importUpload( $revision ) {
517 $status = $this->uploadRevisionImporter
->import( $revision );
518 return $status->isGood();
522 * Mostly for hook use
523 * @param PageIdentity $pageIdentity
524 * @param ForeignTitle $foreignTitle
525 * @param int $revCount
526 * @param int $sRevCount
527 * @param array $pageInfo
530 public function finishImportPage( PageIdentity
$pageIdentity, $foreignTitle, $revCount,
531 $sRevCount, $pageInfo
533 // Update article count statistics (T42009)
534 // The normal counting logic in WikiPage->doEditUpdates() is designed for
535 // one-revision-at-a-time editing, not bulk imports. In this situation it
536 // suffers from issues of replica DB lag. We let WikiPage handle the total page
537 // and revision count, and we implement our own custom logic for the
538 // article (content page) count.
539 if ( !$this->disableStatisticsUpdate
) {
540 $page = $this->wikiPageFactory
->newFromTitle( $pageIdentity );
542 $page->loadPageData( IDBAccessObject
::READ_LATEST
);
543 $rev = $page->getRevisionRecord();
544 if ( $rev === null ) {
546 wfDebug( __METHOD__
. ': Skipping article count adjustment for ' . $pageIdentity .
547 ' because WikiPage::getRevisionRecord() returned null' );
549 $update = $page->newPageUpdater( $this->performer
)->prepareUpdate();
550 $countKey = 'title_' . CacheKeyHelper
::getKeyForPage( $pageIdentity );
551 $countable = $update->isCountable();
552 if ( array_key_exists( $countKey, $this->countableCache
) &&
553 $countable != $this->countableCache
[$countKey] ) {
554 DeferredUpdates
::addUpdate( SiteStatsUpdate
::factory( [
555 'articles' => ( (int)$countable - (int)$this->countableCache
[$countKey] )
561 $title = Title
::newFromPageIdentity( $pageIdentity );
562 return $this->hookRunner
->onAfterImportPage( $title, $foreignTitle,
563 $revCount, $sRevCount, $pageInfo );
567 * Notify the callback function of site info
568 * @param array $siteInfo
569 * @return mixed|false
571 private function siteInfoCallback( $siteInfo ) {
572 if ( $this->mSiteInfoCallback
) {
573 return call_user_func_array(
574 $this->mSiteInfoCallback
,
583 * Notify the callback function when a new "<page>" is reached.
584 * @param array $title
586 public function pageCallback( $title ) {
587 if ( $this->mPageCallback
) {
588 call_user_func( $this->mPageCallback
, $title );
593 * Notify the callback function when a "</page>" is closed.
594 * @param PageIdentity $pageIdentity
595 * @param ForeignTitle $foreignTitle
596 * @param int $revCount
597 * @param int $sucCount Number of revisions for which callback returned true
598 * @param array $pageInfo Associative array of page information
600 private function pageOutCallback( PageIdentity
$pageIdentity, $foreignTitle, $revCount,
601 $sucCount, $pageInfo ) {
602 if ( $this->mPageOutCallback
) {
603 call_user_func_array( $this->mPageOutCallback
, func_get_args() );
608 * Notify the callback function of a revision
609 * @param WikiRevision $revision
612 private function revisionCallback( $revision ) {
613 if ( $this->mRevisionCallback
) {
614 return call_user_func_array(
615 $this->mRevisionCallback
,
624 * Notify the callback function of a new log item
625 * @param WikiRevision $revision
626 * @return mixed|false
628 private function logItemCallback( $revision ) {
629 if ( $this->mLogItemCallback
) {
630 return call_user_func_array(
631 $this->mLogItemCallback
,
640 * Retrieves the contents of the named attribute of the current element.
641 * @param string $attr The name of the attribute
642 * @return string The value of the attribute or an empty string if it is not set in the current
645 public function nodeAttribute( $attr ) {
646 return $this->reader
->getAttribute( $attr ) ??
'';
650 * Shouldn't something like this be built-in to XMLReader?
651 * Fetches text contents of the current element, assuming
652 * no sub-elements or such scary things.
656 public function nodeContents() {
657 if ( $this->reader
->isEmptyElement
) {
661 while ( $this->reader
->read() ) {
662 switch ( $this->reader
->nodeType
) {
663 case XMLReader
::TEXT
:
664 case XMLReader
::CDATA
:
665 case XMLReader
::SIGNIFICANT_WHITESPACE
:
666 $buffer .= $this->reader
->value
;
668 case XMLReader
::END_ELEMENT
:
673 $this->reader
->close();
678 * Primary entry point
682 public function doImport() {
683 $this->syntaxCheckXML();
685 // Calls to reader->read need to be wrapped in calls to
686 // libxml_disable_entity_loader() to avoid local file
687 // inclusion attacks (T48932).
688 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
689 $oldDisable = @libxml_disable_entity_loader
( true );
691 $this->reader
->read();
693 if ( $this->reader
->localName
!= 'mediawiki' ) {
694 // phpcs:ignore Generic.PHP.NoSilencedErrors
695 @libxml_disable_entity_loader
( $oldDisable );
696 $error = libxml_get_last_error();
698 throw new NormalizedException( "XML error at line {line}: {message}", [
699 'line' => $error->line
,
700 'message' => $error->message
,
703 throw new UnexpectedValueException(
704 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
708 $this->debug( "<mediawiki> tag is correct." );
710 $this->debug( "Starting primary dump processing loop." );
712 $keepReading = $this->reader
->read();
715 while ( $keepReading ) {
716 $tag = $this->reader
->localName
;
717 if ( $this->pageOffset
) {
718 if ( $tag === 'page' ) {
721 if ( $pageCount < $this->pageOffset
) {
722 $keepReading = $this->reader
->next();
726 $type = $this->reader
->nodeType
;
728 if ( !$this->hookRunner
->onImportHandleToplevelXMLTag( $this ) ) {
730 } elseif ( $tag == 'mediawiki' && $type == XMLReader
::END_ELEMENT
) {
732 } elseif ( $tag == 'siteinfo' ) {
733 $this->handleSiteInfo();
734 } elseif ( $tag == 'page' ) {
736 } elseif ( $tag == 'logitem' ) {
737 $this->handleLogItem();
738 } elseif ( $tag != '#text' ) {
739 $this->warn( "Unhandled top-level XML tag $tag" );
745 $keepReading = $this->reader
->next();
747 $this->debug( "Skip" );
749 $keepReading = $this->reader
->read();
753 // phpcs:ignore Generic.PHP.NoSilencedErrors
754 @libxml_disable_entity_loader
( $oldDisable );
755 $this->reader
->close();
761 private function handleSiteInfo() {
762 $this->debug( "Enter site info handler." );
765 // Fields that can just be stuffed in the siteInfo object
766 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
768 while ( $this->reader
->read() ) {
769 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
770 $this->reader
->localName
== 'siteinfo' ) {
774 $tag = $this->reader
->localName
;
776 if ( $tag == 'namespace' ) {
777 $this->foreignNamespaces
[$this->nodeAttribute( 'key' )] =
778 $this->nodeContents();
779 } elseif ( in_array( $tag, $normalFields ) ) {
780 $siteInfo[$tag] = $this->nodeContents();
784 $siteInfo['_namespaces'] = $this->foreignNamespaces
;
785 $this->siteInfoCallback( $siteInfo );
788 private function handleLogItem() {
789 $this->debug( "Enter log item handler." );
792 // Fields that can just be stuffed in the pageInfo object
793 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
794 'logtitle', 'params' ];
796 while ( $this->reader
->read() ) {
797 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
798 $this->reader
->localName
== 'logitem' ) {
802 $tag = $this->reader
->localName
;
804 if ( !$this->hookRunner
->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
806 } elseif ( in_array( $tag, $normalFields ) ) {
807 $logInfo[$tag] = $this->nodeContents();
808 } elseif ( $tag == 'contributor' ) {
809 $logInfo['contributor'] = $this->handleContributor();
810 } elseif ( $tag != '#text' ) {
811 $this->warn( "Unhandled log-item XML tag $tag" );
815 $this->processLogItem( $logInfo );
819 * @param array $logInfo
820 * @return mixed|false
822 private function processLogItem( $logInfo ) {
823 $revision = new WikiRevision();
825 if ( isset( $logInfo['id'] ) ) {
826 $revision->setID( $logInfo['id'] );
828 $revision->setType( $logInfo['type'] );
829 $revision->setAction( $logInfo['action'] );
830 if ( isset( $logInfo['timestamp'] ) ) {
831 $revision->setTimestamp( $logInfo['timestamp'] );
833 if ( isset( $logInfo['params'] ) ) {
834 $revision->setParams( $logInfo['params'] );
836 if ( isset( $logInfo['logtitle'] ) ) {
837 // @todo Using Title for non-local titles is a recipe for disaster.
838 // We should use ForeignTitle here instead.
839 $revision->setTitle( Title
::newFromText( $logInfo['logtitle'] ) );
842 $revision->setNoUpdates( $this->mNoUpdates
);
844 if ( isset( $logInfo['comment'] ) ) {
845 $revision->setComment( $logInfo['comment'] );
848 if ( isset( $logInfo['contributor']['username'] ) ) {
849 $revision->setUsername(
850 $this->externalUserNames
->applyPrefix( $logInfo['contributor']['username'] )
852 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
853 $revision->setUserIP( $logInfo['contributor']['ip'] );
855 $revision->setUsername( $this->externalUserNames
->addPrefix( 'Unknown user' ) );
858 return $this->logItemCallback( $revision );
861 private function handlePage() {
863 $this->debug( "Enter page handler." );
864 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
866 // Fields that can just be stuffed in the pageInfo object
867 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
872 while ( $skip ?
$this->reader
->next() : $this->reader
->read() ) {
873 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
874 $this->reader
->localName
== 'page' ) {
880 $tag = $this->reader
->localName
;
883 // The title is invalid, bail out of this page
885 } elseif ( !$this->hookRunner
->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
887 } elseif ( in_array( $tag, $normalFields ) ) {
891 // <title>Page</title>
892 // <redirect title="NewTitle"/>
894 // Because the redirect tag is built differently, we need special handling for that case.
895 if ( $tag == 'redirect' ) {
896 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
898 $pageInfo[$tag] = $this->nodeContents();
900 } elseif ( $tag == 'revision' ||
$tag == 'upload' ) {
901 if ( !isset( $title ) ) {
902 $title = $this->processTitle( $pageInfo['title'],
903 $pageInfo['ns'] ??
null );
905 // $title is either an array of two titles or false.
906 if ( is_array( $title ) ) {
907 $this->pageCallback( $title );
908 [ $pageInfo['_title'], $foreignTitle ] = $title;
916 if ( $tag == 'revision' ) {
917 $this->handleRevision( $pageInfo );
919 $this->handleUpload( $pageInfo );
922 } elseif ( $tag != '#text' ) {
923 $this->warn( "Unhandled page XML tag $tag" );
928 // @note $pageInfo is only set if a valid $title is processed above with
929 // no error. If we have a valid $title, then pageCallback is called
930 // above, $pageInfo['title'] is set and we do pageOutCallback here.
931 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
932 // set since they both come from $title above.
933 if ( array_key_exists( '_title', $pageInfo ) ) {
934 /** @var Title $title */
935 $title = $pageInfo['_title'];
936 $this->pageOutCallback(
938 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
940 $pageInfo['revisionCount'],
941 $pageInfo['successfulRevisionCount'],
948 * @param array &$pageInfo
950 private function handleRevision( &$pageInfo ) {
951 $this->debug( "Enter revision handler" );
954 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
955 'model', 'format', 'text', 'sha1' ];
959 while ( $skip ?
$this->reader
->next() : $this->reader
->read() ) {
960 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
961 $this->reader
->localName
== 'revision' ) {
965 $tag = $this->reader
->localName
;
967 if ( !$this->hookRunner
->onImportHandleRevisionXMLTag(
968 $this, $pageInfo, $revisionInfo )
971 } elseif ( in_array( $tag, $normalFields ) ) {
972 $revisionInfo[$tag] = $this->nodeContents();
973 } elseif ( $tag == 'content' ) {
974 // We can have multiple content tags, so make this an array.
975 $revisionInfo[$tag][] = $this->handleContent();
976 } elseif ( $tag == 'contributor' ) {
977 $revisionInfo['contributor'] = $this->handleContributor();
978 } elseif ( $tag != '#text' ) {
979 $this->warn( "Unhandled revision XML tag $tag" );
984 $pageInfo['revisionCount']++
;
985 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
986 $pageInfo['successfulRevisionCount']++
;
990 private function handleContent() {
991 $this->debug( "Enter content handler" );
994 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
998 while ( $skip ?
$this->reader
->next() : $this->reader
->read() ) {
999 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
1000 $this->reader
->localName
== 'content' ) {
1004 $tag = $this->reader
->localName
;
1006 if ( !$this->hookRunner
->onImportHandleContentXMLTag(
1007 $this, $contentInfo )
1010 } elseif ( in_array( $tag, $normalFields ) ) {
1011 $contentInfo[$tag] = $this->nodeContents();
1012 } elseif ( $tag != '#text' ) {
1013 $this->warn( "Unhandled content XML tag $tag" );
1018 return $contentInfo;
1022 * @param PageIdentity $page
1023 * @param int $revisionId
1024 * @param array $contentInfo
1028 private function makeContent( PageIdentity
$page, $revisionId, $contentInfo ) {
1029 $maxArticleSize = $this->config
->get( MainConfigNames
::MaxArticleSize
);
1031 if ( !isset( $contentInfo['text'] ) ) {
1032 throw new InvalidArgumentException( 'Missing text field in import.' );
1035 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1036 // database errors and instability. Testing for revisions with only listed
1037 // content models, as other content models might use serialization formats
1038 // which aren't checked against $wgMaxArticleSize.
1039 if ( ( !isset( $contentInfo['model'] ) ||
1040 in_array( $contentInfo['model'], [
1048 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1050 throw new RuntimeException( 'The text of ' .
1052 "the revision with ID $revisionId" :
1054 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1057 $role = $contentInfo['role'] ?? SlotRecord
::MAIN
;
1058 $model = $contentInfo['model'] ??
$this->slotRoleRegistry
1059 ->getRoleHandler( $role )
1060 ->getDefaultModel( $page );
1061 $handler = $this->contentHandlerFactory
->getContentHandler( $model );
1063 $text = $handler->importTransform( $contentInfo['text'] );
1065 return $handler->unserializeContent( $text );
1069 * @param array $pageInfo
1070 * @param array $revisionInfo
1071 * @return mixed|false
1073 private function processRevision( $pageInfo, $revisionInfo ) {
1074 $revision = new WikiRevision();
1076 $revId = $revisionInfo['id'] ??
0;
1078 $revision->setID( $revisionInfo['id'] );
1081 $title = $pageInfo['_title'];
1082 $revision->setTitle( $title );
1084 $content = $this->makeContent( $title, $revId, $revisionInfo );
1085 $revision->setContent( SlotRecord
::MAIN
, $content );
1087 foreach ( $revisionInfo['content'] ??
[] as $slotInfo ) {
1088 if ( !isset( $slotInfo['role'] ) ) {
1089 throw new RuntimeException( "Missing role for imported slot." );
1092 $content = $this->makeContent( $title, $revId, $slotInfo );
1093 $revision->setContent( $slotInfo['role'], $content );
1095 $revision->setTimestamp( $revisionInfo['timestamp'] ??
wfTimestampNow() );
1097 if ( isset( $revisionInfo['comment'] ) ) {
1098 $revision->setComment( $revisionInfo['comment'] );
1101 if ( isset( $revisionInfo['minor'] ) ) {
1102 $revision->setMinor( true );
1104 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1105 $revision->setUsername(
1106 $this->externalUserNames
->applyPrefix( $revisionInfo['contributor']['username'] )
1108 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1109 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1111 $revision->setUsername( $this->externalUserNames
->addPrefix( 'Unknown user' ) );
1113 if ( isset( $revisionInfo['sha1'] ) ) {
1114 $revision->setSha1Base36( $revisionInfo['sha1'] );
1116 $revision->setNoUpdates( $this->mNoUpdates
);
1118 return $this->revisionCallback( $revision );
1122 * @param array &$pageInfo
1125 private function handleUpload( &$pageInfo ) {
1126 $this->debug( "Enter upload handler" );
1129 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1130 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1134 while ( $skip ?
$this->reader
->next() : $this->reader
->read() ) {
1135 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
1136 $this->reader
->localName
== 'upload' ) {
1140 $tag = $this->reader
->localName
;
1142 if ( !$this->hookRunner
->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1144 } elseif ( in_array( $tag, $normalFields ) ) {
1145 $uploadInfo[$tag] = $this->nodeContents();
1146 } elseif ( $tag == 'contributor' ) {
1147 $uploadInfo['contributor'] = $this->handleContributor();
1148 } elseif ( $tag == 'contents' ) {
1149 $contents = $this->nodeContents();
1150 $encoding = $this->reader
->getAttribute( 'encoding' );
1151 if ( $encoding === 'base64' ) {
1152 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1153 $uploadInfo['isTempSrc'] = true;
1155 } elseif ( $tag != '#text' ) {
1156 $this->warn( "Unhandled upload XML tag $tag" );
1161 if ( $this->mImageBasePath
&& isset( $uploadInfo['rel'] ) ) {
1162 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1163 if ( file_exists( $path ) ) {
1164 $uploadInfo['fileSrc'] = $path;
1165 $uploadInfo['isTempSrc'] = false;
1169 if ( $this->mImportUploads
) {
1170 return $this->processUpload( $pageInfo, $uploadInfo );
1175 * @param string $contents
1178 private function dumpTemp( $contents ) {
1179 $filename = tempnam( wfTempDir(), 'importupload' );
1180 file_put_contents( $filename, $contents );
1185 * @param array $pageInfo
1186 * @param array $uploadInfo
1189 private function processUpload( $pageInfo, $uploadInfo ) {
1190 $revision = new WikiRevision();
1191 $revId = $pageInfo['id'];
1192 $title = $pageInfo['_title'];
1193 // T292348: text key may be absent, force addition if null
1194 $uploadInfo['text'] ??
= '';
1195 $content = $this->makeContent( $title, $revId, $uploadInfo );
1197 $revision->setTitle( $title );
1198 $revision->setID( $revId );
1199 $revision->setTimestamp( $uploadInfo['timestamp'] );
1200 $revision->setContent( SlotRecord
::MAIN
, $content );
1201 $revision->setFilename( $uploadInfo['filename'] );
1202 if ( isset( $uploadInfo['archivename'] ) ) {
1203 $revision->setArchiveName( $uploadInfo['archivename'] );
1205 $revision->setSrc( $uploadInfo['src'] );
1206 if ( isset( $uploadInfo['fileSrc'] ) ) {
1207 $revision->setFileSrc( $uploadInfo['fileSrc'],
1208 !empty( $uploadInfo['isTempSrc'] )
1211 if ( isset( $uploadInfo['sha1base36'] ) ) {
1212 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1214 $revision->setSize( intval( $uploadInfo['size'] ) );
1215 $revision->setComment( $uploadInfo['comment'] );
1217 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1218 $revision->setUsername(
1219 $this->externalUserNames
->applyPrefix( $uploadInfo['contributor']['username'] )
1221 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1222 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1224 $revision->setNoUpdates( $this->mNoUpdates
);
1226 return call_user_func( $this->mUploadCallback
, $revision );
1232 private function handleContributor() {
1233 $this->debug( "Enter contributor handler." );
1235 if ( $this->reader
->isEmptyElement
) {
1239 $fields = [ 'id', 'ip', 'username' ];
1242 while ( $this->reader
->read() ) {
1243 if ( $this->reader
->nodeType
== XMLReader
::END_ELEMENT
&&
1244 $this->reader
->localName
== 'contributor' ) {
1248 $tag = $this->reader
->localName
;
1250 if ( in_array( $tag, $fields ) ) {
1251 $info[$tag] = $this->nodeContents();
1259 * @param string $text
1260 * @param string|null $ns
1261 * @return array|false
1263 private function processTitle( $text, $ns = null ) {
1264 if ( $this->foreignNamespaces
=== null ) {
1265 $foreignTitleFactory = new NaiveForeignTitleFactory(
1266 $this->contentLanguage
1269 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1270 $this->foreignNamespaces
);
1273 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1276 $title = $this->importTitleFactory
->createTitleFromForeignTitle(
1279 if ( $title === null ) {
1280 # Invalid page title? Ignore the page
1281 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1283 } elseif ( $title->isExternal() ) {
1284 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1286 } elseif ( !$title->canExist() ) {
1287 $this->notice( 'import-error-special', $title->getPrefixedText() );
1289 } elseif ( !$this->performer
->definitelyCan( 'edit', $title ) ) {
1290 # Do not import if the importing wiki user cannot edit this page
1291 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1295 return [ $title, $foreignTitle ];
1299 * Open the XMLReader connected to the source adapter id
1300 * @suppress PhanStaticCallToNonStatic, UnusedSuppression -- for PHP 7.4 support
1302 private function openReader() {
1303 // Enable the entity loader, as it is needed for loading external URLs via
1304 // XMLReader::open (T86036)
1305 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1306 $oldDisable = @libxml_disable_entity_loader
( false );
1308 if ( PHP_VERSION_ID
>= 80000 ) {
1309 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1310 $reader = XMLReader
::open(
1311 'uploadsource://' . $this->sourceAdapterId
, null, LIBXML_PARSEHUGE
);
1312 if ( $reader instanceof XMLReader
) {
1313 $this->reader
= $reader;
1319 // A static call generated a deprecation warning prior to PHP 8.0
1320 $this->reader
= new XMLReader
;
1321 $status = $this->reader
->open(
1322 'uploadsource://' . $this->sourceAdapterId
, null, LIBXML_PARSEHUGE
);
1325 $error = libxml_get_last_error();
1326 // phpcs:ignore Generic.PHP.NoSilencedErrors
1327 @libxml_disable_entity_loader
( $oldDisable );
1328 throw new RuntimeException(
1329 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1332 // phpcs:ignore Generic.PHP.NoSilencedErrors
1333 @libxml_disable_entity_loader
( $oldDisable );
1337 * Check the syntax of the given xml
1339 private function syntaxCheckXML() {
1340 if ( !UploadSourceAdapter
::isSeekableSource( $this->sourceAdapterId
) ) {
1343 AtEase
::suppressWarnings();
1344 $oldDisable = libxml_disable_entity_loader( false );
1346 while ( $this->reader
->read() );
1347 $error = libxml_get_last_error();
1349 $errorMessage = 'XML error at line ' . $error->line
. ': ' . $error->message
;
1350 wfDebug( __METHOD__
. ': Invalid xml found - ' . $errorMessage );
1351 throw new RuntimeException( $errorMessage );
1354 libxml_disable_entity_loader( $oldDisable );
1355 AtEase
::restoreWarnings();
1356 $this->reader
->close();
1359 // Reopen for the real import
1360 UploadSourceAdapter
::seekSource( $this->sourceAdapterId
, 0 );
1361 $this->openReader();