5 * Copyright © 2003, 2005, 2006 Brooke Vibber <bvibber@wikimedia.org>
6 * https://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
26 use MediaWiki\CommentStore\CommentStore
;
27 use MediaWiki\Content\TextContent
;
28 use MediaWiki\Debug\MWDebug
;
29 use MediaWiki\HookContainer\HookContainer
;
30 use MediaWiki\HookContainer\HookRunner
;
31 use MediaWiki\MainConfigNames
;
32 use MediaWiki\MediaWikiServices
;
33 use MediaWiki\Revision\RevisionAccessException
;
34 use MediaWiki\Revision\RevisionRecord
;
35 use MediaWiki\Revision\RevisionStore
;
36 use MediaWiki\Revision\SlotRecord
;
37 use MediaWiki\Revision\SuppressedDataException
;
38 use MediaWiki\Storage\SqlBlobStore
;
39 use MediaWiki\Title\Title
;
40 use MediaWiki\Xml\Xml
;
41 use Wikimedia\Assert\Assert
;
42 use Wikimedia\IPUtils
;
49 /** Output serialized revision content. */
50 public const WRITE_CONTENT
= 0;
52 /** Only output subs for revision content. */
53 public const WRITE_STUB
= 1;
56 * Only output subs for revision content, indicating that the content has been
59 private const WRITE_STUB_DELETED
= 2;
62 * @var string[] the schema versions supported for output
65 public static $supportedSchemas = [
66 XML_DUMP_SCHEMA_VERSION_10
,
67 XML_DUMP_SCHEMA_VERSION_11
71 * @var string which schema version the generated XML should comply to.
72 * One of the values from self::$supportedSchemas, using the SCHEMA_VERSION_XX
75 private $schemaVersion;
78 * Title of the currently processed page
82 private $currentTitle = null;
85 * @var int Whether to output revision content or just stubs. WRITE_CONTENT or WRITE_STUB.
89 /** @var HookRunner */
92 /** @var CommentStore */
93 private $commentStore;
96 * @param int $contentMode WRITE_CONTENT or WRITE_STUB.
97 * @param string $schemaVersion which schema version the generated XML should comply to.
98 * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX
100 * @param HookContainer|null $hookContainer
101 * @param CommentStore|null $commentStore
103 public function __construct(
104 $contentMode = self
::WRITE_CONTENT
,
105 $schemaVersion = XML_DUMP_SCHEMA_VERSION_11
,
106 ?HookContainer
$hookContainer = null,
107 ?CommentStore
$commentStore = null
110 in_array( $contentMode, [ self
::WRITE_CONTENT
, self
::WRITE_STUB
], true ),
112 'must be one of the following constants: WRITE_CONTENT or WRITE_STUB.'
116 in_array( $schemaVersion, self
::$supportedSchemas, true ),
118 'must be one of the following schema versions: '
119 . implode( ',', self
::$supportedSchemas )
122 $this->contentMode
= $contentMode;
123 $this->schemaVersion
= $schemaVersion;
124 $this->hookRunner
= new HookRunner(
125 $hookContainer ?? MediaWikiServices
::getInstance()->getHookContainer()
127 $this->commentStore
= $commentStore ?? MediaWikiServices
::getInstance()->getCommentStore();
131 * Opens the XML output stream's root "<mediawiki>" element.
132 * This does not include an xml directive, so is safe to include
133 * as a subelement in a larger XML stream. Namespace and XML Schema
134 * references are included.
136 * Output will be encoded in UTF-8.
140 public function openStream() {
141 $ver = $this->schemaVersion
;
142 return Xml
::element( 'mediawiki', [
143 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
144 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
146 * When a new version of the schema is created, it needs staging on mediawiki.org.
147 * This requires a change in the operations/mediawiki-config git repo.
149 * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
150 * you copy in the new xsd file.
152 * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
153 * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
155 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
156 "http://www.mediawiki.org/xml/export-$ver.xsd",
158 'xml:lang' => MediaWikiServices
::getInstance()->getContentLanguage()->getHtmlCode() ],
167 private function siteInfo() {
173 $this->caseSetting(),
174 $this->namespaces() ];
175 return " <siteinfo>\n " .
176 implode( "\n ", $info ) .
183 private function sitename() {
184 $sitename = MediaWikiServices
::getInstance()->getMainConfig()->get(
185 MainConfigNames
::Sitename
);
186 return Xml
::element( 'sitename', [], $sitename );
192 private function dbname() {
193 $dbname = MediaWikiServices
::getInstance()->getMainConfig()->get( MainConfigNames
::DBname
);
194 return Xml
::element( 'dbname', [], $dbname );
200 private function generator() {
201 return Xml
::element( 'generator', [], 'MediaWiki ' . MW_VERSION
);
207 private function homelink() {
208 return Xml
::element( 'base', [], Title
::newMainPage()->getCanonicalURL() );
214 private function caseSetting() {
215 $capitalLinks = MediaWikiServices
::getInstance()->getMainConfig()->get(
216 MainConfigNames
::CapitalLinks
);
217 // "case-insensitive" option is reserved for future
218 $sensitivity = $capitalLinks ?
'first-letter' : 'case-sensitive';
219 return Xml
::element( 'case', [], $sensitivity );
225 private function namespaces() {
226 $spaces = "<namespaces>\n";
227 $nsInfo = MediaWikiServices
::getInstance()->getNamespaceInfo();
229 MediaWikiServices
::getInstance()->getContentLanguage()->getFormattedNamespaces()
233 Xml
::element( 'namespace',
236 'case' => $nsInfo->isCapitalized( $ns )
237 ?
'first-letter' : 'case-sensitive',
240 $spaces .= " </namespaces>";
245 * Closes the output stream with the closing root element.
246 * Call when finished dumping things.
250 public function closeStream() {
251 return "</mediawiki>\n";
255 * Opens a "<page>" section on the output stream, with data
256 * from the given database row.
258 * @param stdClass $row
261 public function openPage( $row ) {
263 $this->currentTitle
= Title
::newFromRow( $row );
264 $canonicalTitle = self
::canonicalTitle( $this->currentTitle
);
265 $out .= ' ' . Xml
::elementClean( 'title', [], $canonicalTitle ) . "\n";
266 $out .= ' ' . Xml
::element( 'ns', [], strval( $row->page_namespace
) ) . "\n";
267 $out .= ' ' . Xml
::element( 'id', [], strval( $row->page_id
) ) . "\n";
268 if ( $row->page_is_redirect
) {
269 $services = MediaWikiServices
::getInstance();
270 $page = $services->getWikiPageFactory()->newFromTitle( $this->currentTitle
);
271 $redirectStore = $services->getRedirectStore();
272 $redirect = $this->invokeLenient(
273 static function () use ( $page, $redirectStore ) {
274 return $redirectStore->getRedirectTarget( $page );
276 'Failed to get redirect target of page ' . $page->getId()
278 $redirect = Title
::castFromLinkTarget( $redirect );
279 if ( $redirect instanceof Title
&& $redirect->isValidRedirectTarget() ) {
281 $out .= Xml
::element( 'redirect', [ 'title' => self
::canonicalTitle( $redirect ) ] );
285 $this->hookRunner
->onXmlDumpWriterOpenPage( $this, $out, $row, $this->currentTitle
);
291 * Closes a "<page>" section on the output stream.
296 public function closePage() {
297 if ( $this->currentTitle
!== null ) {
298 $linkCache = MediaWikiServices
::getInstance()->getLinkCache();
299 // In rare cases, link cache has the same key for some pages which
300 // might be read as part of the same batch. T220424 and T220316
301 $linkCache->clearLink( $this->currentTitle
);
307 * @return RevisionStore
309 private function getRevisionStore() {
310 return MediaWikiServices
::getInstance()->getRevisionStore();
314 * @return SqlBlobStore
316 private function getBlobStore() {
317 // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
318 return MediaWikiServices
::getInstance()->getBlobStore();
322 * Invokes the given callback, catching and logging any exceptions.
324 * @param callable $callback
325 * @param string $warning The warning to output in case of a storage related exception.
327 * @return mixed Returns the method's return value, or null in case of an exception.
330 private function invokeLenient( $callback, $warning ) {
333 } catch ( SuppressedDataException
$ex ) {
335 } catch ( MWException | RuntimeException | InvalidArgumentException | ErrorException
$ex ) {
336 MWDebug
::warning( $warning . ': ' . $ex->getMessage() );
342 * Dumps a "<revision>" section on the output stream, with
343 * data filled in from the given database row.
345 * @param stdClass $row
346 * @param null|stdClass[] $slotRows
349 * @throws RevisionAccessException
351 public function writeRevision( $row, $slotRows = null ) {
352 $rev = $this->getRevisionStore()->newRevisionFromRowAndSlots(
359 $out = " <revision>\n";
360 $out .= " " . Xml
::element( 'id', null, strval( $rev->getId() ) ) . "\n";
362 if ( $rev->getParentId() ) {
363 $out .= " " . Xml
::element( 'parentid', null, strval( $rev->getParentId() ) ) . "\n";
366 $out .= $this->writeTimestamp( $rev->getTimestamp() );
368 if ( $rev->isDeleted( RevisionRecord
::DELETED_USER
) ) {
369 $out .= " " . Xml
::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
371 // empty values get written out as uid 0, see T224221
372 $user = $rev->getUser();
373 $out .= $this->writeContributor(
374 $user ?
$user->getId() : 0,
375 $user ?
$user->getName() : ''
379 if ( $rev->isMinor() ) {
380 $out .= " <minor/>\n";
382 if ( $rev->isDeleted( RevisionRecord
::DELETED_COMMENT
) ) {
383 $out .= " " . Xml
::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
385 if ( $rev->getComment()->text
!= '' ) {
387 . Xml
::elementClean( 'comment', [], strval( $rev->getComment()->text
) )
392 $contentMode = $rev->isDeleted( RevisionRecord
::DELETED_TEXT
) ? self
::WRITE_STUB_DELETED
393 : $this->contentMode
;
395 $slots = $rev->getSlots()->getSlots();
397 // use predictable order, put main slot first
399 $out .= $this->writeSlot( $slots[SlotRecord
::MAIN
], $contentMode );
401 foreach ( $slots as $role => $slot ) {
402 if ( $role === SlotRecord
::MAIN
) {
405 $out .= $this->writeSlot( $slot, $contentMode );
408 if ( $rev->isDeleted( RevisionRecord
::DELETED_TEXT
) ) {
409 $out .= " <sha1/>\n";
411 $sha1 = $this->invokeLenient(
412 static function () use ( $rev ) {
413 return $rev->getSha1();
415 'failed to determine sha1 for revision ' . $rev->getId()
417 $out .= " " . Xml
::element( 'sha1', null, strval( $sha1 ) ) . "\n";
421 if ( $contentMode === self
::WRITE_CONTENT
) {
422 /** @var Content $content */
423 $content = $this->invokeLenient(
424 static function () use ( $rev ) {
425 return $rev->getContent( SlotRecord
::MAIN
, RevisionRecord
::RAW
);
427 'Failed to load main slot content of revision ' . $rev->getId()
430 $text = $content ?
$content->serialize() : '';
432 $this->hookRunner
->onXmlDumpWriterWriteRevision( $this, $out, $row, $text, $rev );
434 $out .= " </revision>\n";
440 * @param SlotRecord $slot
441 * @param int $contentMode see the WRITE_XXX constants
445 private function writeSlot( SlotRecord
$slot, $contentMode ) {
446 $isMain = $slot->getRole() === SlotRecord
::MAIN
;
447 $isV11 = $this->schemaVersion
>= XML_DUMP_SCHEMA_VERSION_11
;
449 if ( !$isV11 && !$isMain ) {
450 // ignore extra slots
458 // non-main slots are wrapped into an additional element.
459 $out .= ' ' . Xml
::openElement( 'content' ) . "\n";
461 $out .= $indent . Xml
::element( 'role', null, strval( $slot->getRole() ) ) . "\n";
465 $out .= $indent . Xml
::element( 'origin', null, strval( $slot->getOrigin() ) ) . "\n";
468 $contentModel = $slot->getModel();
469 $contentHandler = MediaWikiServices
::getInstance()
470 ->getContentHandlerFactory()
471 ->getContentHandler( $contentModel );
472 $contentFormat = $contentHandler->getDefaultFormat();
474 // XXX: The content format is only relevant when actually outputting serialized content.
475 // It should probably be an attribute on the text tag.
476 $out .= $indent . Xml
::element( 'model', null, strval( $contentModel ) ) . "\n";
477 $out .= $indent . Xml
::element( 'format', null, strval( $contentFormat ) ) . "\n";
480 'bytes' => $this->invokeLenient(
481 static function () use ( $slot ) {
482 return $slot->getSize();
484 'failed to determine size for slot ' . $slot->getRole() . ' of revision '
485 . $slot->getRevision()
490 $textAttributes['sha1'] = $this->invokeLenient(
491 static function () use ( $slot ) {
492 return $slot->getSha1();
494 'failed to determine sha1 for slot ' . $slot->getRole() . ' of revision '
495 . $slot->getRevision()
499 if ( $contentMode === self
::WRITE_CONTENT
) {
500 $content = $this->invokeLenient(
501 static function () use ( $slot ) {
502 return $slot->getContent();
504 'failed to load content for slot ' . $slot->getRole() . ' of revision '
505 . $slot->getRevision()
508 if ( $content === null ) {
509 $out .= $indent . Xml
::element( 'text', $textAttributes ) . "\n";
511 $out .= $this->writeText( $content, $textAttributes, $indent );
513 } elseif ( $contentMode === self
::WRITE_STUB_DELETED
) {
514 // write <text> placeholder tag
515 $textAttributes['deleted'] = 'deleted';
516 $out .= $indent . Xml
::element( 'text', $textAttributes ) . "\n";
518 // write <text> stub tag
520 $textAttributes['location'] = $slot->getAddress();
525 // Output the numerical text ID if possible, for backwards compatibility.
526 // Note that this is currently the ONLY reason we have a BlobStore here at all.
527 // When removing this line, check whether the BlobStore has become unused.
529 // NOTE: this will only work for addresses of the form "tt:12345" or "es:DB://cluster1/1234".
530 // If we want to support other kinds of addresses in the future,
531 // we will have to silently ignore failures here.
532 // For now, this fails for "tt:0", which is present in the WMF production
533 // database as of July 2019, due to data corruption.
534 [ $schema, $textId ] = $this->getBlobStore()->splitBlobAddress( $slot->getAddress() );
535 } catch ( InvalidArgumentException
$ex ) {
536 MWDebug
::warning( 'Bad content address for slot ' . $slot->getRole()
537 . ' of revision ' . $slot->getRevision() . ': ' . $ex->getMessage() );
541 if ( $schema === 'tt' ) {
542 $textAttributes['id'] = $textId;
543 } elseif ( $schema === 'es' ) {
544 $textAttributes['id'] = bin2hex( $textId );
548 $out .= $indent . Xml
::element( 'text', $textAttributes ) . "\n";
552 $out .= ' ' . Xml
::closeElement( 'content' ) . "\n";
559 * @param Content $content
560 * @param string[] $textAttributes
561 * @param string $indent
565 private function writeText( Content
$content, $textAttributes, $indent ) {
566 $contentHandler = $content->getContentHandler();
567 $contentFormat = $contentHandler->getDefaultFormat();
569 if ( $content instanceof TextContent
) {
570 // HACK: For text based models, bypass the serialization step. This allows extensions (like Flow)
571 // that use incompatible combinations of serialization format and content model.
572 $data = $content->getText();
574 $data = $content->serialize( $contentFormat );
577 $data = $contentHandler->exportTransform( $data, $contentFormat );
578 // make sure to use the actual size
579 $textAttributes['bytes'] = strlen( $data );
580 $textAttributes['xml:space'] = 'preserve';
581 return $indent . Xml
::elementClean( 'text', $textAttributes, strval( $data ) ) . "\n";
585 * Dumps a "<logitem>" section on the output stream, with
586 * data filled in from the given database row.
588 * @param stdClass $row
591 public function writeLogItem( $row ) {
592 $out = " <logitem>\n";
593 $out .= " " . Xml
::element( 'id', null, strval( $row->log_id
) ) . "\n";
595 $out .= $this->writeTimestamp( $row->log_timestamp
, " " );
597 if ( $row->log_deleted
& LogPage
::DELETED_USER
) {
598 $out .= " " . Xml
::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
600 $out .= $this->writeContributor( $row->actor_user
, $row->actor_name
, " " );
603 if ( $row->log_deleted
& LogPage
::DELETED_COMMENT
) {
604 $out .= " " . Xml
::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
606 $comment = $this->commentStore
->getComment( 'log_comment', $row )->text
;
607 if ( $comment != '' ) {
608 $out .= " " . Xml
::elementClean( 'comment', null, strval( $comment ) ) . "\n";
612 $out .= " " . Xml
::element( 'type', null, strval( $row->log_type
) ) . "\n";
613 $out .= " " . Xml
::element( 'action', null, strval( $row->log_action
) ) . "\n";
615 if ( $row->log_deleted
& LogPage
::DELETED_ACTION
) {
616 $out .= " " . Xml
::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
618 $title = Title
::makeTitle( $row->log_namespace
, $row->log_title
);
619 $out .= " " . Xml
::elementClean( 'logtitle', null, self
::canonicalTitle( $title ) ) . "\n";
620 $out .= " " . Xml
::elementClean( 'params',
621 [ 'xml:space' => 'preserve' ],
622 strval( $row->log_params
) ) . "\n";
625 $out .= " </logitem>\n";
631 * @param string $timestamp
632 * @param string $indent Default to six spaces
635 public function writeTimestamp( $timestamp, $indent = " " ) {
636 $ts = wfTimestamp( TS_ISO_8601
, $timestamp );
637 return $indent . Xml
::element( 'timestamp', null, $ts ) . "\n";
642 * @param string $text
643 * @param string $indent Default to six spaces
646 public function writeContributor( $id, $text, $indent = " " ) {
647 $out = $indent . "<contributor>\n";
648 if ( $id ||
!IPUtils
::isValid( $text ) ) {
649 $out .= $indent . " " . Xml
::elementClean( 'username', null, strval( $text ) ) . "\n";
650 $out .= $indent . " " . Xml
::element( 'id', null, strval( $id ) ) . "\n";
652 $out .= $indent . " " . Xml
::elementClean( 'ip', null, strval( $text ) ) . "\n";
654 $out .= $indent . "</contributor>\n";
659 * Warning! This data is potentially inconsistent. :(
660 * @param stdClass $row
661 * @param bool $dumpContents
664 public function writeUploads( $row, $dumpContents = false ) {
665 if ( $row->page_namespace
== NS_FILE
) {
666 $img = MediaWikiServices
::getInstance()->getRepoGroup()->getLocalRepo()
667 ->newFile( $row->page_title
);
668 if ( $img && $img->exists() ) {
670 foreach ( array_reverse( $img->getHistory() ) as $ver ) {
671 $out .= $this->writeUpload( $ver, $dumpContents );
673 $out .= $this->writeUpload( $img, $dumpContents );
682 * @param bool $dumpContents
685 private function writeUpload( $file, $dumpContents = false ) {
686 if ( $file->isOld() ) {
687 /** @var OldLocalFile $file */
688 '@phan-var OldLocalFile $file';
690 Xml
::element( 'archivename', null, $file->getArchiveName() ) . "\n";
694 if ( $dumpContents ) {
695 $be = $file->getRepo()->getBackend();
696 # Dump file as base64
697 # Uses only XML-safe characters, so does not need escaping
698 # @todo Too bad this loads the contents into memory (script might swap)
699 $contents = ' <contents encoding="base64">' .
700 chunk_split( base64_encode(
701 $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
706 $uploader = $file->getUploader( File
::FOR_PUBLIC
);
708 $uploader = $this->writeContributor( $uploader->getId(), $uploader->getName() );
710 $uploader = Xml
::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
712 $comment = $file->getDescription( File
::FOR_PUBLIC
);
713 if ( ( $comment ??
'' ) !== '' ) {
714 $comment = Xml
::elementClean( 'comment', null, $comment );
716 $comment = Xml
::element( 'comment', [ 'deleted' => 'deleted' ] );
718 return " <upload>\n" .
719 $this->writeTimestamp( $file->getTimestamp() ) .
721 " " . $comment . "\n" .
722 " " . Xml
::element( 'filename', null, $file->getName() ) . "\n" .
724 " " . Xml
::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
725 " " . Xml
::element( 'size', null, (string)( $file->getSize() ?
: 0 ) ) . "\n" .
726 " " . Xml
::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
727 " " . Xml
::element( 'rel', null, $file->getRel() ) . "\n" .
733 * Return prefixed text form of title, but using the content language's
734 * canonical namespace. This skips any special-casing such as gendered
735 * user namespaces -- which while useful, are not yet listed in the
736 * XML "<siteinfo>" data so are unsafe in export.
738 * @param Title $title
742 public static function canonicalTitle( Title
$title ) {
743 if ( $title->isExternal() ) {
744 return $title->getPrefixedText();
747 $prefix = MediaWikiServices
::getInstance()->getContentLanguage()->
748 getFormattedNsText( $title->getNamespace() );
750 // @todo Emit some kind of warning to the user if $title->getNamespace() !==
751 // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
753 if ( $prefix !== '' ) {
757 return $prefix . $title->getText();