Non-word characters don't terminate tag names.
[mediawiki.git] / includes / specials / SpecialExport.php
blob61ed34d4733f36f6347219909808d816e81436c2
1 <?php
2 /**
3 * Implements Special:Export
5 * Copyright © 2003-2008 Brion Vibber <brion@pobox.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @file
23 * @ingroup SpecialPage
26 /**
27 * A special page that allows users to export pages in a XML file
29 * @ingroup SpecialPage
31 class SpecialExport extends SpecialPage {
32 private $curonly, $doExport, $pageLinkDepth, $templates;
33 private $images;
35 public function __construct() {
36 parent::__construct( 'Export' );
39 public function execute( $par ) {
40 global $wgSitename, $wgExportAllowListContributors, $wgExportFromNamespaces;
41 global $wgExportAllowHistory, $wgExportMaxHistory, $wgExportMaxLinkDepth;
42 global $wgExportAllowAll;
44 $this->setHeaders();
45 $this->outputHeader();
47 // Set some variables
48 $this->curonly = true;
49 $this->doExport = false;
50 $request = $this->getRequest();
51 $this->templates = $request->getCheck( 'templates' );
52 $this->images = $request->getCheck( 'images' ); // Doesn't do anything yet
53 $this->pageLinkDepth = $this->validateLinkDepth(
54 $request->getIntOrNull( 'pagelink-depth' )
56 $nsindex = '';
57 $exportall = false;
59 if ( $request->getCheck( 'addcat' ) ) {
60 $page = $request->getText( 'pages' );
61 $catname = $request->getText( 'catname' );
63 if ( $catname !== '' && $catname !== null && $catname !== false ) {
64 $t = Title::makeTitleSafe( NS_MAIN, $catname );
65 if ( $t ) {
66 /**
67 * @todo FIXME: This can lead to hitting memory limit for very large
68 * categories. Ideally we would do the lookup synchronously
69 * during the export in a single query.
71 $catpages = $this->getPagesFromCategory( $t );
72 if ( $catpages ) {
73 $page .= "\n" . implode( "\n", $catpages );
77 } elseif ( $request->getCheck( 'addns' ) && $wgExportFromNamespaces ) {
78 $page = $request->getText( 'pages' );
79 $nsindex = $request->getText( 'nsindex', '' );
81 if ( strval( $nsindex ) !== '' ) {
82 /**
83 * Same implementation as above, so same @todo
85 $nspages = $this->getPagesFromNamespace( $nsindex );
86 if ( $nspages ) {
87 $page .= "\n" . implode( "\n", $nspages );
90 } elseif ( $request->getCheck( 'exportall' ) && $wgExportAllowAll ) {
91 $this->doExport = true;
92 $exportall = true;
94 /* Although $page and $history are not used later on, we
95 nevertheless set them to avoid that PHP notices about using
96 undefined variables foul up our XML output (see call to
97 doExport(...) further down) */
98 $page = '';
99 $history = '';
100 } elseif ( $request->wasPosted() && $par == '' ) {
101 $page = $request->getText( 'pages' );
102 $this->curonly = $request->getCheck( 'curonly' );
103 $rawOffset = $request->getVal( 'offset' );
105 if ( $rawOffset ) {
106 $offset = wfTimestamp( TS_MW, $rawOffset );
107 } else {
108 $offset = null;
111 $limit = $request->getInt( 'limit' );
112 $dir = $request->getVal( 'dir' );
113 $history = array(
114 'dir' => 'asc',
115 'offset' => false,
116 'limit' => $wgExportMaxHistory,
118 $historyCheck = $request->getCheck( 'history' );
120 if ( $this->curonly ) {
121 $history = WikiExporter::CURRENT;
122 } elseif ( !$historyCheck ) {
123 if ( $limit > 0 && ( $wgExportMaxHistory == 0 || $limit < $wgExportMaxHistory ) ) {
124 $history['limit'] = $limit;
127 if ( !is_null( $offset ) ) {
128 $history['offset'] = $offset;
131 if ( strtolower( $dir ) == 'desc' ) {
132 $history['dir'] = 'desc';
136 if ( $page != '' ) {
137 $this->doExport = true;
139 } else {
140 // Default to current-only for GET requests.
141 $page = $request->getText( 'pages', $par );
142 $historyCheck = $request->getCheck( 'history' );
144 if ( $historyCheck ) {
145 $history = WikiExporter::FULL;
146 } else {
147 $history = WikiExporter::CURRENT;
150 if ( $page != '' ) {
151 $this->doExport = true;
155 if ( !$wgExportAllowHistory ) {
156 // Override
157 $history = WikiExporter::CURRENT;
160 $list_authors = $request->getCheck( 'listauthors' );
161 if ( !$this->curonly || !$wgExportAllowListContributors ) {
162 $list_authors = false;
165 if ( $this->doExport ) {
166 $this->getOutput()->disable();
168 // Cancel output buffering and gzipping if set
169 // This should provide safer streaming for pages with history
170 wfResetOutputBuffers();
171 $request->response()->header( "Content-type: application/xml; charset=utf-8" );
173 if ( $request->getCheck( 'wpDownload' ) ) {
174 // Provide a sane filename suggestion
175 $filename = urlencode( $wgSitename . '-' . wfTimestampNow() . '.xml' );
176 $request->response()->header( "Content-disposition: attachment;filename={$filename}" );
179 $this->doExport( $page, $history, $list_authors, $exportall );
181 return;
184 $out = $this->getOutput();
185 $out->addWikiMsg( 'exporttext' );
187 $form = Xml::openElement( 'form', array( 'method' => 'post',
188 'action' => $this->getTitle()->getLocalURL( 'action=submit' ) ) );
189 $form .= Xml::inputLabel(
190 $this->msg( 'export-addcattext' )->text(),
191 'catname',
192 'catname',
194 ) . '&#160;';
195 $form .= Xml::submitButton(
196 $this->msg( 'export-addcat' )->text(),
197 array( 'name' => 'addcat' )
198 ) . '<br />';
200 if ( $wgExportFromNamespaces ) {
201 $form .= Html::namespaceSelector(
202 array(
203 'selected' => $nsindex,
204 'label' => $this->msg( 'export-addnstext' )->text()
205 ), array(
206 'name' => 'nsindex',
207 'id' => 'namespace',
208 'class' => 'namespaceselector',
210 ) . '&#160;';
211 $form .= Xml::submitButton(
212 $this->msg( 'export-addns' )->text(),
213 array( 'name' => 'addns' )
214 ) . '<br />';
217 if ( $wgExportAllowAll ) {
218 $form .= Xml::checkLabel(
219 $this->msg( 'exportall' )->text(),
220 'exportall',
221 'exportall',
222 $request->wasPosted() ? $request->getCheck( 'exportall' ) : false
223 ) . '<br />';
226 $form .= Xml::element(
227 'textarea',
228 array( 'name' => 'pages', 'cols' => 40, 'rows' => 10 ),
229 $page,
230 false
232 $form .= '<br />';
234 if ( $wgExportAllowHistory ) {
235 $form .= Xml::checkLabel(
236 $this->msg( 'exportcuronly' )->text(),
237 'curonly',
238 'curonly',
239 $request->wasPosted() ? $request->getCheck( 'curonly' ) : true
240 ) . '<br />';
241 } else {
242 $out->addWikiMsg( 'exportnohistory' );
245 $form .= Xml::checkLabel(
246 $this->msg( 'export-templates' )->text(),
247 'templates',
248 'wpExportTemplates',
249 $request->wasPosted() ? $request->getCheck( 'templates' ) : false
250 ) . '<br />';
252 if ( $wgExportMaxLinkDepth || $this->userCanOverrideExportDepth() ) {
253 $form .= Xml::inputLabel(
254 $this->msg( 'export-pagelinks' )->text(),
255 'pagelink-depth',
256 'pagelink-depth',
259 ) . '<br />';
262 // Enable this when we can do something useful exporting/importing image information. :)
263 //$form .= Xml::checkLabel( $this->msg( 'export-images' )->text(), 'images', 'wpExportImages', false ) . '<br />';
264 $form .= Xml::checkLabel(
265 $this->msg( 'export-download' )->text(),
266 'wpDownload',
267 'wpDownload',
268 $request->wasPosted() ? $request->getCheck( 'wpDownload' ) : true
269 ) . '<br />';
271 if ( $wgExportAllowListContributors ) {
272 $form .= Xml::checkLabel(
273 $this->msg( 'exportlistauthors' )->text(),
274 'listauthors',
275 'listauthors',
276 $request->wasPosted() ? $request->getCheck( 'listauthors' ) : false
277 ) . '<br />';
280 $form .= Xml::submitButton(
281 $this->msg( 'export-submit' )->text(),
282 Linker::tooltipAndAccesskeyAttribs( 'export' )
284 $form .= Xml::closeElement( 'form' );
286 $out->addHTML( $form );
290 * @return bool
292 private function userCanOverrideExportDepth() {
293 return $this->getUser()->isAllowed( 'override-export-depth' );
297 * Do the actual page exporting
299 * @param string $page user input on what page(s) to export
300 * @param $history Mixed: one of the WikiExporter history export constants
301 * @param $list_authors Boolean: Whether to add distinct author list (when
302 * not returning full history)
303 * @param $exportall Boolean: Whether to export everything
305 private function doExport( $page, $history, $list_authors, $exportall ) {
307 // If we are grabbing everything, enable full history and ignore the rest
308 if ( $exportall ) {
309 $history = WikiExporter::FULL;
310 } else {
312 $pageSet = array(); // Inverted index of all pages to look up
314 // Split up and normalize input
315 foreach ( explode( "\n", $page ) as $pageName ) {
316 $pageName = trim( $pageName );
317 $title = Title::newFromText( $pageName );
318 if ( $title && $title->getInterwiki() == '' && $title->getText() !== '' ) {
319 // Only record each page once!
320 $pageSet[$title->getPrefixedText()] = true;
324 // Set of original pages to pass on to further manipulation...
325 $inputPages = array_keys( $pageSet );
327 // Look up any linked pages if asked...
328 if ( $this->templates ) {
329 $pageSet = $this->getTemplates( $inputPages, $pageSet );
331 $linkDepth = $this->pageLinkDepth;
332 if ( $linkDepth ) {
333 $pageSet = $this->getPageLinks( $inputPages, $pageSet, $linkDepth );
336 // Enable this when we can do something useful exporting/importing image information.
337 // if( $this->images ) ) {
338 // $pageSet = $this->getImages( $inputPages, $pageSet );
339 // }
341 $pages = array_keys( $pageSet );
343 // Normalize titles to the same format and remove dupes, see bug 17374
344 foreach ( $pages as $k => $v ) {
345 $pages[$k] = str_replace( " ", "_", $v );
348 $pages = array_unique( $pages );
351 /* Ok, let's get to it... */
352 if ( $history == WikiExporter::CURRENT ) {
353 $lb = false;
354 $db = wfGetDB( DB_SLAVE );
355 $buffer = WikiExporter::BUFFER;
356 } else {
357 // Use an unbuffered query; histories may be very long!
358 $lb = wfGetLBFactory()->newMainLB();
359 $db = $lb->getConnection( DB_SLAVE );
360 $buffer = WikiExporter::STREAM;
362 // This might take a while... :D
363 wfSuppressWarnings();
364 set_time_limit( 0 );
365 wfRestoreWarnings();
368 $exporter = new WikiExporter( $db, $history, $buffer );
369 $exporter->list_authors = $list_authors;
370 $exporter->openStream();
372 if ( $exportall ) {
373 $exporter->allPages();
374 } else {
375 foreach ( $pages as $page ) {
376 #Bug 8824: Only export pages the user can read
377 $title = Title::newFromText( $page );
378 if ( is_null( $title ) ) {
379 // @todo Perhaps output an <error> tag or something.
380 continue;
383 if ( !$title->userCan( 'read', $this->getUser() ) ) {
384 // @todo Perhaps output an <error> tag or something.
385 continue;
388 $exporter->pageByTitle( $title );
392 $exporter->closeStream();
394 if ( $lb ) {
395 $lb->closeAll();
400 * @param $title Title
401 * @return array
403 private function getPagesFromCategory( $title ) {
404 global $wgContLang;
406 $name = $title->getDBkey();
408 $dbr = wfGetDB( DB_SLAVE );
409 $res = $dbr->select(
410 array( 'page', 'categorylinks' ),
411 array( 'page_namespace', 'page_title' ),
412 array( 'cl_from=page_id', 'cl_to' => $name ),
413 __METHOD__,
414 array( 'LIMIT' => '5000' )
417 $pages = array();
419 foreach ( $res as $row ) {
420 $n = $row->page_title;
421 if ( $row->page_namespace ) {
422 $ns = $wgContLang->getNsText( $row->page_namespace );
423 $n = $ns . ':' . $n;
426 $pages[] = $n;
429 return $pages;
433 * @param $nsindex int
434 * @return array
436 private function getPagesFromNamespace( $nsindex ) {
437 global $wgContLang;
439 $dbr = wfGetDB( DB_SLAVE );
440 $res = $dbr->select(
441 'page',
442 array( 'page_namespace', 'page_title' ),
443 array( 'page_namespace' => $nsindex ),
444 __METHOD__,
445 array( 'LIMIT' => '5000' )
448 $pages = array();
450 foreach ( $res as $row ) {
451 $n = $row->page_title;
453 if ( $row->page_namespace ) {
454 $ns = $wgContLang->getNsText( $row->page_namespace );
455 $n = $ns . ':' . $n;
458 $pages[] = $n;
461 return $pages;
465 * Expand a list of pages to include templates used in those pages.
466 * @param $inputPages array, list of titles to look up
467 * @param $pageSet array, associative array indexed by titles for output
468 * @return array associative array index by titles
470 private function getTemplates( $inputPages, $pageSet ) {
471 return $this->getLinks( $inputPages, $pageSet,
472 'templatelinks',
473 array( 'namespace' => 'tl_namespace', 'title' => 'tl_title' ),
474 array( 'page_id=tl_from' )
479 * Validate link depth setting, if available.
480 * @param $depth int
481 * @return int
483 private function validateLinkDepth( $depth ) {
484 global $wgExportMaxLinkDepth;
486 if ( $depth < 0 ) {
487 return 0;
490 if ( !$this->userCanOverrideExportDepth() ) {
491 if ( $depth > $wgExportMaxLinkDepth ) {
492 return $wgExportMaxLinkDepth;
497 * There's a HARD CODED limit of 5 levels of recursion here to prevent a
498 * crazy-big export from being done by someone setting the depth
499 * number too high. In other words, last resort safety net.
502 return intval( min( $depth, 5 ) );
506 * Expand a list of pages to include pages linked to from that page.
507 * @param $inputPages array
508 * @param $pageSet array
509 * @param $depth int
510 * @return array
512 private function getPageLinks( $inputPages, $pageSet, $depth ) {
513 for ( ; $depth > 0; --$depth ) {
514 $pageSet = $this->getLinks(
515 $inputPages, $pageSet, 'pagelinks',
516 array( 'namespace' => 'pl_namespace', 'title' => 'pl_title' ),
517 array( 'page_id=pl_from' )
519 $inputPages = array_keys( $pageSet );
522 return $pageSet;
526 * Expand a list of pages to include images used in those pages.
528 * @param $inputPages array, list of titles to look up
529 * @param $pageSet array, associative array indexed by titles for output
531 * @return array associative array index by titles
533 private function getImages( $inputPages, $pageSet ) {
534 return $this->getLinks(
535 $inputPages,
536 $pageSet,
537 'imagelinks',
538 array( 'namespace' => NS_FILE, 'title' => 'il_to' ),
539 array( 'page_id=il_from' )
544 * Expand a list of pages to include items used in those pages.
545 * @param array $inputPages Array of page titles
546 * @param array $pageSet
547 * @param string $table
548 * @param array $fields Array of field names
549 * @param array $join
550 * @return array
552 private function getLinks( $inputPages, $pageSet, $table, $fields, $join ) {
553 $dbr = wfGetDB( DB_SLAVE );
555 foreach ( $inputPages as $page ) {
556 $title = Title::newFromText( $page );
558 if ( $title ) {
559 $pageSet[$title->getPrefixedText()] = true;
560 /// @todo FIXME: May or may not be more efficient to batch these
561 /// by namespace when given multiple input pages.
562 $result = $dbr->select(
563 array( 'page', $table ),
564 $fields,
565 array_merge(
566 $join,
567 array(
568 'page_namespace' => $title->getNamespace(),
569 'page_title' => $title->getDBkey()
572 __METHOD__
575 foreach ( $result as $row ) {
576 $template = Title::makeTitle( $row->namespace, $row->title );
577 $pageSet[$template->getPrefixedText()] = true;
582 return $pageSet;
585 protected function getGroupName() {
586 return 'pagetools';