Merge "Remove not used private member variable mParserWarnings from OutputPage"
[mediawiki.git] / maintenance / backupPrefetch.inc
blob265800ec06cd97a3907f04d1c7a11487cf003390
1 <?php
2 /**
3  * Helper class for the --prefetch option of dumpTextPass.php
4  *
5  * Copyright © 2005 Brion Vibber <brion@pobox.com>
6  * https://www.mediawiki.org/
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  * @ingroup Maintenance
25  */
27 /**
28  * Readahead helper for making large MediaWiki data dumps;
29  * reads in a previous XML dump to sequentially prefetch text
30  * records already normalized and decompressed.
31  *
32  * This can save load on the external database servers, hopefully.
33  *
34  * Assumes that dumps will be recorded in the canonical order:
35  * - ascending by page_id
36  * - ascending by rev_id within each page
37  * - text contents are immutable and should not change once
38  *   recorded, so the previous dump is a reliable source
39  *
40  * @ingroup Maintenance
41  */
42 class BaseDump {
43         protected $reader = null;
44         protected $atEnd = false;
45         protected $atPageEnd = false;
46         protected $lastPage = 0;
47         protected $lastRev = 0;
48         protected $infiles = null;
50         public function __construct( $infile ) {
51                 $this->infiles = explode( ';', $infile );
52                 $this->reader = new XMLReader();
53                 $infile = array_shift( $this->infiles );
54                 if ( defined( 'LIBXML_PARSEHUGE' ) ) {
55                         $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
56                 } else {
57                         $this->reader->open( $infile );
58                 }
59         }
61         /**
62          * Attempts to fetch the text of a particular page revision
63          * from the dump stream. May return null if the page is
64          * unavailable.
65          *
66          * @param int $page ID number of page to read
67          * @param int $rev ID number of revision to read
68          * @return string|null
69          */
70         function prefetch( $page, $rev ) {
71                 $page = intval( $page );
72                 $rev = intval( $rev );
73                 while ( $this->lastPage < $page && !$this->atEnd ) {
74                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
75                         $this->nextPage();
76                 }
77                 if ( $this->lastPage > $page || $this->atEnd ) {
78                         $this->debug( "BaseDump::prefetch already past page $page "
79                                 . "looking for rev $rev  [$this->lastPage, $this->lastRev]" );
81                         return null;
82                 }
83                 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
84                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
85                                 . "looking for $page, $rev" );
86                         $this->nextRev();
87                 }
88                 if ( $this->lastRev == $rev && !$this->atEnd ) {
89                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
91                         return $this->nextText();
92                 } else {
93                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page "
94                                 . "[$this->lastPage, $this->lastRev]" );
96                         return null;
97                 }
98         }
100         function debug( $str ) {
101                 wfDebug( $str . "\n" );
102                 // global $dumper;
103                 // $dumper->progress( $str );
104         }
106         /**
107          * @access private
108          */
109         function nextPage() {
110                 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
111                         if ( $this->skipTo( 'id' ) ) {
112                                 $this->lastPage = intval( $this->nodeContents() );
113                                 $this->lastRev = 0;
114                                 $this->atPageEnd = false;
115                         }
116                 } else {
117                         $this->close();
118                         if ( count( $this->infiles ) ) {
119                                 $infile = array_shift( $this->infiles );
120                                 $this->reader->open( $infile );
121                                 $this->atEnd = false;
122                         }
123                 }
124         }
126         /**
127          * @access private
128          */
129         function nextRev() {
130                 if ( $this->skipTo( 'revision' ) ) {
131                         if ( $this->skipTo( 'id' ) ) {
132                                 $this->lastRev = intval( $this->nodeContents() );
133                         }
134                 } else {
135                         $this->atPageEnd = true;
136                 }
137         }
139         /**
140          * @access private
141          * @return string
142          */
143         function nextText() {
144                 $this->skipTo( 'text' );
146                 return strval( $this->nodeContents() );
147         }
149         /**
150          * @access private
151          * @param string $name
152          * @param string $parent
153          * @return bool|null
154          */
155         function skipTo( $name, $parent = 'page' ) {
156                 if ( $this->atEnd ) {
157                         return false;
158                 }
159                 while ( $this->reader->read() ) {
160                         if ( $this->reader->nodeType == XMLReader::ELEMENT
161                                 && $this->reader->name == $name
162                         ) {
163                                 return true;
164                         }
165                         if ( $this->reader->nodeType == XMLReader::END_ELEMENT
166                                 && $this->reader->name == $parent
167                         ) {
168                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
170                                 return false;
171                         }
172                 }
174                 return $this->close();
175         }
177         /**
178          * Shouldn't something like this be built-in to XMLReader?
179          * Fetches text contents of the current element, assuming
180          * no sub-elements or such scary things.
181          *
182          * @return string
183          * @access private
184          */
185         function nodeContents() {
186                 if ( $this->atEnd ) {
187                         return null;
188                 }
189                 if ( $this->reader->isEmptyElement ) {
190                         return "";
191                 }
192                 $buffer = "";
193                 while ( $this->reader->read() ) {
194                         switch ( $this->reader->nodeType ) {
195                                 case XMLReader::TEXT:
196                                 // case XMLReader::WHITESPACE:
197                                 case XMLReader::SIGNIFICANT_WHITESPACE:
198                                         $buffer .= $this->reader->value;
199                                         break;
200                                 case XMLReader::END_ELEMENT:
201                                         return $buffer;
202                         }
203                 }
205                 return $this->close();
206         }
208         /**
209          * @access private
210          * @return null
211          */
212         function close() {
213                 $this->reader->close();
214                 $this->atEnd = true;
216                 return null;
217         }