Whitelist the <wbr> element.
[mediawiki.git] / maintenance / backupPrefetch.inc
blob04352b9b9fdfb91b84ae4a11afeef29be9ce3fe4
1 <?php
2 /**
3  * Helper class for the --prefetch option of dumpTextPass.php
4  *
5  * Copyright © 2005 Brion Vibber <brion@pobox.com>
6  * http://www.mediawiki.org/
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, write to the Free Software Foundation, Inc.,
20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21  * http://www.gnu.org/copyleft/gpl.html
22  *
23  * @file
24  * @ingroup Maintenance
25  */
27 /**
28  * Readahead helper for making large MediaWiki data dumps;
29  * reads in a previous XML dump to sequentially prefetch text
30  * records already normalized and decompressed.
31  *
32  * This can save load on the external database servers, hopefully.
33  *
34  * Assumes that dumps will be recorded in the canonical order:
35  * - ascending by page_id
36  * - ascending by rev_id within each page
37  * - text contents are immutable and should not change once
38  *   recorded, so the previous dump is a reliable source
39  *
40  * @ingroup Maintenance
41  */
42 class BaseDump {
43         var $reader = null;
44         var $atEnd = false;
45         var $atPageEnd = false;
46         var $lastPage = 0;
47         var $lastRev = 0;
48         var $infiles = null;
50         function BaseDump( $infile ) {
51                 $this->infiles = explode( ';', $infile );
52                 $this->reader = new XMLReader();
53                 $infile = array_shift( $this->infiles );
54                 if ( defined( 'LIBXML_PARSEHUGE' ) ) {
55                         $this->reader->open( $infile, null, LIBXML_PARSEHUGE );
56                 }
57                 else {
58                         $this->reader->open( $infile );
59                 }
60         }
62         /**
63          * Attempts to fetch the text of a particular page revision
64          * from the dump stream. May return null if the page is
65          * unavailable.
66          *
67          * @param $page Integer: ID number of page to read
68          * @param $rev Integer: ID number of revision to read
69          * @return string or null
70          */
71         function prefetch( $page, $rev ) {
72                 $page = intval( $page );
73                 $rev = intval( $rev );
74                 while ( $this->lastPage < $page && !$this->atEnd ) {
75                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
76                         $this->nextPage();
77                 }
78                 if ( $this->lastPage > $page || $this->atEnd ) {
79                         $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
80                         return null;
81                 }
82                 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
83                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
84                         $this->nextRev();
85                 }
86                 if ( $this->lastRev == $rev && !$this->atEnd ) {
87                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
88                         return $this->nextText();
89                 } else {
90                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
91                         return null;
92                 }
93         }
95         function debug( $str ) {
96                 wfDebug( $str . "\n" );
97                 // global $dumper;
98                 // $dumper->progress( $str );
99         }
101         /**
102          * @access private
103          */
104         function nextPage() {
105                 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
106                         if ( $this->skipTo( 'id' ) ) {
107                                 $this->lastPage = intval( $this->nodeContents() );
108                                 $this->lastRev = 0;
109                                 $this->atPageEnd = false;
110                         }
111                 } else {
112                         $this->close();
113                         if ( count( $this->infiles ) ) {
114                                 $infile = array_shift( $this->infiles );
115                                 $this->reader->open( $infile );
116                                 $this->atEnd = false;
117                         }
118                 }
119         }
121         /**
122          * @access private
123          */
124         function nextRev() {
125                 if ( $this->skipTo( 'revision' ) ) {
126                         if ( $this->skipTo( 'id' ) ) {
127                                 $this->lastRev = intval( $this->nodeContents() );
128                         }
129                 } else {
130                         $this->atPageEnd = true;
131                 }
132         }
134         /**
135          * @access private
136          * @return string
137          */
138         function nextText() {
139                 $this->skipTo( 'text' );
140                 return strval( $this->nodeContents() );
141         }
143         /**
144          * @access private
145          * @param $name string
146          * @param $parent string
147          * @return bool|null
148          */
149         function skipTo( $name, $parent = 'page' ) {
150                 if ( $this->atEnd ) {
151                         return false;
152                 }
153                 while ( $this->reader->read() ) {
154                         if ( $this->reader->nodeType == XMLReader::ELEMENT &&
155                                 $this->reader->name == $name ) {
156                                 return true;
157                         }
158                         if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
159                                 $this->reader->name == $parent ) {
160                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
161                                 return false;
162                         }
163                 }
164                 return $this->close();
165         }
167         /**
168          * Shouldn't something like this be built-in to XMLReader?
169          * Fetches text contents of the current element, assuming
170          * no sub-elements or such scary things.
171          *
172          * @return String
173          * @access private
174          */
175         function nodeContents() {
176                 if ( $this->atEnd ) {
177                         return null;
178                 }
179                 if ( $this->reader->isEmptyElement ) {
180                         return "";
181                 }
182                 $buffer = "";
183                 while ( $this->reader->read() ) {
184                         switch ( $this->reader->nodeType ) {
185                         case XMLReader::TEXT:
186 //                      case XMLReader::WHITESPACE:
187                         case XMLReader::SIGNIFICANT_WHITESPACE:
188                                 $buffer .= $this->reader->value;
189                                 break;
190                         case XMLReader::END_ELEMENT:
191                                 return $buffer;
192                         }
193                 }
194                 return $this->close();
195         }
197         /**
198          * @access private
199          * @return null
200          */
201         function close() {
202                 $this->reader->close();
203                 $this->atEnd = true;
204                 return null;
205         }