Major refactoring of site and user CSS, creating ResourceLoaderUserModule and Resourc...
[mediawiki.git] / maintenance / backupPrefetch.inc
blob14f78a0bcbd2f0a5e752611f9eda9d1e289b0df5
1 <?php
2 /**
3  * Helper class for the --prefetch option of dumpTextPass.php
4  *
5  * @file
6  * @ingrouo Maintenance
7  */
9 // Some smart guy removed XMLReader's global constants from PHP 5.1
10 // and replaced them with class constants. Breaking source compatibility
11 // is SUPER awesome, and I love languages which do this constantly!
12 $xmlReaderConstants = array(
13         "NONE",
14         "ELEMENT",
15         "ATTRIBUTE",
16         "TEXT",
17         "CDATA",
18         "ENTITY_REF",
19         "ENTITY",
20         "PI",
21         "COMMENT",
22         "DOC",
23         "DOC_TYPE",
24         "DOC_FRAGMENT",
25         "NOTATION",
26         "WHITESPACE",
27         "SIGNIFICANT_WHITESPACE",
28         "END_ELEMENT",
29         "END_ENTITY",
30         "XML_DECLARATION",
31         "LOADDTD",
32         "DEFAULTATTRS",
33         "VALIDATE",
34         "SUBST_ENTITIES" );
35 foreach ( $xmlReaderConstants as $name ) {
36         $fullName = "XMLREADER_$name";
37         $newName = "XMLReader::$name";
38         if ( !defined( $fullName ) ) {
39                 if ( defined( $newName ) ) {
40                         define( $fullName, constant( $newName ) );
41                 } else {
42                         // broken or missing the extension...
43                 }
44         }
47 /**
48  * Readahead helper for making large MediaWiki data dumps;
49  * reads in a previous XML dump to sequentially prefetch text
50  * records already normalized and decompressed.
51  *
52  * This can save load on the external database servers, hopefully.
53  *
54  * Assumes that dumps will be recorded in the canonical order:
55  * - ascending by page_id
56  * - ascending by rev_id within each page
57  * - text contents are immutable and should not change once
58  *   recorded, so the previous dump is a reliable source
59  *
60  * Requires the XMLReader PECL extension.
61  * @ingroup Maintenance
62  */
63 class BaseDump {
64         var $reader = null;
65         var $atEnd = false;
66         var $atPageEnd = false;
67         var $lastPage = 0;
68         var $lastRev = 0;
70         function BaseDump( $infile ) {
71                 $this->reader = new XMLReader();
72                 $this->reader->open( $infile );
73         }
75         /**
76          * Attempts to fetch the text of a particular page revision
77          * from the dump stream. May return null if the page is
78          * unavailable.
79          *
80          * @param $page Integer: ID number of page to read
81          * @param $rev Integer: ID number of revision to read
82          * @return string or null
83          */
84         function prefetch( $page, $rev ) {
85                 $page = intval( $page );
86                 $rev = intval( $rev );
87                 while ( $this->lastPage < $page && !$this->atEnd ) {
88                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
89                         $this->nextPage();
90                 }
91                 if ( $this->lastPage > $page || $this->atEnd ) {
92                         $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
93                         return null;
94                 }
95                 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
96                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
97                         $this->nextRev();
98                 }
99                 if ( $this->lastRev == $rev && !$this->atEnd ) {
100                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
101                         return $this->nextText();
102                 } else {
103                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
104                         return null;
105                 }
106         }
108         function debug( $str ) {
109                 wfDebug( $str . "\n" );
110                 // global $dumper;
111                 // $dumper->progress( $str );
112         }
114         /**
115          * @access private
116          */
117         function nextPage() {
118                 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
119                         if ( $this->skipTo( 'id' ) ) {
120                                 $this->lastPage = intval( $this->nodeContents() );
121                                 $this->lastRev = 0;
122                                 $this->atPageEnd = false;
123                         }
124                 } else {
125                         $this->atEnd = true;
126                 }
127         }
129         /**
130          * @access private
131          */
132         function nextRev() {
133                 if ( $this->skipTo( 'revision' ) ) {
134                         if ( $this->skipTo( 'id' ) ) {
135                                 $this->lastRev = intval( $this->nodeContents() );
136                         }
137                 } else {
138                         $this->atPageEnd = true;
139                 }
140         }
142         /**
143          * @access private
144          */
145         function nextText() {
146                 $this->skipTo( 'text' );
147                 return strval( $this->nodeContents() );
148         }
150         /**
151          * @access private
152          */
153         function skipTo( $name, $parent = 'page' ) {
154                 if ( $this->atEnd ) {
155                         return false;
156                 }
157                 while ( $this->reader->read() ) {
158                         if ( $this->reader->nodeType == XMLREADER_ELEMENT &&
159                                 $this->reader->name == $name ) {
160                                 return true;
161                         }
162                         if ( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
163                                 $this->reader->name == $parent ) {
164                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
165                                 return false;
166                         }
167                 }
168                 return $this->close();
169         }
171         /**
172          * Shouldn't something like this be built-in to XMLReader?
173          * Fetches text contents of the current element, assuming
174          * no sub-elements or such scary things.
175          *
176          * @return String
177          * @access private
178          */
179         function nodeContents() {
180                 if ( $this->atEnd ) {
181                         return null;
182                 }
183                 if ( $this->reader->isEmptyElement ) {
184                         return "";
185                 }
186                 $buffer = "";
187                 while ( $this->reader->read() ) {
188                         switch( $this->reader->nodeType ) {
189                         case XMLREADER_TEXT:
190 //                      case XMLREADER_WHITESPACE:
191                         case XMLREADER_SIGNIFICANT_WHITESPACE:
192                                 $buffer .= $this->reader->value;
193                                 break;
194                         case XMLREADER_END_ELEMENT:
195                                 return $buffer;
196                         }
197                 }
198                 return $this->close();
199         }
201         /**
202          * @access private
203          */
204         function close() {
205                 $this->reader->close();
206                 $this->atEnd = true;
207                 return null;
208         }