3 * Helper class for the --prefetch option of dumpTextPass.php
9 // Some smart guy removed XMLReader's global constants from PHP 5.1
10 // and replaced them with class constants. Breaking source compatibility
11 // is SUPER awesome, and I love languages which do this constantly!
12 $xmlReaderConstants = array(
27 "SIGNIFICANT_WHITESPACE",
35 foreach ( $xmlReaderConstants as $name ) {
36 $fullName = "XMLREADER_$name";
37 $newName = "XMLReader::$name";
38 if ( !defined( $fullName ) ) {
39 if ( defined( $newName ) ) {
40 define( $fullName, constant( $newName ) );
42 // broken or missing the extension...
48 * Readahead helper for making large MediaWiki data dumps;
49 * reads in a previous XML dump to sequentially prefetch text
50 * records already normalized and decompressed.
52 * This can save load on the external database servers, hopefully.
54 * Assumes that dumps will be recorded in the canonical order:
55 * - ascending by page_id
56 * - ascending by rev_id within each page
57 * - text contents are immutable and should not change once
58 * recorded, so the previous dump is a reliable source
60 * Requires the XMLReader PECL extension.
61 * @ingroup Maintenance
66 var $atPageEnd = false;
70 function BaseDump( $infile ) {
71 $this->reader = new XMLReader();
72 $this->reader->open( $infile );
76 * Attempts to fetch the text of a particular page revision
77 * from the dump stream. May return null if the page is
80 * @param $page Integer: ID number of page to read
81 * @param $rev Integer: ID number of revision to read
82 * @return string or null
84 function prefetch( $page, $rev ) {
85 $page = intval( $page );
86 $rev = intval( $rev );
87 while ( $this->lastPage < $page && !$this->atEnd ) {
88 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
91 if ( $this->lastPage > $page || $this->atEnd ) {
92 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
95 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
96 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
99 if ( $this->lastRev == $rev && !$this->atEnd ) {
100 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
101 return $this->nextText();
103 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
108 function debug( $str ) {
109 wfDebug( $str . "\n" );
111 // $dumper->progress( $str );
117 function nextPage() {
118 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
119 if ( $this->skipTo( 'id' ) ) {
120 $this->lastPage = intval( $this->nodeContents() );
122 $this->atPageEnd = false;
133 if ( $this->skipTo( 'revision' ) ) {
134 if ( $this->skipTo( 'id' ) ) {
135 $this->lastRev = intval( $this->nodeContents() );
138 $this->atPageEnd = true;
145 function nextText() {
146 $this->skipTo( 'text' );
147 return strval( $this->nodeContents() );
153 function skipTo( $name, $parent = 'page' ) {
154 if ( $this->atEnd ) {
157 while ( $this->reader->read() ) {
158 if ( $this->reader->nodeType == XMLREADER_ELEMENT &&
159 $this->reader->name == $name ) {
162 if ( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
163 $this->reader->name == $parent ) {
164 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
168 return $this->close();
172 * Shouldn't something like this be built-in to XMLReader?
173 * Fetches text contents of the current element, assuming
174 * no sub-elements or such scary things.
179 function nodeContents() {
180 if ( $this->atEnd ) {
183 if ( $this->reader->isEmptyElement ) {
187 while ( $this->reader->read() ) {
188 switch( $this->reader->nodeType ) {
190 // case XMLREADER_WHITESPACE:
191 case XMLREADER_SIGNIFICANT_WHITESPACE:
192 $buffer .= $this->reader->value;
194 case XMLREADER_END_ELEMENT:
198 return $this->close();
205 $this->reader->close();