3 // Some smart guy removed XMLReader's global constants from PHP 5.1
4 // and replaced them with class constants. Breaking source compatibility
5 // is SUPER awesome, and I love languages which do this constantly!
6 $xmlReaderConstants = array(
21 "SIGNIFICANT_WHITESPACE",
29 foreach( $xmlReaderConstants as $name ) {
30 $fullName = "XMLREADER_$name";
31 $newName = "XMLReader::$name";
32 if( !defined( $fullName ) ) {
33 if( defined( $newName ) ) {
34 define( $fullName, constant( $newName ) );
36 // broken or missing the extension...
42 * Readahead helper for making large MediaWiki data dumps;
43 * reads in a previous XML dump to sequentially prefetch text
44 * records already normalized and decompressed.
46 * This can save load on the external database servers, hopefully.
48 * Assumes that dumps will be recorded in the canonical order:
49 * - ascending by page_id
50 * - ascending by rev_id within each page
51 * - text contents are immutable and should not change once
52 * recorded, so the previous dump is a reliable source
54 * Requires PHP 5 and the XMLReader PECL extension.
59 var $atPageEnd = false;
63 function BaseDump( $infile ) {
64 $this->reader = new XMLReader();
65 $this->reader->open( $infile );
69 * Attempts to fetch the text of a particular page revision
70 * from the dump stream. May return null if the page is
73 * @param int $page ID number of page to read
74 * @param int $rev ID number of revision to read
75 * @return string or null
77 function prefetch( $page, $rev ) {
78 $page = intval( $page );
79 $rev = intval( $rev );
80 while( $this->lastPage < $page && !$this->atEnd ) {
81 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
84 if( $this->lastPage > $page || $this->atEnd ) {
85 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
88 while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
89 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
92 if( $this->lastRev == $rev && !$this->atEnd ) {
93 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
94 return $this->nextText();
96 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
101 function debug( $str ) {
102 wfDebug( $str . "\n" );
104 //$dumper->progress( $str );
110 function nextPage() {
111 if( $this->skipTo( 'page', 'mediawiki' ) ) {
112 if( $this->skipTo( 'id' ) ) {
113 $this->lastPage = intval( $this->nodeContents() );
115 $this->atPageEnd = false;
126 if( $this->skipTo( 'revision' ) ) {
127 if( $this->skipTo( 'id' ) ) {
128 $this->lastRev = intval( $this->nodeContents() );
131 $this->atPageEnd = true;
138 function nextText() {
139 $this->skipTo( 'text' );
140 return strval( $this->nodeContents() );
146 function skipTo( $name, $parent='page' ) {
150 while( $this->reader->read() ) {
151 if( $this->reader->nodeType == XMLREADER_ELEMENT &&
152 $this->reader->name == $name ) {
155 if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
156 $this->reader->name == $parent ) {
157 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
161 return $this->close();
165 * Shouldn't something like this be built-in to XMLReader?
166 * Fetches text contents of the current element, assuming
167 * no sub-elements or such scary things.
171 function nodeContents() {
175 if( $this->reader->isEmptyElement ) {
179 while( $this->reader->read() ) {
180 switch( $this->reader->nodeType ) {
182 // case XMLREADER_WHITESPACE:
183 case XMLREADER_SIGNIFICANT_WHITESPACE:
184 $buffer .= $this->reader->value;
186 case XMLREADER_END_ELEMENT:
190 return $this->close();
197 $this->reader->close();