4 * Readahead helper for making large MediaWiki data dumps;
5 * reads in a previous XML dump to sequentially prefetch text
6 * records already normalized and decompressed.
8 * This can save load on the external database servers, hopefully.
10 * Assumes that dumps will be recorded in the canonical order:
11 * - ascending by page_id
12 * - ascending by rev_id within each page
13 * - text contents are immutable and should not change once
14 * recorded, so the previous dump is a reliable source
16 * Requires PHP 5 and the XMLReader PECL extension.
21 var $atPageEnd = false;
25 function BaseDump( $infile ) {
26 $this->reader = new XMLReader();
27 $this->reader->open( $infile );
31 * Attempts to fetch the text of a particular page revision
32 * from the dump stream. May return null if the page is
35 * @param int $page ID number of page to read
36 * @param int $rev ID number of revision to read
37 * @return string or null
39 function prefetch( $page, $rev ) {
40 $page = intval( $page );
41 $rev = intval( $rev );
42 while( $this->lastPage < $page && !$this->atEnd ) {
43 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
46 if( $this->lastPage > $page || $this->atEnd ) {
47 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
50 while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
51 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
54 if( $this->lastRev == $rev && !$this->atEnd ) {
55 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
56 return $this->nextText();
58 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
63 function debug( $str ) {
64 wfDebug( $str . "\n" );
66 //$dumper->progress( $str );
73 if( $this->skipTo( 'page', 'mediawiki' ) ) {
74 if( $this->skipTo( 'id' ) ) {
75 $this->lastPage = intval( $this->nodeContents() );
77 $this->atPageEnd = false;
88 if( $this->skipTo( 'revision' ) ) {
89 if( $this->skipTo( 'id' ) ) {
90 $this->lastRev = intval( $this->nodeContents() );
93 $this->atPageEnd = true;
100 function nextText() {
101 $this->skipTo( 'text' );
102 return strval( $this->nodeContents() );
108 function skipTo( $name, $parent='page' ) {
112 while( $this->reader->read() ) {
113 if( $this->reader->nodeType == XMLREADER_ELEMENT &&
114 $this->reader->name == $name ) {
117 if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
118 $this->reader->name == $parent ) {
119 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
123 return $this->close();
127 * Shouldn't something like this be built-in to XMLReader?
128 * Fetches text contents of the current element, assuming
129 * no sub-elements or such scary things.
133 function nodeContents() {
137 if( $this->reader->isEmptyElement ) {
141 while( $this->reader->read() ) {
142 switch( $this->reader->nodeType ) {
144 // case XMLREADER_WHITESPACE:
145 case XMLREADER_SIGNIFICANT_WHITESPACE:
146 $buffer .= $this->reader->value;
148 case XMLREADER_END_ELEMENT:
152 return $this->close();
159 $this->reader->close();