3 * Helper class for the --prefetch option of dumpTextPass.php
10 * Readahead helper for making large MediaWiki data dumps;
11 * reads in a previous XML dump to sequentially prefetch text
12 * records already normalized and decompressed.
14 * This can save load on the external database servers, hopefully.
16 * Assumes that dumps will be recorded in the canonical order:
17 * - ascending by page_id
18 * - ascending by rev_id within each page
19 * - text contents are immutable and should not change once
20 * recorded, so the previous dump is a reliable source
22 * Requires the XMLReader PECL extension.
23 * @ingroup Maintenance
28 var $atPageEnd = false;
32 function BaseDump( $infile ) {
33 $this->reader = new XMLReader();
34 $this->reader->open( $infile );
38 * Attempts to fetch the text of a particular page revision
39 * from the dump stream. May return null if the page is
42 * @param $page Integer: ID number of page to read
43 * @param $rev Integer: ID number of revision to read
44 * @return string or null
46 function prefetch( $page, $rev ) {
47 $page = intval( $page );
48 $rev = intval( $rev );
49 while ( $this->lastPage < $page && !$this->atEnd ) {
50 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
53 if ( $this->lastPage > $page || $this->atEnd ) {
54 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
57 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
58 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
61 if ( $this->lastRev == $rev && !$this->atEnd ) {
62 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
63 return $this->nextText();
65 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
70 function debug( $str ) {
71 wfDebug( $str . "\n" );
73 // $dumper->progress( $str );
80 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
81 if ( $this->skipTo( 'id' ) ) {
82 $this->lastPage = intval( $this->nodeContents() );
84 $this->atPageEnd = false;
95 if ( $this->skipTo( 'revision' ) ) {
96 if ( $this->skipTo( 'id' ) ) {
97 $this->lastRev = intval( $this->nodeContents() );
100 $this->atPageEnd = true;
107 function nextText() {
108 $this->skipTo( 'text' );
109 return strval( $this->nodeContents() );
115 function skipTo( $name, $parent = 'page' ) {
116 if ( $this->atEnd ) {
119 while ( $this->reader->read() ) {
120 if ( $this->reader->nodeType == XMLReader::ELEMENT &&
121 $this->reader->name == $name ) {
124 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
125 $this->reader->name == $parent ) {
126 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
130 return $this->close();
134 * Shouldn't something like this be built-in to XMLReader?
135 * Fetches text contents of the current element, assuming
136 * no sub-elements or such scary things.
141 function nodeContents() {
142 if ( $this->atEnd ) {
145 if ( $this->reader->isEmptyElement ) {
149 while ( $this->reader->read() ) {
150 switch( $this->reader->nodeType ) {
151 case XMLReader::TEXT:
152 // case XMLReader::WHITESPACE:
153 case XMLReader::SIGNIFICANT_WHITESPACE:
154 $buffer .= $this->reader->value;
156 case XMLReader::END_ELEMENT:
160 return $this->close();
167 $this->reader->close();