* Fix for short_open_tag off again; please don't break this, guys
[mediawiki.git] / maintenance / backupPrefetch.inc
blobf40bc89350c941267f748348cd6b05e913a0aa92
1 <?php
3 /**
4  * Readahead helper for making large MediaWiki data dumps;
5  * reads in a previous XML dump to sequentially prefetch text
6  * records already normalized and decompressed.
7  *
8  * This can save load on the external database servers, hopefully.
9  *
10  * Assumes that dumps will be recorded in the canonical order:
11  * - ascending by page_id
12  * - ascending by rev_id within each page
13  * - text contents are immutable and should not change once
14  *   recorded, so the previous dump is a reliable source
15  *
16  * Requires PHP 5 and the XMLReader PECL extension.
17  */
18 class BaseDump {
19         var $reader = null;
20         var $atEnd = false;
21         var $atPageEnd = false;
22         var $lastPage = 0;
23         var $lastRev = 0;
24         
25         function BaseDump( $infile ) {
26                 $this->reader = new XMLReader();
27                 $this->reader->open( $infile );
28         }
29         
30         /**
31          * Attempts to fetch the text of a particular page revision
32          * from the dump stream. May return null if the page is
33          * unavailable.
34          *
35          * @param int $page ID number of page to read
36          * @param int $rev ID number of revision to read
37          * @return string or null
38          */
39         function prefetch( $page, $rev ) {
40                 $page = intval( $page );
41                 $rev = intval( $rev );
42                 while( $this->lastPage < $page && !$this->atEnd ) {
43                         $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
44                         $this->nextPage();
45                 }
46                 if( $this->lastPage > $page || $this->atEnd ) {
47                         $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
48                         return null;
49                 }
50                 while( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
51                         $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
52                         $this->nextRev();
53                 }
54                 if( $this->lastRev == $rev && !$this->atEnd ) {
55                         $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
56                         return $this->nextText();
57                 } else {
58                         $this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
59                         return null;
60                 }
61         }
62         
63         function debug( $str ) {
64                 wfDebug( $str . "\n" );
65                 //global $dumper;
66                 //$dumper->progress( $str );
67         }
68         
69         /**
70          * @access private
71          */
72         function nextPage() {
73                 if( $this->skipTo( 'page', 'mediawiki' ) ) {
74                         if( $this->skipTo( 'id' ) ) {
75                                 $this->lastPage = intval( $this->nodeContents() );
76                                 $this->lastRev = 0;
77                                 $this->atPageEnd = false;
78                         }
79                 } else {
80                         $this->atEnd = true;
81                 }
82         }
83         
84         /**
85          * @access private
86          */
87         function nextRev() {
88                 if( $this->skipTo( 'revision' ) ) {
89                         if( $this->skipTo( 'id' ) ) {
90                                 $this->lastRev = intval( $this->nodeContents() );
91                         }
92                 } else {
93                         $this->atPageEnd = true;
94                 }
95         }
96         
97         /**
98          * @access private
99          */
100         function nextText() {
101                 $this->skipTo( 'text' );
102                 return strval( $this->nodeContents() );
103         }
104         
105         /**
106          * @access private
107          */
108         function skipTo( $name, $parent='page' ) {
109                 if( $this->atEnd ) {
110                         return false;
111                 }
112                 while( $this->reader->read() ) {
113                         if( $this->reader->nodeType == XMLREADER_ELEMENT &&
114                                 $this->reader->name == $name ) {
115                                 return true;
116                         }
117                         if( $this->reader->nodeType == XMLREADER_END_ELEMENT &&
118                                 $this->reader->name == $parent ) {
119                                 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
120                                 return false;
121                         }
122                 }
123                 return $this->close();
124         }
125         
126         /**
127          * Shouldn't something like this be built-in to XMLReader?
128          * Fetches text contents of the current element, assuming
129          * no sub-elements or such scary things.
130          * @return string
131          * @access private
132          */
133         function nodeContents() {
134                 if( $this->atEnd ) {
135                         return null;
136                 }
137                 if( $this->reader->isEmptyElement ) {
138                         return "";
139                 }
140                 $buffer = "";
141                 while( $this->reader->read() ) {
142                         switch( $this->reader->nodeType ) {
143                         case XMLREADER_TEXT:
144 //                      case XMLREADER_WHITESPACE:
145                         case XMLREADER_SIGNIFICANT_WHITESPACE:
146                                 $buffer .= $this->reader->value;
147                                 break;
148                         case XMLREADER_END_ELEMENT:
149                                 return $buffer;
150                         }
151                 }
152                 return $this->close();
153         }
154         
155         /**
156          * @access private
157          */
158         function close() {
159                 $this->reader->close();
160                 $this->atEnd = true;
161                 return null;
162         }