simple script to generate phpdocumentor documentation and troubleshoot its generation
[mediawiki.git] / maintenance / rebuildlinks.inc
blob12148d6b234ccf6325886992240b32a4bb5e0ef6
1 <?php
2 /**
3  * Functions for rebuilding the link tracking tables; must
4  * be included within a script that also includes the Setup.
5  * See @see rebuildlinks.php, for example.
6  *
7  * @deprecated
8  * @todo document
9  * @package MediaWiki
10  * @subpackage Maintenance
11  */
13 /** */
14 die( "rebuildLinks.inc needs to be updated for the new schema\n" );
18 # Buffer this many rows before inserting them all in one sweep. More
19 # than about 1000 will probably not increase speed significantly on
20 # most setups.
21 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
23 function rebuildLinkTables()
25         error_reporting (E_ALL);
26         global $wgLang, $wgLinkCache, $rowbuf_size;
28         print "This script may take several hours to complete. If you abort during that time,\n";
29         print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
30         print "the time to do it.\n\n";
31         print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
32         sleep(15);
34         $count = 0;
35         print "Rebuilding link tables.\n";
37         print "Setting AUTOCOMMIT=1\n";
38         wfQuery("SET SESSION AUTOCOMMIT=1", DB_MASTER);
40         print "Extracting often used data from cur (may take a few minutes)\n";
41         $sql = "CREATE TEMPORARY TABLE cur_fast SELECT cur_namespace, cur_title, cur_id FROM cur";
42         wfQuery( $sql, DB_MASTER );
43         $sql = "ALTER TABLE cur_fast ADD INDEX(cur_namespace, cur_title)";
44         wfQuery( $sql, DB_MASTER );
46         print "Locking tables\n";
47         $sql = "LOCK TABLES cur READ, cur_fast READ, interwiki READ, user_newtalk READ, " .
48                 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
49         wfQuery( $sql, DB_MASTER );
52         print "Deleting old data in links table.\n";
53         $sql = "DELETE FROM links";
54         wfQuery( $sql, DB_MASTER );
56         print "Deleting old data in brokenlinks table.\n";
57         $sql = "DELETE FROM brokenlinks";
58         wfQuery( $sql, DB_MASTER );
60         print "Deleting old data in imagelinks table.\n";
61         $sql = "DELETE FROM imagelinks";
62         wfQuery( $sql, DB_MASTER );
64         print "Finding number of articles to process... ";
65         $sql = "SELECT COUNT(*) as count FROM cur";
66         $res = wfQuery( $sql, DB_SLAVE );
67         $obj = wfFetchObject( $res );
68         $total = $obj->count;
69         print "$total\n";
71         print "Finding highest article id\n";
72         $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
73         $res = wfQuery( $sql, DB_SLAVE );
74         $obj = wfFetchObject( $res );
76         $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
77                 "FROM cur WHERE cur_id ", 
78                 $obj->min, $obj->max, 100);
80         $brokenlinks_inserter = new InsertBuffer(
81                 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
83         $links_inserter = new InsertBuffer(
84                 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
86         $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
87                 "(il_from,il_to) VALUES ", $rowbuf_size);
89         print "Starting processing\n";
91         $ins = $wgLang->getNsText( Namespace::getImage() );
92         $inslen = strlen($ins)+1;
94         $tc = Title::legalChars();
95         
96         $titleCache = new MRUCache( 10000 );
97         $titlecount = 0;
98         $start_time = time();
100         while ( $row = $cur_pulser->next() ) {
102                 $from_id = intval($row->cur_id);
103                 $ns = $wgLang->getNsText( $row->cur_namespace );        
104                 $from_full_title = $row->cur_title;             
105                 if ( "" != $ns ) {
106                         $from_full_title = "$ns:{$from_full_title}";
107                 }
108                 $from_full_title_with_slashes = addslashes( $from_full_title );
109                 $text = $row->cur_text;
111                 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
112                   $m, PREG_PATTERN_ORDER );
114                 $seen_dbtitles = array(); // seen links (normalized and with ns, see below) 
115                 $titles_ready_for_insertion = array();
116                 $titles_needing_curdata = array();
117                 $titles_needing_curdata_pos = array();
118                 $links_corresponding_to_titles = array();
120                 for ( $i = 0 ; $i < $numlinks; ++$i ) {
121                         $link = $m[1][$i];
122                         if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
123                                 # an URL link; not for us!
124                                 continue;
125                         }
126                         
127                         # FIXME: Handle subpage links
128                         $nt = $titleCache->get( $link );
129                         if( $nt != false ){
130                                 // Only process each unique link once per page
131                                 $nt_key = $nt->getDBkey() . $nt->getNamespace();
132                                 if( isset( $seen_dbtitles[$nt_key] ) )
133                                         continue;
134                                 $seen_dbtitles[$nt_key] = 1;
136                                 $titles_ready_for_insertion[] = $nt;
137                         } else {
138                                 $nt = Title::newFromText( $link );
139                                 if (! $nt) {
140                                         // Invalid link, probably something like "[[  ]]"
141                                         continue;
142                                 }
143                                 
144                                 // Only process each unique link once per page
145                                 $nt_key = $nt->getDBkey() . $nt->getNamespace();
146                                 if( isset( $seen_dbtitles[$nt_key] ) )
147                                         continue;
148                                 $seen_dbtitles[$nt_key] = 1;
150                                 if( $nt->getInterwiki() != "" ) {
151                                         # Interwiki links are not stored in the link tables
152                                         continue;
153                                 }
154                                 if( $nt->getNamespace() == Namespace::getSpecial() ) {
155                                         # Special links not stored in link tables
156                                         continue;
157                                 }
158                                 if( $nt->getNamespace() == Namespace::getMedia() ) {
159                                         # treat media: links as image: links
160                                         $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
161                                 }
162                                 $nt->mArticleID = 0; // assume broken link until proven otherwise
164                                 $pos = array_push($titles_needing_curdata, $nt) - 1;
165                                 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos;
166                                 $links_corresponding_to_titles[] = $link;
167                                 unset( $link ); // useless outside this loop, but tempting 
168                         }
169                 }
172                 if ( count( $titles_needing_curdata ) > 0 ){
173                         $parts = array();
174                         foreach ($titles_needing_curdata as $nt ) {
175                                 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
176                                         "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')";
177                         }
178                         $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " . 
179                                 implode(" OR ", $parts);
180                         $res = wfQuery( $sql, DB_MASTER );
181                         while($row = wfFetchObject( $res ) ){
182                                 $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace];
183                                 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
184                         }
185                         for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
186                                 $tmplink = $links_corresponding_to_titles[$k];
187                                 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
188                                 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
189                         }
190                 }
192                 foreach ( $titles_ready_for_insertion as $nt ) {
193                         $dest_noslashes = $nt->getPrefixedDBkey();
194                         $dest = addslashes( $dest_noslashes ); 
195                         $dest_id = $nt->getArticleID();
196                         $from = $from_full_title_with_slashes;
198                         # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
200                         if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) { 
201                                 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
202                                 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
203                         } else if ( 0 == $dest_id ) {
204                                 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
205                         } else {
206                                 $links_inserter->insert( "('{$from}',{$dest_id})" );
207                         }
208                         $titlecount++;
209                 }
211                 if ( ( $count % 20 ) == 0 )
212                         print "."; 
214                 if ( ( ++$count % 1000 ) == 0 ) {
215                         $dt = time() - $start_time;
216                         $start_time = time();
217                         $rps = persec(1000, $dt);
218                         $tps = persec($titlecount, $dt);
219                         $titlecount = 0;
220                         print "\n$count of $total articles scanned ({$rps} articles ".
221                                 "and {$tps} titles per second)\n";
222                         print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
224                 }
226         }
228         print "\nFlushing insertion buffers...";
229         $imagelinks_inserter->flush();
230         $links_inserter->flush();
231         $brokenlinks_inserter->flush();
232         print "ok\n";
234         print "$count articles scanned.\n";
236         $sql = "UNLOCK TABLES";
237         wfQuery( $sql, DB_MASTER );
238         print "Done\n";
241 /* private */ function persec($n, $t){
242         if($n == 0)
243                 return "zero";
244         if($t == 0)
245                 return "lots of";
246         return intval($n/$t);
250  * InsertBuffer increases performance slightly by inserting many rows
251  * at once. The gain is small (<5%) when running against a local, idle
252  * database, but may be significant in other circumstances. It also
253  * limits the number of inserted rows uppwards, which should avoid
254  * problems with huge articles and certain mysql settings that limits
255  * the size of queries. It's also convenient.
257  * @deprecated
258  * @package MediaWiki
259  * @subpackage Maintenance
260  */
261 class InsertBuffer {
262         /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
263         
264         function InsertBuffer( $sql, $bufsize ){
265                 $this->mSql = $sql;
266                 $this->mBuf = array();
267                 $this->mBufcount = 0;
268                 $this->mMaxsize = $bufsize;
269         }
271         function insert( $value ){
272                 // print $this->mSql . " -> " . $value . "\n";
273                 $this->mBuf[] = $value;
274                 $this->mBufcount++;
275                 if($this->mBufcount > $this->mMaxsize){
276                         $this->flush();
277                 }
278         }
280         function flush(){
281                 if( $this->mBufcount > 0 ){
282                         $sql = $this->mSql . implode(",", $this->mBuf);
283                         wfQuery( $sql, DB_MASTER );
284                         $this->mBuf = array();
285                         $this->mBufcount = 0;
286                         // print "Wrote query of size " . strlen( $sql ) . "\n";
287                 }
288         }
289         
293  * Select parts from a large table by using the "BETWEEN X AND Y"
294  * operator on the id column. Avoids buffering the whole thing in
295  * RAM. It's also convenient.
297  * @deprecated
298  * @package MediaWiki
299  * @subpackage Maintenance
300  */
301 class SelectPulser {
302         /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
304         function SelectPulser( $sql, $min, $max, $setsize) {
305                 $this->mSql = $sql;
306                 $this->mSet = array();
307                 $this->mPos = $min;
308                 $this->mMax = $max;
309                 $this->mSetsize = $setsize;
310         }
311         
312         function next(){
313                 $result = current( $this->mSet );
314                 next( $this->mSet );
315                 if( false !== $result ){
316                         return $result;
317                 }
318                 while( $this->mPos <= $this->mMax ){
319                         $this->mSet = array();
320                         $sql = $this->mSql . " BETWEEN " . $this->mPos .
321                                 " AND " . ($this->mPos + $this->mSetsize - 1);
322                         $this->mPos += $this->mSetsize;
324                         $res = wfQuery( $sql, DB_SLAVE );
325                         while ( $row = wfFetchObject( $res ) ) {
326                                 $this->mSet[] = $row;
327                         }
328                         wfFreeResult( $res );
329                         if( count( $this->mSet ) > 0 ){
330                                 return $this->next();
331                         } 
332                 }
333                 return false;
334         }
338  * A simple MRU for general cacheing.
339  * @deprecated
340  * @todo document
341  * @package MediaWiki
342  * @subpackage Maintenance
343  */
344 class MRUCache {
345         /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
346         /* private */ var $hits, $misses;
348         function MRUCache( $size, $purgefreq = -1 ) {
349                 // purgefreq is 1/10 of $size if not stated             
350                 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
351                 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
353                 $this->mSize = $size;
354                 $this->mMru = array();
355                 $this->mCache = array();
356                 $this->mPurgefreq = $purgefreq;
357                 $this->nexti = 1;
358                 print "purgefreq = " . $this->mPurgefreq . "\n";
359         }
361         function get( $key ){
362                 if ( ! array_key_exists( $key, $this->mCache) ){
363                         $this->misses++;
364                         return false;
365                 }
366                 $this->hits++;
367                 $this->mMru[$key] = $this->nexti++;
368                 return $this->mCache[$key];
369         }
370         
371         function set( $key, $value ){   
372                 $this->mMru[$key] = $this->nexti++;
373                 $this->mCache[$key] = $value;
375                 if($this->nexti % $this->mPurgefreq == 0)
376                         $this->purge();
377         }
379         function purge(){
380                 $to_remove = count( $this->mMru ) - $this->mSize;
381                 if( $to_remove <= 0 ){
382                         return;
383                 }               
384                 asort( $this->mMru );
385                 $removed = array_splice( $this->mMru, 0, $to_remove );
386                 foreach( array_keys( $removed ) as $key ){
387                         unset( $this->mCache[$key] );
388                 }
389         }
390         
391         function getPerformance(){
392                 $tot = $this->hits + $this->misses;
393                 if($tot > 0)
394                         return intval(100.0 * $this->hits / $tot);
395                 else
396                         return 0;
397         }
398 }