Add missing getMagicWords()
[mediawiki.git] / maintenance / rebuildlinks.inc
blob7db35a284f8750c45f7c3e8c650bc6b1f50f5ceb
1 <?
3 # Functions for rebuilding the link tracking tables; must
4 # be included within a script that also includes the Setup.
5 # See rebuildlinks.php, for example.
8 # Turn this on if you've got memory to burn
9 $wgUseMemoryTables = false;
11 # Buffer this many rows before inserting them all in one sweep. More
12 # than about 1000 will probably not increase speed significantly on
13 # most setups.
14 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
16 function rebuildLinkTables()
18         error_reporting (E_ALL);
19         global $wgLang, $wgUseMemoryTables, $wgLinkCache, $rowbuf_size;
21         print "This script may take several hours to complete. If you abort during that time,\n";
22         print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
23         print "the time to do it.\n\n";
24         print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
25         sleep(15);
27         $count = 0;
28         print "Rebuilding link tables.\n";
30         print "Setting AUTOCOMMIT=1\n";
31         wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
33         print "Locking tables\n";
34         $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
35                 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
36         wfQuery( $sql, DB_WRITE );
38         print "Deleting old data in links table.\n";
39         $sql = "DELETE FROM links";
40         wfQuery( $sql, DB_WRITE );
42         print "Deleting old data in brokenlinks table.\n";
43         $sql = "DELETE FROM brokenlinks";
44         wfQuery( $sql, DB_WRITE );
46         print "Deleting old data in imagelinks table.\n";
47         $sql = "DELETE FROM imagelinks";
48         wfQuery( $sql, DB_WRITE );
50         print "Finding number of articles to process... ";
51         $sql = "SELECT COUNT(*) as count FROM cur";
52         $res = wfQuery( $sql, DB_READ );
53         $obj = wfFetchObject( $res );
54         $total = $obj->count;
55         print "$total\n";
57         print "Finding highest article id\n";
58         $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
59         $res = wfQuery( $sql, DB_READ );
60         $obj = wfFetchObject( $res );
62         $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
63                 "FROM cur WHERE cur_id ", 
64                 $obj->min, $obj->max, 100);
66         $brokenlinks_inserter = new InsertBuffer(
67                 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
69         $links_inserter = new InsertBuffer(
70                 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
72         $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
73                 "(il_from,il_to) VALUES ", $rowbuf_size);
75         print "Starting processing\n";
77         $ins = $wgLang->getNsText( Namespace::getImage() );
78         $inslen = strlen($ins)+1;
80         $tc = Title::legalChars();
81         
82         $titleCache = new MRUCache( 10000 );
83         $titlecount = 0;
84         $start_time = time();
86         while ( $row = $cur_pulser->next() ) {
88                 $from_id = intval($row->cur_id);
89                 $ns = $wgLang->getNsText( $row->cur_namespace );        
90                 $from_full_title = $row->cur_title;             
91                 if ( "" != $ns ) {
92                         $from_full_title = "$ns:{$from_full_title}";
93                 }
94                 $from_full_title_with_slashes = addslashes( $from_full_title );
95                 $text = $row->cur_text;
97                 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
98                   $m, PREG_PATTERN_ORDER );
100                 $seen_dbtitles = array(); // seen links (normalized and with ns, see below) 
101                 $titles_ready_for_insertion = array();
102                 $titles_needing_curdata = array();
103                 $titles_needing_curdata_pos = array();
104                 $links_corresponding_to_titles = array();
106                 for ( $i = 0 ; $i < $numlinks; ++$i ) {
107                         $link = $m[1][$i];
108                         if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
109                                 # an URL link; not for us!
110                                 continue;
111                         }
112                         
113                         # FIXME: Handle subpage links
114                         $nt = $titleCache->get( $link );
115                         if( $nt != false ){
116                                 // Only process each unique link once per page
117                                 $nt_key = $nt->getDBkey() . $nt->getNamespace();
118                                 if( isset( $seen_dbtitles[$nt_key] ) )
119                                         continue;
120                                 $seen_dbtitles[$nt_key] = 1;
122                                 $titles_ready_for_insertion[] = $nt;
123                         } else {
124                                 $nt = Title::newFromText( $link );
125                                 if (! $nt) {
126                                         print "\nInvalid link in page '$ns:{$from_full_title}': '$link'\n";
127                                         continue;
128                                 }
129                                 
130                                 // Only process each unique link once per page
131                                 $nt_key = $nt->getDBkey() . $nt->getNamespace();
132                                 if( isset( $seen_dbtitles[$nt_key] ) )
133                                         continue;
134                                 $seen_dbtitles[$nt_key] = 1;
136                                 if( $nt->getInterwiki() != "" ) {
137                                         # Interwiki links are not stored in the link tables
138                                         continue;
139                                 }
140                                 if( $nt->getNamespace() == Namespace::getSpecial() ) {
141                                         # Special links not stored in link tables
142                                         continue;
143                                 }
144                                 if( $nt->getNamespace() == Namespace::getMedia() ) {
145                                         # treat media: links as image: links
146                                         $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
147                                 }
148                                 $nt->mArticleID = 0; // assume broken link until proven otherwise
150                                 $pos = array_push($titles_needing_curdata, $nt) - 1;
151                                 $titles_needing_curdata_pos[$nt->getDBkey()] = $pos;
152                                 $links_corresponding_to_titles[] = $link;
153                                 unset( $link ); // useless outside this loop, but tempting 
154                         }
155                 }
158                 if ( count( $titles_needing_curdata ) > 0 ){
159                         $parts = array();
160                         foreach ($titles_needing_curdata as $nt ) {
161                                 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
162                                         "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')";
163                         }
164                         $sql = "SELECT cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts);
165                         $res = wfQuery( $sql, DB_WRITE );
166                         while($row = wfFetchObject( $res ) ){
167                                 $pos = $titles_needing_curdata_pos[$row->cur_title];
168                                 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
169                         }
170                         for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
171                                 $tmplink = $links_corresponding_to_titles[$k];
172                                 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
173                                 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
174                         }
175                 }
177                 foreach ( $titles_ready_for_insertion as $nt ) {
178                         $dest_noslashes = $nt->getPrefixedDBkey();
179                         $dest = addslashes( $dest_noslashes ); 
180                         $dest_id = $nt->getArticleID();
181                         $from = $from_full_title_with_slashes;
183                         # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
185                         if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) { 
186                                 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
187                                 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
188                         } else if ( 0 == $dest_id ) {
189                                 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
190                         } else {
191                                 $links_inserter->insert( "('{$from}',{$dest_id})" );
192                         }
193                         $titlecount++;
194                 }
196                 if ( ( $count % 20 ) == 0 )
197                         print "."; 
199                 if ( ( ++$count % 1000 ) == 0 ) {
200                         $dt = time() - $start_time;
201                         $start_time = time();
202                         $rps = persec(1000, $dt);
203                         $tps = persec($titlecount, $dt);
204                         $titlecount = 0;
205                         print "\n$count of $total articles scanned ({$rps} articles ".
206                                 "and {$tps} titles per second)\n";
207                         print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
209                 }
211         }
213         print "\nFlushing insertion buffers...";
214         $imagelinks_inserter->flush();
215         $links_inserter->flush();
216         $brokenlinks_inserter->flush();
217         print "ok\n";
219         print "$count articles scanned.\n";
221         $sql = "UNLOCK TABLES";
222         wfQuery( $sql, DB_WRITE );
223         print "Done\n";
226 /* private */ function persec($n, $t){
227         if($n == 0)
228                 return "zero";
229         if($t == 0)
230                 return "lots of";
231         return intval($n/$t);
234 # InsertBuffer increases performance slightly by inserting many rows
235 # at once. The gain is small (<5%) when running against a local, idle
236 # database, but may be significant in other circumstances. It also
237 # limits the number of inserted rows uppwards, which should avoid
238 # problems with huge articles and certain mysql settings that limits
239 # the size of queries. It's also convenient.
241 class InsertBuffer {
242         /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
243         
244         function InsertBuffer( $sql, $bufsize ){
245                 $this->mSql = $sql;
246                 $this->mBuf = array();
247                 $this->mBufcount = 0;
248                 $this->mMaxsize = $bufsize;
249         }
251         function insert( $value ){
252                 // print $this->mSql . " -> " . $value . "\n";
253                 $this->mBuf[] = $value;
254                 $this->mBufcount++;
255                 if($this->mBufcount > $this->mMaxsize){
256                         $this->flush();
257                 }
258         }
260         function flush(){
261                 if( $this->mBufcount > 0 ){
262                         $sql = $this->mSql . implode(",", $this->mBuf);
263                         wfQuery( $sql, DB_WRITE );
264                         $this->mBuf = array();
265                         $this->mBufcount = 0;
266                         // print "Wrote query of size " . strlen( $sql ) . "\n";
267                 }
268         }
269         
272 # Select parts from a large table by using the "BETWEEN X AND Y"
273 # operator on the id column. Avoids buffering the whole thing in
274 # RAM. It's also convenient.  
276 class SelectPulser {
277         /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
279         function SelectPulser( $sql, $min, $max, $setsize) {
280                 $this->mSql = $sql;
281                 $this->mSet = array();
282                 $this->mPos = $min;
283                 $this->mMax = $max;
284                 $this->mSetsize = $setsize;
285         }
286         
287         function next(){
288                 $result = current( $this->mSet );
289                 next( $this->mSet );
290                 if( false !== $result ){
291                         return $result;
292                 }
293                 while( $this->mPos <= $this->mMax ){
294                         $this->mSet = array();
295                         $sql = $this->mSql . " BETWEEN " . $this->mPos .
296                                 " AND " . ($this->mPos + $this->mSetsize - 1);
297                         $this->mPos += $this->mSetsize;
299                         $res = wfQuery( $sql, DB_READ );
300                         while ( $row = wfFetchObject( $res ) ) {
301                                 $this->mSet[] = $row;
302                         }
303                         wfFreeResult( $res );
304                         if( count( $this->mSet ) > 0 ){
305                                 return $this->next();
306                         } 
307                 }
308                 return false;
309         }
312 # A simple MRU for general cacheing.
314 class MRUCache {
315         /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
316         /* private */ var $hits, $misses;
318         function MRUCache( $size, $purgefreq = -1 ) {
319                 // purgefreq is 1/10 of $size if not stated             
320                 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
321                 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
323                 $this->mSize = $size;
324                 $this->mMru = array();
325                 $this->mCache = array();
326                 $this->mPurgefreq = $purgefreq;
327                 $this->nexti = 1;
328                 print "purgefreq = " . $this->mPurgefreq . "\n";
329         }
331         function get( $key ){
332                 if ( ! array_key_exists( $key, $this->mCache) ){
333                         $this->misses++;
334                         return false;
335                 }
336                 $this->hits++;
337                 $this->mMru[$key] = $this->nexti++;
338                 return $this->mCache[$key];
339         }
340         
341         function set( $key, $value ){   
342                 $this->mMru[$key] = $this->nexti++;
343                 $this->mCache[$key] = $value;
345                 if($this->nexti % $this->mPurgefreq == 0)
346                         $this->purge();
347         }
349         function purge(){
350                 $to_remove = count( $this->mMru ) - $this->mSize;
351                 if( $to_remove <= 0 ){
352                         return;
353                 }               
354                 asort( $this->mMru );
355                 $removed = array_splice( $this->mMru, 0, $to_remove );
356                 foreach( array_keys( $removed ) as $key ){
357                         unset( $this->mCache[$key] );
358                 }
359         }
360         
361         function getPerformance(){
362                 $tot = $this->hits + $this->misses;
363                 if($tot > 0)
364                         return intval(100.0 * $this->hits / $tot);
365                 else
366                         return 0;
367         }
368 }