maintenance/archives/importUseModWiki.php

   1 <?php
   2 /**
   3  * @deprecated
   4  * @package MediaWiki
   5  * @subpackage MaintenanceArchive
   6  */
   7
   8 /** */
   9 print "This script is obsolete!";
  10 print "It is retained in the source here in case some of its
  11 code might be useful for ad-hoc conversion tasks, but it is
  12 not maintained and probably won't even work as is.";
  13 exit();
  14
  15 /*
  16         Import data from a UseModWiki into a PediaWiki wiki
  17         2003-02-09 Brion VIBBER <brion@pobox.com>
  18         Based loosely on Magnus's code from 2001-2002
  19
  20           Pass one: collect data on links & title case, users
  21           Pass two: spit out SQL for
  22           Separately, be sure to run the link & index rebuilding scripts!
  23
  24   */
  25
  26 /* globals
  27         */
  28 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
  29 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
  30         $FS = $wgFieldSeparator ;
  31         $FS1 = $FS."1" ;
  32         $FS2 = $FS."2" ;
  33         $FS3 = $FS."3" ;
  34
  35 # Images to import
  36 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
  37
  38 # Number of *seconds to add* to timestamp to get UTC/GMT
  39 #$wgTimezoneCorrection = 0;             # GMT
  40 $wgTimezoneCorrection = 8*3600; # PST - California
  41
  42 # Other options...
  43 $historyonly = false;           # Don't add converted revisions to cur table; just get old histories
  44 $lasthistoryonly = false;       # Only add the _original_ form of the _current_ revision
  45
  46 /* Vary by language */
  47 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
  48 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
  49 $talkending = "Talk";
  50 $mediatext = "Media";
  51 $conversionscript = "Conversion script";
  52 $conversioncomment = "Automatic conversion";
  53 $redirectcomment = "Automatic converion, moved to \$1";
  54 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
  55
  56 # Stats and caches
  57 $oldtitles = array();
  58 $usercache = array();
  59 $titlecache = array();
  60 $linkcache = array();
  61
  62 /**
  63  * Some oversimplified test types
  64  *
  65  * @deprecated
  66  * @package MediaWiki
  67  * @subpackage MaintenanceArchive
  68  */
  69 class Title {
  70         var $title, $namespace;
  71         function fromData( $namespace, $title ) {
  72                 $x = new Title;
  73                 $x->namespace = $namespace;
  74                 $x->title = $title;
  75                 return $x;
  76         }
  77 }
  78
  79 # See tests in importTests.php
  80 if( ! $testingonly ) {
  81         firstPass();
  82         secondPass();
  83 }
  84
  85 # ------------------------------------------------------------------------------
  86
  87 /* First pass:
  88         Information please!
  89         */
  90 function firstPass()
  91 {
  92         global $wgRootDirectory, $oldtitles;
  93
  94         $letters = array(
  95                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
  96                 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  97                 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
  98         foreach( $letters as $letter ) {
  99                 firstPassDirectory( "$wgRootDirectory/page/$letter" );
 100         }
 101 }
 102
 103 function firstPassDirectory( $dir )
 104 {
 105         global $titlecache;
 106
 107         $mydir = opendir( $dir );
 108         while( $entry = readdir( $mydir ) ) {
 109                 if( $entry != '.' && $entry != '..' ) {
 110                         if( is_dir( "$dir/$entry" ) ) {
 111                                 firstPassDirectory( "$dir/$entry" );
 112                         }
 113                 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
 114                         $titlecache[$title] = transformTitle( $m[1] );
 115                         countLinksFrom( $title );
 116                 } else {
 117                         echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
 118                 }
 119         }
 120 }
 121
 122 /* Second pass:
 123         make the dang SQL
 124         */
 125 function secondPass()
 126 {
 127         global $titlecache, $usercache, $redirects;
 128
 129         foreach( $usercache as $oldname => $user ) {
 130                 echo importUser( $oldname );
 131         }
 132         foreach( $titlecache as $oldtitle => $newtitle ) {
 133                 echo importPage( $oldtitle );
 134         }
 135
 136         echo "\n-- Done!\n";
 137 }
 138
 139
 140 # ------------------------------------------------------------------------------
 141
 142 /* fetch_ functions
 143         Grab a given item from the database
 144         */
 145 function fetchUser( $uid )
 146 {
 147         global $FS,$FS2,$FS3, $wgRootDirectory;
 148
 149         $fname = $wgRootDirectory . "/pages/" . $title;
 150         if( !file_exists( $fname ) ) return false;
 151
 152         $data = splitHash( implode( "", file( $fname ) ) );
 153         # enough?
 154
 155         return $data;
 156 }
 157
 158 function fetchPage( $title )
 159 {
 160         global $FS,$FS2,$FS3, $wgRootDirectory;
 161
 162         $fname = $wgRootDirectory . "/pages/" . $title;
 163         if( !file_exists( $fname ) ) return false;
 164
 165         $page = splitHash( implode( "", file( $fname ) ) );
 166         $section = splitHash( $FS2, $page["text_default"] );
 167         $text = splitHash( $FS3, $section["data"] );
 168
 169         return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
 170                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 171                 "username" => $section["username"] , "host" => $section["host"] ) ;
 172 }
 173
 174 function fetchKeptPages( $title )
 175 {
 176         global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
 177
 178         $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
 179         if( !file_exists( $fname ) ) return array();
 180
 181         $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
 182         array_shift( $keptlist ); # Drop the junk at beginning of file
 183
 184         $revisions = array();
 185         foreach( $keptlist as $rev ) {
 186                 $section = splitHash( $FS2, $rev );
 187                 $text = splitHash( $FS3, $section["data"] );
 188                 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
 189                         array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
 190                                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 191                                 "username" => $section["username"] , "host" => $section["host"] ) );
 192                 } else {
 193                         echo "-- skipped a bad old revision\n";
 194                 }
 195         }
 196         return $revisions;
 197 }
 198
 199 function splitHash ( $sep , $str ) {
 200         $temp = explode ( $sep , $str ) ;
 201         $ret = array () ;
 202         for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
 203                 $ret[$temp[$i]] = $temp[++$i] ;
 204                 }
 205         return $ret ;
 206         }
 207
 208
 209 /* import_ functions
 210         Take a fetched item and produce SQL
 211         */
 212
 213 /* importUser
 214         $uid is the UseMod user id number.
 215         The new ones will be assigned arbitrarily and are for internal use only.
 216
 217         THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
 218         */
 219 function importUser( $uid )
 220 {
 221         global $last_uid, $user_list, $wgTimestampCorrection;
 222
 223         return "";
 224
 225         $stuff = fetchUser( $uid );
 226         $last_uid++;
 227
 228         $name = wfStrencode( $stuff->username );
 229         $hash = md5hash( $stuff->password ); # Doable?
 230         $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
 231         $hideminor = ($stuff['rcall'] ? 0 : 1);
 232         $options = "cols={$stuff['editcols']}
 233 rows={$stuff['editrows']}
 234 rcdays={$stuff['rcdays']}
 235 timecorrection={$tzoffset}
 236 hideminor={$hideminor}
 237         ";
 238
 239         $sql = "INSERT
 240                 INTO user (user_id,user_name,user_password,user_options)
 241                 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
 242         return $sql;
 243 }
 244
 245 function checkUserCache( $name, $host )
 246 {
 247         global $usercache;
 248
 249         if( $name ) {
 250                 if( in_array( $name, $usercache ) ) {
 251                         $userid = $usercache[$name];
 252                 } else {
 253                         # If we haven't imported user accounts
 254                         $userid = 0;
 255                 }
 256                 $username = wfStrencode( $name );
 257         } else {
 258                 $userid = 0;
 259                 $username = wfStrencode( $host );
 260         }
 261         return array( $userid, $username );
 262 }
 263
 264 function importPage( $title )
 265 {
 266         global $wgTimezoneCorrection, $titlecache, $usercache;
 267         global $conversionscript, $conversioncomment, $conversiontime;
 268         global $historyonly, $lasthistoryonly;
 269
 270         $page = fetchPage( $title );
 271
 272         $newtext = wfStrencode( rewritePage( $title, $page->text ) );
 273         $t = renamePage( $title );
 274         $newtitle = wfStrencode( $t->title );
 275         $namespace = $t->namespace;
 276
 277         # Current revision:
 278         $text = wfStrencode( $page->text );
 279         $minor = ($page->minor ? 1 : 0);
 280         list( $userid, $username ) = checkUserCache( $page->username, $page->host );
 281         $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
 282         $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
 283         $sql = "\n";
 284         if( !$historyonly ) {
 285                 $sql .= "INSERT
 286                 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
 287                 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
 288         }
 289         $sql .= "INSERT
 290                 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
 291                 VALUES";
 292         $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
 293
 294         # History
 295         if( !$lasthistoryonly ) {
 296                 $revisions = fetchKeptPages( $title );
 297                 foreach( $revisions as $rev ) {
 298                         $text = wfStrencode( $rev->text );
 299                         $minor = ($rev->minor ? 1 : 0);
 300                         list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
 301                         $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
 302                         $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
 303                 }
 304         }
 305         return $sql . $sqlfinal;
 306 }
 307
 308
 309 # Count up basic links
 310 function countLinksFrom( $title )
 311 {
 312         $page = fetchPage( $title );
 313         $page->text = preg_replace(
 314                 '/<nowiki>.*<\/nowiki>/sDU',
 315                 '',
 316                 $page->text );
 317         $page->text = preg_replace(
 318                 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
 319                 'countLinkTo( ucfirst( "$1" ) )',
 320                 $page->text );
 321 }
 322
 323 function countLinkTo( $title )
 324 {
 325         global $linkcache;
 326         $t = transformTitle( $title );
 327         $linkform = FreeToNormal( $t->title );
 328         $x = $linkcache[$title];
 329         if ( count ( $x ) ) {
 330                 $y = $x[$linkform] ;
 331                 if ( $y ) $y++; else $y = 1 ;
 332                 $x[$linkform] = $y ;
 333         } else {
 334                 $x = array ( $linkform => 1 ) ;
 335         }
 336         $linkcache[$title] = $x;
 337 }
 338
 339 # Preferentially change case
 340 function renamePage( $title )
 341 {
 342         global $linkcache;
 343         $t = transformTitle( $title );
 344
 345         # We want to use the most frequently linked-to form as the title
 346         $maxcount = 0 ; $maxform = $t->title ;
 347         foreach ( $linkcache[$title] as $linkform => $count ) {
 348                 if ( $count > $maxcount ) {
 349                         $maxcount = $count ;
 350                         $maxform = $linkform ;
 351                 }
 352         }
 353         if( $maxform != $t->title) {
 354                 doRenamePage( $t, $maxform );
 355         }
 356 }
 357
 358 function doRenamePage( $title, $maxform )
 359 {
 360         global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
 361         $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
 362         VALUES ";
 363         $redirsql = array();
 364         foreach( $linkcache[$title] as $linkform => $count ) {
 365                 if( $linkform != $maxform ) {
 366                         $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
 367                         array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
 368                 }
 369         }
 370         $sql .= implode( ",\n\t", $redirsql ) . ";\n";
 371         return $sql;
 372 }
 373
 374 # Account for syntax changes
 375 function rewritePage( $title, $text )
 376 {
 377         # ...
 378         $text = removeTalkLink( $text );
 379         $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
 380                 'rewritePageBits( $title, "$1")',
 381                 $text );
 382         return $text;
 383 }
 384
 385 function rewritePageBits( $title, $text ) {
 386         $text = fixSubpages( $title, $text );
 387         $text = fixMedialinks( $text );
 388         $text = fixImagelinks( $text );
 389         return $text;
 390 }
 391
 392 function removeTalkLink( &$text ) {
 393         global $talkending;
 394         return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
 395 }
 396
 397 function fixSubpages( $text, &$title ) {
 398         $old = preg_quote( $text );
 399         $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
 400                 "$1[[$title/$2|/$2]]", $text );
 401         $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
 402                 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
 403         $text = preg_replace( "<\[\[/(.*?)\]\]>e",
 404                 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
 405         return $text;
 406 }
 407
 408 function fixImagelinks( &$text ) {
 409         global $imageimport, $namespaces;
 410         return preg_replace( "/$imageimport/e",
 411                 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
 412                 $text );
 413 }
 414
 415 function fixMedialinks( &$text ) {
 416         global $imageimport, $mediatext;
 417         $text = preg_replace( "/\[$imageimport\]/e",
 418                 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
 419                 $text );
 420         return preg_replace( "/\[$imageimport (.+?)\]/e",
 421                 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
 422                 $text );
 423 }
 424
 425 function fetchMediaFile( $url, $filename )
 426 {
 427         # Copy an image file into local upload space
 428         # FIXME
 429         return ucfirst( $filename );
 430 }
 431
 432 # Simple move of talk pages, etc
 433 function transformTitle( $title, $dorename = false )
 434 {
 435         global $talkending;
 436         if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
 437                 $thetitle = $m[1];
 438                 $namespace = 1;
 439         } else {
 440                 $thetitle = $title;
 441                 $namespace = 0;
 442         }
 443         return Title::fromData( $namespace, $thetitle );
 444 }
 445
 446 # Translated out of old usemod wiki...
 447 function FreeToNormal ( $id , $FreeUpper = true ) {
 448   $id = str_replace ( " ", "_", $id ) ;
 449   $id = ucfirst($id);
 450   if (strstr($id, '_') != false) {  # Quick check for any space/underscores
 451     $id = preg_replace ( '/__+/' , "_" , $id ) ;
 452     $id = preg_replace ( '/^_/' , "", $id ) ;
 453     $id = preg_replace ( '/_$/' , "", $id ) ;
 454     #if ($UseSubpage) {
 455       $id = preg_replace ( '|_/|', "/" , $id ) ;
 456       $id = preg_replace ( '|/_|', "/" , $id ) ;
 457     #}
 458   }
 459   if ($FreeUpper) {
 460     # Note that letters after ' are *not* capitalized
 461     if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
 462       $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
 463     }
 464   }
 465   return $id;
 466 }
 467
 468 # Whee!
 469 function recodeInput( $text )
 470 {
 471         return $text;
 472 }
 473
 474 function wfUnix2Timestamp( $unixtime ) {
 475         return gmdate( "YmdHis", $timestamp );
 476 }
 477
 478 function wfTimestamp2Unix( $ts )
 479 {
 480         return gmmktime( ( (int)substr( $ts, 8, 2) ),
 481                   (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
 482                   (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
 483                   (int)substr( $ts, 0, 4 ) );
 484 }
 485
 486 ?>