Some changes to the link tables. They now all use a key on cur_id for the *_from...
[mediawiki.git] / maintenance / archives / importUseModWiki.php
blob09eb025f5505aee057536fabf3b7764101ae6652
1 <?php
3 print "This script is obsolete!";
4 print "It is retained in the source here in case some of its
5 code might be useful for ad-hoc conversion tasks, but it is
6 not maintained and probably won't even work as is.";
7 exit();
9 /*
10 Import data from a UseModWiki into a PediaWiki wiki
11 2003-02-09 Brion VIBBER <brion@pobox.com>
12 Based loosely on Magnus's code from 2001-2002
14 Pass one: collect data on links & title case, users
15 Pass two: spit out SQL for
16 Separately, be sure to run the link & index rebuilding scripts!
20 /* globals
22 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
23 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
24 $FS = $wgFieldSeparator ;
25 $FS1 = $FS."1" ;
26 $FS2 = $FS."2" ;
27 $FS3 = $FS."3" ;
29 # Images to import
30 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
32 # Number of *seconds to add* to timestamp to get UTC/GMT
33 #$wgTimezoneCorrection = 0; # GMT
34 $wgTimezoneCorrection = 8*3600; # PST - California
36 # Other options...
37 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
38 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
40 /* Vary by language */
41 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
42 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
43 $talkending = "Talk";
44 $mediatext = "Media";
45 $conversionscript = "Conversion script";
46 $conversioncomment = "Automatic conversion";
47 $redirectcomment = "Automatic converion, moved to \$1";
48 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
50 # Stats and caches
51 $oldtitles = array();
52 $usercache = array();
53 $titlecache = array();
54 $linkcache = array();
56 # Some oversimplified test types
57 class Title {
58 var $title, $namespace;
59 function fromData( $namespace, $title ) {
60 $x = new Title;
61 $x->namespace = $namespace;
62 $x->title = $title;
63 return $x;
67 # See tests in importTests.php
68 if( ! $testingonly ) {
69 firstPass();
70 secondPass();
73 # ------------------------------------------------------------------------------
75 /* First pass:
76 Information please!
78 function firstPass()
80 global $wgRootDirectory, $oldtitles;
82 $letters = array(
83 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
84 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
85 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
86 foreach( $letters as $letter ) {
87 firstPassDirectory( "$wgRootDirectory/page/$letter" );
91 function firstPassDirectory( $dir )
93 global $titlecache;
95 $mydir = opendir( $dir );
96 while( $entry = readdir( $mydir ) ) {
97 if( $entry != '.' && $entry != '..' ) {
98 if( is_dir( "$dir/$entry" ) ) {
99 firstPassDirectory( "$dir/$entry" );
101 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
102 $titlecache[$title] = transformTitle( $m[1] );
103 countLinksFrom( $title );
104 } else {
105 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
110 /* Second pass:
111 make the dang SQL
113 function secondPass()
115 global $titlecache, $usercache, $redirects;
117 foreach( $usercache as $oldname => $user ) {
118 echo importUser( $oldname );
120 foreach( $titlecache as $oldtitle => $newtitle ) {
121 echo importPage( $oldtitle );
124 echo "\n-- Done!\n";
128 # ------------------------------------------------------------------------------
130 /* fetch_ functions
131 Grab a given item from the database
133 function fetchUser( $uid )
135 global $FS,$FS2,$FS3, $wgRootDirectory;
137 $fname = $wgRootDirectory . "/pages/" . $title;
138 if( !file_exists( $fname ) ) return false;
140 $data = splitHash( implode( "", file( $fname ) ) );
141 # enough?
143 return $data;
146 function fetchPage( $title )
148 global $FS,$FS2,$FS3, $wgRootDirectory;
150 $fname = $wgRootDirectory . "/pages/" . $title;
151 if( !file_exists( $fname ) ) return false;
153 $page = splitHash( implode( "", file( $fname ) ) );
154 $section = splitHash( $FS2, $page["text_default"] );
155 $text = splitHash( $FS3, $section["data"] );
157 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
158 "minor" => $text["minor"] , "ts" => $section["ts"] ,
159 "username" => $section["username"] , "host" => $section["host"] ) ;
162 function fetchKeptPages( $title )
164 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
166 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
167 if( !file_exists( $fname ) ) return array();
169 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
170 array_shift( $keptlist ); # Drop the junk at beginning of file
172 $revisions = array();
173 foreach( $keptlist as $rev ) {
174 $section = splitHash( $FS2, $rev );
175 $text = splitHash( $FS3, $section["data"] );
176 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
177 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
178 "minor" => $text["minor"] , "ts" => $section["ts"] ,
179 "username" => $section["username"] , "host" => $section["host"] ) );
180 } else {
181 echo "-- skipped a bad old revision\n";
184 return $revisions;
187 function splitHash ( $sep , $str ) {
188 $temp = explode ( $sep , $str ) ;
189 $ret = array () ;
190 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
191 $ret[$temp[$i]] = $temp[++$i] ;
193 return $ret ;
197 /* import_ functions
198 Take a fetched item and produce SQL
201 /* importUser
202 $uid is the UseMod user id number.
203 The new ones will be assigned arbitrarily and are for internal use only.
205 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
207 function importUser( $uid )
209 global $last_uid, $user_list, $wgTimestampCorrection;
211 return "";
213 $stuff = fetchUser( $uid );
214 $last_uid++;
216 $name = wfStrencode( $stuff->username );
217 $hash = md5hash( $stuff->password ); # Doable?
218 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
219 $hideminor = ($stuff['rcall'] ? 0 : 1);
220 $options = "cols={$stuff['editcols']}
221 rows={$stuff['editrows']}
222 rcdays={$stuff['rcdays']}
223 timecorrection={$tzoffset}
224 hideminor={$hideminor}
227 $sql = "INSERT
228 INTO user (user_id,user_name,user_password,user_options)
229 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
230 return $sql;
233 function checkUserCache( $name, $host )
235 global $usercache;
237 if( $name ) {
238 if( in_array( $name, $usercache ) ) {
239 $userid = $usercache[$name];
240 } else {
241 # If we haven't imported user accounts
242 $userid = 0;
244 $username = wfStrencode( $name );
245 } else {
246 $userid = 0;
247 $username = wfStrencode( $host );
249 return array( $userid, $username );
252 function importPage( $title )
254 global $wgTimezoneCorrection, $titlecache, $usercache;
255 global $conversionscript, $conversioncomment, $conversiontime;
256 global $historyonly, $lasthistoryonly;
258 $page = fetchPage( $title );
260 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
261 $t = renamePage( $title );
262 $newtitle = wfStrencode( $t->title );
263 $namespace = $t->namespace;
265 # Current revision:
266 $text = wfStrencode( $page->text );
267 $minor = ($page->minor ? 1 : 0);
268 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
269 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
270 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
271 $sql = "\n";
272 if( !$historyonly ) {
273 $sql .= "INSERT
274 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
275 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
277 $sql .= "INSERT
278 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
279 VALUES";
280 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
282 # History
283 if( !$lasthistoryonly ) {
284 $revisions = fetchKeptPages( $title );
285 foreach( $revisions as $rev ) {
286 $text = wfStrencode( $rev->text );
287 $minor = ($rev->minor ? 1 : 0);
288 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
289 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
290 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
293 return $sql . $sqlfinal;
297 # Count up basic links
298 function countLinksFrom( $title )
300 $page = fetchPage( $title );
301 $page->text = preg_replace(
302 '/<nowiki>.*<\/nowiki>/sDU',
304 $page->text );
305 $page->text = preg_replace(
306 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
307 'countLinkTo( ucfirst( "$1" ) )',
308 $page->text );
311 function countLinkTo( $title )
313 global $linkcache;
314 $t = transformTitle( $title );
315 $linkform = FreeToNormal( $t->title );
316 $x = $linkcache[$title];
317 if ( count ( $x ) ) {
318 $y = $x[$linkform] ;
319 if ( $y ) $y++; else $y = 1 ;
320 $x[$linkform] = $y ;
321 } else {
322 $x = array ( $linkform => 1 ) ;
324 $linkcache[$title] = $x;
327 # Preferentially change case
328 function renamePage( $title )
330 global $linkcache;
331 $t = transformTitle( $title );
333 # We want to use the most frequently linked-to form as the title
334 $maxcount = 0 ; $maxform = $t->title ;
335 foreach ( $linkcache[$title] as $linkform => $count ) {
336 if ( $count > $maxcount ) {
337 $maxcount = $count ;
338 $maxform = $linkform ;
341 if( $maxform != $t->title) {
342 doRenamePage( $t, $maxform );
346 function doRenamePage( $title, $maxform )
348 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
349 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
350 VALUES ";
351 $redirsql = array();
352 foreach( $linkcache[$title] as $linkform => $count ) {
353 if( $linkform != $maxform ) {
354 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
355 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
358 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
359 return $sql;
362 # Account for syntax changes
363 function rewritePage( $title, $text )
365 # ...
366 $text = removeTalkLink( $text );
367 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
368 'rewritePageBits( $title, "$1")',
369 $text );
370 return $text;
373 function rewritePageBits( $title, $text ) {
374 $text = fixSubpages( $title, $text );
375 $text = fixMedialinks( $text );
376 $text = fixImagelinks( $text );
377 return $text;
380 function removeTalkLink( &$text ) {
381 global $talkending;
382 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
385 function fixSubpages( $text, &$title ) {
386 $old = preg_quote( $text );
387 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
388 "$1[[$title/$2|/$2]]", $text );
389 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
390 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
391 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
392 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
393 return $text;
396 function fixImagelinks( &$text ) {
397 global $imageimport, $namespaces;
398 return preg_replace( "/$imageimport/e",
399 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
400 $text );
403 function fixMedialinks( &$text ) {
404 global $imageimport, $mediatext;
405 $text = preg_replace( "/\[$imageimport\]/e",
406 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
407 $text );
408 return preg_replace( "/\[$imageimport (.+?)\]/e",
409 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
410 $text );
413 function fetchMediaFile( $url, $filename )
415 # Copy an image file into local upload space
416 # FIXME
417 return ucfirst( $filename );
420 # Simple move of talk pages, etc
421 function transformTitle( $title, $dorename = false )
423 global $talkending;
424 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
425 $thetitle = $m[1];
426 $namespace = 1;
427 } else {
428 $thetitle = $title;
429 $namespace = 0;
431 return Title::fromData( $namespace, $thetitle );
434 # Translated out of old usemod wiki...
435 function FreeToNormal ( $id , $FreeUpper = true ) {
436 $id = str_replace ( " ", "_", $id ) ;
437 $id = ucfirst($id);
438 if (strstr($id, '_') != false) { # Quick check for any space/underscores
439 $id = preg_replace ( '/__+/' , "_" , $id ) ;
440 $id = preg_replace ( '/^_/' , "", $id ) ;
441 $id = preg_replace ( '/_$/' , "", $id ) ;
442 #if ($UseSubpage) {
443 $id = preg_replace ( '|_/|', "/" , $id ) ;
444 $id = preg_replace ( '|/_|', "/" , $id ) ;
447 if ($FreeUpper) {
448 # Note that letters after ' are *not* capitalized
449 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
450 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
453 return $id;
456 # Whee!
457 function recodeInput( $text )
459 return $text;
462 function wfUnix2Timestamp( $unixtime ) {
463 return gmdate( "YmdHis", $timestamp );
466 function wfTimestamp2Unix( $ts )
468 return gmmktime( ( (int)substr( $ts, 8, 2) ),
469 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
470 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
471 (int)substr( $ts, 0, 4 ) );