3 print "This script is obsolete!";
4 print "It is retained in the source here in case some of its
5 code might be useful for ad-hoc conversion tasks, but it is
6 not maintained and probably won't even work as is.";
10 Import data from a UseModWiki into a PediaWiki wiki
11 2003-02-09 Brion VIBBER <brion@pobox.com>
12 Based loosely on Magnus's code from 2001-2002
14 Pass one: collect data on links & title case, users
15 Pass two: spit out SQL for
16 Separately, be sure to run the link & index rebuilding scripts!
22 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
23 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
24 $FS = $wgFieldSeparator ;
30 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
32 # Number of *seconds to add* to timestamp to get UTC/GMT
33 #$wgTimezoneCorrection = 0; # GMT
34 $wgTimezoneCorrection = 8*3600; # PST - California
37 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
38 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
40 /* Vary by language */
41 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
42 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
45 $conversionscript = "Conversion script";
46 $conversioncomment = "Automatic conversion";
47 $redirectcomment = "Automatic converion, moved to \$1";
48 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
53 $titlecache = array();
56 # Some oversimplified test types
58 var $title, $namespace;
59 function fromData( $namespace, $title ) {
61 $x->namespace = $namespace;
67 # See tests in importTests.php
68 if( ! $testingonly ) {
73 # ------------------------------------------------------------------------------
80 global $wgRootDirectory, $oldtitles;
83 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
84 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
85 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
86 foreach( $letters as $letter ) {
87 firstPassDirectory( "$wgRootDirectory/page/$letter" );
91 function firstPassDirectory( $dir )
95 $mydir = opendir( $dir );
96 while( $entry = readdir( $mydir ) ) {
97 if( $entry != '.' && $entry != '..' ) {
98 if( is_dir( "$dir/$entry" ) ) {
99 firstPassDirectory( "$dir/$entry" );
101 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
102 $titlecache[$title] = transformTitle( $m[1] );
103 countLinksFrom( $title );
105 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
113 function secondPass()
115 global $titlecache, $usercache, $redirects;
117 foreach( $usercache as $oldname => $user ) {
118 echo importUser( $oldname );
120 foreach( $titlecache as $oldtitle => $newtitle ) {
121 echo importPage( $oldtitle );
128 # ------------------------------------------------------------------------------
131 Grab a given item from the database
133 function fetchUser( $uid )
135 global $FS,$FS2,$FS3, $wgRootDirectory;
137 $fname = $wgRootDirectory . "/pages/" . $title;
138 if( !file_exists( $fname ) ) return false;
140 $data = splitHash( implode( "", file( $fname ) ) );
146 function fetchPage( $title )
148 global $FS,$FS2,$FS3, $wgRootDirectory;
150 $fname = $wgRootDirectory . "/pages/" . $title;
151 if( !file_exists( $fname ) ) return false;
153 $page = splitHash( implode( "", file( $fname ) ) );
154 $section = splitHash( $FS2, $page["text_default"] );
155 $text = splitHash( $FS3, $section["data"] );
157 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
158 "minor" => $text["minor"] , "ts" => $section["ts"] ,
159 "username" => $section["username"] , "host" => $section["host"] ) ;
162 function fetchKeptPages( $title )
164 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
166 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
167 if( !file_exists( $fname ) ) return array();
169 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
170 array_shift( $keptlist ); # Drop the junk at beginning of file
172 $revisions = array();
173 foreach( $keptlist as $rev ) {
174 $section = splitHash( $FS2, $rev );
175 $text = splitHash( $FS3, $section["data"] );
176 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
177 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
178 "minor" => $text["minor"] , "ts" => $section["ts"] ,
179 "username" => $section["username"] , "host" => $section["host"] ) );
181 echo "-- skipped a bad old revision\n";
187 function splitHash ( $sep , $str ) {
188 $temp = explode ( $sep , $str ) ;
190 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
191 $ret[$temp[$i]] = $temp[++
$i] ;
198 Take a fetched item and produce SQL
202 $uid is the UseMod user id number.
203 The new ones will be assigned arbitrarily and are for internal use only.
205 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
207 function importUser( $uid )
209 global $last_uid, $user_list, $wgTimestampCorrection;
213 $stuff = fetchUser( $uid );
216 $name = wfStrencode( $stuff->username
);
217 $hash = md5hash( $stuff->password
); # Doable?
218 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
219 $hideminor = ($stuff['rcall'] ?
0 : 1);
220 $options = "cols={$stuff['editcols']}
221 rows={$stuff['editrows']}
222 rcdays={$stuff['rcdays']}
223 timecorrection={$tzoffset}
224 hideminor={$hideminor}
228 INTO user (user_id,user_name,user_password,user_options)
229 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
233 function checkUserCache( $name, $host )
238 if( in_array( $name, $usercache ) ) {
239 $userid = $usercache[$name];
241 # If we haven't imported user accounts
244 $username = wfStrencode( $name );
247 $username = wfStrencode( $host );
249 return array( $userid, $username );
252 function importPage( $title )
254 global $wgTimezoneCorrection, $titlecache, $usercache;
255 global $conversionscript, $conversioncomment, $conversiontime;
256 global $historyonly, $lasthistoryonly;
258 $page = fetchPage( $title );
260 $newtext = wfStrencode( rewritePage( $title, $page->text
) );
261 $t = renamePage( $title );
262 $newtitle = wfStrencode( $t->title
);
263 $namespace = $t->namespace;
266 $text = wfStrencode( $page->text
);
267 $minor = ($page->minor ?
1 : 0);
268 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
269 $timestamp = wfUnix2Timestamp( $page->timestamp +
$wgTimezoneCorrection );
270 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
272 if( !$historyonly ) {
274 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
275 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
278 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
280 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
283 if( !$lasthistoryonly ) {
284 $revisions = fetchKeptPages( $title );
285 foreach( $revisions as $rev ) {
286 $text = wfStrencode( $rev->text
);
287 $minor = ($rev->minor ?
1 : 0);
288 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
289 $timestamp = wfUnix2Timestamp( $rev->timestamp +
$wgTimezoneCorrection );
290 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
293 return $sql . $sqlfinal;
297 # Count up basic links
298 function countLinksFrom( $title )
300 $page = fetchPage( $title );
301 $page->text
= preg_replace(
302 '/<nowiki>.*<\/nowiki>/sDU',
305 $page->text
= preg_replace(
306 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
307 'countLinkTo( ucfirst( "$1" ) )',
311 function countLinkTo( $title )
314 $t = transformTitle( $title );
315 $linkform = FreeToNormal( $t->title
);
316 $x = $linkcache[$title];
317 if ( count ( $x ) ) {
319 if ( $y ) $y++
; else $y = 1 ;
322 $x = array ( $linkform => 1 ) ;
324 $linkcache[$title] = $x;
327 # Preferentially change case
328 function renamePage( $title )
331 $t = transformTitle( $title );
333 # We want to use the most frequently linked-to form as the title
334 $maxcount = 0 ; $maxform = $t->title
;
335 foreach ( $linkcache[$title] as $linkform => $count ) {
336 if ( $count > $maxcount ) {
338 $maxform = $linkform ;
341 if( $maxform != $t->title
) {
342 doRenamePage( $t, $maxform );
346 function doRenamePage( $title, $maxform )
348 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
349 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
352 foreach( $linkcache[$title] as $linkform => $count ) {
353 if( $linkform != $maxform ) {
354 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
355 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
358 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
362 # Account for syntax changes
363 function rewritePage( $title, $text )
366 $text = removeTalkLink( $text );
367 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
368 'rewritePageBits( $title, "$1")',
373 function rewritePageBits( $title, $text ) {
374 $text = fixSubpages( $title, $text );
375 $text = fixMedialinks( $text );
376 $text = fixImagelinks( $text );
380 function removeTalkLink( &$text ) {
382 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
385 function fixSubpages( $text, &$title ) {
386 $old = preg_quote( $text );
387 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
388 "$1[[$title/$2|/$2]]", $text );
389 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
390 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
391 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
392 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
396 function fixImagelinks( &$text ) {
397 global $imageimport, $namespaces;
398 return preg_replace( "/$imageimport/e",
399 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
403 function fixMedialinks( &$text ) {
404 global $imageimport, $mediatext;
405 $text = preg_replace( "/\[$imageimport\]/e",
406 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
408 return preg_replace( "/\[$imageimport (.+?)\]/e",
409 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
413 function fetchMediaFile( $url, $filename )
415 # Copy an image file into local upload space
417 return ucfirst( $filename );
420 # Simple move of talk pages, etc
421 function transformTitle( $title, $dorename = false )
424 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
431 return Title
::fromData( $namespace, $thetitle );
434 # Translated out of old usemod wiki...
435 function FreeToNormal ( $id , $FreeUpper = true ) {
436 $id = str_replace ( " ", "_", $id ) ;
438 if (strstr($id, '_') != false) { # Quick check for any space/underscores
439 $id = preg_replace ( '/__+/' , "_" , $id ) ;
440 $id = preg_replace ( '/^_/' , "", $id ) ;
441 $id = preg_replace ( '/_$/' , "", $id ) ;
443 $id = preg_replace ( '|_/|', "/" , $id ) ;
444 $id = preg_replace ( '|/_|', "/" , $id ) ;
448 # Note that letters after ' are *not* capitalized
449 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
450 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
457 function recodeInput( $text )
462 function wfUnix2Timestamp( $unixtime ) {
463 return gmdate( "YmdHis", $timestamp );
466 function wfTimestamp2Unix( $ts )
468 return gmmktime( ( (int)substr( $ts, 8, 2) ),
469 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
470 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
471 (int)substr( $ts, 0, 4 ) );