4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
8 Pass one: collect data on links & title case, users
9 Pass two: spit out SQL for
10 Separately, be sure to run the link & index rebuilding scripts!
16 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
17 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
18 $FS = $wgFieldSeparator ;
24 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
26 # Number of *seconds to add* to timestamp to get UTC/GMT
27 #$wgTimezoneCorrection = 0; # GMT
28 $wgTimezoneCorrection = 8*3600; # PST - California
31 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
32 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
34 /* Vary by language */
35 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
36 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
39 $conversionscript = "Conversion script";
40 $conversioncomment = "Automatic conversion";
41 $redirectcomment = "Automatic converion, moved to \$1";
42 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
47 $titlecache = array();
50 # Some oversimplified test types
52 var $title, $namespace;
53 function fromData( $namespace, $title ) {
55 $x->namespace = $namespace;
61 # See tests in importTests.php
62 if( ! $testingonly ) {
67 # ------------------------------------------------------------------------------
74 global $wgRootDirectory, $oldtitles;
77 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
78 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
79 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
80 foreach( $letters as $letter ) {
81 firstPassDirectory( "$wgRootDirectory/page/$letter" );
85 function firstPassDirectory( $dir )
89 $mydir = opendir( $dir );
90 while( $entry = readdir( $mydir ) ) {
91 if( $entry != '.' && $entry != '..' ) {
92 if( is_dir( "$dir/$entry" ) ) {
93 firstPassDirectory( "$dir/$entry" );
95 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
96 $titlecache[$title] = transformTitle( $m[1] );
97 countLinksFrom( $title );
99 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
107 function secondPass()
109 global $titlecache, $usercache, $redirects;
111 foreach( $usercache as $oldname => $user ) {
112 echo importUser( $oldname );
114 foreach( $titlecache as $oldtitle => $newtitle ) {
115 echo importPage( $oldtitle );
122 # ------------------------------------------------------------------------------
125 Grab a given item from the database
127 function fetchUser( $uid )
129 global $FS,$FS2,$FS3, $wgRootDirectory;
131 $fname = $wgRootDirectory . "/pages/" . $title;
132 if( !file_exists( $fname ) ) return false;
134 $data = splitHash( implode( "", file( $fname ) ) );
140 function fetchPage( $title )
142 global $FS,$FS2,$FS3, $wgRootDirectory;
144 $fname = $wgRootDirectory . "/pages/" . $title;
145 if( !file_exists( $fname ) ) return false;
147 $page = splitHash( implode( "", file( $fname ) ) );
148 $section = splitHash( $FS2, $page["text_default"] );
149 $text = splitHash( $FS3, $section["data"] );
151 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
152 "minor" => $text["minor"] , "ts" => $section["ts"] ,
153 "username" => $section["username"] , "host" => $section["host"] ) ;
156 function fetchKeptPages( $title )
158 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
160 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
161 if( !file_exists( $fname ) ) return array();
163 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
164 array_shift( $keptlist ); # Drop the junk at beginning of file
166 $revisions = array();
167 foreach( $keptlist as $rev ) {
168 $section = splitHash( $FS2, $rev );
169 $text = splitHash( $FS3, $section["data"] );
170 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
171 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
172 "minor" => $text["minor"] , "ts" => $section["ts"] ,
173 "username" => $section["username"] , "host" => $section["host"] ) );
175 echo "-- skipped a bad old revision\n";
181 function splitHash ( $sep , $str ) {
182 $temp = explode ( $sep , $str ) ;
184 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
185 $ret[$temp[$i]] = $temp[++
$i] ;
192 Take a fetched item and produce SQL
196 $uid is the UseMod user id number.
197 The new ones will be assigned arbitrarily and are for internal use only.
199 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
201 function importUser( $uid )
203 global $last_uid, $user_list, $wgTimestampCorrection;
207 $stuff = fetchUser( $uid );
210 $name = wfStrencode( $stuff->username
);
211 $hash = md5hash( $stuff->password
); # Doable?
212 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
213 $hideminor = ($stuff['rcall'] ?
0 : 1);
214 $options = "cols={$stuff['editcols']}
215 rows={$stuff['editrows']}
216 rcdays={$stuff['rcdays']}
217 timecorrection={$tzoffset}
218 hideminor={$hideminor}
222 INTO user (user_id,user_name,user_password,user_options)
223 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
227 function checkUserCache( $name, $host )
232 if( in_array( $name, $usercache ) ) {
233 $userid = $usercache[$name];
235 # If we haven't imported user accounts
238 $username = wfStrencode( $name );
241 $username = wfStrencode( $host );
243 return array( $userid, $username );
246 function importPage( $title )
248 global $wgTimezoneCorrection, $titlecache, $usercache;
249 global $conversionscript, $conversioncomment, $conversiontime;
250 global $historyonly, $lasthistoryonly;
252 $page = fetchPage( $title );
254 $newtext = wfStrencode( rewritePage( $title, $page->text
) );
255 $t = renamePage( $title );
256 $newtitle = wfStrencode( $t->title
);
257 $namespace = $t->namespace;
260 $text = wfStrencode( $page->text
);
261 $minor = ($page->minor ?
1 : 0);
262 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
263 $timestamp = wfUnix2Timestamp( $page->timestamp +
$wgTimezoneCorrection );
264 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
266 if( !$historyonly ) {
268 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
269 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
272 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
274 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
277 if( !$lasthistoryonly ) {
278 $revisions = fetchKeptPages( $title );
279 foreach( $revisions as $rev ) {
280 $text = wfStrencode( $rev->text
);
281 $minor = ($rev->minor ?
1 : 0);
282 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
283 $timestamp = wfUnix2Timestamp( $rev->timestamp +
$wgTimezoneCorrection );
284 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
287 return $sql . $sqlfinal;
291 # Count up basic links
292 function countLinksFrom( $title )
294 $page = fetchPage( $title );
295 $page->text
= preg_replace(
296 '/<nowiki>.*<\/nowiki>/sDU',
299 $page->text
= preg_replace(
300 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
301 'countLinkTo( ucfirst( "$1" ) )',
305 function countLinkTo( $title )
308 $t = transformTitle( $title );
309 $linkform = FreeToNormal( $t->title
);
310 $x = $linkcache[$title];
311 if ( count ( $x ) ) {
313 if ( $y ) $y++
; else $y = 1 ;
316 $x = array ( $linkform => 1 ) ;
318 $linkcache[$title] = $x;
321 # Preferentially change case
322 function renamePage( $title )
325 $t = transformTitle( $title );
327 # We want to use the most frequently linked-to form as the title
328 $maxcount = 0 ; $maxform = $t->title
;
329 foreach ( $linkcache[$title] as $linkform => $count ) {
330 if ( $count > $maxcount ) {
332 $maxform = $linkform ;
335 if( $maxform != $t->title
) {
336 doRenamePage( $t, $maxform );
340 function doRenamePage( $title, $maxform )
342 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
343 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
346 foreach( $linkcache[$title] as $linkform => $count ) {
347 if( $linkform != $maxform ) {
348 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
349 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
352 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
356 # Account for syntax changes
357 function rewritePage( $title, $text )
360 $text = removeTalkLink( $text );
361 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
362 'rewritePageBits( $title, "$1")',
367 function rewritePageBits( $title, $text ) {
368 $text = fixSubpages( $title, $text );
369 $text = fixMedialinks( $text );
370 $text = fixImagelinks( $text );
374 function removeTalkLink( &$text ) {
376 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
379 function fixSubpages( $text, &$title ) {
380 $old = preg_quote( $text );
381 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
382 "$1[[$title/$2|/$2]]", $text );
383 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
384 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
385 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
386 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
390 function fixImagelinks( &$text ) {
391 global $imageimport, $namespaces;
392 return preg_replace( "/$imageimport/e",
393 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
397 function fixMedialinks( &$text ) {
398 global $imageimport, $mediatext;
399 $text = preg_replace( "/\[$imageimport\]/e",
400 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
402 return preg_replace( "/\[$imageimport (.+?)\]/e",
403 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
407 function fetchMediaFile( $url, $filename )
409 # Copy an image file into local upload space
411 return ucfirst( $filename );
414 # Simple move of talk pages, etc
415 function transformTitle( $title, $dorename = false )
418 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
425 return Title
::fromData( $namespace, $thetitle );
428 # Translated out of old usemod wiki...
429 function FreeToNormal ( $id , $FreeUpper = true ) {
430 $id = str_replace ( " ", "_", $id ) ;
432 if (strstr($id, '_') != false) { # Quick check for any space/underscores
433 $id = preg_replace ( '/__+/' , "_" , $id ) ;
434 $id = preg_replace ( '/^_/' , "", $id ) ;
435 $id = preg_replace ( '/_$/' , "", $id ) ;
437 $id = preg_replace ( '|_/|', "/" , $id ) ;
438 $id = preg_replace ( '|/_|', "/" , $id ) ;
442 # Note that letters after ' are *not* capitalized
443 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
444 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
451 function recodeInput( $text )
456 function wfUnix2Timestamp( $unixtime ) {
457 return gmdate( "YmdHis", $timestamp );
460 function wfTimestamp2Unix( $ts )
462 return gmmktime( ( (int)substr( $ts, 8, 2) ),
463 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
464 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
465 (int)substr( $ts, 0, 4 ) );