5 * @subpackage MaintenanceArchive
9 print "This script is obsolete!";
10 print "It is retained in the source here in case some of its
11 code might be useful for ad-hoc conversion tasks, but it is
12 not maintained and probably won't even work as is.";
16 Import data from a UseModWiki into a PediaWiki wiki
17 2003-02-09 Brion VIBBER <brion@pobox.com>
18 Based loosely on Magnus's code from 2001-2002
20 Pass one: collect data on links & title case, users
21 Pass two: spit out SQL for
22 Separately, be sure to run the link & index rebuilding scripts!
28 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
29 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
30 $FS = $wgFieldSeparator ;
36 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
38 # Number of *seconds to add* to timestamp to get UTC/GMT
39 #$wgTimezoneCorrection = 0; # GMT
40 $wgTimezoneCorrection = 8*3600; # PST - California
43 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
44 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
46 /* Vary by language */
47 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
48 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
51 $conversionscript = "Conversion script";
52 $conversioncomment = "Automatic conversion";
53 $redirectcomment = "Automatic converion, moved to \$1";
54 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
59 $titlecache = array();
63 * Some oversimplified test types
67 * @subpackage MaintenanceArchive
70 var $title, $namespace;
71 function fromData( $namespace, $title ) {
73 $x->namespace = $namespace;
79 # See tests in importTests.php
80 if( ! $testingonly ) {
85 # ------------------------------------------------------------------------------
92 global $wgRootDirectory, $oldtitles;
95 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
96 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
97 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
98 foreach( $letters as $letter ) {
99 firstPassDirectory( "$wgRootDirectory/page/$letter" );
103 function firstPassDirectory( $dir )
107 $mydir = opendir( $dir );
108 while( $entry = readdir( $mydir ) ) {
109 if( $entry != '.' && $entry != '..' ) {
110 if( is_dir( "$dir/$entry" ) ) {
111 firstPassDirectory( "$dir/$entry" );
113 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
114 $titlecache[$title] = transformTitle( $m[1] );
115 countLinksFrom( $title );
117 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
125 function secondPass()
127 global $titlecache, $usercache, $redirects;
129 foreach( $usercache as $oldname => $user ) {
130 echo importUser( $oldname );
132 foreach( $titlecache as $oldtitle => $newtitle ) {
133 echo importPage( $oldtitle );
140 # ------------------------------------------------------------------------------
143 Grab a given item from the database
145 function fetchUser( $uid )
147 global $FS,$FS2,$FS3, $wgRootDirectory;
149 $fname = $wgRootDirectory . "/pages/" . $title;
150 if( !file_exists( $fname ) ) return false;
152 $data = splitHash( implode( "", file( $fname ) ) );
158 function fetchPage( $title )
160 global $FS,$FS2,$FS3, $wgRootDirectory;
162 $fname = $wgRootDirectory . "/pages/" . $title;
163 if( !file_exists( $fname ) ) return false;
165 $page = splitHash( implode( "", file( $fname ) ) );
166 $section = splitHash( $FS2, $page["text_default"] );
167 $text = splitHash( $FS3, $section["data"] );
169 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
170 "minor" => $text["minor"] , "ts" => $section["ts"] ,
171 "username" => $section["username"] , "host" => $section["host"] ) ;
174 function fetchKeptPages( $title )
176 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
178 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
179 if( !file_exists( $fname ) ) return array();
181 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
182 array_shift( $keptlist ); # Drop the junk at beginning of file
184 $revisions = array();
185 foreach( $keptlist as $rev ) {
186 $section = splitHash( $FS2, $rev );
187 $text = splitHash( $FS3, $section["data"] );
188 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
189 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
190 "minor" => $text["minor"] , "ts" => $section["ts"] ,
191 "username" => $section["username"] , "host" => $section["host"] ) );
193 echo "-- skipped a bad old revision\n";
199 function splitHash ( $sep , $str ) {
200 $temp = explode ( $sep , $str ) ;
202 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
203 $ret[$temp[$i]] = $temp[++
$i] ;
210 Take a fetched item and produce SQL
214 $uid is the UseMod user id number.
215 The new ones will be assigned arbitrarily and are for internal use only.
217 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
219 function importUser( $uid )
221 global $last_uid, $user_list, $wgTimestampCorrection;
225 $stuff = fetchUser( $uid );
228 $name = wfStrencode( $stuff->username
);
229 $hash = md5hash( $stuff->password
); # Doable?
230 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
231 $hideminor = ($stuff['rcall'] ?
0 : 1);
232 $options = "cols={$stuff['editcols']}
233 rows={$stuff['editrows']}
234 rcdays={$stuff['rcdays']}
235 timecorrection={$tzoffset}
236 hideminor={$hideminor}
240 INTO user (user_id,user_name,user_password,user_options)
241 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
245 function checkUserCache( $name, $host )
250 if( in_array( $name, $usercache ) ) {
251 $userid = $usercache[$name];
253 # If we haven't imported user accounts
256 $username = wfStrencode( $name );
259 $username = wfStrencode( $host );
261 return array( $userid, $username );
264 function importPage( $title )
266 global $wgTimezoneCorrection, $titlecache, $usercache;
267 global $conversionscript, $conversioncomment, $conversiontime;
268 global $historyonly, $lasthistoryonly;
270 $page = fetchPage( $title );
272 $newtext = wfStrencode( rewritePage( $title, $page->text
) );
273 $t = renamePage( $title );
274 $newtitle = wfStrencode( $t->title
);
275 $namespace = $t->namespace;
278 $text = wfStrencode( $page->text
);
279 $minor = ($page->minor ?
1 : 0);
280 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
281 $timestamp = wfUnix2Timestamp( $page->timestamp +
$wgTimezoneCorrection );
282 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
284 if( !$historyonly ) {
286 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
287 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
290 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
292 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
295 if( !$lasthistoryonly ) {
296 $revisions = fetchKeptPages( $title );
297 foreach( $revisions as $rev ) {
298 $text = wfStrencode( $rev->text
);
299 $minor = ($rev->minor ?
1 : 0);
300 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
301 $timestamp = wfUnix2Timestamp( $rev->timestamp +
$wgTimezoneCorrection );
302 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
305 return $sql . $sqlfinal;
309 # Count up basic links
310 function countLinksFrom( $title )
312 $page = fetchPage( $title );
313 $page->text
= preg_replace(
314 '/<nowiki>.*<\/nowiki>/sDU',
317 $page->text
= preg_replace(
318 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
319 'countLinkTo( ucfirst( "$1" ) )',
323 function countLinkTo( $title )
326 $t = transformTitle( $title );
327 $linkform = FreeToNormal( $t->title
);
328 $x = $linkcache[$title];
329 if ( count ( $x ) ) {
331 if ( $y ) $y++
; else $y = 1 ;
334 $x = array ( $linkform => 1 ) ;
336 $linkcache[$title] = $x;
339 # Preferentially change case
340 function renamePage( $title )
343 $t = transformTitle( $title );
345 # We want to use the most frequently linked-to form as the title
346 $maxcount = 0 ; $maxform = $t->title
;
347 foreach ( $linkcache[$title] as $linkform => $count ) {
348 if ( $count > $maxcount ) {
350 $maxform = $linkform ;
353 if( $maxform != $t->title
) {
354 doRenamePage( $t, $maxform );
358 function doRenamePage( $title, $maxform )
360 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
361 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
364 foreach( $linkcache[$title] as $linkform => $count ) {
365 if( $linkform != $maxform ) {
366 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
367 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
370 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
374 # Account for syntax changes
375 function rewritePage( $title, $text )
378 $text = removeTalkLink( $text );
379 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
380 'rewritePageBits( $title, "$1")',
385 function rewritePageBits( $title, $text ) {
386 $text = fixSubpages( $title, $text );
387 $text = fixMedialinks( $text );
388 $text = fixImagelinks( $text );
392 function removeTalkLink( &$text ) {
394 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
397 function fixSubpages( $text, &$title ) {
398 $old = preg_quote( $text );
399 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
400 "$1[[$title/$2|/$2]]", $text );
401 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
402 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
403 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
404 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
408 function fixImagelinks( &$text ) {
409 global $imageimport, $namespaces;
410 return preg_replace( "/$imageimport/e",
411 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
415 function fixMedialinks( &$text ) {
416 global $imageimport, $mediatext;
417 $text = preg_replace( "/\[$imageimport\]/e",
418 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
420 return preg_replace( "/\[$imageimport (.+?)\]/e",
421 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
425 function fetchMediaFile( $url, $filename )
427 # Copy an image file into local upload space
429 return ucfirst( $filename );
432 # Simple move of talk pages, etc
433 function transformTitle( $title, $dorename = false )
436 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
443 return Title
::fromData( $namespace, $thetitle );
446 # Translated out of old usemod wiki...
447 function FreeToNormal ( $id , $FreeUpper = true ) {
448 $id = str_replace ( " ", "_", $id ) ;
450 if (strstr($id, '_') != false) { # Quick check for any space/underscores
451 $id = preg_replace ( '/__+/' , "_" , $id ) ;
452 $id = preg_replace ( '/^_/' , "", $id ) ;
453 $id = preg_replace ( '/_$/' , "", $id ) ;
455 $id = preg_replace ( '|_/|', "/" , $id ) ;
456 $id = preg_replace ( '|/_|', "/" , $id ) ;
460 # Note that letters after ' are *not* capitalized
461 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
462 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
469 function recodeInput( $text )
474 function wfUnix2Timestamp( $unixtime ) {
475 return gmdate( "YmdHis", $timestamp );
478 function wfTimestamp2Unix( $ts )
480 return gmmktime( ( (int)substr( $ts, 8, 2) ),
481 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
482 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
483 (int)substr( $ts, 0, 4 ) );