Fixme note
[mediawiki.git] / maintenance / archives / importUseModWiki.php
blob5e033a55aa15a7dc3d69f1626a854a410acf2d07
1 <?php
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
8 Pass one: collect data on links & title case, users
9 Pass two: spit out SQL for
10 Separately, be sure to run the link & index rebuilding scripts!
14 /* globals
16 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
17 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
18 $FS = $wgFieldSeparator ;
19 $FS1 = $FS."1" ;
20 $FS2 = $FS."2" ;
21 $FS3 = $FS."3" ;
23 # Images to import
24 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
26 # Number of *seconds to add* to timestamp to get UTC/GMT
27 #$wgTimezoneCorrection = 0; # GMT
28 $wgTimezoneCorrection = 8*3600; # PST - California
30 # Other options...
31 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
32 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
34 /* Vary by language */
35 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
36 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
37 $talkending = "Talk";
38 $mediatext = "Media";
39 $conversionscript = "Conversion script";
40 $conversioncomment = "Automatic conversion";
41 $redirectcomment = "Automatic converion, moved to \$1";
42 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
44 # Stats and caches
45 $oldtitles = array();
46 $usercache = array();
47 $titlecache = array();
48 $linkcache = array();
50 # Some oversimplified test types
51 class Title {
52 var $title, $namespace;
53 function fromData( $namespace, $title ) {
54 $x = new Title;
55 $x->namespace = $namespace;
56 $x->title = $title;
57 return $x;
61 # See tests in importTests.php
62 if( ! $testingonly ) {
63 firstPass();
64 secondPass();
67 # ------------------------------------------------------------------------------
69 /* First pass:
70 Information please!
72 function firstPass()
74 global $wgRootDirectory, $oldtitles;
76 $letters = array(
77 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
78 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
79 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
80 foreach( $letters as $letter ) {
81 firstPassDirectory( "$wgRootDirectory/page/$letter" );
85 function firstPassDirectory( $dir )
87 global $titlecache;
89 $mydir = opendir( $dir );
90 while( $entry = readdir( $mydir ) ) {
91 if( $entry != '.' && $entry != '..' ) {
92 if( is_dir( "$dir/$entry" ) ) {
93 firstPassDirectory( "$dir/$entry" );
95 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
96 $titlecache[$title] = transformTitle( $m[1] );
97 countLinksFrom( $title );
98 } else {
99 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
104 /* Second pass:
105 make the dang SQL
107 function secondPass()
109 global $titlecache, $usercache, $redirects;
111 foreach( $usercache as $oldname => $user ) {
112 echo importUser( $oldname );
114 foreach( $titlecache as $oldtitle => $newtitle ) {
115 echo importPage( $oldtitle );
118 echo "\n-- Done!\n";
122 # ------------------------------------------------------------------------------
124 /* fetch_ functions
125 Grab a given item from the database
127 function fetchUser( $uid )
129 global $FS,$FS2,$FS3, $wgRootDirectory;
131 $fname = $wgRootDirectory . "/pages/" . $title;
132 if( !file_exists( $fname ) ) return false;
134 $data = splitHash( implode( "", file( $fname ) ) );
135 # enough?
137 return $data;
140 function fetchPage( $title )
142 global $FS,$FS2,$FS3, $wgRootDirectory;
144 $fname = $wgRootDirectory . "/pages/" . $title;
145 if( !file_exists( $fname ) ) return false;
147 $page = splitHash( implode( "", file( $fname ) ) );
148 $section = splitHash( $FS2, $page["text_default"] );
149 $text = splitHash( $FS3, $section["data"] );
151 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
152 "minor" => $text["minor"] , "ts" => $section["ts"] ,
153 "username" => $section["username"] , "host" => $section["host"] ) ;
156 function fetchKeptPages( $title )
158 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
160 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
161 if( !file_exists( $fname ) ) return array();
163 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
164 array_shift( $keptlist ); # Drop the junk at beginning of file
166 $revisions = array();
167 foreach( $keptlist as $rev ) {
168 $section = splitHash( $FS2, $rev );
169 $text = splitHash( $FS3, $section["data"] );
170 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
171 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
172 "minor" => $text["minor"] , "ts" => $section["ts"] ,
173 "username" => $section["username"] , "host" => $section["host"] ) );
174 } else {
175 echo "-- skipped a bad old revision\n";
178 return $revisions;
181 function splitHash ( $sep , $str ) {
182 $temp = explode ( $sep , $str ) ;
183 $ret = array () ;
184 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
185 $ret[$temp[$i]] = $temp[++$i] ;
187 return $ret ;
191 /* import_ functions
192 Take a fetched item and produce SQL
195 /* importUser
196 $uid is the UseMod user id number.
197 The new ones will be assigned arbitrarily and are for internal use only.
199 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
201 function importUser( $uid )
203 global $last_uid, $user_list, $wgTimestampCorrection;
205 return "";
207 $stuff = fetchUser( $uid );
208 $last_uid++;
210 $name = wfStrencode( $stuff->username );
211 $hash = md5hash( $stuff->password ); # Doable?
212 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
213 $hideminor = ($stuff['rcall'] ? 0 : 1);
214 $options = "cols={$stuff['editcols']}
215 rows={$stuff['editrows']}
216 rcdays={$stuff['rcdays']}
217 timecorrection={$tzoffset}
218 hideminor={$hideminor}
221 $sql = "INSERT
222 INTO user (user_id,user_name,user_password,user_options)
223 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
224 return $sql;
227 function checkUserCache( $name, $host )
229 global $usercache;
231 if( $name ) {
232 if( in_array( $name, $usercache ) ) {
233 $userid = $usercache[$name];
234 } else {
235 # If we haven't imported user accounts
236 $userid = 0;
238 $username = wfStrencode( $name );
239 } else {
240 $userid = 0;
241 $username = wfStrencode( $host );
243 return array( $userid, $username );
246 function importPage( $title )
248 global $wgTimezoneCorrection, $titlecache, $usercache;
249 global $conversionscript, $conversioncomment, $conversiontime;
250 global $historyonly, $lasthistoryonly;
252 $page = fetchPage( $title );
254 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
255 $t = renamePage( $title );
256 $newtitle = wfStrencode( $t->title );
257 $namespace = $t->namespace;
259 # Current revision:
260 $text = wfStrencode( $page->text );
261 $minor = ($page->minor ? 1 : 0);
262 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
263 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
264 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
265 $sql = "\n";
266 if( !$historyonly ) {
267 $sql .= "INSERT
268 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
269 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
271 $sql .= "INSERT
272 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
273 VALUES";
274 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
276 # History
277 if( !$lasthistoryonly ) {
278 $revisions = fetchKeptPages( $title );
279 foreach( $revisions as $rev ) {
280 $text = wfStrencode( $rev->text );
281 $minor = ($rev->minor ? 1 : 0);
282 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
283 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
284 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
287 return $sql . $sqlfinal;
291 # Count up basic links
292 function countLinksFrom( $title )
294 $page = fetchPage( $title );
295 $page->text = preg_replace(
296 '/<nowiki>.*<\/nowiki>/sDU',
298 $page->text );
299 $page->text = preg_replace(
300 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
301 'countLinkTo( ucfirst( "$1" ) )',
302 $page->text );
305 function countLinkTo( $title )
307 global $linkcache;
308 $t = transformTitle( $title );
309 $linkform = FreeToNormal( $t->title );
310 $x = $linkcache[$title];
311 if ( count ( $x ) ) {
312 $y = $x[$linkform] ;
313 if ( $y ) $y++; else $y = 1 ;
314 $x[$linkform] = $y ;
315 } else {
316 $x = array ( $linkform => 1 ) ;
318 $linkcache[$title] = $x;
321 # Preferentially change case
322 function renamePage( $title )
324 global $linkcache;
325 $t = transformTitle( $title );
327 # We want to use the most frequently linked-to form as the title
328 $maxcount = 0 ; $maxform = $t->title ;
329 foreach ( $linkcache[$title] as $linkform => $count ) {
330 if ( $count > $maxcount ) {
331 $maxcount = $count ;
332 $maxform = $linkform ;
335 if( $maxform != $t->title) {
336 doRenamePage( $t, $maxform );
340 function doRenamePage( $title, $maxform )
342 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
343 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
344 VALUES ";
345 $redirsql = array();
346 foreach( $linkcache[$title] as $linkform => $count ) {
347 if( $linkform != $maxform ) {
348 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
349 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
352 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
353 return $sql;
356 # Account for syntax changes
357 function rewritePage( $title, $text )
359 # ...
360 $text = removeTalkLink( $text );
361 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
362 'rewritePageBits( $title, "$1")',
363 $text );
364 return $text;
367 function rewritePageBits( $title, $text ) {
368 $text = fixSubpages( $title, $text );
369 $text = fixMedialinks( $text );
370 $text = fixImagelinks( $text );
371 return $text;
374 function removeTalkLink( &$text ) {
375 global $talkending;
376 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
379 function fixSubpages( $text, &$title ) {
380 $old = preg_quote( $text );
381 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
382 "$1[[$title/$2|/$2]]", $text );
383 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
384 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
385 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
386 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
387 return $text;
390 function fixImagelinks( &$text ) {
391 global $imageimport, $namespaces;
392 return preg_replace( "/$imageimport/e",
393 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
394 $text );
397 function fixMedialinks( &$text ) {
398 global $imageimport, $mediatext;
399 $text = preg_replace( "/\[$imageimport\]/e",
400 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
401 $text );
402 return preg_replace( "/\[$imageimport (.+?)\]/e",
403 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
404 $text );
407 function fetchMediaFile( $url, $filename )
409 # Copy an image file into local upload space
410 # FIXME
411 return ucfirst( $filename );
414 # Simple move of talk pages, etc
415 function transformTitle( $title, $dorename = false )
417 global $talkending;
418 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
419 $thetitle = $m[1];
420 $namespace = 1;
421 } else {
422 $thetitle = $title;
423 $namespace = 0;
425 return Title::fromData( $namespace, $thetitle );
428 # Translated out of old usemod wiki...
429 function FreeToNormal ( $id , $FreeUpper = true ) {
430 $id = str_replace ( " ", "_", $id ) ;
431 $id = ucfirst($id);
432 if (strstr($id, '_') != false) { # Quick check for any space/underscores
433 $id = preg_replace ( '/__+/' , "_" , $id ) ;
434 $id = preg_replace ( '/^_/' , "", $id ) ;
435 $id = preg_replace ( '/_$/' , "", $id ) ;
436 #if ($UseSubpage) {
437 $id = preg_replace ( '|_/|', "/" , $id ) ;
438 $id = preg_replace ( '|/_|', "/" , $id ) ;
441 if ($FreeUpper) {
442 # Note that letters after ' are *not* capitalized
443 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
444 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
447 return $id;
450 # Whee!
451 function recodeInput( $text )
453 return $text;
456 function wfUnix2Timestamp( $unixtime ) {
457 return gmdate( "YmdHis", $timestamp );
460 function wfTimestamp2Unix( $ts )
462 return gmmktime( ( (int)substr( $ts, 8, 2) ),
463 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
464 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
465 (int)substr( $ts, 0, 4 ) );