Test inline interwiki link
[mediawiki.git] / maintenance / archives / importUseModWiki.php
blob755acc14068e8a6691776b870c70fbf9e815cc38
1 <?php
2 /**
3 * @deprecated
4 * @package MediaWiki
5 * @subpackage MaintenanceArchive
6 */
8 /** */
9 print "This script is obsolete!";
10 print "It is retained in the source here in case some of its
11 code might be useful for ad-hoc conversion tasks, but it is
12 not maintained and probably won't even work as is.";
13 exit();
16 Import data from a UseModWiki into a PediaWiki wiki
17 2003-02-09 Brion VIBBER <brion@pobox.com>
18 Based loosely on Magnus's code from 2001-2002
20 Pass one: collect data on links & title case, users
21 Pass two: spit out SQL for
22 Separately, be sure to run the link & index rebuilding scripts!
26 /* globals
28 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
29 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
30 $FS = $wgFieldSeparator ;
31 $FS1 = $FS."1" ;
32 $FS2 = $FS."2" ;
33 $FS3 = $FS."3" ;
35 # Images to import
36 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
38 # Number of *seconds to add* to timestamp to get UTC/GMT
39 #$wgTimezoneCorrection = 0; # GMT
40 $wgTimezoneCorrection = 8*3600; # PST - California
42 # Other options...
43 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
44 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
46 /* Vary by language */
47 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
48 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
49 $talkending = "Talk";
50 $mediatext = "Media";
51 $conversionscript = "Conversion script";
52 $conversioncomment = "Automatic conversion";
53 $redirectcomment = "Automatic converion, moved to \$1";
54 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
56 # Stats and caches
57 $oldtitles = array();
58 $usercache = array();
59 $titlecache = array();
60 $linkcache = array();
62 /**
63 * Some oversimplified test types
65 * @deprecated
66 * @package MediaWiki
67 * @subpackage MaintenanceArchive
69 class Title {
70 var $title, $namespace;
71 function fromData( $namespace, $title ) {
72 $x = new Title;
73 $x->namespace = $namespace;
74 $x->title = $title;
75 return $x;
79 # See tests in importTests.php
80 if( ! $testingonly ) {
81 firstPass();
82 secondPass();
85 # ------------------------------------------------------------------------------
87 /* First pass:
88 Information please!
90 function firstPass()
92 global $wgRootDirectory, $oldtitles;
94 $letters = array(
95 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
96 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
97 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
98 foreach( $letters as $letter ) {
99 firstPassDirectory( "$wgRootDirectory/page/$letter" );
103 function firstPassDirectory( $dir )
105 global $titlecache;
107 $mydir = opendir( $dir );
108 while( $entry = readdir( $mydir ) ) {
109 if( $entry != '.' && $entry != '..' ) {
110 if( is_dir( "$dir/$entry" ) ) {
111 firstPassDirectory( "$dir/$entry" );
113 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
114 $titlecache[$title] = transformTitle( $m[1] );
115 countLinksFrom( $title );
116 } else {
117 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
122 /* Second pass:
123 make the dang SQL
125 function secondPass()
127 global $titlecache, $usercache, $redirects;
129 foreach( $usercache as $oldname => $user ) {
130 echo importUser( $oldname );
132 foreach( $titlecache as $oldtitle => $newtitle ) {
133 echo importPage( $oldtitle );
136 echo "\n-- Done!\n";
140 # ------------------------------------------------------------------------------
142 /* fetch_ functions
143 Grab a given item from the database
145 function fetchUser( $uid )
147 global $FS,$FS2,$FS3, $wgRootDirectory;
149 $fname = $wgRootDirectory . "/pages/" . $title;
150 if( !file_exists( $fname ) ) return false;
152 $data = splitHash( implode( "", file( $fname ) ) );
153 # enough?
155 return $data;
158 function fetchPage( $title )
160 global $FS,$FS2,$FS3, $wgRootDirectory;
162 $fname = $wgRootDirectory . "/pages/" . $title;
163 if( !file_exists( $fname ) ) return false;
165 $page = splitHash( implode( "", file( $fname ) ) );
166 $section = splitHash( $FS2, $page["text_default"] );
167 $text = splitHash( $FS3, $section["data"] );
169 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
170 "minor" => $text["minor"] , "ts" => $section["ts"] ,
171 "username" => $section["username"] , "host" => $section["host"] ) ;
174 function fetchKeptPages( $title )
176 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
178 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
179 if( !file_exists( $fname ) ) return array();
181 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
182 array_shift( $keptlist ); # Drop the junk at beginning of file
184 $revisions = array();
185 foreach( $keptlist as $rev ) {
186 $section = splitHash( $FS2, $rev );
187 $text = splitHash( $FS3, $section["data"] );
188 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
189 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
190 "minor" => $text["minor"] , "ts" => $section["ts"] ,
191 "username" => $section["username"] , "host" => $section["host"] ) );
192 } else {
193 echo "-- skipped a bad old revision\n";
196 return $revisions;
199 function splitHash ( $sep , $str ) {
200 $temp = explode ( $sep , $str ) ;
201 $ret = array () ;
202 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
203 $ret[$temp[$i]] = $temp[++$i] ;
205 return $ret ;
209 /* import_ functions
210 Take a fetched item and produce SQL
213 /* importUser
214 $uid is the UseMod user id number.
215 The new ones will be assigned arbitrarily and are for internal use only.
217 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
219 function importUser( $uid )
221 global $last_uid, $user_list, $wgTimestampCorrection;
223 return "";
225 $stuff = fetchUser( $uid );
226 $last_uid++;
228 $name = wfStrencode( $stuff->username );
229 $hash = md5hash( $stuff->password ); # Doable?
230 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
231 $hideminor = ($stuff['rcall'] ? 0 : 1);
232 $options = "cols={$stuff['editcols']}
233 rows={$stuff['editrows']}
234 rcdays={$stuff['rcdays']}
235 timecorrection={$tzoffset}
236 hideminor={$hideminor}
239 $sql = "INSERT
240 INTO user (user_id,user_name,user_password,user_options)
241 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
242 return $sql;
245 function checkUserCache( $name, $host )
247 global $usercache;
249 if( $name ) {
250 if( in_array( $name, $usercache ) ) {
251 $userid = $usercache[$name];
252 } else {
253 # If we haven't imported user accounts
254 $userid = 0;
256 $username = wfStrencode( $name );
257 } else {
258 $userid = 0;
259 $username = wfStrencode( $host );
261 return array( $userid, $username );
264 function importPage( $title )
266 global $wgTimezoneCorrection, $titlecache, $usercache;
267 global $conversionscript, $conversioncomment, $conversiontime;
268 global $historyonly, $lasthistoryonly;
270 $page = fetchPage( $title );
272 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
273 $t = renamePage( $title );
274 $newtitle = wfStrencode( $t->title );
275 $namespace = $t->namespace;
277 # Current revision:
278 $text = wfStrencode( $page->text );
279 $minor = ($page->minor ? 1 : 0);
280 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
281 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
282 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
283 $sql = "\n";
284 if( !$historyonly ) {
285 $sql .= "INSERT
286 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
287 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
289 $sql .= "INSERT
290 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
291 VALUES";
292 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
294 # History
295 if( !$lasthistoryonly ) {
296 $revisions = fetchKeptPages( $title );
297 foreach( $revisions as $rev ) {
298 $text = wfStrencode( $rev->text );
299 $minor = ($rev->minor ? 1 : 0);
300 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
301 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
302 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
305 return $sql . $sqlfinal;
309 # Count up basic links
310 function countLinksFrom( $title )
312 $page = fetchPage( $title );
313 $page->text = preg_replace(
314 '/<nowiki>.*<\/nowiki>/sDU',
316 $page->text );
317 $page->text = preg_replace(
318 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
319 'countLinkTo( ucfirst( "$1" ) )',
320 $page->text );
323 function countLinkTo( $title )
325 global $linkcache;
326 $t = transformTitle( $title );
327 $linkform = FreeToNormal( $t->title );
328 $x = $linkcache[$title];
329 if ( count ( $x ) ) {
330 $y = $x[$linkform] ;
331 if ( $y ) $y++; else $y = 1 ;
332 $x[$linkform] = $y ;
333 } else {
334 $x = array ( $linkform => 1 ) ;
336 $linkcache[$title] = $x;
339 # Preferentially change case
340 function renamePage( $title )
342 global $linkcache;
343 $t = transformTitle( $title );
345 # We want to use the most frequently linked-to form as the title
346 $maxcount = 0 ; $maxform = $t->title ;
347 foreach ( $linkcache[$title] as $linkform => $count ) {
348 if ( $count > $maxcount ) {
349 $maxcount = $count ;
350 $maxform = $linkform ;
353 if( $maxform != $t->title) {
354 doRenamePage( $t, $maxform );
358 function doRenamePage( $title, $maxform )
360 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
361 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
362 VALUES ";
363 $redirsql = array();
364 foreach( $linkcache[$title] as $linkform => $count ) {
365 if( $linkform != $maxform ) {
366 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
367 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
370 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
371 return $sql;
374 # Account for syntax changes
375 function rewritePage( $title, $text )
377 # ...
378 $text = removeTalkLink( $text );
379 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
380 'rewritePageBits( $title, "$1")',
381 $text );
382 return $text;
385 function rewritePageBits( $title, $text ) {
386 $text = fixSubpages( $title, $text );
387 $text = fixMedialinks( $text );
388 $text = fixImagelinks( $text );
389 return $text;
392 function removeTalkLink( &$text ) {
393 global $talkending;
394 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
397 function fixSubpages( $text, &$title ) {
398 $old = preg_quote( $text );
399 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
400 "$1[[$title/$2|/$2]]", $text );
401 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
402 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
403 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
404 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
405 return $text;
408 function fixImagelinks( &$text ) {
409 global $imageimport, $namespaces;
410 return preg_replace( "/$imageimport/e",
411 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
412 $text );
415 function fixMedialinks( &$text ) {
416 global $imageimport, $mediatext;
417 $text = preg_replace( "/\[$imageimport\]/e",
418 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
419 $text );
420 return preg_replace( "/\[$imageimport (.+?)\]/e",
421 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
422 $text );
425 function fetchMediaFile( $url, $filename )
427 # Copy an image file into local upload space
428 # FIXME
429 return ucfirst( $filename );
432 # Simple move of talk pages, etc
433 function transformTitle( $title, $dorename = false )
435 global $talkending;
436 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
437 $thetitle = $m[1];
438 $namespace = 1;
439 } else {
440 $thetitle = $title;
441 $namespace = 0;
443 return Title::fromData( $namespace, $thetitle );
446 # Translated out of old usemod wiki...
447 function FreeToNormal ( $id , $FreeUpper = true ) {
448 $id = str_replace ( " ", "_", $id ) ;
449 $id = ucfirst($id);
450 if (strstr($id, '_') != false) { # Quick check for any space/underscores
451 $id = preg_replace ( '/__+/' , "_" , $id ) ;
452 $id = preg_replace ( '/^_/' , "", $id ) ;
453 $id = preg_replace ( '/_$/' , "", $id ) ;
454 #if ($UseSubpage) {
455 $id = preg_replace ( '|_/|', "/" , $id ) ;
456 $id = preg_replace ( '|/_|', "/" , $id ) ;
459 if ($FreeUpper) {
460 # Note that letters after ' are *not* capitalized
461 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
462 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
465 return $id;
468 # Whee!
469 function recodeInput( $text )
471 return $text;
474 function wfUnix2Timestamp( $unixtime ) {
475 return gmdate( "YmdHis", $timestamp );
478 function wfTimestamp2Unix( $ts )
480 return gmmktime( ( (int)substr( $ts, 8, 2) ),
481 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
482 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
483 (int)substr( $ts, 0, 4 ) );