Require running from command line
[mediawiki.git] / maintenance / importUseModWiki.php
blob318483e712425d5c9efaf96b4119c15f6440e05f
1 <?php
3 /**
4 * Import data from a UseModWiki into a PediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
8 * Updated limited version to get something working temporarily
9 * 2003-10-09
10 * Be sure to run the link & index rebuilding scripts!
12 * Some more munging for charsets etc
13 * 2003-11-28
15 * Partial fix for pages starting with lowercase letters (??)
16 * and CamelCase and /Subpage link conversion
17 * 2004-11-17
19 * @todo document
20 * @package MediaWiki
21 * @subpackage Maintenance
25 /** Set these correctly! */
26 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
27 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
29 /* On a large wiki, you might run out of memory */
30 @ini_set( 'memory_limit', '40M' );
32 /* globals */
33 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
34 $FS = $wgFieldSeparator ;
35 $FS1 = $FS."1" ;
36 $FS2 = $FS."2" ;
37 $FS3 = $FS."3" ;
39 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
40 $usercache = array();
42 wfSeedRandom();
43 importPages();
45 # ------------------------------------------------------------------------------
47 function importPages()
49 global $wgRootDirectory;
51 $letters = array(
52 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
53 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
54 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
55 foreach( $letters as $letter ) {
56 $dir = "$wgRootDirectory/page/$letter";
57 if( is_dir( $dir ) )
58 importPageDirectory( $dir );
62 function importPageDirectory( $dir, $prefix = "" )
64 echo "\n-- Checking page directory $dir\n";
65 $mydir = opendir( $dir );
66 while( $entry = readdir( $mydir ) ) {
67 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
68 echo importPage( $prefix . $m[1] );
69 } else {
70 if( is_dir( "$dir/$entry" ) ) {
71 if( $entry != '.' && $entry != '..' ) {
72 importPageDirectory( "$dir/$entry", "$entry/" );
74 } else {
75 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
82 # ------------------------------------------------------------------------------
84 /* fetch_ functions
85 Grab a given item from the database
87 function fetchUser( $uid )
89 die ("fetchUser not implemented" );
91 global $FS,$FS2,$FS3, $wgRootDirectory;
93 $fname = $wgRootDirectory . "/page/" . $title;
94 if( !file_exists( $fname ) ) return false;
96 $data = splitHash( implode( "", file( $fname ) ) );
97 # enough?
99 return $data;
102 function useModFilename( $title ) {
103 $c = substr( $title, 0, 1 );
104 if(preg_match( '/[A-Z]/i', $c ) ) {
105 return strtoupper( $c ) . "/$title";
107 return "other/$title";
110 function fetchPage( $title )
112 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
114 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
115 if( !file_exists( $fname ) ) {
116 die( "Couldn't open file '$fname' for page '$title'.\n" );
119 $page = splitHash( $FS1, file_get_contents( $fname ) );
120 $section = splitHash( $FS2, $page["text_default"] );
121 $text = splitHash( $FS3, $section["data"] );
123 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
124 "minor" => $text["minor"] , "ts" => $section["ts"] ,
125 "username" => $section["username"] , "host" => $section["host"] ) );
128 function fetchKeptPages( $title )
130 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
132 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
133 if( !file_exists( $fname ) ) return array();
135 $keptlist = explode( $FS1, file_get_contents( $fname ) );
136 array_shift( $keptlist ); # Drop the junk at beginning of file
138 $revisions = array();
139 foreach( $keptlist as $rev ) {
140 $section = splitHash( $FS2, $rev );
141 $text = splitHash( $FS3, $section["data"] );
142 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
143 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
144 "minor" => $text["minor"] , "ts" => $section["ts"] ,
145 "username" => $section["username"] , "host" => $section["host"] ) ) );
146 } else {
147 echo "-- skipped a bad old revision\n";
150 return $revisions;
153 function splitHash ( $sep , $str ) {
154 $temp = explode ( $sep , $str ) ;
155 $ret = array () ;
156 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
157 $ret[$temp[$i]] = $temp[++$i] ;
159 return $ret ;
163 /* import_ functions
164 Take a fetched item and produce SQL
167 /* importUser
168 $uid is the UseMod user id number.
169 The new ones will be assigned arbitrarily and are for internal use only.
171 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
173 function importUser( $uid )
175 global $last_uid, $user_list, $wgTimestampCorrection;
176 die("importUser NYI");
177 return "";
179 $stuff = fetchUser( $uid );
180 $last_uid++;
182 $name = wfStrencode( $stuff->username );
183 $hash = md5hash( $stuff->password ); # Doable?
184 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
185 $hideminor = ($stuff['rcall'] ? 0 : 1);
186 $options = "cols={$stuff['editcols']}
187 rows={$stuff['editrows']}
188 rcdays={$stuff['rcdays']}
189 timecorrection={$tzoffset}
190 hideminor={$hideminor}
193 $sql = "INSERT
194 INTO user (user_id,user_name,user_password,user_options)
195 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
196 return $sql;
199 function checkUserCache( $name, $host )
201 global $usercache;
203 if( $name ) {
204 if( in_array( $name, $usercache ) ) {
205 $userid = $usercache[$name];
206 } else {
207 # If we haven't imported user accounts
208 $userid = 0;
210 $username = wfStrencode( $name );
211 } else {
212 $userid = 0;
213 $username = wfStrencode( $host );
215 return array( $userid, $username );
218 function importPage( $title )
220 global $usercache;
221 global $conversiontime;
223 echo "\n-- Importing page $title\n";
224 $page = fetchPage( $title );
226 $newtitle = wfStrencode( recodeText( $title ) );
227 $namespace = 0;
229 $munged = mungeFormat( $page->text );
230 if( $munged != $page->text ) {
232 * Save a *new* revision with the conversion, and put the
233 * previous last version into the history.
235 $text = wfStrencode( recodeText( $munged ) );
236 $comment = "link fix";
237 $minor = 1;
238 $userid = 0;
239 $username = "Conversion script";
240 $timestamp = wfUnix2Timestamp( time() );
241 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
242 $random = mt_rand() / mt_getrandmax();
243 $inverse = wfInvertTimestamp( $timestamp );
245 $revisions = array( $page );
246 } else {
248 * Current revision:
250 $text = wfStrencode( recodeText( $page->text ) );
251 $comment = wfStrencode( recodeText( $page->summary ) );
252 $minor = ($page->minor ? 1 : 0);
253 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
254 $username = wfStrencode( recodeText( $username ) );
255 $timestamp = wfUnix2Timestamp( $page->ts );
256 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
257 $random = mt_rand() / mt_getrandmax();
258 $inverse = wfInvertTimestamp( $timestamp );
260 $revisions = array();
262 $sql = "
263 INSERT
264 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
265 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
267 # History
268 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
269 if(count( $revisions ) == 0 ) {
270 return $sql;
273 $any = false;
274 $sql .= "INSERT
275 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
276 foreach( $revisions as $rev ) {
277 $text = wfStrencode( recodeText( $rev->text ) );
278 $minor = ($rev->minor ? 1 : 0);
279 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
280 $username = wfStrencode( recodeText( $username ) );
281 $timestamp = wfUnix2Timestamp( $rev->ts );
282 $inverse = wfInvertTimestamp( $timestamp );
283 $comment = wfStrencode( recodeText( $rev->summary ) );
285 if($any) $sql .= ",";
286 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
287 $any = true;
289 $sql .= ";\n\n";
290 return $sql;
293 # Whee!
294 function recodeText( $string ) {
295 global $wgImportEncoding;
296 # For currently latin-1 wikis
297 $string = str_replace( "\r\n", "\n", $string );
298 $string = @iconv( $wgImportEncoding, "UTF-8", $string );
299 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
300 return $string;
303 function wfUtf8Sequence($codepoint) {
304 if($codepoint < 0x80) return chr($codepoint);
305 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
306 chr($codepoint & 0x3f | 0x80);
307 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
308 chr($codepoint >> 6 & 0x3f | 0x80) .
309 chr($codepoint & 0x3f | 0x80);
310 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
311 chr($codepoint >> 12 & 0x3f | 0x80) .
312 chr($codepoint >> 6 & 0x3f | 0x80) .
313 chr($codepoint & 0x3f | 0x80);
314 # Doesn't yet handle outside the BMP
315 return "&#$codepoint;";
318 function wfMungeToUtf8($string) {
319 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
320 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
321 # Should also do named entities here
322 return $string;
325 function wfStrencode( $string ) {
326 return mysql_escape_string( $string );
329 function wfUnix2Timestamp( $unixtime ) {
330 return gmdate( "YmdHis", $unixtime );
333 function wfTimestamp2Unix( $ts )
335 return gmmktime( ( (int)substr( $ts, 8, 2) ),
336 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
337 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
338 (int)substr( $ts, 0, 4 ) );
341 function wfTimestampNow() {
342 # return NOW
343 return gmdate( "YmdHis" );
346 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
347 function wfInvertTimestamp( $ts ) {
348 return strtr(
349 $ts,
350 "0123456789",
351 "9876543210"
355 function wfSeedRandom()
357 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
358 mt_srand( $seed );
359 $wgRandomSeeded = true;
362 function array2object( $arr ) {
363 $o = (object)0;
364 foreach( $arr as $x => $y ) {
365 $o->$x = $y;
367 return $o;
372 * Make CamelCase and /Talk links work
374 function mungeFormat( $text ) {
375 global $nowiki;
376 $nowiki = array();
377 $staged = preg_replace_callback(
378 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
379 'nowikiPlaceholder', $text );
381 # This is probably not 100% correct, I'm just
382 # glancing at the UseModWiki code.
383 $upper = "[A-Z]";
384 $lower = "[a-z_0-9]";
385 $any = "[A-Za-z_0-9]";
386 $camel = "(?:$upper+$lower+$upper+$any*)";
387 $subpage = "(?:\\/$any+)";
388 $substart = "(?:\\/$upper$any*)";
390 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
391 '[[$1]]', $staged );
393 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
394 'array_shift( $nowiki )', $munged );
395 return $final;
399 function placeholder( $x = null ) {
400 return '\xffplaceholder\xff';
403 function nowikiPlaceholder( $matches ) {
404 global $nowiki;
405 $nowiki[] = $matches[1];
406 return placeholder();