MediaWiki namespace
[mediawiki.git] / maintenance / importUseModWiki.php
blob37b4f10a66437e7a7ab2fc0697fd05bc6682e6b1
1 <?php
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
8 Updated limited version to get something working temporarily
9 2003-10-09
10 Be sure to run the link & index rebuilding scripts!
12 Some more munging for charsets etc
13 2003-11-28
17 /* Set these correctly! */
18 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
19 $wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki";
21 /* globals */
22 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
23 $FS = $wgFieldSeparator ;
24 $FS1 = $FS."1" ;
25 $FS2 = $FS."2" ;
26 $FS3 = $FS."3" ;
28 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
29 $usercache = array();
31 wfSeedRandom();
32 importPages();
34 # ------------------------------------------------------------------------------
36 function importPages()
38 global $wgRootDirectory;
40 $letters = array(
41 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
42 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
43 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
44 foreach( $letters as $letter ) {
45 $dir = "$wgRootDirectory/page/$letter";
46 if( is_dir( $dir ) )
47 importPageDirectory( $dir );
51 function importPageDirectory( $dir, $prefix = "" )
53 echo "\n-- Checking page directory $dir\n";
54 $mydir = opendir( $dir );
55 while( $entry = readdir( $mydir ) ) {
56 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
57 echo importPage( $prefix . $m[1] );
58 } else {
59 if( is_dir( "$dir/$entry" ) ) {
60 if( $entry != '.' && $entry != '..' ) {
61 importPageDirectory( "$dir/$entry", "$entry/" );
63 } else {
64 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
71 # ------------------------------------------------------------------------------
73 /* fetch_ functions
74 Grab a given item from the database
76 function fetchUser( $uid )
78 die ("fetchUser not implemented" );
80 global $FS,$FS2,$FS3, $wgRootDirectory;
82 $fname = $wgRootDirectory . "/page/" . $title;
83 if( !file_exists( $fname ) ) return false;
85 $data = splitHash( implode( "", file( $fname ) ) );
86 # enough?
88 return $data;
91 function useModFilename( $title ) {
92 $c = substr( $title, 0, 1 );
93 if(preg_match( '/[A-Z]/', $c ) ) {
94 return "$c/$title";
96 return "other/$title";
99 function fetchPage( $title )
101 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
103 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
104 if( !file_exists( $fname ) ) {
105 die( "Couldn't open file '$fname' for page '$title'.\n" );
108 $page = splitHash( $FS1, file_get_contents( $fname ) );
109 $section = splitHash( $FS2, $page["text_default"] );
110 $text = splitHash( $FS3, $section["data"] );
112 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
113 "minor" => $text["minor"] , "ts" => $section["ts"] ,
114 "username" => $section["username"] , "host" => $section["host"] ) );
117 function fetchKeptPages( $title )
119 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
121 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
122 if( !file_exists( $fname ) ) return array();
124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
125 array_shift( $keptlist ); # Drop the junk at beginning of file
127 $revisions = array();
128 foreach( $keptlist as $rev ) {
129 $section = splitHash( $FS2, $rev );
130 $text = splitHash( $FS3, $section["data"] );
131 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
132 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
133 "minor" => $text["minor"] , "ts" => $section["ts"] ,
134 "username" => $section["username"] , "host" => $section["host"] ) ) );
135 } else {
136 echo "-- skipped a bad old revision\n";
139 return $revisions;
142 function splitHash ( $sep , $str ) {
143 $temp = explode ( $sep , $str ) ;
144 $ret = array () ;
145 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
146 $ret[$temp[$i]] = $temp[++$i] ;
148 return $ret ;
152 /* import_ functions
153 Take a fetched item and produce SQL
156 /* importUser
157 $uid is the UseMod user id number.
158 The new ones will be assigned arbitrarily and are for internal use only.
160 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
162 function importUser( $uid )
164 global $last_uid, $user_list, $wgTimestampCorrection;
165 die("importUser NYI");
166 return "";
168 $stuff = fetchUser( $uid );
169 $last_uid++;
171 $name = wfStrencode( $stuff->username );
172 $hash = md5hash( $stuff->password ); # Doable?
173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
174 $hideminor = ($stuff['rcall'] ? 0 : 1);
175 $options = "cols={$stuff['editcols']}
176 rows={$stuff['editrows']}
177 rcdays={$stuff['rcdays']}
178 timecorrection={$tzoffset}
179 hideminor={$hideminor}
182 $sql = "INSERT
183 INTO user (user_id,user_name,user_password,user_options)
184 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
185 return $sql;
188 function checkUserCache( $name, $host )
190 global $usercache;
192 if( $name ) {
193 if( in_array( $name, $usercache ) ) {
194 $userid = $usercache[$name];
195 } else {
196 # If we haven't imported user accounts
197 $userid = 0;
199 $username = wfStrencode( $name );
200 } else {
201 $userid = 0;
202 $username = wfStrencode( $host );
204 return array( $userid, $username );
207 function importPage( $title )
209 global $usercache;
210 global $conversiontime;
212 echo "\n-- Importing page $title\n";
213 $page = fetchPage( $title );
215 $newtitle = wfStrencode( recodeText( $title ) );
216 $namespace = 0;
218 # Current revision:
219 $text = wfStrencode( recodeText( $page->text ) );
220 $comment = wfStrencode( recodeText( $page->summary ) );
221 $minor = ($page->minor ? 1 : 0);
222 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
223 $timestamp = wfUnix2Timestamp( $page->ts );
224 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
225 $random = mt_rand() / mt_getrandmax();
226 $inverse = wfInvertTimestamp( $timestamp );
227 $sql = "
228 INSERT
229 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
230 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
232 # History
233 $revisions = fetchKeptPages( $title );
234 if(count( $revisions ) == 0 ) {
235 return $sql;
238 $any = false;
239 $sql .= "INSERT
240 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
241 foreach( $revisions as $rev ) {
242 $text = wfStrencode( recodeText( $rev->text ) );
243 $minor = ($rev->minor ? 1 : 0);
244 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
245 $username = wfStrencode( recodeText( $username ) );
246 $timestamp = wfUnix2Timestamp( $rev->ts );
247 $inverse = wfInvertTimestamp( $timestamp );
248 $comment = wfStrencode( recodeText( $rev->summary ) );
250 if($any) $sql .= ",";
251 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
252 $any = true;
254 $sql .= ";\n\n";
255 return $sql;
258 # Whee!
259 function recodeText( $string ) {
260 global $wgImportEncoding;
261 # For currently latin-1 wikis
262 $string = str_replace( "\r\n", "\n", $string );
263 $string = iconv( $wgImportEncoding, "UTF-8", $string );
264 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
265 return $string;
268 function wfUtf8Sequence($codepoint) {
269 if($codepoint < 0x80) return chr($codepoint);
270 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
271 chr($codepoint & 0x3f | 0x80);
272 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
273 chr($codepoint >> 6 & 0x3f | 0x80) .
274 chr($codepoint & 0x3f | 0x80);
275 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
276 chr($codepoint >> 12 & 0x3f | 0x80) .
277 chr($codepoint >> 6 & 0x3f | 0x80) .
278 chr($codepoint & 0x3f | 0x80);
279 # Doesn't yet handle outside the BMP
280 return "&#$codepoint;";
283 function wfMungeToUtf8($string) {
284 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
285 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
286 # Should also do named entities here
287 return $string;
290 function wfStrencode( $string ) {
291 return mysql_escape_string( $string );
294 function wfUnix2Timestamp( $unixtime ) {
295 return gmdate( "YmdHis", $unixtime );
298 function wfTimestamp2Unix( $ts )
300 return gmmktime( ( (int)substr( $ts, 8, 2) ),
301 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
302 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
303 (int)substr( $ts, 0, 4 ) );
306 function wfTimestampNow() {
307 # return NOW
308 return gmdate( "YmdHis" );
311 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
312 function wfInvertTimestamp( $ts ) {
313 return strtr(
314 $ts,
315 "0123456789",
316 "9876543210"
320 function wfSeedRandom()
322 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
323 mt_srand( $seed );
324 $wgRandomSeeded = true;
327 function array2object( $arr ) {
328 $o = (object)0;
329 foreach( $arr as $x => $y ) {
330 $o->$x = $y;
332 return $o;