* td background set to White to avoid header border shining through
[mediawiki.git] / maintenance / importUseModWiki.php
blobd7416fabd89c8de40f24de7429a12f724666eca9
1 <?php
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
8 Updated limited version to get something working temporarily
9 2003-10-09
10 Be sure to run the link & index rebuilding scripts!
12 Some more munging for charsets etc
13 2003-11-28
17 /* Set these correctly! */
18 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
19 $wgRootDirectory = "/home/usemod/wiki-ia/lib-http/db/wiki";
21 /* globals */
22 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
23 $FS = $wgFieldSeparator ;
24 $FS1 = $FS."1" ;
25 $FS2 = $FS."2" ;
26 $FS3 = $FS."3" ;
28 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
29 $usercache = array();
31 wfSeedRandom();
32 importPages();
34 # ------------------------------------------------------------------------------
36 function importPages()
38 global $wgRootDirectory;
40 $letters = array(
41 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
42 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
43 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
44 foreach( $letters as $letter ) {
45 $dir = "$wgRootDirectory/page/$letter";
46 if( is_dir( $dir ) )
47 importPageDirectory( $dir );
51 function importPageDirectory( $dir, $prefix = "" )
53 echo "\n-- Checking page directory $dir\n";
54 $mydir = opendir( $dir );
55 while( $entry = readdir( $mydir ) ) {
56 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
57 echo importPage( $prefix . $m[1] );
58 } else {
59 if( is_dir( "$dir/$entry" ) ) {
60 if( $entry != '.' && $entry != '..' ) {
61 importPageDirectory( "$dir/$entry", "$entry/" );
63 } else {
64 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
71 # ------------------------------------------------------------------------------
73 /* fetch_ functions
74 Grab a given item from the database
76 function fetchUser( $uid )
78 die ("fetchUser not implemented" );
80 global $FS,$FS2,$FS3, $wgRootDirectory;
82 $fname = $wgRootDirectory . "/page/" . $title;
83 if( !file_exists( $fname ) ) return false;
85 $data = splitHash( implode( "", file( $fname ) ) );
86 # enough?
88 return $data;
91 function useModFilename( $title ) {
92 $c = substr( $title, 0, 1 );
93 if(preg_match( '/[A-Z]/', $c ) ) {
94 return "$c/$title";
96 return "other/$title";
99 function fetchPage( $title )
101 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
103 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
104 if( !file_exists( $fname ) ) {
105 die( "Couldn't open file '$fname' for page '$title'.\n" );
108 $page = splitHash( $FS1, file_get_contents( $fname ) );
109 $section = splitHash( $FS2, $page["text_default"] );
110 $text = splitHash( $FS3, $section["data"] );
112 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
113 "minor" => $text["minor"] , "ts" => $section["ts"] ,
114 "username" => $section["username"] , "host" => $section["host"] ) );
117 function fetchKeptPages( $title )
119 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
121 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
122 if( !file_exists( $fname ) ) return array();
124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
125 array_shift( $keptlist ); # Drop the junk at beginning of file
127 $revisions = array();
128 foreach( $keptlist as $rev ) {
129 $section = splitHash( $FS2, $rev );
130 $text = splitHash( $FS3, $section["data"] );
131 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
132 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
133 "minor" => $text["minor"] , "ts" => $section["ts"] ,
134 "username" => $section["username"] , "host" => $section["host"] ) ) );
135 } else {
136 echo "-- skipped a bad old revision\n";
139 return $revisions;
142 function splitHash ( $sep , $str ) {
143 $temp = explode ( $sep , $str ) ;
144 $ret = array () ;
145 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
146 $ret[$temp[$i]] = $temp[++$i] ;
148 return $ret ;
152 /* import_ functions
153 Take a fetched item and produce SQL
156 /* importUser
157 $uid is the UseMod user id number.
158 The new ones will be assigned arbitrarily and are for internal use only.
160 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
162 function importUser( $uid )
164 global $last_uid, $user_list, $wgTimestampCorrection;
165 die("importUser NYI");
166 return "";
168 $stuff = fetchUser( $uid );
169 $last_uid++;
171 $name = wfStrencode( $stuff->username );
172 $hash = md5hash( $stuff->password ); # Doable?
173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
174 $hideminor = ($stuff['rcall'] ? 0 : 1);
175 $options = "cols={$stuff['editcols']}
176 rows={$stuff['editrows']}
177 rcdays={$stuff['rcdays']}
178 timecorrection={$tzoffset}
179 hideminor={$hideminor}
182 $sql = "INSERT
183 INTO user (user_id,user_name,user_password,user_options)
184 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
185 return $sql;
188 function checkUserCache( $name, $host )
190 global $usercache;
192 if( $name ) {
193 if( in_array( $name, $usercache ) ) {
194 $userid = $usercache[$name];
195 } else {
196 # If we haven't imported user accounts
197 $userid = 0;
199 $username = wfStrencode( $name );
200 } else {
201 $userid = 0;
202 $username = wfStrencode( $host );
204 return array( $userid, $username );
207 function importPage( $title )
209 global $usercache;
210 global $conversiontime;
212 echo "\n-- Importing page $title\n";
213 $page = fetchPage( $title );
215 $newtitle = wfStrencode( recodeText( $title ) );
216 $namespace = 0;
218 # Current revision:
219 $text = wfStrencode( recodeText( $page->text ) );
220 $comment = wfStrencode( recodeText( $page->summary ) );
221 $minor = ($page->minor ? 1 : 0);
222 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
223 $username = wfStrencode( recodeText( $username ) );
224 $timestamp = wfUnix2Timestamp( $page->ts );
225 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
226 $random = mt_rand() / mt_getrandmax();
227 $inverse = wfInvertTimestamp( $timestamp );
228 $sql = "
229 INSERT
230 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
231 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
233 # History
234 $revisions = fetchKeptPages( $title );
235 if(count( $revisions ) == 0 ) {
236 return $sql;
239 $any = false;
240 $sql .= "INSERT
241 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
242 foreach( $revisions as $rev ) {
243 $text = wfStrencode( recodeText( $rev->text ) );
244 $minor = ($rev->minor ? 1 : 0);
245 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
246 $username = wfStrencode( recodeText( $username ) );
247 $timestamp = wfUnix2Timestamp( $rev->ts );
248 $inverse = wfInvertTimestamp( $timestamp );
249 $comment = wfStrencode( recodeText( $rev->summary ) );
251 if($any) $sql .= ",";
252 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
253 $any = true;
255 $sql .= ";\n\n";
256 return $sql;
259 # Whee!
260 function recodeText( $string ) {
261 global $wgImportEncoding;
262 # For currently latin-1 wikis
263 $string = str_replace( "\r\n", "\n", $string );
264 $string = iconv( $wgImportEncoding, "UTF-8", $string );
265 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
266 return $string;
269 function wfUtf8Sequence($codepoint) {
270 if($codepoint < 0x80) return chr($codepoint);
271 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
272 chr($codepoint & 0x3f | 0x80);
273 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
274 chr($codepoint >> 6 & 0x3f | 0x80) .
275 chr($codepoint & 0x3f | 0x80);
276 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
277 chr($codepoint >> 12 & 0x3f | 0x80) .
278 chr($codepoint >> 6 & 0x3f | 0x80) .
279 chr($codepoint & 0x3f | 0x80);
280 # Doesn't yet handle outside the BMP
281 return "&#$codepoint;";
284 function wfMungeToUtf8($string) {
285 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
286 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
287 # Should also do named entities here
288 return $string;
291 function wfStrencode( $string ) {
292 return mysql_escape_string( $string );
295 function wfUnix2Timestamp( $unixtime ) {
296 return gmdate( "YmdHis", $unixtime );
299 function wfTimestamp2Unix( $ts )
301 return gmmktime( ( (int)substr( $ts, 8, 2) ),
302 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
303 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
304 (int)substr( $ts, 0, 4 ) );
307 function wfTimestampNow() {
308 # return NOW
309 return gmdate( "YmdHis" );
312 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
313 function wfInvertTimestamp( $ts ) {
314 return strtr(
315 $ts,
316 "0123456789",
317 "9876543210"
321 function wfSeedRandom()
323 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
324 mt_srand( $seed );
325 $wgRandomSeeded = true;
328 function array2object( $arr ) {
329 $o = (object)0;
330 foreach( $arr as $x => $y ) {
331 $o->$x = $y;
333 return $o;