4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
8 Updated limited version to get something working temporarily
10 Be sure to run the link & index rebuilding scripts!
12 Some more munging for charsets etc
17 /* Set these correctly! */
18 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
19 $wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki";
22 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
23 $FS = $wgFieldSeparator ;
28 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
34 # ------------------------------------------------------------------------------
36 function importPages()
38 global $wgRootDirectory;
41 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
42 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
43 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
44 foreach( $letters as $letter ) {
45 $dir = "$wgRootDirectory/page/$letter";
47 importPageDirectory( $dir );
51 function importPageDirectory( $dir, $prefix = "" )
53 echo "\n-- Checking page directory $dir\n";
54 $mydir = opendir( $dir );
55 while( $entry = readdir( $mydir ) ) {
56 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
57 echo importPage( $prefix . $m[1] );
59 if( is_dir( "$dir/$entry" ) ) {
60 if( $entry != '.' && $entry != '..' ) {
61 importPageDirectory( "$dir/$entry", "$entry/" );
64 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
71 # ------------------------------------------------------------------------------
74 Grab a given item from the database
76 function fetchUser( $uid )
78 die ("fetchUser not implemented" );
80 global $FS,$FS2,$FS3, $wgRootDirectory;
82 $fname = $wgRootDirectory . "/page/" . $title;
83 if( !file_exists( $fname ) ) return false;
85 $data = splitHash( implode( "", file( $fname ) ) );
91 function useModFilename( $title ) {
92 $c = substr( $title, 0, 1 );
93 if(preg_match( '/[A-Z]/', $c ) ) {
96 return "other/$title";
99 function fetchPage( $title )
101 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
103 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
104 if( !file_exists( $fname ) ) {
105 die( "Couldn't open file '$fname' for page '$title'.\n" );
108 $page = splitHash( $FS1, file_get_contents( $fname ) );
109 $section = splitHash( $FS2, $page["text_default"] );
110 $text = splitHash( $FS3, $section["data"] );
112 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
113 "minor" => $text["minor"] , "ts" => $section["ts"] ,
114 "username" => $section["username"] , "host" => $section["host"] ) );
117 function fetchKeptPages( $title )
119 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
121 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
122 if( !file_exists( $fname ) ) return array();
124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
125 array_shift( $keptlist ); # Drop the junk at beginning of file
127 $revisions = array();
128 foreach( $keptlist as $rev ) {
129 $section = splitHash( $FS2, $rev );
130 $text = splitHash( $FS3, $section["data"] );
131 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
132 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
133 "minor" => $text["minor"] , "ts" => $section["ts"] ,
134 "username" => $section["username"] , "host" => $section["host"] ) ) );
136 echo "-- skipped a bad old revision\n";
142 function splitHash ( $sep , $str ) {
143 $temp = explode ( $sep , $str ) ;
145 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
146 $ret[$temp[$i]] = $temp[++
$i] ;
153 Take a fetched item and produce SQL
157 $uid is the UseMod user id number.
158 The new ones will be assigned arbitrarily and are for internal use only.
160 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
162 function importUser( $uid )
164 global $last_uid, $user_list, $wgTimestampCorrection;
165 die("importUser NYI");
168 $stuff = fetchUser( $uid );
171 $name = wfStrencode( $stuff->username
);
172 $hash = md5hash( $stuff->password
); # Doable?
173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
174 $hideminor = ($stuff['rcall'] ?
0 : 1);
175 $options = "cols={$stuff['editcols']}
176 rows={$stuff['editrows']}
177 rcdays={$stuff['rcdays']}
178 timecorrection={$tzoffset}
179 hideminor={$hideminor}
183 INTO user (user_id,user_name,user_password,user_options)
184 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
188 function checkUserCache( $name, $host )
193 if( in_array( $name, $usercache ) ) {
194 $userid = $usercache[$name];
196 # If we haven't imported user accounts
199 $username = wfStrencode( $name );
202 $username = wfStrencode( $host );
204 return array( $userid, $username );
207 function importPage( $title )
210 global $conversiontime;
212 echo "\n-- Importing page $title\n";
213 $page = fetchPage( $title );
215 $newtitle = wfStrencode( recodeText( $title ) );
219 $text = wfStrencode( recodeText( $page->text
) );
220 $comment = wfStrencode( recodeText( $page->summary
) );
221 $minor = ($page->minor ?
1 : 0);
222 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
223 $timestamp = wfUnix2Timestamp( $page->ts
);
224 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
225 $random = mt_rand() / mt_getrandmax();
226 $inverse = wfInvertTimestamp( $timestamp );
229 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
230 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
233 $revisions = fetchKeptPages( $title );
234 if(count( $revisions ) == 0 ) {
240 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
241 foreach( $revisions as $rev ) {
242 $text = wfStrencode( recodeText( $rev->text
) );
243 $minor = ($rev->minor ?
1 : 0);
244 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
245 $username = wfStrencode( recodeText( $username ) );
246 $timestamp = wfUnix2Timestamp( $rev->ts
);
247 $inverse = wfInvertTimestamp( $timestamp );
248 $comment = wfStrencode( recodeText( $rev->summary
) );
250 if($any) $sql .= ",";
251 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
259 function recodeText( $string ) {
260 global $wgImportEncoding;
261 # For currently latin-1 wikis
262 $string = str_replace( "\r\n", "\n", $string );
263 $string = iconv( $wgImportEncoding, "UTF-8", $string );
264 $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
268 function wfUtf8Sequence($codepoint) {
269 if($codepoint < 0x80) return chr($codepoint);
270 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f |
0xc0) .
271 chr($codepoint & 0x3f |
0x80);
272 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f |
0xe0) .
273 chr($codepoint >> 6 & 0x3f |
0x80) .
274 chr($codepoint & 0x3f |
0x80);
275 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 |
0xf0) . # Double-check this
276 chr($codepoint >> 12 & 0x3f |
0x80) .
277 chr($codepoint >> 6 & 0x3f |
0x80) .
278 chr($codepoint & 0x3f |
0x80);
279 # Doesn't yet handle outside the BMP
280 return "&#$codepoint;";
283 function wfMungeToUtf8($string) {
284 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
285 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
286 # Should also do named entities here
290 function wfStrencode( $string ) {
291 return mysql_escape_string( $string );
294 function wfUnix2Timestamp( $unixtime ) {
295 return gmdate( "YmdHis", $unixtime );
298 function wfTimestamp2Unix( $ts )
300 return gmmktime( ( (int)substr( $ts, 8, 2) ),
301 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
302 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
303 (int)substr( $ts, 0, 4 ) );
306 function wfTimestampNow() {
308 return gmdate( "YmdHis" );
311 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
312 function wfInvertTimestamp( $ts ) {
320 function wfSeedRandom()
322 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
324 $wgRandomSeeded = true;
327 function array2object( $arr ) {
329 foreach( $arr as $x => $y ) {