4 * Import data from a UseModWiki into a PediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
8 * Updated limited version to get something working temporarily
10 * Be sure to run the link & index rebuilding scripts!
12 * Some more munging for charsets etc
17 * @subpackage Maintenance
20 /** Set these correctly! */
21 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
22 $wgRootDirectory = "/home/usemod/wiki-ia/lib-http/db/wiki";
25 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
26 $FS = $wgFieldSeparator ;
31 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
37 # ------------------------------------------------------------------------------
39 function importPages()
41 global $wgRootDirectory;
44 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
45 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
46 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
47 foreach( $letters as $letter ) {
48 $dir = "$wgRootDirectory/page/$letter";
50 importPageDirectory( $dir );
54 function importPageDirectory( $dir, $prefix = "" )
56 echo "\n-- Checking page directory $dir\n";
57 $mydir = opendir( $dir );
58 while( $entry = readdir( $mydir ) ) {
59 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
60 echo importPage( $prefix . $m[1] );
62 if( is_dir( "$dir/$entry" ) ) {
63 if( $entry != '.' && $entry != '..' ) {
64 importPageDirectory( "$dir/$entry", "$entry/" );
67 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
74 # ------------------------------------------------------------------------------
77 Grab a given item from the database
79 function fetchUser( $uid )
81 die ("fetchUser not implemented" );
83 global $FS,$FS2,$FS3, $wgRootDirectory;
85 $fname = $wgRootDirectory . "/page/" . $title;
86 if( !file_exists( $fname ) ) return false;
88 $data = splitHash( implode( "", file( $fname ) ) );
94 function useModFilename( $title ) {
95 $c = substr( $title, 0, 1 );
96 if(preg_match( '/[A-Z]/', $c ) ) {
99 return "other/$title";
102 function fetchPage( $title )
104 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
106 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
107 if( !file_exists( $fname ) ) {
108 die( "Couldn't open file '$fname' for page '$title'.\n" );
111 $page = splitHash( $FS1, file_get_contents( $fname ) );
112 $section = splitHash( $FS2, $page["text_default"] );
113 $text = splitHash( $FS3, $section["data"] );
115 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
116 "minor" => $text["minor"] , "ts" => $section["ts"] ,
117 "username" => $section["username"] , "host" => $section["host"] ) );
120 function fetchKeptPages( $title )
122 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
124 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
125 if( !file_exists( $fname ) ) return array();
127 $keptlist = explode( $FS1, file_get_contents( $fname ) );
128 array_shift( $keptlist ); # Drop the junk at beginning of file
130 $revisions = array();
131 foreach( $keptlist as $rev ) {
132 $section = splitHash( $FS2, $rev );
133 $text = splitHash( $FS3, $section["data"] );
134 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
135 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
136 "minor" => $text["minor"] , "ts" => $section["ts"] ,
137 "username" => $section["username"] , "host" => $section["host"] ) ) );
139 echo "-- skipped a bad old revision\n";
145 function splitHash ( $sep , $str ) {
146 $temp = explode ( $sep , $str ) ;
148 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
149 $ret[$temp[$i]] = $temp[++
$i] ;
156 Take a fetched item and produce SQL
160 $uid is the UseMod user id number.
161 The new ones will be assigned arbitrarily and are for internal use only.
163 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
165 function importUser( $uid )
167 global $last_uid, $user_list, $wgTimestampCorrection;
168 die("importUser NYI");
171 $stuff = fetchUser( $uid );
174 $name = wfStrencode( $stuff->username
);
175 $hash = md5hash( $stuff->password
); # Doable?
176 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
177 $hideminor = ($stuff['rcall'] ?
0 : 1);
178 $options = "cols={$stuff['editcols']}
179 rows={$stuff['editrows']}
180 rcdays={$stuff['rcdays']}
181 timecorrection={$tzoffset}
182 hideminor={$hideminor}
186 INTO user (user_id,user_name,user_password,user_options)
187 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
191 function checkUserCache( $name, $host )
196 if( in_array( $name, $usercache ) ) {
197 $userid = $usercache[$name];
199 # If we haven't imported user accounts
202 $username = wfStrencode( $name );
205 $username = wfStrencode( $host );
207 return array( $userid, $username );
210 function importPage( $title )
213 global $conversiontime;
215 echo "\n-- Importing page $title\n";
216 $page = fetchPage( $title );
218 $newtitle = wfStrencode( recodeText( $title ) );
222 $text = wfStrencode( recodeText( $page->text
) );
223 $comment = wfStrencode( recodeText( $page->summary
) );
224 $minor = ($page->minor ?
1 : 0);
225 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
226 $username = wfStrencode( recodeText( $username ) );
227 $timestamp = wfUnix2Timestamp( $page->ts
);
228 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
229 $random = mt_rand() / mt_getrandmax();
230 $inverse = wfInvertTimestamp( $timestamp );
233 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
234 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
237 $revisions = fetchKeptPages( $title );
238 if(count( $revisions ) == 0 ) {
244 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
245 foreach( $revisions as $rev ) {
246 $text = wfStrencode( recodeText( $rev->text
) );
247 $minor = ($rev->minor ?
1 : 0);
248 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
249 $username = wfStrencode( recodeText( $username ) );
250 $timestamp = wfUnix2Timestamp( $rev->ts
);
251 $inverse = wfInvertTimestamp( $timestamp );
252 $comment = wfStrencode( recodeText( $rev->summary
) );
254 if($any) $sql .= ",";
255 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
263 function recodeText( $string ) {
264 global $wgImportEncoding;
265 # For currently latin-1 wikis
266 $string = str_replace( "\r\n", "\n", $string );
267 $string = iconv( $wgImportEncoding, "UTF-8", $string );
268 $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
272 function wfUtf8Sequence($codepoint) {
273 if($codepoint < 0x80) return chr($codepoint);
274 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f |
0xc0) .
275 chr($codepoint & 0x3f |
0x80);
276 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f |
0xe0) .
277 chr($codepoint >> 6 & 0x3f |
0x80) .
278 chr($codepoint & 0x3f |
0x80);
279 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 |
0xf0) . # Double-check this
280 chr($codepoint >> 12 & 0x3f |
0x80) .
281 chr($codepoint >> 6 & 0x3f |
0x80) .
282 chr($codepoint & 0x3f |
0x80);
283 # Doesn't yet handle outside the BMP
284 return "&#$codepoint;";
287 function wfMungeToUtf8($string) {
288 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
289 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
290 # Should also do named entities here
294 function wfStrencode( $string ) {
295 return mysql_escape_string( $string );
298 function wfUnix2Timestamp( $unixtime ) {
299 return gmdate( "YmdHis", $unixtime );
302 function wfTimestamp2Unix( $ts )
304 return gmmktime( ( (int)substr( $ts, 8, 2) ),
305 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
306 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
307 (int)substr( $ts, 0, 4 ) );
310 function wfTimestampNow() {
312 return gmdate( "YmdHis" );
315 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
316 function wfInvertTimestamp( $ts ) {
324 function wfSeedRandom()
326 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
328 $wgRandomSeeded = true;
331 function array2object( $arr ) {
333 foreach( $arr as $x => $y ) {