4 * Import data from a UseModWiki into a MediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
8 * Updated limited version to get something working temporarily
10 * Be sure to run the link & index rebuilding scripts!
12 * Some more munging for charsets etc
15 * Partial fix for pages starting with lowercase letters (??)
16 * and CamelCase and /Subpage link conversion
19 * Rewrite output to create Special:Export format for import
20 * instead of raw SQL. Should be 'future-proof' against future
26 * @ingroup Maintenance
29 if( php_sapi_name() != 'cli' ) {
30 echo "Please customize the settings and run me from the command line.";
34 /** Set these correctly! */
35 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
36 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
38 /* On a large wiki, you might run out of memory */
39 @ini_set
( 'memory_limit', '40M' );
42 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
43 $FS = $wgFieldSeparator ;
48 # Unicode sanitization tools
49 require_once( dirname( dirname( __FILE__
) ) . '/includes/normal/UtfNormal.php' );
55 # ------------------------------------------------------------------------------
57 function importPages()
59 global $wgRootDirectory;
63 <?xml version="1.0" encoding="UTF-8" ?$gt
64 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
65 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
66 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
67 http://www.mediawiki.org/xml/export-0.1.xsd"
70 <!-- generated by importUseModWiki.php -->
74 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
75 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
76 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
77 foreach( $letters as $letter ) {
78 $dir = "$wgRootDirectory/page/$letter";
80 importPageDirectory( $dir );
88 function importPageDirectory( $dir, $prefix = "" )
90 echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
91 $mydir = opendir( $dir );
92 while( $entry = readdir( $mydir ) ) {
94 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
95 echo importPage( $prefix . $m[1] );
97 if( is_dir( "$dir/$entry" ) ) {
98 if( $entry != '.' && $entry != '..' ) {
99 importPageDirectory( "$dir/$entry", "$entry/" );
102 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
109 # ------------------------------------------------------------------------------
112 Grab a given item from the database
115 function useModFilename( $title ) {
116 $c = substr( $title, 0, 1 );
117 if(preg_match( '/[A-Z]/i', $c ) ) {
118 return strtoupper( $c ) . "/$title";
120 return "other/$title";
123 function fetchPage( $title )
125 global $FS1,$FS2,$FS3, $wgRootDirectory;
127 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
128 if( !file_exists( $fname ) ) {
129 echo "Couldn't open file '$fname' for page '$title'.\n";
133 $page = splitHash( $FS1, file_get_contents( $fname ) );
134 $section = splitHash( $FS2, $page["text_default"] );
135 $text = splitHash( $FS3, $section["data"] );
137 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
138 "minor" => $text["minor"] , "ts" => $section["ts"] ,
139 "username" => $section["username"] , "host" => $section["host"] ) );
142 function fetchKeptPages( $title )
144 global $FS1,$FS2,$FS3, $wgRootDirectory;
146 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
147 if( !file_exists( $fname ) ) return array();
149 $keptlist = explode( $FS1, file_get_contents( $fname ) );
150 array_shift( $keptlist ); # Drop the junk at beginning of file
152 $revisions = array();
153 foreach( $keptlist as $rev ) {
154 $section = splitHash( $FS2, $rev );
155 $text = splitHash( $FS3, $section["data"] );
156 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
157 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
158 "minor" => $text["minor"] , "ts" => $section["ts"] ,
159 "username" => $section["username"] , "host" => $section["host"] ) ) );
161 echo "<!-- skipped a bad old revision -->\n";
167 function splitHash ( $sep , $str ) {
168 $temp = explode ( $sep , $str ) ;
170 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
171 $ret[$temp[$i]] = $temp[++
$i] ;
178 Take a fetched item and produce SQL
181 function checkUserCache( $name, $host )
186 if( in_array( $name, $usercache ) ) {
187 $userid = $usercache[$name];
189 # If we haven't imported user accounts
192 $username = str_replace( '_', ' ', $name );
197 return array( $userid, $username );
200 function importPage( $title )
204 echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
205 $page = fetchPage( $title );
207 $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
209 $munged = mungeFormat( $page->text
);
210 if( $munged != $page->text
) {
212 * Save a *new* revision with the conversion, and put the
213 * previous last version into the history.
215 $next = array2object( array(
218 'username' => 'Conversion script',
219 'host' => '127.0.0.1',
221 'summary' => 'link fix',
223 $revisions = array( $page, $next );
228 $revisions = array( $page );
232 <title>$newtitle</title>
237 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
238 if(count( $revisions ) == 0 ) {
239 return NULL; // Was "$sql", which does not appear to be defined.
242 foreach( $revisions as $rev ) {
243 $text = xmlsafe( recodeText( $rev->text
) );
244 $minor = ($rev->minor ?
'<minor/>' : '');
245 list( /* $userid */ , $username ) = checkUserCache( $rev->username
, $rev->host
);
246 $username = xmlsafe( recodeText( $username ) );
247 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts
) );
248 $comment = xmlsafe( recodeText( $rev->summary
) );
252 <timestamp>$timestamp</timestamp>
253 <contributor><username>$username</username></contributor>
255 <comment>$comment</comment>
261 $xml .= "</page>\n\n";
266 function recodeText( $string ) {
267 global $wgImportEncoding;
268 # For currently latin-1 wikis
269 $string = str_replace( "\r\n", "\n", $string );
270 $string = @iconv
( $wgImportEncoding, "UTF-8", $string );
271 $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
275 function wfUtf8Sequence($codepoint) {
276 if($codepoint < 0x80) return chr($codepoint);
277 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f |
0xc0) .
278 chr($codepoint & 0x3f |
0x80);
279 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f |
0xe0) .
280 chr($codepoint >> 6 & 0x3f |
0x80) .
281 chr($codepoint & 0x3f |
0x80);
282 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 |
0xf0) . # Double-check this
283 chr($codepoint >> 12 & 0x3f |
0x80) .
284 chr($codepoint >> 6 & 0x3f |
0x80) .
285 chr($codepoint & 0x3f |
0x80);
286 # Doesn't yet handle outside the BMP
287 return "&#$codepoint;";
290 function wfMungeToUtf8($string) {
291 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
292 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
293 # Should also do named entities here
297 function timestamp2ISO8601( $ts ) {
298 #2003-08-05T18:30:02Z
299 return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
302 function xmlsafe( $string ) {
304 * The page may contain old data which has not been properly normalized.
305 * Invalid UTF-8 sequences or forbidden control characters will make our
306 * XML output invalid, so be sure to strip them out.
308 $string = UtfNormal
::cleanUp( $string );
310 $string = htmlspecialchars( $string );
314 function xmlCommentSafe( $text ) {
315 return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
319 function array2object( $arr ) {
321 foreach( $arr as $x => $y ) {
329 * Make CamelCase and /Talk links work
331 function mungeFormat( $text ) {
334 $staged = preg_replace_callback(
335 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
336 'nowikiPlaceholder', $text );
338 # This is probably not 100% correct, I'm just
339 # glancing at the UseModWiki code.
341 $lower = "[a-z_0-9]";
342 $any = "[A-Za-z_0-9]";
343 $camel = "(?:$upper+$lower+$upper+$any*)";
344 $subpage = "(?:\\/$any+)";
345 $substart = "(?:\\/$upper$any*)";
347 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
350 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
351 'array_shift( $nowiki )', $munged );
356 function placeholder( $x = null ) {
357 return '\xffplaceholder\xff';
360 function nowikiPlaceholder( $matches ) {
362 $nowiki[] = $matches[1];
363 return placeholder();