upgrade-files.pl

   1 #! /usr/bin/perl -w
   2
   3 use CGI qw/:standard/;
   4 use CGI::Carp qw(fatalsToBrowser);
   5
   6 if (param('separator') eq 'UseMod 0.92' or param('separator') eq 'UseMod 1.00') {
   7   $FS = "\xb3";
   8 } elsif (param('separator') eq 'UseMod 1.00 with $NewFS set') {
   9   $FS = "\x1e\xff\xfe\x1e";
  10 } else {
  11   $FS = "\x1e";
  12 }
  13
  14 $NewFS = "\x1e";
  15
  16 # override $FS if you want!
  17
  18 print header() . start_html('Upgrading Files'), p;
  19 print q{Upgrade version: $Id: upgrade-files.pl,v 1.16 2010/11/06 11:51:18 as Exp $}, "\n";
  20 if (not param('dir')) {
  21   print start_form, p, '$DataDir: ', textfield('dir', '/tmp/oddmuse'),
  22     p, radio_group('separator', ['Oddmuse', 'UseMod 0.92', 'UseMod 1.00',
  23                                  'UseMod 1.00 with $NewFS set']),
  24     p, checkbox('convert', 'checked', 'on', 'Convert Latin-1 to UTF-8'),
  25     p, submit('Ok'), "\n", end_form;
  26 } elsif (param('dir') and not param('sure')) {
  27   print start_form, hidden('sure', 'yes'), hidden('dir', param('dir')),
  28     hidden('separator', param('separator')), hidden('convert', param('convert')),
  29     p, '$DataDir: ', param('dir'),
  30     p, 'separator used when reading pages: ',
  31     join(', ', map { sprintf('0x%x', ord($_)) } split (//, $FS)),
  32     p, 'separator used when writing pages: ',
  33     join(', ', map { sprintf('0x%x', ord($_)) } split (//, $NewFS)),
  34     p, 'Convert Latin-1 to UTF-8: ', param('convert') ? 'Yes' : 'No',
  35     p, submit('Confirm'), "\n", end_form;
  36 } else {
  37   rewrite(param('dir'));
  38 }
  39 print end_html();
  40
  41 sub rewrite {
  42   my ($directory) = @_;
  43   $FS1 = $FS . "1";
  44   $FS2 = $FS . "2";
  45   $FS3 = $FS . "3";
  46   my @files = glob("$directory/page/*/*.db");
  47   if (not @files) {
  48     print "$directory does not seem to be a data directory.\n";
  49     return;
  50   }
  51   print '<pre>';
  52   foreach my $file (@files) {
  53     print "Reading page $file...\n";
  54     my %page = split(/$FS1/, read_file($file), -1);
  55     %section = split(/$FS2/, $page{text_default}, -1);
  56     %text = split(/$FS3/, $section{data}, -1);
  57     $file =~ s/\.db$/.pg/ or die "Invalid page name\n";
  58     print "Writing $file...\n";
  59     write_page_file($file);
  60   }
  61   print '</pre>';
  62   @files = glob("$directory/referer/*/*.rb");
  63   print '<pre>';
  64   foreach my $file (@files) {
  65     print "Reading refer $file...\n";
  66     my $data = read_file($file);
  67     $data =~ s/$FS1/$NewFS/g;
  68     $file =~ s/\.rb$/.rf/ or die "Invalid page name\n";
  69     print "Writing $file...\n";
  70     write_file($file, $data);
  71   }
  72   print '</pre>';
  73   @files = glob("$directory/keep/*/*.kp");
  74   foreach my $file (@files) {
  75     print '<pre>';
  76     print "Reading keep $file...\n";
  77     my $data = read_file($file);
  78     my @list = split(/$FS1/, $data);
  79     my $out = $file;
  80     $out =~ s/\.kp$// or die "Invalid keep name\n";
  81     # We introduce a new variable $dir, here, instead of using $out,
  82     # because $out will be part of the filename later on, and the
  83     # filename will be converted in write_file.  To convert $out to
  84     # utf8 would double-encode the directory part of the filename.
  85     my $dir = param('convert') ? utf8($out) : $out;
  86     print "Creating $out...\n";
  87     mkdir($dir) or die "Cannot create directory $dir\n" unless -d $dir;
  88     foreach my $keep (@list) {
  89       next unless $keep;
  90       %section = split(/$FS2/, $keep, -1);
  91       %text = split(/$FS3/, $section{data}, -1);
  92       my $current = "$out/$section{'revision'}.kp";
  93       print "Writing $current...\n";
  94       write_keep_file($current);
  95     }
  96     print '</pre>';
  97   }
  98   @files = glob("$directory/*rclog");
  99   print '<pre>';
 100   foreach my $file (@files) {
 101     print "Reading $file...\n";
 102     my $data = read_file($file);
 103     @rc = split(/\n/, $data);
 104     foreach (@rc) {
 105       my ($ts, $pagename, $summary, $minor, $host, $kind, $extraTemp)
 106         = split(/$FS3/, $_);
 107       my %extra = split(/$FS2/, $extraTemp, -1);
 108       foreach ('name', 'revision', 'languages', 'cluster') {
 109         $extra{$_} = '' unless $extra{$_};
 110       }
 111       $extra{languages} =~ s/$FS1/,/g;
 112       $_ = join($NewFS, $ts, $pagename, $minor, $summary, $host,
 113                 $extra{name}, $extra{revision}, $extra{languages}, $extra{cluster});
 114     }
 115     $data = join("\n", @rc) . "\n";
 116     $file =~ s/log$/.log/;
 117     print "Writing $file...\n";
 118     write_file($file, $data);
 119   }
 120   print '</pre>';
 121   print p, "Done.\n";
 122 }
 123
 124 sub read_file {
 125   my ($filename) = @_;
 126   my ($data);
 127   local $/ = undef;             # Read complete files
 128   open(F, "<$filename") or die "can't read $filename: $!";
 129   $data=<F>;
 130   close F;
 131   return $data;
 132 }
 133
 134 sub write_file {
 135   my ($filename, $data) = @_;
 136   if (param('convert')) {
 137     $filename = utf8($filename);
 138     $data = utf8($data);
 139   }
 140   open(F, ">$filename") or die "can't write $filename: $!";
 141   print F $data;
 142   close F;
 143 }
 144
 145 sub cache {
 146   $_ = shift;
 147   return "" unless $_;
 148   my ($block, $flag) = split(/$FS2/, $_);
 149   my @blocks = split(/$FS3/, $block);
 150   my @flags = split(/$FS3/, $flag);
 151   return 'blocks: ' . escape_newlines(join($NewFS, @blocks)) . "\n"
 152     . 'flags: ' . escape_newlines(join($NewFS, @flags)) . "\n";
 153 }
 154
 155 sub escape_newlines {
 156   $_ = shift;
 157   $_ =~ s/\n/\n\t/g if $_;
 158   return $_;
 159 }
 160
 161 # Skip the info encoded in the filename (page name).  We need the info
 162 # stored in the rclog (summary, ip, host, username) for the history
 163 # page.  Don't trust the modification dates of the files themselves,
 164 # which is why we have the timestamp in the file, too.  We need the
 165 # timestamp when expiring old keep files.  We need all the info in the
 166 # page file that will eventually end up in the keep file.
 167
 168 sub basic_data {
 169   my $data = 'ts: ' . $section{ts} . "\n" if $section{ts};
 170   $data .= 'keep-ts: ' . $section{keepts} . "\n" if $section{keepts};
 171   $data .= 'revision: ' . $section{revision} . "\n" if $section{revision};
 172   $data .= 'summary: ' . $section{summary} . "\n" if $section{summary};
 173   $data .= 'summary: ' . $text{summary} . "\n" if $text{summary} and not $section{summary};
 174   $data .= 'username: ' . $section{username} . "\n" if $section{username};
 175   $data .= 'ip: ' . $section{ip} . "\n" if $section{ip};
 176   $data .= 'host: ' . $section{host} . "\n" if $section{host};
 177   $data .= 'minor: ' . $text{minor} . "\n" if $text{minor};
 178   # $data .= 'oldmajor: ' . $page{cache_oldmajor} . "\n" if $page{cache_oldmajor};
 179   $data .= 'text: ' . escape_newlines($text{text}) . "\n";
 180   return $data;
 181 }
 182
 183 sub write_page_file {
 184   my $file = shift;
 185   my $data = basic_data();
 186   $data .= cache($page{cache_blocks});
 187   $data .= 'diff-major: ' . escape_newlines($page{cache_diff_default_major}) . "\n"
 188     if $page{cache_diff_default_major};
 189   $data .= 'diff-minor: ' . escape_newlines($page{cache_diff_default_minor}) . "\n"
 190     if $page{cache_diff_default_minor};
 191   write_file($file, $data);
 192 }
 193
 194 sub write_keep_file {
 195   my $file = shift;
 196   my $data = basic_data();
 197   write_file($file, $data);
 198 }
 199
 200
 201 # This Latin-1 to UTF-8 conversion was written by Skalman on the
 202 # Oddmuse Wiki.  He says:  I added a quick, dirty and completely
 203 # unreadable hack to convert all characters above 0x7F:
 204
 205 #  s/([\x80-\xff])/chr(0xc0+(ord($1)>>6)).chr(ord($1)&0b00111111|0b10000000)/ge;
 206
 207 # Reading the UTF-8 and Unicode FAQ, I convert every character to
 208 # (binary) 110xxxxx 10xxxxxx where the 'x' marks the bits of the
 209 # original ISO-8859-1 character. That is: take the two most
 210 # significant bits of the caracter and add them to 0xC0 (first byte),
 211 # then replace the first two bits with 10 (second byte).
 212
 213 sub utf8 {
 214   $_ = shift;
 215   s/([\x80-\xff])/chr(0xc0+(ord($1)>>6)).chr(ord($1)&0b00111111|0b10000000)/ge;
 216   return $_;
 217 }