wiki.pl: Port some fixes from upstream
[Orgmuse.git] / upgrade-files.pl
blob1cdf08103339777a1637501770248314d83e1d2f
1 #! /usr/bin/perl -w
3 use CGI qw/:standard/;
4 use CGI::Carp qw(fatalsToBrowser);
6 if (param('separator') eq 'UseMod 0.92' or param('separator') eq 'UseMod 1.00') {
7 $FS = "\xb3";
8 } elsif (param('separator') eq 'UseMod 1.00 with $NewFS set') {
9 $FS = "\x1e\xff\xfe\x1e";
10 } else {
11 $FS = "\x1e";
14 $NewFS = "\x1e";
16 # override $FS if you want!
18 print header() . start_html('Upgrading Files'), p;
19 print q{Upgrade version: $Id: upgrade-files.pl,v 1.16 2010/11/06 11:51:18 as Exp $}, "\n";
20 if (not param('dir')) {
21 print start_form, p, '$DataDir: ', textfield('dir', '/tmp/oddmuse'),
22 p, radio_group('separator', ['Oddmuse', 'UseMod 0.92', 'UseMod 1.00',
23 'UseMod 1.00 with $NewFS set']),
24 p, checkbox('convert', 'checked', 'on', 'Convert Latin-1 to UTF-8'),
25 p, submit('Ok'), "\n", end_form;
26 } elsif (param('dir') and not param('sure')) {
27 print start_form, hidden('sure', 'yes'), hidden('dir', param('dir')),
28 hidden('separator', param('separator')), hidden('convert', param('convert')),
29 p, '$DataDir: ', param('dir'),
30 p, 'separator used when reading pages: ',
31 join(', ', map { sprintf('0x%x', ord($_)) } split (//, $FS)),
32 p, 'separator used when writing pages: ',
33 join(', ', map { sprintf('0x%x', ord($_)) } split (//, $NewFS)),
34 p, 'Convert Latin-1 to UTF-8: ', param('convert') ? 'Yes' : 'No',
35 p, submit('Confirm'), "\n", end_form;
36 } else {
37 rewrite(param('dir'));
39 print end_html();
41 sub rewrite {
42 my ($directory) = @_;
43 $FS1 = $FS . "1";
44 $FS2 = $FS . "2";
45 $FS3 = $FS . "3";
46 my @files = glob("$directory/page/*/*.db");
47 if (not @files) {
48 print "$directory does not seem to be a data directory.\n";
49 return;
51 print '<pre>';
52 foreach my $file (@files) {
53 print "Reading page $file...\n";
54 my %page = split(/$FS1/, read_file($file), -1);
55 %section = split(/$FS2/, $page{text_default}, -1);
56 %text = split(/$FS3/, $section{data}, -1);
57 $file =~ s/\.db$/.pg/ or die "Invalid page name\n";
58 print "Writing $file...\n";
59 write_page_file($file);
61 print '</pre>';
62 @files = glob("$directory/referer/*/*.rb");
63 print '<pre>';
64 foreach my $file (@files) {
65 print "Reading refer $file...\n";
66 my $data = read_file($file);
67 $data =~ s/$FS1/$NewFS/g;
68 $file =~ s/\.rb$/.rf/ or die "Invalid page name\n";
69 print "Writing $file...\n";
70 write_file($file, $data);
72 print '</pre>';
73 @files = glob("$directory/keep/*/*.kp");
74 foreach my $file (@files) {
75 print '<pre>';
76 print "Reading keep $file...\n";
77 my $data = read_file($file);
78 my @list = split(/$FS1/, $data);
79 my $out = $file;
80 $out =~ s/\.kp$// or die "Invalid keep name\n";
81 # We introduce a new variable $dir, here, instead of using $out,
82 # because $out will be part of the filename later on, and the
83 # filename will be converted in write_file. To convert $out to
84 # utf8 would double-encode the directory part of the filename.
85 my $dir = param('convert') ? utf8($out) : $out;
86 print "Creating $out...\n";
87 mkdir($dir) or die "Cannot create directory $dir\n" unless -d $dir;
88 foreach my $keep (@list) {
89 next unless $keep;
90 %section = split(/$FS2/, $keep, -1);
91 %text = split(/$FS3/, $section{data}, -1);
92 my $current = "$out/$section{'revision'}.kp";
93 print "Writing $current...\n";
94 write_keep_file($current);
96 print '</pre>';
98 @files = glob("$directory/*rclog");
99 print '<pre>';
100 foreach my $file (@files) {
101 print "Reading $file...\n";
102 my $data = read_file($file);
103 @rc = split(/\n/, $data);
104 foreach (@rc) {
105 my ($ts, $pagename, $summary, $minor, $host, $kind, $extraTemp)
106 = split(/$FS3/, $_);
107 my %extra = split(/$FS2/, $extraTemp, -1);
108 foreach ('name', 'revision', 'languages', 'cluster') {
109 $extra{$_} = '' unless $extra{$_};
111 $extra{languages} =~ s/$FS1/,/g;
112 $_ = join($NewFS, $ts, $pagename, $minor, $summary, $host,
113 $extra{name}, $extra{revision}, $extra{languages}, $extra{cluster});
115 $data = join("\n", @rc) . "\n";
116 $file =~ s/log$/.log/;
117 print "Writing $file...\n";
118 write_file($file, $data);
120 print '</pre>';
121 print p, "Done.\n";
124 sub read_file {
125 my ($filename) = @_;
126 my ($data);
127 local $/ = undef; # Read complete files
128 open(F, "<$filename") or die "can't read $filename: $!";
129 $data=<F>;
130 close F;
131 return $data;
134 sub write_file {
135 my ($filename, $data) = @_;
136 if (param('convert')) {
137 $filename = utf8($filename);
138 $data = utf8($data);
140 open(F, ">$filename") or die "can't write $filename: $!";
141 print F $data;
142 close F;
145 sub cache {
146 $_ = shift;
147 return "" unless $_;
148 my ($block, $flag) = split(/$FS2/, $_);
149 my @blocks = split(/$FS3/, $block);
150 my @flags = split(/$FS3/, $flag);
151 return 'blocks: ' . escape_newlines(join($NewFS, @blocks)) . "\n"
152 . 'flags: ' . escape_newlines(join($NewFS, @flags)) . "\n";
155 sub escape_newlines {
156 $_ = shift;
157 $_ =~ s/\n/\n\t/g if $_;
158 return $_;
161 # Skip the info encoded in the filename (page name). We need the info
162 # stored in the rclog (summary, ip, host, username) for the history
163 # page. Don't trust the modification dates of the files themselves,
164 # which is why we have the timestamp in the file, too. We need the
165 # timestamp when expiring old keep files. We need all the info in the
166 # page file that will eventually end up in the keep file.
168 sub basic_data {
169 my $data = 'ts: ' . $section{ts} . "\n" if $section{ts};
170 $data .= 'keep-ts: ' . $section{keepts} . "\n" if $section{keepts};
171 $data .= 'revision: ' . $section{revision} . "\n" if $section{revision};
172 $data .= 'summary: ' . $section{summary} . "\n" if $section{summary};
173 $data .= 'summary: ' . $text{summary} . "\n" if $text{summary} and not $section{summary};
174 $data .= 'username: ' . $section{username} . "\n" if $section{username};
175 $data .= 'ip: ' . $section{ip} . "\n" if $section{ip};
176 $data .= 'host: ' . $section{host} . "\n" if $section{host};
177 $data .= 'minor: ' . $text{minor} . "\n" if $text{minor};
178 # $data .= 'oldmajor: ' . $page{cache_oldmajor} . "\n" if $page{cache_oldmajor};
179 $data .= 'text: ' . escape_newlines($text{text}) . "\n";
180 return $data;
183 sub write_page_file {
184 my $file = shift;
185 my $data = basic_data();
186 $data .= cache($page{cache_blocks});
187 $data .= 'diff-major: ' . escape_newlines($page{cache_diff_default_major}) . "\n"
188 if $page{cache_diff_default_major};
189 $data .= 'diff-minor: ' . escape_newlines($page{cache_diff_default_minor}) . "\n"
190 if $page{cache_diff_default_minor};
191 write_file($file, $data);
194 sub write_keep_file {
195 my $file = shift;
196 my $data = basic_data();
197 write_file($file, $data);
201 # This Latin-1 to UTF-8 conversion was written by Skalman on the
202 # Oddmuse Wiki. He says: I added a quick, dirty and completely
203 # unreadable hack to convert all characters above 0x7F:
205 # s/([\x80-\xff])/chr(0xc0+(ord($1)>>6)).chr(ord($1)&0b00111111|0b10000000)/ge;
207 # Reading the UTF-8 and Unicode FAQ, I convert every character to
208 # (binary) 110xxxxx 10xxxxxx where the 'x' marks the bits of the
209 # original ISO-8859-1 character. That is: take the two most
210 # significant bits of the caracter and add them to 0xC0 (first byte),
211 # then replace the first two bits with 10 (second byte).
213 sub utf8 {
214 $_ = shift;
215 s/([\x80-\xff])/chr(0xc0+(ord($1)>>6)).chr(ord($1)&0b00111111|0b10000000)/ge;
216 return $_;