3 #=======================================================================
5 # File ID: cab2ee1e-5d46-11df-8988-90e6ba3022ac
7 # Converts from UTF-8 charset to HTML numeric entities (☺ and
10 # Character set: UTF-8
11 # ©opyleft 2001– Øyvind A. Holm <sunny@sunbase.org>
12 # License: GNU General Public License version 2 or later, see end of
13 # file for legal stuff.
14 #=======================================================================
37 $progname =~ s/^.*\/(.*?)$/$1/;
38 our $VERSION = '0.1.0';
40 Getopt
::Long
::Configure
('bundling');
43 'ampersand|a' => \
$Opt{'ampersand'},
44 'decimal|d' => \
$Opt{'decimal'},
45 'help|h' => \
$Opt{'help'},
46 'invalid|i' => \
$Opt{'invalid'},
47 'latin1|l' => \
$Opt{'latin1'},
48 'quiet|q+' => \
$Opt{'quiet'},
49 'standard|s' => \
$Opt{'standard'},
50 'verbose|v+' => \
$Opt{'verbose'},
51 'version' => \
$Opt{'version'},
53 ) || die("$progname: Option error. Use -h for help.\n");
55 $Opt{'verbose'} -= $Opt{'quiet'};
56 $Opt{'help'} && usage
(0);
57 if ($Opt{'version'}) {
68 my $amp_ent = $Opt{'decimal'} ?
"&" : "&";
71 $Opt{'ampersand'} && s/&/$amp_ent/g;
72 $Opt{'standard'} && s/([\x20-\x7F])/decode_char($1)/ge;
73 s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
74 s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
75 s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
76 s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
77 s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge;
89 if ($Msg =~ /^([\x20-\x7F])$/) {
91 } elsif ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) {
92 if (!$Opt{'invalid'} && $Msg =~ /^[\xC0-\xC1]/) {
95 $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F);
97 } elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) {
98 if (!$Opt{'invalid'} && $Msg =~ /^\xE0[\x80-\x9F]/) {
101 $Val = ((ord($1) & 0x0F) << 12) |
102 ((ord($2) & 0x3F) << 6) |
105 } elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
106 if (!$Opt{'invalid'} && $Msg =~ /^\xF0[\x80-\x8F]/) {
109 $Val = ((ord($1) & 0x07) << 18) |
110 ((ord($2) & 0x3F) << 12) |
111 ((ord($3) & 0x3F) << 6) |
114 } elsif ($Msg =~ /^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
115 if (!$Opt{'invalid'} && $Msg =~ /^\xF8[\x80-\x87]/) {
118 $Val = ((ord($1) & 0x03) << 24) |
119 ((ord($2) & 0x3F) << 18) |
120 ((ord($3) & 0x3F) << 12) |
121 ((ord($4) & 0x3F) << 6) |
124 } elsif ($Msg =~ /^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
125 if (!$Opt{'invalid'} && $Msg =~ /^\xFC[\x80-\x83]/) {
128 $Val = ((ord($1) & 0x01) << 30) |
129 ((ord($2) & 0x3F) << 24) |
130 ((ord($3) & 0x3F) << 18) |
131 ((ord($4) & 0x3F) << 12) |
132 ((ord($5) & 0x3F) << 6) |
136 unless ($Opt{'invalid'}) {
137 if (($Val >= 0xD800 && $Val <= 0xDFFF) || ($Val eq 0xFFFE) || ($Val eq 0xFFFF)) {
141 return ($Opt{'latin1'} && ($Val <= 0xFF)) ?
chr($Val) : sprintf(($Opt{'decimal'} ?
"&#%u;" : "&#x%X;"), $Val);
146 # Print program version {{{
147 print("$progname $VERSION\n");
153 # Send the help message to stdout {{{
156 if ($Opt{'verbose'}) {
162 Converts from UTF-8 charset to HTML numeric entities (☺ and
165 Usage: $progname [options] [file [files [...]]]
170 Convert Ampersand into entity.
176 Accept invalid sequences (overlong sequences and surrogates).
178 Convert U+0080 through U+00FF to latin-1 instead of entities.
180 Be more quiet. Can be repeated to increase silence.
182 Also convert standard ascii U+0020 through U+007F.
184 Increase level of verbosity. Can be repeated.
186 Print version information.
194 # Print a status message to stderr based on verbosity level {{{
195 my ($verbose_level, $Txt) = @_;
197 if ($Opt{'verbose'} >= $verbose_level) {
198 print(STDERR
"$progname: $Txt\n");
206 # This program is free software; you can redistribute it and/or modify
207 # it under the terms of the GNU General Public License as published by
208 # the Free Software Foundation; either version 2 of the License, or (at
209 # your option) any later version.
211 # This program is distributed in the hope that it will be useful, but
212 # WITHOUT ANY WARRANTY; without even the implied warranty of
213 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
214 # See the GNU General Public License for more details.
216 # You should have received a copy of the GNU General Public License
217 # along with this program.
218 # If not, see L<http://www.gnu.org/licenses/>.
220 # vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :