move point to raise exception
[sunny256-utils.git] / u2h
blobb8e21715d28e54b913d39f6a4b0cab96c79c30cf
1 #!/usr/bin/env perl
3 #=======================================================================
4 # u2h
5 # File ID: cab2ee1e-5d46-11df-8988-90e6ba3022ac
7 # Converts from UTF-8 charset to HTML numeric entities (☺ and
8 # ☺).
10 # Character set: UTF-8
11 # ©opyleft 2001– Øyvind A. Holm <sunny@sunbase.org>
12 # License: GNU General Public License version 2 or later, see end of
13 # file for legal stuff.
14 #=======================================================================
16 use strict;
17 use warnings;
18 use Getopt::Long;
20 local $| = 1;
22 our %Opt = (
24 'ampersand' => 0,
25 'decimal' => 0,
26 'help' => 0,
27 'invalid' => 0,
28 'latin1' => 0,
29 'quiet' => 0,
30 'standard' => 0,
31 'verbose' => 0,
32 'version' => 0,
36 our $progname = $0;
37 $progname =~ s/^.*\/(.*?)$/$1/;
38 our $VERSION = '0.1.0';
40 Getopt::Long::Configure('bundling');
41 GetOptions(
43 'ampersand|a' => \$Opt{'ampersand'},
44 'decimal|d' => \$Opt{'decimal'},
45 'help|h' => \$Opt{'help'},
46 'invalid|i' => \$Opt{'invalid'},
47 'latin1|l' => \$Opt{'latin1'},
48 'quiet|q+' => \$Opt{'quiet'},
49 'standard|s' => \$Opt{'standard'},
50 'verbose|v+' => \$Opt{'verbose'},
51 'version' => \$Opt{'version'},
53 ) || die("$progname: Option error. Use -h for help.\n");
55 $Opt{'verbose'} -= $Opt{'quiet'};
56 $Opt{'help'} && usage(0);
57 if ($Opt{'version'}) {
58 print_version();
59 exit(0);
62 exit(main());
64 sub main {
65 # {{{
66 my $Retval = 0;
68 my $amp_ent = $Opt{'decimal'} ? "&#38;" : "&#x26;";
70 while (<>) {
71 $Opt{'ampersand'} && s/&/$amp_ent/g;
72 $Opt{'standard'} && s/([\x20-\x7F])/decode_char($1)/ge;
73 s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
74 s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
75 s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
76 s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
77 s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge;
78 print;
81 return $Retval;
82 # }}}
83 } # main()
85 sub decode_char {
86 # {{{
87 my $Msg = shift;
88 my $Val = "";
89 if ($Msg =~ /^([\x20-\x7F])$/) {
90 $Val = ord($1);
91 } elsif ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) {
92 if (!$Opt{'invalid'} && $Msg =~ /^[\xC0-\xC1]/) {
93 $Val = 0xFFFD;
94 } else {
95 $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F);
97 } elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) {
98 if (!$Opt{'invalid'} && $Msg =~ /^\xE0[\x80-\x9F]/) {
99 $Val = 0xFFFD;
100 } else {
101 $Val = ((ord($1) & 0x0F) << 12) |
102 ((ord($2) & 0x3F) << 6) |
103 ( ord($3) & 0x3F);
105 } elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
106 if (!$Opt{'invalid'} && $Msg =~ /^\xF0[\x80-\x8F]/) {
107 $Val = 0xFFFD;
108 } else {
109 $Val = ((ord($1) & 0x07) << 18) |
110 ((ord($2) & 0x3F) << 12) |
111 ((ord($3) & 0x3F) << 6) |
112 ( ord($4) & 0x3F);
114 } elsif ($Msg =~ /^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
115 if (!$Opt{'invalid'} && $Msg =~ /^\xF8[\x80-\x87]/) {
116 $Val = 0xFFFD;
117 } else {
118 $Val = ((ord($1) & 0x03) << 24) |
119 ((ord($2) & 0x3F) << 18) |
120 ((ord($3) & 0x3F) << 12) |
121 ((ord($4) & 0x3F) << 6) |
122 ( ord($5) & 0x3F);
124 } elsif ($Msg =~ /^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
125 if (!$Opt{'invalid'} && $Msg =~ /^\xFC[\x80-\x83]/) {
126 $Val = 0xFFFD;
127 } else {
128 $Val = ((ord($1) & 0x01) << 30) |
129 ((ord($2) & 0x3F) << 24) |
130 ((ord($3) & 0x3F) << 18) |
131 ((ord($4) & 0x3F) << 12) |
132 ((ord($5) & 0x3F) << 6) |
133 ( ord($6) & 0x3F);
136 unless ($Opt{'invalid'}) {
137 if (($Val >= 0xD800 && $Val <= 0xDFFF) || ($Val eq 0xFFFE) || ($Val eq 0xFFFF)) {
138 $Val = 0xFFFD;
141 return ($Opt{'latin1'} && ($Val <= 0xFF)) ? chr($Val) : sprintf(($Opt{'decimal'} ? "&#%u;" : "&#x%X;"), $Val);
142 # }}}
143 } # decode_char()
145 sub print_version {
146 # Print program version {{{
147 print("$progname $VERSION\n");
148 return;
149 # }}}
150 } # print_version()
152 sub usage {
153 # Send the help message to stdout {{{
154 my $Retval = shift;
156 if ($Opt{'verbose'}) {
157 print("\n");
158 print_version();
160 print(<<"END");
162 Converts from UTF-8 charset to HTML numeric entities (&#x263A; and
163 &#9786;).
165 Usage: $progname [options] [file [files [...]]]
167 Options:
169 -a, --ampersand
170 Convert Ampersand into entity.
171 -d, --decimal
172 Use decimal values.
173 -h, --help
174 Show this help.
175 -i, --invalid
176 Accept invalid sequences (overlong sequences and surrogates).
177 -l, --latin1
178 Convert U+0080 through U+00FF to latin-1 instead of entities.
179 -q, --quiet
180 Be more quiet. Can be repeated to increase silence.
181 -s, --standard
182 Also convert standard ascii U+0020 through U+007F.
183 -v, --verbose
184 Increase level of verbosity. Can be repeated.
185 --version
186 Print version information.
189 exit($Retval);
190 # }}}
191 } # usage()
193 sub msg {
194 # Print a status message to stderr based on verbosity level {{{
195 my ($verbose_level, $Txt) = @_;
197 if ($Opt{'verbose'} >= $verbose_level) {
198 print(STDERR "$progname: $Txt\n");
200 return;
201 # }}}
202 } # msg()
204 __END__
206 # This program is free software; you can redistribute it and/or modify
207 # it under the terms of the GNU General Public License as published by
208 # the Free Software Foundation; either version 2 of the License, or (at
209 # your option) any later version.
211 # This program is distributed in the hope that it will be useful, but
212 # WITHOUT ANY WARRANTY; without even the implied warranty of
213 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
214 # See the GNU General Public License for more details.
216 # You should have received a copy of the GNU General Public License
217 # along with this program.
218 # If not, see L<http://www.gnu.org/licenses/>.
220 # vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :