clipboard on new cygwin
[sunny256-utils.git] / find_inv_utf8
blob6ecc7eff106fc8dad64785065e3eead5558af674
1 #!/usr/bin/env perl
3 #=======================================================================
4 # find_inv_utf8
5 # File ID: d971a582-f742-11dd-9aeb-000475e441b9
7 # Read lines from stdin and print those who contains eight-bit
8 # characters not encoded in UTF-8.
10 # Character set: UTF-8
11 # ©opyleft 2001– Øyvind A. Holm <sunny@sunbase.org>
12 # License: GNU General Public License version 2 or later, see end of
13 # file for legal stuff.
14 #=======================================================================
16 use strict;
17 use warnings;
18 use Getopt::Long;
20 local $| = 1;
22 our %Opt = (
24 'help' => 0,
25 'quiet' => 0,
26 'skip-invalid' => 0,
27 'verbose' => 0,
28 'version' => 0,
29 'zero' => 0,
33 our $progname = $0;
34 $progname =~ s/^.*\/(.*?)$/$1/;
35 our $VERSION = '0.1.0';
37 Getopt::Long::Configure('bundling');
38 GetOptions(
40 'help|h' => \$Opt{'help'},
41 'quiet|q+' => \$Opt{'quiet'},
42 'skip-invalid|s' => \$Opt{'skip-invalid'},
43 'verbose|v+' => \$Opt{'verbose'},
44 'version' => \$Opt{'version'},
45 'zero|z' => \$Opt{'zero'},
47 ) || die("$progname: Option error. Use -h for help.\n");
49 $Opt{'verbose'} -= $Opt{'quiet'};
50 $Opt{'help'} && usage(0);
51 if ($Opt{'version'}) {
52 print_version();
53 exit(0);
56 exit(main());
58 sub main {
59 # {{{
60 my $Retval = 0;
62 $Opt{'zero'} && ($/ = "\x00");
64 # Dobbelt opp for å få opp farta.
66 if ($Opt{'skip-invalid'}) {
67 while (<>) {
68 my $Orig = $_;
69 if (is_utf8($Orig)) {
70 $Opt{'quiet'} || print $Orig;
71 } else {
72 $Retval = 1;
75 } else {
76 while (<>) {
77 my $Orig = $_;
78 if (!is_utf8($Orig)) {
79 $Retval = 1;
80 $Opt{'quiet'} && last || print $Orig;
85 return $Retval;
86 # }}}
87 } # main()
89 sub is_utf8 {
90 # {{{
91 my $text = shift;
92 # UTF-8 regexp copied from linux-2.6.git/scripts/checkpatch.pl in
93 # commit ddb503b42960792f3be580f98959add669241a04. Originally from
94 # http://www.w3.org/International/questions/qa-forms-utf-8.en.php .
95 # Modified by me to include U+007F and everything below U+0020, not
96 # only \t, \n and \r.
98 my $UTF8 = qr {
99 [\x00-\x7F] # ASCII
100 | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
101 | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
102 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
103 | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
104 | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
105 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
106 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
109 my $retval;
110 $text =~ s/$UTF8//gs;
111 $retval = length($text) ? 0 : 1;
113 return($retval);
114 # }}}
115 } # is_utf8()
117 sub print_version {
118 # Print program version {{{
119 print("$progname $VERSION\n");
120 return;
121 # }}}
122 } # print_version()
124 sub usage {
125 # Send the help message to stdout {{{
126 my $Retval = shift;
128 if ($Opt{'verbose'}) {
129 print("\n");
130 print_version();
132 print(<<"END");
134 Usage: $progname [options] [file [files [...]]]
136 Print lines containing invalid UTF-8. Returns 1 if invalid UTF-8 is
137 found, otherwise 0 is returned.
139 Options:
141 -h, --help
142 Show this help.
143 -q, --quiet
144 Be more quiet. Can be repeated to increase silence.
145 Don't produce any output, use only return value.
146 -s, --skip-invalid
147 Vice versa, skip lines with invalid UTF-8 and only print those
148 containing proper UTF-8.
149 -v, --verbose
150 Increase level of verbosity. Can be repeated.
151 -z, zero
152 Use NUL character (0x00) as separator instead of \\n. This affects
153 both input and output.
154 --version
155 Print version information.
158 exit($Retval);
159 # }}}
160 } # usage()
162 sub msg {
163 # Print a status message to stderr based on verbosity level {{{
164 my ($verbose_level, $Txt) = @_;
166 if ($Opt{'verbose'} >= $verbose_level) {
167 print(STDERR "$progname: $Txt\n");
169 return;
170 # }}}
171 } # msg()
173 __END__
175 # This program is free software; you can redistribute it and/or modify
176 # it under the terms of the GNU General Public License as published by
177 # the Free Software Foundation; either version 2 of the License, or (at
178 # your option) any later version.
180 # This program is distributed in the hope that it will be useful, but
181 # WITHOUT ANY WARRANTY; without even the implied warranty of
182 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
183 # See the GNU General Public License for more details.
185 # You should have received a copy of the GNU General Public License
186 # along with this program.
187 # If not, see L<http://www.gnu.org/licenses/>.
189 # vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :