3 #=======================================================================
5 # File ID: d971a582-f742-11dd-9aeb-000475e441b9
7 # Read lines from stdin and print those who contains eight-bit
8 # characters not encoded in UTF-8.
10 # Character set: UTF-8
11 # ©opyleft 2001– Øyvind A. Holm <sunny@sunbase.org>
12 # License: GNU General Public License version 2 or later, see end of
13 # file for legal stuff.
14 #=======================================================================
34 $progname =~ s/^.*\/(.*?)$/$1/;
35 our $VERSION = '0.1.0';
37 Getopt
::Long
::Configure
('bundling');
40 'help|h' => \
$Opt{'help'},
41 'quiet|q+' => \
$Opt{'quiet'},
42 'skip-invalid|s' => \
$Opt{'skip-invalid'},
43 'verbose|v+' => \
$Opt{'verbose'},
44 'version' => \
$Opt{'version'},
45 'zero|z' => \
$Opt{'zero'},
47 ) || die("$progname: Option error. Use -h for help.\n");
49 $Opt{'verbose'} -= $Opt{'quiet'};
50 $Opt{'help'} && usage
(0);
51 if ($Opt{'version'}) {
62 $Opt{'zero'} && ($/ = "\x00");
64 # Dobbelt opp for å få opp farta.
66 if ($Opt{'skip-invalid'}) {
70 $Opt{'quiet'} || print $Orig;
78 if (!is_utf8
($Orig)) {
80 $Opt{'quiet'} && last || print $Orig;
92 # UTF-8 regexp copied from linux-2.6.git/scripts/checkpatch.pl in
93 # commit ddb503b42960792f3be580f98959add669241a04. Originally from
94 # http://www.w3.org/International/questions/qa-forms-utf-8.en.php .
95 # Modified by me to include U+007F and everything below U+0020, not
100 | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
101 | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
102 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
103 | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
104 | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
105 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
106 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
110 $text =~ s/$UTF8//gs;
111 $retval = length($text) ?
0 : 1;
118 # Print program version {{{
119 print("$progname $VERSION\n");
125 # Send the help message to stdout {{{
128 if ($Opt{'verbose'}) {
134 Usage: $progname [options] [file [files [...]]]
136 Print lines containing invalid UTF-8. Returns 1 if invalid UTF-8 is
137 found, otherwise 0 is returned.
144 Be more quiet. Can be repeated to increase silence.
145 Don't produce any output, use only return value.
147 Vice versa, skip lines with invalid UTF-8 and only print those
148 containing proper UTF-8.
150 Increase level of verbosity. Can be repeated.
152 Use NUL character (0x00) as separator instead of \\n. This affects
153 both input and output.
155 Print version information.
163 # Print a status message to stderr based on verbosity level {{{
164 my ($verbose_level, $Txt) = @_;
166 if ($Opt{'verbose'} >= $verbose_level) {
167 print(STDERR
"$progname: $Txt\n");
175 # This program is free software; you can redistribute it and/or modify
176 # it under the terms of the GNU General Public License as published by
177 # the Free Software Foundation; either version 2 of the License, or (at
178 # your option) any later version.
180 # This program is distributed in the hope that it will be useful, but
181 # WITHOUT ANY WARRANTY; without even the implied warranty of
182 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
183 # See the GNU General Public License for more details.
185 # You should have received a copy of the GNU General Public License
186 # along with this program.
187 # If not, see L<http://www.gnu.org/licenses/>.
189 # vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :