cksum: ensure appropriate "binary" mode with --untagged
[coreutils.git] / tests / expr / expr-multibyte.pl
blobdb18d673d3f3fe01daa3204bc0410b0899ecabfd
1 #!/usr/bin/perl
2 # Exercise expr with multibyte input
4 # Copyright (C) 2017-2024 Free Software Foundation, Inc.
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19 use strict;
21 (my $ME = $0) =~ s|.*/||;
23 my $limits = getlimits ();
24 my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW};
26 (my $program_name = $0) =~ s|.*/||;
27 my $prog = 'expr';
29 my $locale = $ENV{LOCALE_FR_UTF8};
30 ! defined $locale || $locale eq 'none'
31 and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n";
34 =pod
35 ἔκφρασις (ekphrasis) - "expression" in Ancient Greek.
36 =cut
37 my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}";
40 ## NOTE about tests locales:
41 ## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"}
42 ## added to them automatically - results are multibyte-aware.
43 ## Tests starting with 'sb' have the same input but will be
44 ## run under C locale and will be treated as single-bytes.
45 ## This enables interleaving C/UTF8 tests
46 ## (for easier comparison of expected results).
48 my @Tests =
50 ### length expressions ###
52 # sanity check
53 ['mb-l1', 'length abcdef', {OUT=>"6"}],
54 ['st-l1', 'length abcdef', {OUT=>"6"}],
56 # A single multibyte character in the beginning of the string
57 # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA"
58 ['mb-l2', "length \xCE\xB1bcdef", {OUT=>"6"}],
59 ['st-l2', "length \xCE\xB1bcdef", {OUT=>"7"}],
61 # A single multibyte character in the middle of the string
62 # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA"
63 ['mb-l3', "length abc\xCE\xB4ef", {OUT=>"6"}],
64 ['st-l3', "length abc\xCE\xB4ef", {OUT=>"7"}],
66 # A single multibyte character in the end of the string
67 ['mb-l4', "length fedcb\xCE\xB1", {OUT=>"6"}],
68 ['st-l4', "length fedcb\xCE\xB1", {OUT=>"7"}],
70 # A invalid multibyte sequence
71 ['mb-l5', "length \xB1aaa", {OUT=>"4"}],
72 ['st-l5', "length \xB1aaa", {OUT=>"4"}],
74 # An incomplete multibyte sequence at the end of the string
75 ['mb-l6', "length aaa\xCE", {OUT=>"4"}],
76 ['st-l6', "length aaa\xCE", {OUT=>"4"}],
78 # An incomplete multibyte sequence at the end of the string
79 ['mb-l7', "length $expression", {OUT=>"8"}],
80 ['st-l7', "length $expression", {OUT=>"17"}],
84 ### index expressions ###
86 # sanity check
87 ['mb-i1', 'index abcdef fb', {OUT=>"2"}],
88 ['st-i1', 'index abcdef fb', {OUT=>"2"}],
90 # Search for a single-octet
91 ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"2"}],
92 ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"3"}],
93 ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"6"}],
94 ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"8"}],
96 # Search for multibyte character.
97 # In the C locale, the search string is treated as two octets.
98 # the first of them (\xCE) matches the first octet of the input string.
99 ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}],
100 ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}],
102 # Invalid multibyte sequence in the input string, treated as a single octet.
103 ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}],
104 ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}],
106 # Invalid multibyte sequence in the search string, treated as a single octet.
107 # In multibyte locale, there should be no match, expr returns and prints
108 # zero, and terminates with exit-code 1 (as per POSIX).
109 ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"0"}, {EXIT=>1}],
110 ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"6"}],
112 # Edge-case: invalid multibyte sequence BOTH in the input string
113 # and in the search string: expr should find a match.
114 ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4", {OUT=>"4"}],
117 ### substr expressions ###
119 # sanity check
120 ['mb-s1', 'substr abcdef 2 3', {OUT=>"bcd"}],
121 ['st-s1', 'substr abcdef 2 3', {OUT=>"bcd"}],
123 ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE\xB1"}],
124 ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE"}],
126 ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"c\xCE\xB4"}],
127 ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"bc"}],
129 ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"\xCE\xB4"}],
130 ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"c"}],
132 ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"\xCE\xB4e"}],
133 ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"c\xCE"}],
135 ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"f"}],
136 ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"\xB4"}],
138 ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>""}, {EXIT=>1}],
139 ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>"e"}],
141 # Invalid multibyte sequences
142 ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"c\xB4e"}],
143 ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"bc\xB4"}],
146 ### match expressions ###
148 # sanity check
149 ['mb-m1', 'match abcdef ab', {OUT=>"2"}],
150 ['st-m1', 'match abcdef ab', {OUT=>"2"}],
151 ['mb-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}],
152 ['st-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}],
154 # The regex engine should match the '.' to the first multibyte character.
155 ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}],
156 ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}],
158 # The opposite of the previous test: two dots should only match
159 # the two octets in single-byte locale.
160 ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}],
161 ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}],
163 # Match with grouping - a single dot should return the two octets
164 ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}],
165 ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}],
167 # Invalid multibyte sequences - regex should not match in multibyte locale
168 # (POSIX requirement)
169 ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}],
170 ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}],
173 # Character classes: in the multibyte case, the regex engine understands
174 # there is a single multibyte character in the brackets.
175 # In the single byte case, the regex engine sees two octets in the character
176 # class ('\xCE' and '\xB1') - and it matches the first one.
177 ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}],
178 ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}],
183 # Append a newline to end of each expected 'OUT' string.
184 my $t;
185 foreach $t (@Tests)
187 my $arg1 = $t->[1];
188 my $e;
189 foreach $e (@$t)
191 $e->{OUT} .= "\n"
192 if ref $e eq 'HASH' and exists $e->{OUT};
197 # Force multibyte locale in all tests.
199 # NOTE about the ERR_SUBST:
200 # The error tests above (e1/e2/e3/e4) expect error messages in C locale
201 # having single-quote character (ASCII 0x27).
202 # In UTF-8 locale, the error messages will use:
203 # 'LEFT SINGLE QUOTATION MARK' (U+2018) (UTF8: 0xE2 0x80 0x98)
204 # 'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99)
205 # So we replace them with ascii single-quote and the results will
206 # match the expected error string.
207 if ($locale ne 'C')
209 my @new;
210 foreach my $t (@Tests)
212 my ($tname) = @$t;
213 if ($tname =~ /^mb/)
215 push @$t, ({ENV => "LC_ALL=$locale"},
216 {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"});
222 my $save_temps = $ENV{DEBUG};
223 my $verbose = $ENV{VERBOSE};
225 my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
226 exit $fail;