tests/expr/expr-multibyte.pl

   1 #!/usr/bin/perl
   2 # Exercise expr with multibyte input
   3
   4 # Copyright (C) 2017-2024 Free Software Foundation, Inc.
   5
   6 # This program is free software: you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 3 of the License, or
   9 # (at your option) any later version.
  10
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18
  19 use strict;
  20
  21 (my $ME = $0) =~ s|.*/||;
  22
  23 my $limits = getlimits ();
  24 my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW};
  25
  26 (my $program_name = $0) =~ s|.*/||;
  27 my $prog = 'expr';
  28
  29 my $locale = $ENV{LOCALE_FR_UTF8};
  30 ! defined $locale || $locale eq 'none'
  31   and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n";
  32
  33
  34 =pod
  35 ἔκφρασις (ekphrasis) - "expression" in Ancient Greek.
  36 =cut
  37 my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}";
  38
  39
  40 ## NOTE about tests locales:
  41 ## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"}
  42 ## added to them automatically - results are multibyte-aware.
  43 ## Tests starting with 'sb' have the same input but will be
  44 ## run under C locale and will be treated as single-bytes.
  45 ## This enables interleaving C/UTF8 tests
  46 ## (for easier comparison of expected results).
  47
  48 my @Tests =
  49   (
  50    ### length expressions ###
  51
  52    # sanity check
  53    ['mb-l1', 'length abcdef',   {OUT=>"6"}],
  54    ['st-l1', 'length abcdef',   {OUT=>"6"}],
  55
  56    # A single multibyte character in the beginning of the string
  57    # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA"
  58    ['mb-l2', "length \xCE\xB1bcdef",   {OUT=>"6"}],
  59    ['st-l2', "length \xCE\xB1bcdef",   {OUT=>"7"}],
  60
  61    # A single multibyte character in the middle of the string
  62    # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA"
  63    ['mb-l3', "length abc\xCE\xB4ef",   {OUT=>"6"}],
  64    ['st-l3', "length abc\xCE\xB4ef",   {OUT=>"7"}],
  65
  66    # A single multibyte character in the end of the string
  67    ['mb-l4', "length fedcb\xCE\xB1",   {OUT=>"6"}],
  68    ['st-l4', "length fedcb\xCE\xB1",   {OUT=>"7"}],
  69
  70    # A invalid multibyte sequence
  71    ['mb-l5', "length \xB1aaa",   {OUT=>"4"}],
  72    ['st-l5', "length \xB1aaa",   {OUT=>"4"}],
  73
  74    # An incomplete multibyte sequence at the end of the string
  75    ['mb-l6', "length aaa\xCE",   {OUT=>"4"}],
  76    ['st-l6', "length aaa\xCE",   {OUT=>"4"}],
  77
  78    # An incomplete multibyte sequence at the end of the string
  79    ['mb-l7', "length $expression",   {OUT=>"8"}],
  80    ['st-l7', "length $expression",   {OUT=>"17"}],
  81
  82
  83
  84    ### index expressions ###
  85
  86    # sanity check
  87    ['mb-i1', 'index abcdef fb',   {OUT=>"2"}],
  88    ['st-i1', 'index abcdef fb',   {OUT=>"2"}],
  89
  90    # Search for a single-octet
  91    ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"2"}],
  92    ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"3"}],
  93    ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"6"}],
  94    ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"8"}],
  95
  96    # Search for multibyte character.
  97    # In the C locale, the search string is treated as two octets.
  98    # the first of them (\xCE) matches the first octet of the input string.
  99    ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
 100    ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
 101
 102    # Invalid multibyte sequence in the input string, treated as a single octet.
 103    ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
 104    ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
 105
 106    # Invalid multibyte sequence in the search string, treated as a single octet.
 107    # In multibyte locale, there should be no match, expr returns and prints
 108    # zero, and terminates with exit-code 1 (as per POSIX).
 109    ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"0"}, {EXIT=>1}],
 110    ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"6"}],
 111
 112    # Edge-case: invalid multibyte sequence BOTH in the input string
 113    # and in the search string: expr should find a match.
 114    ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4",       {OUT=>"4"}],
 115
 116
 117    ### substr expressions ###
 118
 119    # sanity check
 120    ['mb-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
 121    ['st-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
 122
 123    ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE\xB1"}],
 124    ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE"}],
 125
 126    ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"c\xCE\xB4"}],
 127    ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"bc"}],
 128
 129    ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"\xCE\xB4"}],
 130    ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"c"}],
 131
 132    ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"\xCE\xB4e"}],
 133    ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"c\xCE"}],
 134
 135    ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"f"}],
 136    ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"\xB4"}],
 137
 138    ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>""}, {EXIT=>1}],
 139    ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>"e"}],
 140
 141    # Invalid multibyte sequences
 142    ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"c\xB4e"}],
 143    ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"bc\xB4"}],
 144
 145
 146    ### match expressions ###
 147
 148    # sanity check
 149    ['mb-m1', 'match abcdef ab',   {OUT=>"2"}],
 150    ['st-m1', 'match abcdef ab',   {OUT=>"2"}],
 151    ['mb-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
 152    ['st-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
 153
 154    # The regex engine should match the '.' to the first multibyte character.
 155    ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}],
 156    ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}],
 157
 158    # The opposite of the previous test: two dots should only match
 159    # the two octets in single-byte locale.
 160    ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}],
 161    ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}],
 162
 163    # Match with grouping - a single dot should return the two octets
 164    ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}],
 165    ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}],
 166
 167    # Invalid multibyte sequences - regex should not match in multibyte locale
 168    # (POSIX requirement)
 169    ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}],
 170    ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}],
 171
 172
 173    # Character classes: in the multibyte case, the regex engine understands
 174    # there is a single multibyte character in the brackets.
 175    # In the single byte case, the regex engine sees two octets in the character
 176    # class ('\xCE' and '\xB1') - and it matches the first one.
 177    ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}],
 178    ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}],
 179
 180   );
 181
 182
 183 # Append a newline to end of each expected 'OUT' string.
 184 my $t;
 185 foreach $t (@Tests)
 186   {
 187     my $arg1 = $t->[1];
 188     my $e;
 189     foreach $e (@$t)
 190       {
 191         $e->{OUT} .= "\n"
 192           if ref $e eq 'HASH' and exists $e->{OUT};
 193       }
 194   }
 195
 196
 197 # Force multibyte locale in all tests.
 198 #
 199 # NOTE about the ERR_SUBST:
 200 # The error tests above (e1/e2/e3/e4) expect error messages in C locale
 201 # having single-quote character (ASCII 0x27).
 202 # In UTF-8 locale, the error messages will use:
 203 #  'LEFT SINGLE QUOTATION MARK'  (U+2018) (UTF8: 0xE2 0x80 0x98)
 204 #  'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99)
 205 # So we replace them with ascii single-quote and the results will
 206 # match the expected error string.
 207 if ($locale ne 'C')
 208   {
 209     my @new;
 210     foreach my $t (@Tests)
 211       {
 212         my ($tname) = @$t;
 213         if ($tname =~ /^mb/)
 214           {
 215             push @$t, ({ENV => "LC_ALL=$locale"},
 216                        {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"});
 217           }
 218       }
 219   }
 220
 221
 222 my $save_temps = $ENV{DEBUG};
 223 my $verbose = $ENV{VERBOSE};
 224
 225 my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
 226 exit $fail;