t/op/stringu.t

   1 #!perl
   2 # Copyright (C) 2001-2009, Parrot Foundation.
   3 # $Id$
   4
   5 use strict;
   6 use warnings;
   7 use lib qw( . lib ../lib ../../lib );
   8 use Test::More;
   9 use Parrot::Test tests => 34;
  10 use Parrot::Config;
  11
  12 =head1 NAME
  13
  14 t/op/stringu.t - Unicode String Test
  15
  16 =head1 SYNOPSIS
  17
  18         % prove t/op/stringu.t
  19
  20 =head1 DESCRIPTION
  21
  22 Tests Parrot unicode string system.
  23
  24 =cut
  25
  26 pir_output_is( <<'CODE', <<OUTPUT, "angstrom" );
  27 .include 'stdio.pasm'
  28 .sub main :main
  29     $P0 = getinterp
  30     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
  31     $P1.'encoding'("utf8")
  32     chr $S0, 0x212B
  33     print $S0
  34     print "\n"
  35     end
  36 .end
  37 CODE
  38 \xe2\x84\xab
  39 OUTPUT
  40
  41 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
  42 .include 'stdio.pasm'
  43 .sub main :main
  44     $P0 = getinterp
  45     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
  46     $P1.'encoding'("utf8")
  47     set $S0, unicode:"\x{212b}"
  48     print $S0
  49     print "\n"
  50     end
  51 .end
  52 CODE
  53 \xe2\x84\xab
  54 OUTPUT
  55
  56 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
  57 .include 'stdio.pasm'
  58 .sub main :main
  59     $P0 = getinterp
  60     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
  61     $P1.'encoding'("utf8")
  62     set $S0, unicode:"aaaaaa\x{212b}"
  63     print $S0
  64     print "\n"
  65     end
  66 .end
  67 CODE
  68 aaaaaa\xe2\x84\xab
  69 OUTPUT
  70
  71 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
  72 .include 'stdio.pasm'
  73 .sub main :main
  74     $P0 = getinterp
  75     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
  76     $P1.'encoding'("utf8")
  77     set $S0, unicode:"aaaaaa\x{212b}-aaaaaa"
  78     print $S0
  79     print "\n"
  80     end
  81 .end
  82 CODE
  83 aaaaaa\xe2\x84\xab-aaaaaa
  84 OUTPUT
  85
  86 pir_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
  87 .include 'stdio.pasm'
  88 .sub main :main
  89     $P0 = getinterp
  90     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
  91     $P1.'encoding'("utf8")
  92     set $S0, unicode:"aaaaaa\u212b-aaaaaa"
  93     print $S0
  94     print "\n"
  95     end
  96 .end
  97 CODE
  98 aaaaaa\xe2\x84\xab-aaaaaa
  99 OUTPUT
 100
 101 pir_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
 102 .include 'stdio.pasm'
 103 .sub main :main
 104     $P0 = getinterp
 105     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
 106     $P1.'encoding'("utf8")
 107     set $S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
 108     print $S0
 109     print "\n"
 110     end
 111 .end
 112 CODE
 113 aaaaaa\xf0\x9d\x90\x80-aaaaaa
 114 OUTPUT
 115
 116 pir_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
 117 .include 'stdio.pasm'
 118 .sub main :main
 119     $P0 = getinterp
 120     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
 121     $P1.'encoding'("utf8")
 122     set $S0, unicode:"aaaaaa\U0001d400-aaaaaa"
 123     print $S0
 124     print "\n"
 125     end
 126 .end
 127 CODE
 128 aaaaaa\xf0\x9d\x90\x80-aaaaaa
 129 OUTPUT
 130
 131 pir_output_is( <<'CODE', <<OUTPUT, "two upscales" );
 132 .include 'stdio.pasm'
 133 .sub main :main
 134     $P0 = getinterp
 135     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
 136     $P1.'encoding'("utf8")
 137     set $S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
 138     print $S0
 139     print "\n"
 140     length $I0, $S0
 141     print $I0
 142     print "\n"
 143     end
 144 .end
 145 CODE
 146 aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
 147 22
 148 OUTPUT
 149
 150 pir_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
 151 .include 'stdio.pasm'
 152 .sub main :main
 153     $P0 = getinterp
 154     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
 155     $P1.'encoding'("utf8")
 156     set $S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
 157     print $S0
 158     print "\n"
 159     length $I0, $S0
 160     print $I0
 161     print "\n"
 162     end
 163 .end
 164 CODE
 165 aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
 166 22
 167 OUTPUT
 168
 169 pir_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
 170 .include 'stdio.pasm'
 171 .sub main :main
 172     $P0 = getinterp
 173     $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
 174     $P1.'encoding'("utf8")
 175     set $S0, "ok 1\cJ"
 176     print $S0
 177     set $S0, "ok 2\012"
 178     print $S0
 179     set $S0, "ok 3\12"
 180     print $S0
 181     set $S0, "ok 4\x0a"
 182     print $S0
 183     set $S0, "ok 5\xa"
 184     print $S0
 185     end
 186 .end
 187 CODE
 188 ok 1
 189 ok 2
 190 ok 3
 191 ok 4
 192 ok 5
 193 OUTPUT
 194
 195 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
 196     set S0, "x\uy"
 197     print "never\n"
 198     end
 199 CODE
 200 /Illegal escape sequence in/
 201 OUTPUT
 202
 203 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
 204     set S0, "x\u123y"
 205     print "never\n"
 206     end
 207 CODE
 208 /Illegal escape sequence in/
 209 OUTPUT
 210
 211 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
 212     set S0, "x\U123y"
 213     print "never\n"
 214     end
 215 CODE
 216 /Illegal escape sequence in/
 217 OUTPUT
 218
 219 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
 220     set S0, "x\xy"
 221     print "never\n"
 222     end
 223 CODE
 224 /Illegal escape sequence in/
 225 OUTPUT
 226
 227 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 228     set S0, utf8:unicode:"«"
 229     length I0, S0
 230     print I0
 231     print "\n"
 232     print S0
 233     print "\n"
 234     end
 235 CODE
 236 1
 237 \xc2\xab
 238 OUTPUT
 239
 240 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 241     set S0, utf8:unicode:"\xc2\xab"
 242     length I0, S0
 243     print I0
 244     print "\n"
 245     print S0
 246     print "\n"
 247     end
 248 CODE
 249 1
 250 \xc2\xab
 251 OUTPUT
 252
 253 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
 254     set S0, utf8:unicode:"\xf2\xab"
 255     length I0, S0
 256     print I0
 257     print "\n"
 258     print S0
 259     print "\n"
 260     end
 261 CODE
 262 /Malformed UTF-8 string/
 263 OUTPUT
 264
 265 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
 266     set S0, ascii:"«"
 267     length I0, S0
 268     print I0
 269     print "\n"
 270     end
 271 CODE
 272 /Malformed string/
 273 OUTPUT
 274
 275 pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
 276     set S0, "AAAAAAAAAA\\u666"
 277     set I0, 0x666
 278     chr S1, I0
 279     replace S0, S0, 10, 5, S1
 280     print S0
 281     print "\n"
 282     end
 283 CODE
 284 AAAAAAAAAA\xd9\xa6
 285 OUTPUT
 286
 287 SKIP: {
 288     skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 289     pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
 290 .sub main
 291     .local string str
 292     .local string rest
 293
 294     str = unicode:".xyz"
 295     rest = substr str, 1
 296     print rest
 297     print "\n"
 298
 299     str = unicode:".xyz"
 300     $S99 = downcase str
 301     rest = substr str, 1
 302     print rest
 303     print "\n"
 304
 305 .end
 306 CODE
 307 xyz
 308 xyz
 309 OUTPUT
 310
 311     pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
 312 .sub main
 313     .local string str
 314     .local string rest
 315     str = unicode:".XYZ"
 316     $S0 = downcase str
 317     print $S0
 318     print "\n"
 319 .end
 320 CODE
 321 .xyz
 322 OUTPUT
 323
 324     # escape does not produce utf8, just a raw sequence of chars
 325     pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
 326 .sub main
 327     .local string s, t
 328     .local int i
 329     s = iso-8859-1:"T\xf6tsch"
 330     i = find_encoding "utf8"
 331     s = trans_encoding s, i
 332     t = upcase s
 333     escape t, t
 334     print t
 335     print "\\n"
 336 .end
 337 CODE
 338 T\x{d6}TSCH
 339 OUTPUT
 340 }
 341
 342 # Tests for .CCLASS_WHITESPACE
 343 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
 344 .sub main
 345     .include 'cclass.pasm'
 346     .local string s
 347     s = unicode:" \t\u207babc\n\u2000\u2009"
 348     $I9 = length s
 349     $I0 = is_cclass .CCLASS_WHITESPACE, s, 0
 350     print $I0
 351     $I0 = is_cclass .CCLASS_WHITESPACE, s, 1
 352     print $I0
 353     $I0 = is_cclass .CCLASS_WHITESPACE, s, 2
 354     print $I0
 355     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
 356     print $I0
 357     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 358     print $I0
 359     $I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 360     print $I0
 361     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 362     print $I0
 363     print "\n"
 364 .end
 365 CODE
 366 1102269
 367 OUTPUT
 368
 369 # Tests for .CCLASS_ANY
 370 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_ANY in unicode" );
 371 .sub main
 372     .include 'cclass.pasm'
 373     .local string s
 374     s = unicode:" \t\u207babc\n\u2000\u2009"
 375     $I9 = length s
 376     $I0 = is_cclass .CCLASS_ANY, s, 0
 377     print $I0
 378     $I0 = is_cclass .CCLASS_ANY, s, 1
 379     print $I0
 380     $I0 = is_cclass .CCLASS_ANY, s, 2
 381     print $I0
 382     $I0 = is_cclass .CCLASS_ANY, s, $I9
 383     print $I0
 384     $I0 = find_not_cclass .CCLASS_ANY, s, 0, $I9
 385     print $I0
 386     $I0 = find_not_cclass .CCLASS_ANY, s, $I0, $I9
 387     print $I0
 388     $I0 = find_cclass .CCLASS_ANY, s, $I0, $I9
 389     print $I0
 390     $I0 = find_cclass .CCLASS_ANY, s, 2, $I9
 391     print $I0
 392     print "\n"
 393 .end
 394 CODE
 395 11109992
 396 OUTPUT
 397
 398 SKIP: {
 399     skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
 400
 401     # Tests for .CCLASS_NUMERIC
 402     pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
 403 .sub main
 404     .include 'cclass.pasm'
 405     .local string s
 406     s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
 407     $I9 = length s
 408     $I0 = is_cclass .CCLASS_NUMERIC, s, 0
 409     print $I0
 410     $I0 = is_cclass .CCLASS_NUMERIC, s, 1
 411     print $I0
 412     $I0 = is_cclass .CCLASS_NUMERIC, s, 2
 413     print $I0
 414     $I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
 415     print $I0
 416     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 417     print $I0
 418     $I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
 419     print $I0
 420     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 421     print $I0
 422     print "\n"
 423 .end
 424 CODE
 425 1102269
 426 OUTPUT
 427
 428     # Concatenate unicode: with iso-8859-1
 429     pir_output_is(
 430         <<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1" );
 431 .sub main
 432     $S0 = unicode:"A"
 433     $S1 = ascii:"B"
 434     $S2 = concat $S0, $S1
 435     print $S2
 436     print "\n"
 437
 438     $S0 = unicode:"A"
 439     $S1 = unicode:"B"
 440     $S2 = concat $S0, $S1
 441     print $S2
 442     print "\n"
 443
 444     $S0 = unicode:"A"
 445     $S1 = iso-8859-1:"B"
 446     $S2 = concat $S0, $S1
 447     print $S2
 448     print "\n"
 449 .end
 450 CODE
 451 AB
 452 AB
 453 AB
 454 OUTPUT
 455 }
 456
 457 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys");
 458 .sub 'main'
 459     .local string str0, str1
 460     str0 = unicode:"\u00ab"
 461     str1 = iso-8859-1:"\xab"
 462
 463     .local pmc hash
 464     hash = new 'Hash'
 465     hash[str0] = 'hello'
 466
 467     $I0 = iseq str0, str1
 468     say $I0
 469
 470     $S0 = hash[str0]
 471     $S1 = hash[str1]
 472     $I0 = iseq $S0, $S1
 473     say $I0
 474     say $S0
 475     say $S1
 476 .end
 477 CODE
 478 1
 479 1
 480 hello
 481 hello
 482 OUTPUT
 483
 484 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys, full bucket" );
 485 .sub 'main'
 486     .local string str0, str1
 487     str0 = unicode:"infix:\u00b1"
 488     str1 = iso-8859-1:"infix:\xb1"
 489
 490     .local pmc hash
 491     hash = new 'Hash'
 492     hash[str0] = 'hello'
 493
 494     $I0 = 0
 495   fill_loop:
 496     unless $I0 < 200 goto fill_done
 497     inc $I0
 498     $S0 = $I0
 499     $S0 = concat 'infix:', $S0
 500     hash[$S0] = 'foo'
 501     goto fill_loop
 502   fill_done:
 503
 504     $I0 = iseq str0, str1
 505     #print "iseq str0, str1               => "
 506     say $I0
 507
 508     $S0 = hash[str0]
 509     $S1 = hash[str1]
 510     $I0 = iseq $S0, $S1
 511     #print "iseq hash[str0], hash[str1]   => "
 512     say $I0
 513     say $S0
 514     say $S1
 515 .end
 516 CODE
 517 1
 518 1
 519 hello
 520 hello
 521 OUTPUT
 522
 523
 524 SKIP: {
 525     skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 526 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
 527 .sub main :main
 528      $S0 = "140"
 529      $I0 = $S0
 530      say $I0
 531      $I0 = find_encoding 'ucs2'
 532      $S0 = trans_encoding $S0, $I0
 533      $I0 = $S0
 534      say $I0
 535 .end
 536 CODE
 537 140
 538 140
 539 OUT
 540
 541 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to float' );
 542 .sub main :main
 543      $S0 = "140"
 544      $N0 = $S0
 545      say $N0
 546      $I0 = find_encoding 'ucs2'
 547      $S0 = trans_encoding $S0, $I0
 548      $N0 = $S0
 549      say $N0
 550 .end
 551 CODE
 552 140
 553 140
 554 OUT
 555
 556 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings float mixed' );
 557 .sub main :main
 558     $S0 = unicode:"140 r\x{e9}sum\x{e9}s"
 559     $N0 = $S0
 560     say $N0
 561     $I0 = find_encoding 'ucs2'
 562     $S0 = trans_encoding $S0, $I0
 563     $N0 = $S0
 564     say $N0
 565 .end
 566 CODE
 567 140
 568 140
 569 OUT
 570 }
 571
 572 pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
 573 .sub 'main'
 574
 575     $S1 = chr 0xe5
 576     $S2 = chr 0x263b
 577
 578     $S0 = unicode:"\u00e5\u263b"
 579     $S3 = concat $S1, $S2
 580     if $S0 == $S3 goto equal_1
 581     print "not "
 582   equal_1:
 583     say "equal"
 584
 585     $S0 = unicode:"\u263b\u00e5"
 586     $S3 = concat $S2, $S1
 587     if $S0 == $S3 goto equal_2
 588     print "not "
 589   equal_2:
 590     say "equal"
 591 .end
 592 CODE
 593 equal
 594 equal
 595 OUT
 596
 597 pir_output_is( <<'CODE', <<'OUT', 'join mixed encodings' );
 598 .sub 'main'
 599     new $P0, 'ResizablePMCArray'
 600     push $P0, ascii:"a"
 601     push $P0, unicode:"\x{e1}" # a acute
 602     push $P0, iso-8859-1:"\x{e1}" # a acute
 603     join $S0, "", $P0
 604     $I0 = length $S0
 605     say $I0
 606 .end
 607 CODE
 608 3
 609 OUT
 610
 611 SKIP: {
 612     skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
 613
 614 pir_output_is( <<'CODE', <<'OUT', 'find_codepoint opcode (experimental)');
 615 .sub 'main'
 616     $I1 = find_codepoint 'THISISNOTTHENAMEOFNOTHING'
 617     say $I1
 618
 619     .const string cpf = "0x%04x"
 620     $P0 = new 'FixedIntegerArray', 1
 621     $I0 = find_codepoint 'LATIN CAPITAL LETTER C'
 622     $P0[0] = $I0
 623     $S0 = sprintf cpf, $P0
 624     say $S0
 625     $I0 = find_codepoint 'MUSIC FLAT SIGN'
 626     $P0[0] = $I0
 627     $S0 = sprintf cpf, $P0
 628     say $S0
 629     $I0 = find_codepoint 'RECYCLING SYMBOL FOR TYPE-1 PLASTICS'
 630     $P0[0] = $I0
 631     $S0 = sprintf cpf, $P0
 632     say $S0
 633 .end
 634 CODE
 635 -1
 636 0x0043
 637 0x266d
 638 0x2673
 639 OUT
 640 }
 641
 642 # Local Variables:
 643 #   mode: cperl
 644 #   cperl-indent-level: 4
 645 #   fill-column: 100
 646 # End:
 647 # vim: expandtab shiftwidth=4: