2 # Copyright (C) 2001-2009, Parrot Foundation.
7 use lib qw( . lib ../lib ../../lib );
9 use Parrot::Test tests => 34;
14 t/op/stringu.t - Unicode String Test
18 % prove t/op/stringu.t
22 Tests Parrot unicode string system.
26 pir_output_is( <<'CODE', <<OUTPUT, "angstrom" );
30 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
31 $P1.'encoding'("utf8")
41 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
45 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
46 $P1.'encoding'("utf8")
47 set $S0, unicode:"\x{212b}"
56 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
60 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
61 $P1.'encoding'("utf8")
62 set $S0, unicode:"aaaaaa\x{212b}"
71 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
75 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
76 $P1.'encoding'("utf8")
77 set $S0, unicode:"aaaaaa\x{212b}-aaaaaa"
83 aaaaaa\xe2\x84\xab-aaaaaa
86 pir_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
90 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
91 $P1.'encoding'("utf8")
92 set $S0, unicode:"aaaaaa\u212b-aaaaaa"
98 aaaaaa\xe2\x84\xab-aaaaaa
101 pir_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
102 .include 'stdio.pasm'
105 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
106 $P1.'encoding'("utf8")
107 set $S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
113 aaaaaa\xf0\x9d\x90\x80-aaaaaa
116 pir_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
117 .include 'stdio.pasm'
120 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
121 $P1.'encoding'("utf8")
122 set $S0, unicode:"aaaaaa\U0001d400-aaaaaa"
128 aaaaaa\xf0\x9d\x90\x80-aaaaaa
131 pir_output_is( <<'CODE', <<OUTPUT, "two upscales" );
132 .include 'stdio.pasm'
135 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
136 $P1.'encoding'("utf8")
137 set $S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
146 aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
150 pir_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
151 .include 'stdio.pasm'
154 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
155 $P1.'encoding'("utf8")
156 set $S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
165 aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
169 pir_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
170 .include 'stdio.pasm'
173 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
174 $P1.'encoding'("utf8")
195 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
200 /Illegal escape sequence in/
203 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
208 /Illegal escape sequence in/
211 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
216 /Illegal escape sequence in/
219 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
224 /Illegal escape sequence in/
227 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
228 set S0, utf8:unicode:"«"
240 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
241 set S0, utf8:unicode:"\xc2\xab"
253 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
254 set S0, utf8:unicode:"\xf2\xab"
262 /Malformed UTF-8 string/
265 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
275 pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
276 set S0, "AAAAAAAAAA\\u666"
279 replace S0, S0, 10, 5, S1
288 skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
289 pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
311 pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
324 # escape does not produce utf8, just a raw sequence of chars
325 pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
329 s = iso-8859-1:"T\xf6tsch"
330 i = find_encoding "utf8"
331 s = trans_encoding s, i
342 # Tests for .CCLASS_WHITESPACE
343 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
345 .include 'cclass.pasm'
347 s = unicode:" \t\u207babc\n\u2000\u2009"
349 $I0 = is_cclass .CCLASS_WHITESPACE, s, 0
351 $I0 = is_cclass .CCLASS_WHITESPACE, s, 1
353 $I0 = is_cclass .CCLASS_WHITESPACE, s, 2
355 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
357 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
359 $I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
361 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
369 # Tests for .CCLASS_ANY
370 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_ANY in unicode" );
372 .include 'cclass.pasm'
374 s = unicode:" \t\u207babc\n\u2000\u2009"
376 $I0 = is_cclass .CCLASS_ANY, s, 0
378 $I0 = is_cclass .CCLASS_ANY, s, 1
380 $I0 = is_cclass .CCLASS_ANY, s, 2
382 $I0 = is_cclass .CCLASS_ANY, s, $I9
384 $I0 = find_not_cclass .CCLASS_ANY, s, 0, $I9
386 $I0 = find_not_cclass .CCLASS_ANY, s, $I0, $I9
388 $I0 = find_cclass .CCLASS_ANY, s, $I0, $I9
390 $I0 = find_cclass .CCLASS_ANY, s, 2, $I9
399 skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
401 # Tests for .CCLASS_NUMERIC
402 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
404 .include 'cclass.pasm'
406 s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
408 $I0 = is_cclass .CCLASS_NUMERIC, s, 0
410 $I0 = is_cclass .CCLASS_NUMERIC, s, 1
412 $I0 = is_cclass .CCLASS_NUMERIC, s, 2
414 $I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
416 $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
418 $I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
420 $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
428 # Concatenate unicode: with iso-8859-1
430 <<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1" );
434 $S2 = concat $S0, $S1
440 $S2 = concat $S0, $S1
446 $S2 = concat $S0, $S1
457 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys");
459 .local string str0, str1
460 str0 = unicode:"\u00ab"
461 str1 = iso-8859-1:"\xab"
467 $I0 = iseq str0, str1
484 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys, full bucket" );
486 .local string str0, str1
487 str0 = unicode:"infix:\u00b1"
488 str1 = iso-8859-1:"infix:\xb1"
496 unless $I0 < 200 goto fill_done
499 $S0 = concat 'infix:', $S0
504 $I0 = iseq str0, str1
505 #print "iseq str0, str1 => "
511 #print "iseq hash[str0], hash[str1] => "
525 skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
526 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
531 $I0 = find_encoding 'ucs2'
532 $S0 = trans_encoding $S0, $I0
541 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to float' );
546 $I0 = find_encoding 'ucs2'
547 $S0 = trans_encoding $S0, $I0
556 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings float mixed' );
558 $S0 = unicode:"140 r\x{e9}sum\x{e9}s"
561 $I0 = find_encoding 'ucs2'
562 $S0 = trans_encoding $S0, $I0
572 pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
578 $S0 = unicode:"\u00e5\u263b"
579 $S3 = concat $S1, $S2
580 if $S0 == $S3 goto equal_1
585 $S0 = unicode:"\u263b\u00e5"
586 $S3 = concat $S2, $S1
587 if $S0 == $S3 goto equal_2
597 pir_output_is( <<'CODE', <<'OUT', 'join mixed encodings' );
599 new $P0, 'ResizablePMCArray'
601 push $P0, unicode:"\x{e1}" # a acute
602 push $P0, iso-8859-1:"\x{e1}" # a acute
612 skip( 'no ICU lib', 1 ) unless $PConfig{has_icu};
614 pir_output_is( <<'CODE', <<'OUT', 'find_codepoint opcode (experimental)');
616 $I1 = find_codepoint 'THISISNOTTHENAMEOFNOTHING'
619 .const string cpf = "0x%04x"
620 $P0 = new 'FixedIntegerArray', 1
621 $I0 = find_codepoint 'LATIN CAPITAL LETTER C'
623 $S0 = sprintf cpf, $P0
625 $I0 = find_codepoint 'MUSIC FLAT SIGN'
627 $S0 = sprintf cpf, $P0
629 $I0 = find_codepoint 'RECYCLING SYMBOL FOR TYPE-1 PLASTICS'
631 $S0 = sprintf cpf, $P0
644 # cperl-indent-level: 4
647 # vim: expandtab shiftwidth=4: