2 # Copyright (C) 2001-2008, Parrot Foundation.
7 use lib qw( . lib ../lib ../../lib );
9 use Parrot::Test tests => 49;
14 t/op/string_cs.t - String Charset Tests
18 % prove t/op/string_cs.t
22 Tests encoding support.
26 pasm_output_is( <<'CODE', <<OUTPUT, "basic syntax" );
27 set S0, ascii:"ok 1\n"
29 set S0, binary:"ok 2\n"
31 set S0, iso-8859-1:"ok 3\n"
40 pasm_output_is( <<'CODE', <<OUTPUT, "encoding name" );
51 pasm_output_is( <<'CODE', <<OUTPUT, "find_encoding" );
52 find_encoding I0, "iso-8859-1"
54 find_encoding I0, "ascii"
56 find_encoding I0, "binary"
65 pasm_error_output_like( <<'CODE', <<OUTPUT, "find_encoding - not existing" );
66 find_encoding I0, "no_such"
69 /encoding 'no_such' not found/
72 pasm_output_is( <<'CODE', <<OUTPUT, "downcase" );
73 set S0, iso-8859-1:"AEIOU_ÄÖÜ\n"
81 pasm_output_is( <<'CODE', <<OUTPUT, "upcase" );
82 set S0, iso-8859-1:"aeiou_äöüß\n"
90 pasm_output_is( <<'CODE', <<OUTPUT, "titlecase" );
91 set S0, iso-8859-1:"zAEIOU_ÄÖÜ\n"
99 pasm_output_is( <<'CODE', <<OUTPUT, "is_whitespace" );
100 set S0, iso-8859-1:"a\t\n \xa0" # is 0xa0 a whitespace in iso-8859-1??
101 .include "cclass.pasm"
102 is_cclass I0, .CCLASS_WHITESPACE, S0, 0
103 is_cclass I1, .CCLASS_WHITESPACE, S0, 1
104 is_cclass I2, .CCLASS_WHITESPACE, S0, 2
105 is_cclass I3, .CCLASS_WHITESPACE, S0, 3
107 is_cclass I4, .CCLASS_WHITESPACE, S0, I4
114 set S0, ascii:"a\t\n "
115 is_cclass I0, .CCLASS_WHITESPACE, S0, 0
116 is_cclass I1, .CCLASS_WHITESPACE, S0, 1
117 is_cclass I2, .CCLASS_WHITESPACE, S0, 2
118 is_cclass I3, .CCLASS_WHITESPACE, S0, 3
119 is_cclass I4, .CCLASS_WHITESPACE, S0, 4 # access past string boundary: not a whitespace
132 pasm_output_is( <<'CODE', <<OUTPUT, "is_wordchar" );
133 .include "cclass.pasm"
138 is_cclass I0, .CCLASS_WORD, S0, I2
148 pasm_output_is( <<'CODE', <<OUTPUT, "is_digit" );
149 .include "cclass.pasm"
154 is_cclass I0, .CCLASS_NUMERIC, S0, I2
164 pasm_output_is( <<'CODE', <<OUTPUT, "is_punctuation" );
165 .include "cclass.pasm"
170 is_cclass I0, .CCLASS_PUNCTUATION, S0, I2
180 pasm_output_is( <<'CODE', <<OUTPUT, "is_newline" );
181 .include "cclass.pasm"
183 is_cclass I0, .CCLASS_NEWLINE, S0, 0
185 is_cclass I0, .CCLASS_NEWLINE, S0, 1
193 pasm_output_is( <<'CODE', <<OUTPUT, "find_wordchar" );
194 .include "cclass.pasm"
199 find_cclass I0, .CCLASS_WORD, S0, I0, I1
212 pasm_output_is( <<'CODE', <<OUTPUT, "find_digit" );
213 .include "cclass.pasm"
218 find_cclass I0, .CCLASS_NUMERIC, S0, I0, I1
231 pasm_output_is( <<'CODE', <<OUTPUT, "find_punctuation" );
232 .include "cclass.pasm"
237 find_cclass I0, .CCLASS_PUNCTUATION, S0, I0, I1
250 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i" );
252 find_encoding I0, "iso-8859-1"
253 trans_encoding S1, S0, I0
266 pasm_error_output_like( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i - lossy" );
267 set S1, iso-8859-1:"abcä"
268 find_encoding I0, "ascii"
269 trans_encoding S2, S1, I0
273 /lossy conversion to ascii/
276 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i iso-8859-1 to binary" );
277 set S0, iso-8859-1:"abc"
278 find_encoding I0, "binary"
279 trans_encoding S1, S0, I0
292 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i ascii to binary" );
294 find_encoding I0, "binary"
295 trans_encoding S1, S0, I0
308 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i ascii to iso-8859-1" );
310 find_encoding I0, "iso-8859-1"
311 trans_encoding S1, S0, I0
324 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i iso-8859-1 to utf8" );
325 set S0, iso-8859-1:"abc_ä_"
326 find_encoding I0, "utf8"
327 trans_encoding S1, S0, I0
344 pasm_output_is( <<'CODE', <<OUTPUT, "trans_encoding_s_s_i utf8 to iso-8859-1" );
345 set S0, utf8:"abc_\xe4_"
346 bytelength I2, S0 # XXX its 7 for utf8 only
349 find_encoding I0, "iso-8859-1"
350 trans_encoding S1, S0, I0
368 pir_output_is( <<'CODE', <<'OUTPUT', "bug #34661 literal" );
370 $S0 = utf8:"\"]\nif I3 == "
377 pir_output_is( <<'CODE', <<'OUTPUT', "todo #34660 hash" );
381 set_global ['Foo'], utf8:"Bar", $P0
383 $P1 = get_global ['Foo'], "Bar"
394 pir_output_is( <<'CODE', <<'OUTPUT', "concat ascii, utf8" );
396 .local string s, t, u
412 skip( 'no ICU lib', 19 ) unless $PConfig{has_icu};
414 pir_output_is( <<'CODE', <<OUTPUT, "literal encoding persistence - TT #468" );
415 .include 'stdio.pasm'
417 # set output encoding to normalize printed strings
419 $P1 = $P0.'stdhandle'(.PIO_STDOUT_FILENO)
420 $P1.'encoding'('utf8')
422 load_bytecode 't/op/testlib/test_strings.pbc'
427 unless $P1 goto end_loop
442 $S0 = encodingname $I0
453 pir_output_is( <<'CODE', <<OUTPUT, "empty literal encoding persistence - TT #1791");
455 load_bytecode 't/op/testlib/test_strings.pbc'
456 $P0 = 'get_empties'()
460 unless $P1 goto end_loop
475 $S0 = encodingname $I0
486 pir_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
488 set $S0, iso-8859-1:"TÖTSCH"
489 find_encoding $I0, "utf8"
490 trans_encoding $S1, $S0, $I0
492 getstdout $P0 # need to convert back to utf8
493 $P0.'encoding'("utf8") # set utf8 output
502 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_encoding_s_s_i" );
503 set S0, iso-8859-1:"TÖTSCH"
504 find_encoding I0, "utf8"
505 trans_encoding S1, S0, I0
507 find_encoding I0, "iso-8859-1"
508 trans_encoding S1, S1, I0
516 pasm_error_output_like( <<'CODE', <<"OUTPUT", "negative encoding number" );
517 trans_encoding S2, 'foo', -1
520 /encoding #-1 not found/
523 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transencoding" );
524 set S0, iso-8859-1:"TÖTSCH"
525 find_encoding I0, "utf8"
526 trans_encoding S1, S0, I0
528 find_encoding I0, "utf8"
529 trans_encoding S2, S1, I0
537 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
538 set S1, iso-8859-1:"TÖTSCH"
539 find_encoding I0, "utf16"
540 trans_encoding S1, S1, I0
558 pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
559 set S0, iso-8859-1:"TTÖÖ"
560 find_encoding I0, "utf8"
561 trans_encoding S1, S0, I0
568 .include "stringinfo.pasm"
569 stringinfo I0, S1, .STRINGINFO_BUFUSED
577 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
578 set S1, iso-8859-1:"Tötsch"
579 find_encoding I0, "utf16"
580 trans_encoding S1, S1, I0
585 .include "stringinfo.pasm"
586 stringinfo I0, S1, .STRINGINFO_BUFUSED
589 find_encoding I0, "utf8"
590 trans_encoding S2, S1, I0
599 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
600 set S1, iso-8859-1:"Tötsch"
601 find_encoding I0, "utf16"
602 trans_encoding S1, S1, I0
603 concat S2, S1, " Leo"
607 .include "stringinfo.pasm"
608 stringinfo I0, S2, .STRINGINFO_BUFUSED
611 find_encoding I0, "utf8"
612 trans_encoding S2, S2, I0
621 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
622 set S1, iso-8859-1:"Tötsch"
623 find_encoding I0, "utf16"
624 trans_encoding S1, S1, I0
626 find_encoding I0, "utf8"
627 trans_encoding S2, S2, I0
635 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
636 set S1, iso-8859-1:"Tötsch"
637 find_encoding I0, "utf16"
638 trans_encoding S1, S1, I0
640 replace S1, S1, 1, 1, "oe"
641 find_encoding I0, "utf8"
642 trans_encoding S2, S2, I0
643 trans_encoding S1, S1, I0
654 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
655 set S0, iso-8859-1:"TÖTSCH"
656 find_encoding I0, "utf8"
657 trans_encoding S1, S0, I0
659 set S2, iso-8859-1:"öt"
668 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
669 set S0, iso-8859-1:"TÖTSCH"
670 find_encoding I0, "utf8"
671 trans_encoding S1, S0, I0
673 set S2, iso-8859-1:"öt"
687 pir_output_is( <<'CODE', <<"OUTPUT", "unicode upcase" );
689 set $S0, iso-8859-1:"tötsch"
690 find_encoding $I0, "utf8"
691 trans_encoding $S1, $S0, $I0
693 getstdout $P0 # need to convert back to utf8
694 $P0.'encoding'("utf8") # set utf8 output
703 pir_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char" );
705 set $S1, utf8:"hacek j \u01f0"
707 getstdout $P0 # need to convert back to utf8
708 $P0.'encoding'("utf8") # set utf8 output
719 # 106 dest_len = u_strToUpper(src->strstart, dest_len,
724 # (gdb) x /8h src->strstart
725 # 0x844fb60: 0x005f 0x005f 0x005f 0x01f0 0x0031 0x0032 0x0033 0x0000
727 # 110 src->bufused = dest_len * sizeof(UChar);
730 # (gdb) x /8h src->strstart
731 # 0x844fb60: 0x005f 0x005f 0x005f 0x004a 0x030c 0x0031 0x0032 0x0000
733 pir_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char 3.2 bug?" );
735 set $S1, utf8:"___\u01f0123"
737 getstdout $P0 # need to convert back to utf8
738 $P0.'encoding'("utf8") # set utf8 output
747 pir_output_is( <<'CODE', <<"OUTPUT", "unicode titlecase" );
749 set $S0, iso-8859-1:"tötsch leo"
750 find_encoding $I0, "utf8"
751 trans_encoding $S1, $S0, $I0
753 getstdout $P0 # need to convert back to utf8
754 $P0.'encoding'("utf8") # set utf8 output
760 T\x{c3}\x{b6}tsch Leo
763 pir_output_is( <<'CODE', <<OUTPUT, "combose combined char" );
765 set $S1, utf8:"___\u01f0___"
767 upcase $S1, $S1 # decompose J+hacek
768 length $I1, $S1 # 1 longer
769 downcase $S1, $S1 # j+hacek
772 length $I3, $S1 # back at original string
773 getstdout $P0 # need to convert back to utf8
774 $P0.'encoding'("utf8") # set utf8 output
794 pasm_output_is( <<'CODE', <<'OUTPUT', "escape ascii" );
795 set S0, "abcdefghi\n"
804 pasm_output_is( <<'CODE', <<'OUTPUT', "escape ctrl" );
805 set S0, "\x00\x01\x1f\x7f"
811 \x{0}\x{1}\x{1f}\x{7f}
814 pasm_output_is( <<'CODE', <<'OUTPUT', "escape latin1" );
815 set S0, iso-8859-1:"tötsch leo"
824 pasm_output_is( <<'CODE', <<'OUTPUT', "escape unicode" );
825 set S0, utf8:"\u2001\u2002\u2003\u2004\x{e01ef}\u0114"
831 \u2001\u2002\u2003\u2004\x{e01ef}\u0114
834 pir_output_is(<<'CODE', <<'OUTPUT', 'escape unicode w/ literal 0' );
836 $S0 = utf8:"x/\u0445\u0440\u0435\u043d\u044c_09-10.txt"
841 x/\u0445\u0440\u0435\u043d\u044c_09-10.txt
846 # cperl-indent-level: 4
849 # vim: expandtab shiftwidth=4: