add tests.
[ruby-svn.git] / test / ruby / test_econv.rb
blob3ae46b184c135e37fce8db02ba141f17f4f3371a
1 require 'test/unit'
3 class TestEncodingConverter < Test::Unit::TestCase
4   def check_ec(edst, esrc, eres, dst, src, ec, off, len, flags=0)
5     res = ec.primitive_convert(src, dst, off, len, flags)
6     assert_equal([edst.dup.force_encoding("ASCII-8BIT"),
7                   esrc.dup.force_encoding("ASCII-8BIT"),
8                   eres],
9                  [dst.dup.force_encoding("ASCII-8BIT"),
10                   src.dup.force_encoding("ASCII-8BIT"),
11                   res])
12   end
14   def assert_econv(converted, eres, obuf_bytesize, ec, consumed, rest, flags=0)
15     ec = Encoding::Converter.new(*ec) if Array === ec
16     i = consumed + rest
17     o = ""
18     ret = ec.primitive_convert(i, o, 0, obuf_bytesize, flags)
19     assert_equal([converted,    eres,       rest],
20                  [o,            ret,           i])
21   end
23   def test_new
24     assert_kind_of(Encoding::Converter, Encoding::Converter.new("UTF-8", "EUC-JP"))
25     assert_kind_of(Encoding::Converter, Encoding::Converter.new(Encoding::UTF_8, Encoding::EUC_JP))
26   end
28   def test_new_fail
29     name1 = "encoding-which-is-not-exist-1"
30     name2 = "encoding-which-is-not-exist-2"
32     assert_raise(ArgumentError) {
33       Encoding::Converter.new(name1, name2)
34     }
36     encoding_list = Encoding.list.map {|e| e.name }
37     assert(!encoding_list.include?(name1))
38     assert(!encoding_list.include?(name2))
39   end
41   def test_get_encoding
42     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
43     assert_equal(Encoding::UTF_8, ec.source_encoding)
44     assert_equal(Encoding::EUC_JP, ec.destination_encoding)
45   end
47   def test_result_encoding
48     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
49     dst = "".force_encoding("ASCII-8BIT")
50     assert_equal(Encoding::ASCII_8BIT, dst.encoding)
51     ec.primitive_convert("\u{3042}", dst, nil, 10)
52     assert_equal(Encoding::EUC_JP, dst.encoding)
53   end
55   def test_output_region
56     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
57     ec.primitive_convert(src="a", dst="b", nil, 1, Encoding::Converter::PARTIAL_INPUT)
58     assert_equal("ba", dst)
59     ec.primitive_convert(src="a", dst="b", 0, 1, Encoding::Converter::PARTIAL_INPUT)
60     assert_equal("a", dst)
61     ec.primitive_convert(src="a", dst="b", 1, 1, Encoding::Converter::PARTIAL_INPUT)
62     assert_equal("ba", dst)
63     assert_raise(ArgumentError) {
64       ec.primitive_convert(src="a", dst="b", 2, 1, Encoding::Converter::PARTIAL_INPUT)
65     }
66     assert_raise(ArgumentError) {
67       ec.primitive_convert(src="a", dst="b", -1, 1, Encoding::Converter::PARTIAL_INPUT)
68     }
69     assert_raise(ArgumentError) {
70       ec.primitive_convert(src="a", dst="b", 1, -1, Encoding::Converter::PARTIAL_INPUT)
71     }
72   end
74   def test_partial_input
75     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
76     ret = ec.primitive_convert(src="", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
77     assert_equal(:source_buffer_empty, ret)
78     ret = ec.primitive_convert(src="", dst="", nil, 10)
79     assert_equal(:finished, ret)
80   end
82   def test_accumulate_dst1
83     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
84     a =     ["", "abc\u{3042}def", ec, nil, 1]
85     check_ec("a",  "c\u{3042}def", :destination_buffer_full, *a)
86     check_ec("ab",  "\u{3042}def", :destination_buffer_full, *a)
87     check_ec("abc",         "def", :destination_buffer_full, *a)
88     check_ec("abc\xA4",     "def", :destination_buffer_full, *a)
89     check_ec("abc\xA4\xA2",  "ef", :destination_buffer_full, *a)
90     check_ec("abc\xA4\xA2d",  "f", :destination_buffer_full, *a)
91     check_ec("abc\xA4\xA2de",  "", :destination_buffer_full, *a)
92     check_ec("abc\xA4\xA2def", "", :finished,  *a)
93   end
95   def test_accumulate_dst2
96     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
97     a =     ["", "abc\u{3042}def", ec, nil, 2]
98     check_ec("ab",  "\u{3042}def", :destination_buffer_full, *a)
99     check_ec("abc\xA4",     "def", :destination_buffer_full, *a)
100     check_ec("abc\xA4\xA2d",  "f", :destination_buffer_full, *a)
101     check_ec("abc\xA4\xA2def", "", :finished,  *a)
102   end
104   def test_eucjp_to_utf8
105     assert_econv("", :finished, 100, ["UTF-8", "EUC-JP"], "", "")
106     assert_econv("a", :finished, 100, ["UTF-8", "EUC-JP"], "a", "")
107   end
109   def test_iso2022jp
110     assert_econv("", :finished, 100, ["Shift_JIS", "ISO-2022-JP"], "", "")
111   end
113   def test_iso2022jp_encode
114     ec = Encoding::Converter.new("EUC-JP", "ISO-2022-JP")
115     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
116     src << "a";        check_ec("a",                           "", :source_buffer_empty, *a)
117     src << "\xA2";     check_ec("a",                           "", :source_buffer_empty, *a)
118     src << "\xA4";     check_ec("a\e$B\"$",                    "", :source_buffer_empty, *a)
119     src << "\xA1";     check_ec("a\e$B\"$",                    "", :source_buffer_empty, *a)
120     src << "\xA2";     check_ec("a\e$B\"$!\"",                 "", :source_buffer_empty, *a)
121     src << "b";        check_ec("a\e$B\"$!\"\e(Bb",            "", :source_buffer_empty, *a)
122     src << "\xA2\xA6"; check_ec("a\e$B\"$!\"\e(Bb\e$B\"&",     "", :source_buffer_empty, *a)
123     a[-1] = 0;         check_ec("a\e$B\"$!\"\e(Bb\e$B\"&\e(B", "", :finished, *a)
124   end
126   def test_iso2022jp_decode
127     ec = Encoding::Converter.new("ISO-2022-JP", "EUC-JP")
128     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
129     src << "a";         check_ec("a",                   "", :source_buffer_empty, *a)
130     src << "\e";        check_ec("a",                   "", :source_buffer_empty, *a)
131     src << "$";         check_ec("a",                   "", :source_buffer_empty, *a)
132     src << "B";         check_ec("a",                   "", :source_buffer_empty, *a)
133     src << "\x21";      check_ec("a",                   "", :source_buffer_empty, *a)
134     src << "\x22";      check_ec("a\xA1\xA2",           "", :source_buffer_empty, *a)
135     src << "\n";        check_ec("a\xA1\xA2",           "", :invalid_byte_sequence, *a)
136     src << "\x23";      check_ec("a\xA1\xA2",           "", :source_buffer_empty, *a)
137     src << "\x24";      check_ec("a\xA1\xA2\xA3\xA4",   "", :source_buffer_empty, *a)
138     src << "\e";        check_ec("a\xA1\xA2\xA3\xA4",   "", :source_buffer_empty, *a)
139     src << "(";         check_ec("a\xA1\xA2\xA3\xA4",   "", :source_buffer_empty, *a)
140     src << "B";         check_ec("a\xA1\xA2\xA3\xA4",   "", :source_buffer_empty, *a)
141     src << "c";         check_ec("a\xA1\xA2\xA3\xA4c",  "", :source_buffer_empty, *a)
142     src << "\n";        check_ec("a\xA1\xA2\xA3\xA4c\n","", :source_buffer_empty, *a)
143   end
145   def test_invalid
146     assert_econv("", :invalid_byte_sequence,    100, ["UTF-8", "EUC-JP"], "\x80", "")
147     assert_econv("a", :invalid_byte_sequence,   100, ["UTF-8", "EUC-JP"], "a\x80", "")
148     assert_econv("a", :invalid_byte_sequence,   100, ["UTF-8", "EUC-JP"], "a\x80", "\x80")
149     assert_econv("abc", :invalid_byte_sequence, 100, ["UTF-8", "EUC-JP"], "abc\xFF", "def")
150     assert_econv("abc", :invalid_byte_sequence, 100, ["Shift_JIS", "EUC-JP"], "abc\xFF", "def")
151     assert_econv("abc", :invalid_byte_sequence, 100, ["ISO-2022-JP", "EUC-JP"], "abc\xFF", "def")
152   end
154   def test_invalid2
155     ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
156     a =     ["", "abc\xFFdef", ec, nil, 1]
157     check_ec("a",  "c\xFFdef", :destination_buffer_full, *a)
158     check_ec("ab",  "\xFFdef", :destination_buffer_full, *a)
159     check_ec("abc",     "def", :invalid_byte_sequence, *a)
160     check_ec("abcd",      "f", :destination_buffer_full, *a)
161     check_ec("abcde",      "", :destination_buffer_full, *a)
162     check_ec("abcdef",     "", :finished, *a)
163   end
165   def test_invalid3
166     ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
167     a =     ["", "abc\xFFdef", ec, nil, 10]
168     check_ec("abc",     "def", :invalid_byte_sequence, *a)
169     check_ec("abcdef",     "", :finished, *a)
170   end
172   def test_invalid4
173     ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
174     a =     ["", "abc\xFFdef", ec, nil, 10, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
175     check_ec("a", "bc\xFFdef", :output_followed_by_input, *a)
176     check_ec("ab", "c\xFFdef", :output_followed_by_input, *a)
177     check_ec("abc", "\xFFdef", :output_followed_by_input, *a)
178     check_ec("abc",     "def", :invalid_byte_sequence, *a)
179     check_ec("abcd",     "ef", :output_followed_by_input, *a)
180     check_ec("abcde",     "f", :output_followed_by_input, *a)
181     check_ec("abcdef",     "", :output_followed_by_input, *a)
182     check_ec("abcdef",     "", :finished, *a)
183   end
185   def test_invalid_utf16le
186     ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
187     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
188     src << "A";         check_ec("",                            "", :source_buffer_empty, *a)
189     src << "\x00";      check_ec("A",                           "", :source_buffer_empty, *a)
190     src << "\x00";      check_ec("A",                           "", :source_buffer_empty, *a)
191     src << "\xd8";      check_ec("A",                           "", :source_buffer_empty, *a)
192     src << "\x01";      check_ec("A",                           "", :source_buffer_empty, *a)
193     src << "\x02";      check_ec("A",                           "", :invalid_byte_sequence, *a)
194     src << "\x03";      check_ec("A\u{0201}",                   "", :source_buffer_empty, *a)
195     src << "\x04";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
196     src << "\x00";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
197     src << "\xd8";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
198     src << "\x00";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
199     src << "\xd8";      check_ec("A\u{0201}\u{0403}",           "", :invalid_byte_sequence, *a)
200     src << "\x00";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
201     src << "\xdc";      check_ec("A\u{0201}\u{0403}\u{10000}",  "", :source_buffer_empty, *a)
202   end
204   def test_invalid_utf16be
205     ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
206     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
207     src << "\x00";      check_ec("",                            "", :source_buffer_empty, *a)
208     src << "A";         check_ec("A",                           "", :source_buffer_empty, *a)
209     src << "\xd8";      check_ec("A",                           "", :source_buffer_empty, *a)
210     src << "\x00";      check_ec("A",                           "", :source_buffer_empty, *a)
211     src << "\x02";      check_ec("A",                           "", :invalid_byte_sequence, *a)
212     src << "\x01";      check_ec("A\u{0201}",                   "", :source_buffer_empty, *a)
213     src << "\x04";      check_ec("A\u{0201}",                   "", :source_buffer_empty, *a)
214     src << "\x03";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
215     src << "\xd8";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
216     src << "\x00";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
217     src << "\xd8";      check_ec("A\u{0201}\u{0403}",           "", :invalid_byte_sequence, *a)
218     src << "\x00";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
219     src << "\xdc";      check_ec("A\u{0201}\u{0403}",           "", :source_buffer_empty, *a)
220     src << "\x00";      check_ec("A\u{0201}\u{0403}\u{10000}",  "", :source_buffer_empty, *a)
221   end
223   def test_invalid_utf32be
224     ec = Encoding::Converter.new("UTF-32BE", "UTF-8")
225     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
226     src << "\x00";      check_ec("",    "", :source_buffer_empty, *a)
227     src << "\x00";      check_ec("",    "", :source_buffer_empty, *a)
228     src << "\x00";      check_ec("",    "", :source_buffer_empty, *a)
229     src << "A";         check_ec("A",   "", :source_buffer_empty, *a)
231     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
232     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
233     src << "\xdc";      check_ec("A",   "", :source_buffer_empty, *a)
234     src << "\x00";      check_ec("A",   "", :invalid_byte_sequence, *a)
236     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
237     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
238     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
239     src << "B";         check_ec("AB",  "", :source_buffer_empty, *a)
241     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
242     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
243     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
244     src << "C";         check_ec("ABC", "", :source_buffer_empty, *a)
245   end
247   def test_invalid_utf32le
248     ec = Encoding::Converter.new("UTF-32LE", "UTF-8")
249     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
250     src << "A";         check_ec("",    "", :source_buffer_empty, *a)
251     src << "\x00";      check_ec("",    "", :source_buffer_empty, *a)
252     src << "\x00";      check_ec("",    "", :source_buffer_empty, *a)
253     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
255     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
256     src << "\xdc";      check_ec("A",   "", :source_buffer_empty, *a)
257     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
258     src << "\x00";      check_ec("A",   "", :invalid_byte_sequence, *a)
260     src << "B";         check_ec("A",   "", :source_buffer_empty, *a)
261     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
262     src << "\x00";      check_ec("A",   "", :source_buffer_empty, *a)
263     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
265     src << "C";         check_ec("AB",  "", :source_buffer_empty, *a)
266     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
267     src << "\x00";      check_ec("AB",  "", :source_buffer_empty, *a)
268     src << "\x00";      check_ec("ABC", "", :source_buffer_empty, *a)
269   end
271   def test_errors
272     ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
273     a =     ["", "\xFF\xFE\x00A\xDC\x00\x00B", ec, nil, 10]
274     check_ec("",         "\x00A\xDC\x00\x00B", :undefined_conversion, *a)
275     check_ec("A",                     "\x00B", :invalid_byte_sequence, *a) # \xDC\x00 is invalid as UTF-16BE
276     check_ec("AB",                         "", :finished, *a)
277   end
279   def test_errors2
280     ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
281     a =     ["", "\xFF\xFE\x00A\xDC\x00\x00B", ec, nil, 10, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
282     check_ec("",         "\x00A\xDC\x00\x00B", :undefined_conversion, *a)
283     check_ec("A",             "\xDC\x00\x00B", :output_followed_by_input, *a)
284     check_ec("A",                     "\x00B", :invalid_byte_sequence, *a)
285     check_ec("AB",                         "", :output_followed_by_input, *a)
286     check_ec("AB",                         "", :finished, *a)
287   end
289   def test_universal_newline
290     ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNIVERSAL_NEWLINE_DECODER)
291     a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
292     src << "abc\r\ndef"; check_ec("abc\ndef",                             "", :source_buffer_empty, *a)
293     src << "ghi\njkl";   check_ec("abc\ndefghi\njkl",                     "", :source_buffer_empty, *a)
294     src << "mno\rpqr";   check_ec("abc\ndefghi\njklmno\npqr",             "", :source_buffer_empty, *a)
295     src << "stu\r";      check_ec("abc\ndefghi\njklmno\npqrstu\n",        "", :source_buffer_empty, *a)
296     src << "\nvwx";      check_ec("abc\ndefghi\njklmno\npqrstu\nvwx",     "", :source_buffer_empty, *a)
297     src << "\nyz";       check_ec("abc\ndefghi\njklmno\npqrstu\nvwx\nyz", "", :source_buffer_empty, *a)
298   end
300   def test_crlf_newline
301     ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CRLF_NEWLINE_ENCODER)
302     assert_econv("abc\r\ndef", :finished, 50, ec, "abc\ndef", "")
303   end
305   def test_cr_newline
306     ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CR_NEWLINE_ENCODER)
307     assert_econv("abc\rdef", :finished, 50, ec, "abc\ndef", "")
308   end
310   def test_output_followed_by_input
311     ec = Encoding::Converter.new("UTF-8", "EUC-JP")
312     a =     ["",  "abc\u{3042}def", ec, nil, 100, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
313     check_ec("a",  "bc\u{3042}def", :output_followed_by_input, *a)
314     check_ec("ab",  "c\u{3042}def", :output_followed_by_input, *a)
315     check_ec("abc",  "\u{3042}def", :output_followed_by_input, *a)
316     check_ec("abc\xA4\xA2",  "def", :output_followed_by_input, *a)
317     check_ec("abc\xA4\xA2d",  "ef", :output_followed_by_input, *a)
318     check_ec("abc\xA4\xA2de",  "f", :output_followed_by_input, *a)
319     check_ec("abc\xA4\xA2def",  "", :output_followed_by_input, *a)
320     check_ec("abc\xA4\xA2def",  "", :finished, *a)
321   end