3 class TestEncodingConverter < Test::Unit::TestCase
4 def check_ec(edst, esrc, eres, dst, src, ec, off, len, flags=0)
5 res = ec.primitive_convert(src, dst, off, len, flags)
6 assert_equal([edst.dup.force_encoding("ASCII-8BIT"),
7 esrc.dup.force_encoding("ASCII-8BIT"),
9 [dst.dup.force_encoding("ASCII-8BIT"),
10 src.dup.force_encoding("ASCII-8BIT"),
14 def assert_econv(converted, eres, obuf_bytesize, ec, consumed, rest, flags=0)
15 ec = Encoding::Converter.new(*ec) if Array === ec
18 ret = ec.primitive_convert(i, o, 0, obuf_bytesize, flags)
19 assert_equal([converted, eres, rest],
24 assert_kind_of(Encoding::Converter, Encoding::Converter.new("UTF-8", "EUC-JP"))
25 assert_kind_of(Encoding::Converter, Encoding::Converter.new(Encoding::UTF_8, Encoding::EUC_JP))
29 name1 = "encoding-which-is-not-exist-1"
30 name2 = "encoding-which-is-not-exist-2"
32 assert_raise(ArgumentError) {
33 Encoding::Converter.new(name1, name2)
36 encoding_list = Encoding.list.map {|e| e.name }
37 assert(!encoding_list.include?(name1))
38 assert(!encoding_list.include?(name2))
42 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
43 assert_equal(Encoding::UTF_8, ec.source_encoding)
44 assert_equal(Encoding::EUC_JP, ec.destination_encoding)
47 def test_result_encoding
48 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
49 dst = "".force_encoding("ASCII-8BIT")
50 assert_equal(Encoding::ASCII_8BIT, dst.encoding)
51 ec.primitive_convert("\u{3042}", dst, nil, 10)
52 assert_equal(Encoding::EUC_JP, dst.encoding)
55 def test_output_region
56 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
57 ec.primitive_convert(src="a", dst="b", nil, 1, Encoding::Converter::PARTIAL_INPUT)
58 assert_equal("ba", dst)
59 ec.primitive_convert(src="a", dst="b", 0, 1, Encoding::Converter::PARTIAL_INPUT)
60 assert_equal("a", dst)
61 ec.primitive_convert(src="a", dst="b", 1, 1, Encoding::Converter::PARTIAL_INPUT)
62 assert_equal("ba", dst)
63 assert_raise(ArgumentError) {
64 ec.primitive_convert(src="a", dst="b", 2, 1, Encoding::Converter::PARTIAL_INPUT)
66 assert_raise(ArgumentError) {
67 ec.primitive_convert(src="a", dst="b", -1, 1, Encoding::Converter::PARTIAL_INPUT)
69 assert_raise(ArgumentError) {
70 ec.primitive_convert(src="a", dst="b", 1, -1, Encoding::Converter::PARTIAL_INPUT)
74 def test_partial_input
75 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
76 ret = ec.primitive_convert(src="", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
77 assert_equal(:source_buffer_empty, ret)
78 ret = ec.primitive_convert(src="", dst="", nil, 10)
79 assert_equal(:finished, ret)
82 def test_accumulate_dst1
83 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
84 a = ["", "abc\u{3042}def", ec, nil, 1]
85 check_ec("a", "c\u{3042}def", :destination_buffer_full, *a)
86 check_ec("ab", "\u{3042}def", :destination_buffer_full, *a)
87 check_ec("abc", "def", :destination_buffer_full, *a)
88 check_ec("abc\xA4", "def", :destination_buffer_full, *a)
89 check_ec("abc\xA4\xA2", "ef", :destination_buffer_full, *a)
90 check_ec("abc\xA4\xA2d", "f", :destination_buffer_full, *a)
91 check_ec("abc\xA4\xA2de", "", :destination_buffer_full, *a)
92 check_ec("abc\xA4\xA2def", "", :finished, *a)
95 def test_accumulate_dst2
96 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
97 a = ["", "abc\u{3042}def", ec, nil, 2]
98 check_ec("ab", "\u{3042}def", :destination_buffer_full, *a)
99 check_ec("abc\xA4", "def", :destination_buffer_full, *a)
100 check_ec("abc\xA4\xA2d", "f", :destination_buffer_full, *a)
101 check_ec("abc\xA4\xA2def", "", :finished, *a)
104 def test_eucjp_to_utf8
105 assert_econv("", :finished, 100, ["UTF-8", "EUC-JP"], "", "")
106 assert_econv("a", :finished, 100, ["UTF-8", "EUC-JP"], "a", "")
110 assert_econv("", :finished, 100, ["Shift_JIS", "ISO-2022-JP"], "", "")
113 def test_iso2022jp_encode
114 ec = Encoding::Converter.new("EUC-JP", "ISO-2022-JP")
115 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
116 src << "a"; check_ec("a", "", :source_buffer_empty, *a)
117 src << "\xA2"; check_ec("a", "", :source_buffer_empty, *a)
118 src << "\xA4"; check_ec("a\e$B\"$", "", :source_buffer_empty, *a)
119 src << "\xA1"; check_ec("a\e$B\"$", "", :source_buffer_empty, *a)
120 src << "\xA2"; check_ec("a\e$B\"$!\"", "", :source_buffer_empty, *a)
121 src << "b"; check_ec("a\e$B\"$!\"\e(Bb", "", :source_buffer_empty, *a)
122 src << "\xA2\xA6"; check_ec("a\e$B\"$!\"\e(Bb\e$B\"&", "", :source_buffer_empty, *a)
123 a[-1] = 0; check_ec("a\e$B\"$!\"\e(Bb\e$B\"&\e(B", "", :finished, *a)
126 def test_iso2022jp_decode
127 ec = Encoding::Converter.new("ISO-2022-JP", "EUC-JP")
128 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
129 src << "a"; check_ec("a", "", :source_buffer_empty, *a)
130 src << "\e"; check_ec("a", "", :source_buffer_empty, *a)
131 src << "$"; check_ec("a", "", :source_buffer_empty, *a)
132 src << "B"; check_ec("a", "", :source_buffer_empty, *a)
133 src << "\x21"; check_ec("a", "", :source_buffer_empty, *a)
134 src << "\x22"; check_ec("a\xA1\xA2", "", :source_buffer_empty, *a)
135 src << "\n"; check_ec("a\xA1\xA2", "", :invalid_byte_sequence, *a)
136 src << "\x23"; check_ec("a\xA1\xA2", "", :source_buffer_empty, *a)
137 src << "\x24"; check_ec("a\xA1\xA2\xA3\xA4", "", :source_buffer_empty, *a)
138 src << "\e"; check_ec("a\xA1\xA2\xA3\xA4", "", :source_buffer_empty, *a)
139 src << "("; check_ec("a\xA1\xA2\xA3\xA4", "", :source_buffer_empty, *a)
140 src << "B"; check_ec("a\xA1\xA2\xA3\xA4", "", :source_buffer_empty, *a)
141 src << "c"; check_ec("a\xA1\xA2\xA3\xA4c", "", :source_buffer_empty, *a)
142 src << "\n"; check_ec("a\xA1\xA2\xA3\xA4c\n","", :source_buffer_empty, *a)
146 assert_econv("", :invalid_byte_sequence, 100, ["UTF-8", "EUC-JP"], "\x80", "")
147 assert_econv("a", :invalid_byte_sequence, 100, ["UTF-8", "EUC-JP"], "a\x80", "")
148 assert_econv("a", :invalid_byte_sequence, 100, ["UTF-8", "EUC-JP"], "a\x80", "\x80")
149 assert_econv("abc", :invalid_byte_sequence, 100, ["UTF-8", "EUC-JP"], "abc\xFF", "def")
150 assert_econv("abc", :invalid_byte_sequence, 100, ["Shift_JIS", "EUC-JP"], "abc\xFF", "def")
151 assert_econv("abc", :invalid_byte_sequence, 100, ["ISO-2022-JP", "EUC-JP"], "abc\xFF", "def")
155 ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
156 a = ["", "abc\xFFdef", ec, nil, 1]
157 check_ec("a", "c\xFFdef", :destination_buffer_full, *a)
158 check_ec("ab", "\xFFdef", :destination_buffer_full, *a)
159 check_ec("abc", "def", :invalid_byte_sequence, *a)
160 check_ec("abcd", "f", :destination_buffer_full, *a)
161 check_ec("abcde", "", :destination_buffer_full, *a)
162 check_ec("abcdef", "", :finished, *a)
166 ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
167 a = ["", "abc\xFFdef", ec, nil, 10]
168 check_ec("abc", "def", :invalid_byte_sequence, *a)
169 check_ec("abcdef", "", :finished, *a)
173 ec = Encoding::Converter.new("Shift_JIS", "EUC-JP")
174 a = ["", "abc\xFFdef", ec, nil, 10, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
175 check_ec("a", "bc\xFFdef", :output_followed_by_input, *a)
176 check_ec("ab", "c\xFFdef", :output_followed_by_input, *a)
177 check_ec("abc", "\xFFdef", :output_followed_by_input, *a)
178 check_ec("abc", "def", :invalid_byte_sequence, *a)
179 check_ec("abcd", "ef", :output_followed_by_input, *a)
180 check_ec("abcde", "f", :output_followed_by_input, *a)
181 check_ec("abcdef", "", :output_followed_by_input, *a)
182 check_ec("abcdef", "", :finished, *a)
185 def test_invalid_utf16le
186 ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
187 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
188 src << "A"; check_ec("", "", :source_buffer_empty, *a)
189 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
190 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
191 src << "\xd8"; check_ec("A", "", :source_buffer_empty, *a)
192 src << "\x01"; check_ec("A", "", :source_buffer_empty, *a)
193 src << "\x02"; check_ec("A", "", :invalid_byte_sequence, *a)
194 src << "\x03"; check_ec("A\u{0201}", "", :source_buffer_empty, *a)
195 src << "\x04"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
196 src << "\x00"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
197 src << "\xd8"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
198 src << "\x00"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
199 src << "\xd8"; check_ec("A\u{0201}\u{0403}", "", :invalid_byte_sequence, *a)
200 src << "\x00"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
201 src << "\xdc"; check_ec("A\u{0201}\u{0403}\u{10000}", "", :source_buffer_empty, *a)
204 def test_invalid_utf16be
205 ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
206 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
207 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
208 src << "A"; check_ec("A", "", :source_buffer_empty, *a)
209 src << "\xd8"; check_ec("A", "", :source_buffer_empty, *a)
210 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
211 src << "\x02"; check_ec("A", "", :invalid_byte_sequence, *a)
212 src << "\x01"; check_ec("A\u{0201}", "", :source_buffer_empty, *a)
213 src << "\x04"; check_ec("A\u{0201}", "", :source_buffer_empty, *a)
214 src << "\x03"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
215 src << "\xd8"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
216 src << "\x00"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
217 src << "\xd8"; check_ec("A\u{0201}\u{0403}", "", :invalid_byte_sequence, *a)
218 src << "\x00"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
219 src << "\xdc"; check_ec("A\u{0201}\u{0403}", "", :source_buffer_empty, *a)
220 src << "\x00"; check_ec("A\u{0201}\u{0403}\u{10000}", "", :source_buffer_empty, *a)
223 def test_invalid_utf32be
224 ec = Encoding::Converter.new("UTF-32BE", "UTF-8")
225 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
226 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
227 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
228 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
229 src << "A"; check_ec("A", "", :source_buffer_empty, *a)
231 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
232 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
233 src << "\xdc"; check_ec("A", "", :source_buffer_empty, *a)
234 src << "\x00"; check_ec("A", "", :invalid_byte_sequence, *a)
236 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
237 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
238 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
239 src << "B"; check_ec("AB", "", :source_buffer_empty, *a)
241 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
242 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
243 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
244 src << "C"; check_ec("ABC", "", :source_buffer_empty, *a)
247 def test_invalid_utf32le
248 ec = Encoding::Converter.new("UTF-32LE", "UTF-8")
249 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
250 src << "A"; check_ec("", "", :source_buffer_empty, *a)
251 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
252 src << "\x00"; check_ec("", "", :source_buffer_empty, *a)
253 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
255 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
256 src << "\xdc"; check_ec("A", "", :source_buffer_empty, *a)
257 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
258 src << "\x00"; check_ec("A", "", :invalid_byte_sequence, *a)
260 src << "B"; check_ec("A", "", :source_buffer_empty, *a)
261 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
262 src << "\x00"; check_ec("A", "", :source_buffer_empty, *a)
263 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
265 src << "C"; check_ec("AB", "", :source_buffer_empty, *a)
266 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
267 src << "\x00"; check_ec("AB", "", :source_buffer_empty, *a)
268 src << "\x00"; check_ec("ABC", "", :source_buffer_empty, *a)
272 ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
273 a = ["", "\xFF\xFE\x00A\xDC\x00\x00B", ec, nil, 10]
274 check_ec("", "\x00A\xDC\x00\x00B", :undefined_conversion, *a)
275 check_ec("A", "\x00B", :invalid_byte_sequence, *a) # \xDC\x00 is invalid as UTF-16BE
276 check_ec("AB", "", :finished, *a)
280 ec = Encoding::Converter.new("UTF-16BE", "EUC-JP")
281 a = ["", "\xFF\xFE\x00A\xDC\x00\x00B", ec, nil, 10, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
282 check_ec("", "\x00A\xDC\x00\x00B", :undefined_conversion, *a)
283 check_ec("A", "\xDC\x00\x00B", :output_followed_by_input, *a)
284 check_ec("A", "\x00B", :invalid_byte_sequence, *a)
285 check_ec("AB", "", :output_followed_by_input, *a)
286 check_ec("AB", "", :finished, *a)
289 def test_universal_newline
290 ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNIVERSAL_NEWLINE_DECODER)
291 a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT]
292 src << "abc\r\ndef"; check_ec("abc\ndef", "", :source_buffer_empty, *a)
293 src << "ghi\njkl"; check_ec("abc\ndefghi\njkl", "", :source_buffer_empty, *a)
294 src << "mno\rpqr"; check_ec("abc\ndefghi\njklmno\npqr", "", :source_buffer_empty, *a)
295 src << "stu\r"; check_ec("abc\ndefghi\njklmno\npqrstu\n", "", :source_buffer_empty, *a)
296 src << "\nvwx"; check_ec("abc\ndefghi\njklmno\npqrstu\nvwx", "", :source_buffer_empty, *a)
297 src << "\nyz"; check_ec("abc\ndefghi\njklmno\npqrstu\nvwx\nyz", "", :source_buffer_empty, *a)
300 def test_crlf_newline
301 ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CRLF_NEWLINE_ENCODER)
302 assert_econv("abc\r\ndef", :finished, 50, ec, "abc\ndef", "")
306 ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CR_NEWLINE_ENCODER)
307 assert_econv("abc\rdef", :finished, 50, ec, "abc\ndef", "")
310 def test_output_followed_by_input
311 ec = Encoding::Converter.new("UTF-8", "EUC-JP")
312 a = ["", "abc\u{3042}def", ec, nil, 100, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT]
313 check_ec("a", "bc\u{3042}def", :output_followed_by_input, *a)
314 check_ec("ab", "c\u{3042}def", :output_followed_by_input, *a)
315 check_ec("abc", "\u{3042}def", :output_followed_by_input, *a)
316 check_ec("abc\xA4\xA2", "def", :output_followed_by_input, *a)
317 check_ec("abc\xA4\xA2d", "ef", :output_followed_by_input, *a)
318 check_ec("abc\xA4\xA2de", "f", :output_followed_by_input, *a)
319 check_ec("abc\xA4\xA2def", "", :output_followed_by_input, *a)
320 check_ec("abc\xA4\xA2def", "", :finished, *a)