1 /**********************************************************************
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
15 #include "transcode_data.h"
18 static VALUE sym_invalid
, sym_ignore
;
19 #define INVALID_IGNORE 0x1
22 * Dispatch data and logic
25 static st_table
*transcoder_table
, *transcoder_lib_table
;
27 #define TRANSCODER_INTERNAL_SEPARATOR '\t'
30 transcoder_key(const char *from_e
, const char *to_e
)
32 int to_len
= strlen(to_e
);
33 int from_len
= strlen(from_e
);
34 char *const key
= xmalloc(to_len
+ from_len
+ 2);
36 memcpy(key
, to_e
, to_len
);
37 memcpy(key
+ to_len
+ 1, from_e
, from_len
+ 1);
38 key
[to_len
] = TRANSCODER_INTERNAL_SEPARATOR
;
43 rb_register_transcoder(const rb_transcoder
*tr
)
46 const char *const from_e
= tr
->from_encoding
;
47 const char *const to_e
= tr
->to_encoding
;
48 char *const key
= transcoder_key(from_e
, to_e
);
50 if (st_lookup(transcoder_table
, (st_data_t
)key
, &val
)) {
52 rb_raise(rb_eArgError
, "transcoder from %s to %s has been already registered",
56 if (st_delete(transcoder_lib_table
, &k
, &val
)) {
59 st_insert(transcoder_table
, (st_data_t
)key
, (st_data_t
)tr
);
63 declare_transcoder(const char *to
, const char *from
, const char *lib
)
65 const char *const key
= transcoder_key(to
, from
);
66 st_data_t k
= (st_data_t
)key
, val
;
68 if (st_delete(transcoder_lib_table
, &k
, &val
)) {
71 st_insert(transcoder_lib_table
, (st_data_t
)key
, (st_data_t
)lib
);
74 #define MAX_TRANSCODER_LIBNAME_LEN 64
75 static const char transcoder_lib_prefix
[] = "enc/trans/";
78 rb_declare_transcoder(const char *enc1
, const char *enc2
, const char *lib
)
80 if (!lib
|| strlen(lib
) > MAX_TRANSCODER_LIBNAME_LEN
) {
81 rb_raise(rb_eArgError
, "invalid library name - %s",
82 lib
? lib
: "(null)");
84 declare_transcoder(enc1
, enc2
, lib
);
85 declare_transcoder(enc2
, enc1
, lib
);
88 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
90 static const rb_transcoder
*
91 transcode_dispatch(const char* from_encoding
, const char* to_encoding
)
93 char *const key
= transcoder_key(from_encoding
, to_encoding
);
96 while (!st_lookup(transcoder_table
, (k
= (st_data_t
)key
), &val
) &&
97 st_delete(transcoder_lib_table
, &k
, &val
)) {
98 const char *const lib
= (const char *)val
;
99 int len
= strlen(lib
);
100 char path
[sizeof(transcoder_lib_prefix
) + MAX_TRANSCODER_LIBNAME_LEN
];
103 if (len
> MAX_TRANSCODER_LIBNAME_LEN
) return NULL
;
104 memcpy(path
, transcoder_lib_prefix
, sizeof(transcoder_lib_prefix
) - 1);
105 memcpy(path
+ sizeof(transcoder_lib_prefix
) - 1, lib
, len
+ 1);
106 if (!rb_require(path
)) return NULL
;
109 if (!st_lookup(transcoder_table
, (st_data_t
)key
, &val
)) {
110 /* multistep logic, via UTF-8 */
111 if (!encoding_equal(from_encoding
, "UTF-8") &&
112 !encoding_equal(to_encoding
, "UTF-8") &&
113 transcode_dispatch("UTF-8", to_encoding
)) { /* check that we have a second step */
114 return transcode_dispatch(from_encoding
, "UTF-8"); /* return first step */
119 return (rb_transcoder
*)val
;
124 * Transcoding engine logic
127 transcode_loop(unsigned char **in_pos
, unsigned char **out_pos
,
128 unsigned char *in_stop
, unsigned char *out_stop
,
129 const rb_transcoder
*my_transcoder
,
130 rb_transcoding
*my_transcoding
,
133 unsigned char *in_p
= *in_pos
, *out_p
= *out_pos
;
134 const BYTE_LOOKUP
*conv_tree_start
= my_transcoder
->conv_tree_start
;
135 const BYTE_LOOKUP
*next_table
;
136 unsigned char *char_start
;
137 unsigned int next_offset
;
139 unsigned char next_byte
;
140 int from_utf8
= my_transcoder
->from_utf8
;
141 unsigned char *out_s
= out_stop
- my_transcoder
->max_output
+ 1;
142 while (in_p
< in_stop
) {
144 next_table
= conv_tree_start
;
145 if (out_p
>= out_s
) {
146 int len
= (out_p
- *out_pos
);
147 int new_len
= (len
+ my_transcoder
->max_output
) * 2;
148 *out_pos
= (*my_transcoding
->flush_func
)(my_transcoding
, len
, new_len
);
149 out_p
= *out_pos
+ len
;
150 out_s
= *out_pos
+ new_len
- my_transcoder
->max_output
;
152 next_byte
= (unsigned char)*in_p
++;
154 next_offset
= next_table
->base
[next_byte
];
155 next_info
= (VALUE
)next_table
->info
[next_offset
];
157 switch (next_info
& 0x1F) {
159 *out_p
++ = next_byte
;
161 case 0x00: case 0x04: case 0x08: case 0x0C:
162 case 0x10: case 0x14: case 0x18: case 0x1C:
163 if (in_p
>= in_stop
) {
164 /* todo: deal with the case of backtracking */
165 /* todo: deal with incomplete input (streaming) */
168 next_byte
= (unsigned char)*in_p
++;
170 if ((next_byte
&0xC0) == 0x80)
173 in_p
--; /* may need to add more code later to revert other things */
177 next_table
= (const BYTE_LOOKUP
*)next_info
;
179 /* maybe rewrite the following cases to use fallthrough???? */
180 case ZERObt
: /* drop input */
183 *out_p
++ = getBT1(next_info
);
186 *out_p
++ = getBT1(next_info
);
187 *out_p
++ = getBT2(next_info
);
190 *out_p
++ = getBT0(next_info
);
191 case THREEbt
: /* fall through */
192 *out_p
++ = getBT1(next_info
);
193 *out_p
++ = getBT2(next_info
);
194 *out_p
++ = getBT3(next_info
);
197 next_info
= (VALUE
)(*my_transcoder
->func_ii
)(next_info
);
200 next_info
= (VALUE
)(*my_transcoder
->func_si
)(char_start
);
204 out_p
+= (VALUE
)(*my_transcoder
->func_io
)(next_info
, out_p
);
207 out_p
+= (VALUE
)(*my_transcoder
->func_so
)(char_start
, out_p
);
212 /* todo: add code for alternate behaviors */
213 rb_raise(rb_eRuntimeError
/*@@@change exception*/, "conversion undefined for byte sequence (maybe invalid byte sequence)");
218 /* deal with invalid byte sequence */
219 /* todo: add more alternative behaviors */
220 if (opt
&INVALID_IGNORE
) {
223 rb_raise(rb_eRuntimeError
/*change exception*/, "invalid byte sequence");
233 * String-specific code
236 static unsigned char *
237 str_transcoding_resize(rb_transcoding
*my_transcoding
, int len
, int new_len
)
239 VALUE dest_string
= my_transcoding
->ruby_string_dest
;
240 rb_str_resize(dest_string
, new_len
);
241 return (unsigned char *)RSTRING_PTR(dest_string
);
245 str_transcode(int argc
, VALUE
*argv
, VALUE
*self
)
250 unsigned char *buf
, *bp
, *sp
, *fromp
;
251 rb_encoding
*from_enc
, *to_enc
;
252 const char *from_e
, *to_e
;
253 int from_encidx
, to_encidx
;
254 VALUE from_encval
, to_encval
;
255 const rb_transcoder
*my_transcoder
;
256 rb_transcoding my_transcoding
;
257 int final_encoding
= 0;
261 opt
= rb_check_convert_type(argv
[argc
-1], T_HASH
, "Hash", "to_hash");
266 v
= rb_hash_aref(opt
, sym_invalid
);
268 rb_raise(rb_eArgError
, "unknown value for invalid: setting");
270 else if (v
==sym_ignore
) {
271 options
|= INVALID_IGNORE
;
274 if (argc
< 1 || argc
> 2) {
275 rb_raise(rb_eArgError
, "wrong number of arguments (%d for 1..2)", argc
);
277 if ((to_encidx
= rb_to_encoding_index(to_encval
= argv
[0])) < 0) {
280 to_e
= StringValueCStr(to_encval
);
283 to_enc
= rb_enc_from_index(to_encidx
);
284 to_e
= rb_enc_name(to_enc
);
287 from_encidx
= rb_enc_get_index(str
);
288 from_enc
= rb_enc_from_index(from_encidx
);
289 from_e
= rb_enc_name(from_enc
);
291 else if ((from_encidx
= rb_to_encoding_index(from_encval
= argv
[1])) < 0) {
293 from_e
= StringValueCStr(from_encval
);
296 from_enc
= rb_enc_from_index(from_encidx
);
297 from_e
= rb_enc_name(from_enc
);
300 if (from_enc
&& from_enc
== to_enc
) {
303 if (from_enc
&& to_enc
&& rb_enc_asciicompat(from_enc
) && rb_enc_asciicompat(to_enc
)) {
304 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
) {
308 if (encoding_equal(from_e
, to_e
)) {
312 while (!final_encoding
) { /* loop for multistep transcoding */
313 /* later, maybe use smaller intermediate strings for very long strings */
314 if (!(my_transcoder
= transcode_dispatch(from_e
, to_e
))) {
315 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_e
, to_e
);
318 my_transcoding
.transcoder
= my_transcoder
;
320 if (my_transcoder
->preprocessor
) {
321 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
322 slen
= RSTRING_LEN(str
);
323 blen
= slen
+ 30; /* len + margin */
324 dest
= rb_str_tmp_new(blen
);
325 bp
= (unsigned char *)RSTRING_PTR(dest
);
326 my_transcoding
.ruby_string_dest
= dest
;
327 (*my_transcoder
->preprocessor
)(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), &my_transcoding
);
328 if (fromp
!= sp
+slen
) {
329 rb_raise(rb_eArgError
, "not fully converted, %d bytes left", sp
+slen
-fromp
);
331 buf
= (unsigned char *)RSTRING_PTR(dest
);
333 rb_str_set_len(dest
, bp
- buf
);
336 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
337 slen
= RSTRING_LEN(str
);
338 blen
= slen
+ 30; /* len + margin */
339 dest
= rb_str_tmp_new(blen
);
340 bp
= (unsigned char *)RSTRING_PTR(dest
);
341 my_transcoding
.ruby_string_dest
= dest
;
342 my_transcoding
.flush_func
= str_transcoding_resize
;
344 transcode_loop(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), my_transcoder
, &my_transcoding
, options
);
345 if (fromp
!= sp
+slen
) {
346 rb_raise(rb_eArgError
, "not fully converted, %d bytes left", sp
+slen
-fromp
);
348 buf
= (unsigned char *)RSTRING_PTR(dest
);
350 rb_str_set_len(dest
, bp
- buf
);
351 if (my_transcoder
->postprocessor
) {
353 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
354 slen
= RSTRING_LEN(str
);
355 blen
= slen
+ 30; /* len + margin */
356 dest
= rb_str_tmp_new(blen
);
357 bp
= (unsigned char *)RSTRING_PTR(dest
);
358 my_transcoding
.ruby_string_dest
= dest
;
359 (*my_transcoder
->postprocessor
)(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), &my_transcoding
);
360 if (fromp
!= sp
+slen
) {
361 rb_raise(rb_eArgError
, "not fully converted, %d bytes left", sp
+slen
-fromp
);
363 buf
= (unsigned char *)RSTRING_PTR(dest
);
365 rb_str_set_len(dest
, bp
- buf
);
368 if (encoding_equal(my_transcoder
->to_encoding
, to_e
)) {
372 from_e
= my_transcoder
->to_encoding
;
378 to_encidx
= rb_define_dummy_encoding(to_e
);
387 * str.encode!(encoding [, options] ) => str
388 * str.encode!(to_encoding, from_encoding [, options] ) => str
390 * The first form transcodes the contents of <i>str</i> from
391 * str.encoding to +encoding+.
392 * The second form transcodes the contents of <i>str</i> from
393 * from_encoding to to_encoding.
394 * The options Hash gives details for conversion. See String#encode
396 * Returns the string even if no changes were made.
400 rb_str_transcode_bang(int argc
, VALUE
*argv
, VALUE str
)
403 int encidx
= str_transcode(argc
, argv
, &newstr
);
406 if (encidx
< 0) return str
;
407 rb_str_shared_replace(str
, newstr
);
408 rb_enc_associate_index(str
, encidx
);
410 /* transcoded string never be broken. */
411 if (rb_enc_asciicompat(rb_enc_from_index(encidx
))) {
412 rb_str_coderange_scan_restartable(RSTRING_PTR(str
), RSTRING_END(str
), 0, &cr
);
415 cr
= ENC_CODERANGE_VALID
;
417 ENC_CODERANGE_SET(str
, cr
);
423 * str.encode(encoding [, options] ) => str
424 * str.encode(to_encoding, from_encoding [, options] ) => str
426 * The first form returns a copy of <i>str</i> transcoded
427 * to encoding +encoding+.
428 * The second form returns a copy of <i>str</i> transcoded
429 * from from_encoding to to_encoding.
430 * The options Hash gives details for conversion. Details
435 rb_str_transcode(int argc
, VALUE
*argv
, VALUE str
)
437 str
= rb_str_dup(str
);
438 return rb_str_transcode_bang(argc
, argv
, str
);
444 transcoder_table
= st_init_strcasetable();
445 transcoder_lib_table
= st_init_strcasetable();
447 sym_invalid
= ID2SYM(rb_intern("invalid"));
448 sym_ignore
= ID2SYM(rb_intern("ignore"));
450 rb_define_method(rb_cString
, "encode", rb_str_transcode
, -1);
451 rb_define_method(rb_cString
, "encode!", rb_str_transcode_bang
, -1);