1 /**********************************************************************
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
15 #include "transcode_data.h"
18 static VALUE sym_invalid
, sym_undef
, sym_ignore
, sym_replace
;
19 #define INVALID_IGNORE 0x1
20 #define INVALID_REPLACE 0x2
21 #define UNDEF_IGNORE 0x10
22 #define UNDEF_REPLACE 0x20
25 * Dispatch data and logic
28 static st_table
*transcoder_table
, *transcoder_lib_table
;
30 #define TRANSCODER_INTERNAL_SEPARATOR '\t'
33 transcoder_key(const char *from_e
, const char *to_e
)
35 int to_len
= strlen(to_e
);
36 int from_len
= strlen(from_e
);
37 char *const key
= xmalloc(to_len
+ from_len
+ 2);
39 memcpy(key
, to_e
, to_len
);
40 memcpy(key
+ to_len
+ 1, from_e
, from_len
+ 1);
41 key
[to_len
] = TRANSCODER_INTERNAL_SEPARATOR
;
46 rb_register_transcoder(const rb_transcoder
*tr
)
49 const char *const from_e
= tr
->from_encoding
;
50 const char *const to_e
= tr
->to_encoding
;
51 char *const key
= transcoder_key(from_e
, to_e
);
53 if (st_lookup(transcoder_table
, (st_data_t
)key
, &val
)) {
55 rb_raise(rb_eArgError
, "transcoder from %s to %s has been already registered",
59 if (st_delete(transcoder_lib_table
, &k
, &val
)) {
62 st_insert(transcoder_table
, (st_data_t
)key
, (st_data_t
)tr
);
66 declare_transcoder(const char *to
, const char *from
, const char *lib
)
68 const char *const key
= transcoder_key(to
, from
);
69 st_data_t k
= (st_data_t
)key
, val
;
71 if (st_delete(transcoder_lib_table
, &k
, &val
)) {
74 st_insert(transcoder_lib_table
, (st_data_t
)key
, (st_data_t
)lib
);
77 #define MAX_TRANSCODER_LIBNAME_LEN 64
78 static const char transcoder_lib_prefix
[] = "enc/trans/";
81 rb_declare_transcoder(const char *enc1
, const char *enc2
, const char *lib
)
83 if (!lib
|| strlen(lib
) > MAX_TRANSCODER_LIBNAME_LEN
) {
84 rb_raise(rb_eArgError
, "invalid library name - %s",
85 lib
? lib
: "(null)");
87 declare_transcoder(enc1
, enc2
, lib
);
88 declare_transcoder(enc2
, enc1
, lib
);
91 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
93 static const rb_transcoder
*
94 transcode_dispatch(const char *from_encoding
, const char *to_encoding
)
96 char *const key
= transcoder_key(from_encoding
, to_encoding
);
99 while (!st_lookup(transcoder_table
, (k
= (st_data_t
)key
), &val
) &&
100 st_delete(transcoder_lib_table
, &k
, &val
)) {
101 const char *const lib
= (const char *)val
;
102 int len
= strlen(lib
);
103 char path
[sizeof(transcoder_lib_prefix
) + MAX_TRANSCODER_LIBNAME_LEN
];
106 if (len
> MAX_TRANSCODER_LIBNAME_LEN
) return NULL
;
107 memcpy(path
, transcoder_lib_prefix
, sizeof(transcoder_lib_prefix
) - 1);
108 memcpy(path
+ sizeof(transcoder_lib_prefix
) - 1, lib
, len
+ 1);
109 if (!rb_require(path
)) return NULL
;
112 if (!st_lookup(transcoder_table
, (st_data_t
)key
, &val
)) {
114 /* multistep logic, via UTF-8 */
115 if (!encoding_equal(from_encoding
, "UTF-8") &&
116 !encoding_equal(to_encoding
, "UTF-8") &&
117 transcode_dispatch("UTF-8", to_encoding
)) { /* check that we have a second step */
118 return transcode_dispatch(from_encoding
, "UTF-8"); /* return first step */
124 return (rb_transcoder
*)val
;
128 output_replacement_character(unsigned char **out_pp
, rb_encoding
*enc
)
130 unsigned char *out_p
= *out_pp
;
131 static rb_encoding
*utf16be_encoding
, *utf16le_encoding
;
132 static rb_encoding
*utf32be_encoding
, *utf32le_encoding
;
133 if (!utf16be_encoding
) {
134 utf16be_encoding
= rb_enc_find("UTF-16BE");
135 utf16le_encoding
= rb_enc_find("UTF-16LE");
136 utf32be_encoding
= rb_enc_find("UTF-32BE");
137 utf32le_encoding
= rb_enc_find("UTF-32LE");
139 if (rb_utf8_encoding() == enc
) {
144 else if (utf16be_encoding
== enc
) {
148 else if (utf16le_encoding
== enc
) {
152 else if (utf32be_encoding
== enc
) {
158 else if (utf32le_encoding
== enc
) {
172 * Transcoding engine logic
175 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
176 const unsigned char *in_stop
, unsigned char *out_stop
,
177 const rb_transcoder
*my_transcoder
,
178 rb_transcoding
*my_transcoding
,
181 const unsigned char *in_p
= *in_pos
;
182 unsigned char *out_p
= *out_pos
;
183 const BYTE_LOOKUP
*conv_tree_start
= my_transcoder
->conv_tree_start
;
184 const BYTE_LOOKUP
*next_table
;
185 const unsigned char *char_start
;
186 unsigned int next_offset
;
188 unsigned char next_byte
;
189 int from_utf8
= my_transcoder
->from_utf8
;
190 unsigned char *out_s
= out_stop
- my_transcoder
->max_output
+ 1;
191 rb_encoding
*from_encoding
= rb_enc_find(my_transcoder
->from_encoding
);
192 rb_encoding
*to_encoding
= rb_enc_find(my_transcoder
->to_encoding
);
194 while (in_p
< in_stop
) {
196 next_table
= conv_tree_start
;
197 if (out_p
>= out_s
) {
198 int len
= (out_p
- *out_pos
);
199 int new_len
= (len
+ my_transcoder
->max_output
) * 2;
200 *out_pos
= (*my_transcoding
->flush_func
)(my_transcoding
, len
, new_len
);
201 out_p
= *out_pos
+ len
;
202 out_s
= *out_pos
+ new_len
- my_transcoder
->max_output
;
204 next_byte
= (unsigned char)*in_p
++;
206 next_offset
= next_table
->base
[next_byte
];
207 next_info
= (VALUE
)next_table
->info
[next_offset
];
209 switch (next_info
& 0x1F) {
211 *out_p
++ = next_byte
;
213 case 0x00: case 0x04: case 0x08: case 0x0C:
214 case 0x10: case 0x14: case 0x18: case 0x1C:
215 if (in_p
>= in_stop
) {
216 /* todo: deal with the case of backtracking */
217 /* todo: deal with incomplete input (streaming) */
220 next_byte
= (unsigned char)*in_p
++;
222 if ((next_byte
&0xC0) == 0x80)
225 in_p
--; /* may need to add more code later to revert other things */
229 next_table
= (const BYTE_LOOKUP
*)next_info
;
231 /* maybe rewrite the following cases to use fallthrough???? */
232 case ZERObt
: /* drop input */
235 *out_p
++ = getBT1(next_info
);
238 *out_p
++ = getBT1(next_info
);
239 *out_p
++ = getBT2(next_info
);
242 *out_p
++ = getBT0(next_info
);
243 case THREEbt
: /* fall through */
244 *out_p
++ = getBT1(next_info
);
245 *out_p
++ = getBT2(next_info
);
246 *out_p
++ = getBT3(next_info
);
249 next_info
= (VALUE
)(*my_transcoder
->func_ii
)(next_info
);
252 next_info
= (VALUE
)(*my_transcoder
->func_si
)(char_start
);
256 out_p
+= (VALUE
)(*my_transcoder
->func_io
)(next_info
, out_p
);
259 out_p
+= (VALUE
)(*my_transcoder
->func_so
)(char_start
, out_p
);
268 /* deal with invalid byte sequence */
269 /* todo: add more alternative behaviors */
270 if (opt
&INVALID_IGNORE
) {
273 else if (opt
&INVALID_REPLACE
) {
274 output_replacement_character(&out_p
, to_encoding
);
277 rb_raise(TRANSCODE_ERROR
, "invalid byte sequence");
280 /* valid character in from encoding
281 * but no related character(s) in to encoding */
282 /* todo: add more alternative behaviors */
284 int len
= rb_enc_mbclen((const char *)char_start
, (const char *)in_stop
, from_encoding
);
285 while (in_p
< char_start
+ len
) in_p
++;
287 if (opt
&UNDEF_IGNORE
) {
290 else if (opt
&UNDEF_REPLACE
) {
291 output_replacement_character(&out_p
, to_encoding
);
294 rb_raise(TRANSCODE_ERROR
, "conversion undefined for byte sequence (maybe invalid byte sequence)");
304 * String-specific code
307 static unsigned char *
308 str_transcoding_resize(rb_transcoding
*my_transcoding
, int len
, int new_len
)
310 VALUE dest_string
= my_transcoding
->ruby_string_dest
;
311 rb_str_resize(dest_string
, new_len
);
312 return (unsigned char *)RSTRING_PTR(dest_string
);
316 str_transcode(int argc
, VALUE
*argv
, VALUE
*self
)
321 unsigned char *buf
, *bp
, *sp
;
322 const unsigned char *fromp
;
323 rb_encoding
*from_enc
, *to_enc
;
324 const char *from_e
, *to_e
;
325 int from_encidx
, to_encidx
;
326 VALUE from_encval
, to_encval
;
327 const rb_transcoder
*my_transcoder
;
328 rb_transcoding my_transcoding
;
329 int final_encoding
= 0;
333 opt
= rb_check_convert_type(argv
[argc
-1], T_HASH
, "Hash", "to_hash");
338 v
= rb_hash_aref(opt
, sym_invalid
);
341 else if (v
==sym_ignore
) {
342 options
|= INVALID_IGNORE
;
344 else if (v
==sym_replace
) {
345 options
|= INVALID_REPLACE
;
346 v
= rb_hash_aref(opt
, sym_replace
);
349 rb_raise(rb_eArgError
, "unknown value for invalid: setting");
351 v
= rb_hash_aref(opt
, sym_undef
);
354 else if (v
==sym_ignore
) {
355 options
|= UNDEF_IGNORE
;
357 else if (v
==sym_replace
) {
358 options
|= UNDEF_REPLACE
;
361 rb_raise(rb_eArgError
, "unknown value for undef: setting");
364 if (argc
< 1 || argc
> 2) {
365 rb_raise(rb_eArgError
, "wrong number of arguments (%d for 1..2)", argc
);
367 if ((to_encidx
= rb_to_encoding_index(to_encval
= argv
[0])) < 0) {
370 to_e
= StringValueCStr(to_encval
);
373 to_enc
= rb_enc_from_index(to_encidx
);
374 to_e
= rb_enc_name(to_enc
);
377 from_encidx
= rb_enc_get_index(str
);
378 from_enc
= rb_enc_from_index(from_encidx
);
379 from_e
= rb_enc_name(from_enc
);
381 else if ((from_encidx
= rb_to_encoding_index(from_encval
= argv
[1])) < 0) {
383 from_e
= StringValueCStr(from_encval
);
386 from_enc
= rb_enc_from_index(from_encidx
);
387 from_e
= rb_enc_name(from_enc
);
390 if (from_enc
&& from_enc
== to_enc
) {
393 if (from_enc
&& to_enc
&& rb_enc_asciicompat(from_enc
) && rb_enc_asciicompat(to_enc
)) {
394 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
) {
398 if (encoding_equal(from_e
, to_e
)) {
402 do { /* loop for multistep transcoding */
403 /* later, maybe use smaller intermediate strings for very long strings */
404 if (!(my_transcoder
= transcode_dispatch(from_e
, to_e
))) {
405 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_e
, to_e
);
408 my_transcoding
.transcoder
= my_transcoder
;
410 if (my_transcoder
->preprocessor
) {
411 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
412 slen
= RSTRING_LEN(str
);
413 blen
= slen
+ 30; /* len + margin */
414 dest
= rb_str_tmp_new(blen
);
415 bp
= (unsigned char *)RSTRING_PTR(dest
);
416 my_transcoding
.ruby_string_dest
= dest
;
417 (*my_transcoder
->preprocessor
)(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), &my_transcoding
);
418 if (fromp
!= sp
+slen
) {
419 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
421 rb_str_set_len(dest
, (char *)bp
- RSTRING_PTR(dest
));
424 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
425 slen
= RSTRING_LEN(str
);
426 blen
= slen
+ 30; /* len + margin */
427 dest
= rb_str_tmp_new(blen
);
428 bp
= (unsigned char *)RSTRING_PTR(dest
);
429 my_transcoding
.ruby_string_dest
= dest
;
430 my_transcoding
.flush_func
= str_transcoding_resize
;
432 transcode_loop(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), my_transcoder
, &my_transcoding
, options
);
433 if (fromp
!= sp
+slen
) {
434 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
436 buf
= (unsigned char *)RSTRING_PTR(dest
);
438 rb_str_set_len(dest
, bp
- buf
);
439 if (my_transcoder
->postprocessor
) {
441 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
442 slen
= RSTRING_LEN(str
);
443 blen
= slen
+ 30; /* len + margin */
444 dest
= rb_str_tmp_new(blen
);
445 bp
= (unsigned char *)RSTRING_PTR(dest
);
446 my_transcoding
.ruby_string_dest
= dest
;
447 (*my_transcoder
->postprocessor
)(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), &my_transcoding
);
448 if (fromp
!= sp
+slen
) {
449 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
451 buf
= (unsigned char *)RSTRING_PTR(dest
);
452 rb_str_set_len(dest
, bp
- buf
);
455 if (encoding_equal(my_transcoder
->to_encoding
, to_e
)) {
459 from_e
= my_transcoder
->to_encoding
;
462 } while (!final_encoding
);
465 to_encidx
= rb_define_dummy_encoding(to_e
);
473 str_encode_associate(VALUE str
, int encidx
)
477 rb_enc_associate_index(str
, encidx
);
479 /* transcoded string never be broken. */
480 if (rb_enc_asciicompat(rb_enc_from_index(encidx
))) {
481 rb_str_coderange_scan_restartable(RSTRING_PTR(str
), RSTRING_END(str
), 0, &cr
);
484 cr
= ENC_CODERANGE_VALID
;
486 ENC_CODERANGE_SET(str
, cr
);
492 * str.encode!(encoding [, options] ) => str
493 * str.encode!(to_encoding, from_encoding [, options] ) => str
495 * The first form transcodes the contents of <i>str</i> from
496 * str.encoding to +encoding+.
497 * The second form transcodes the contents of <i>str</i> from
498 * from_encoding to to_encoding.
499 * The options Hash gives details for conversion. See String#encode
501 * Returns the string even if no changes were made.
505 str_encode_bang(int argc
, VALUE
*argv
, VALUE str
)
508 int encidx
= str_transcode(argc
, argv
, &newstr
);
510 if (encidx
< 0) return str
;
511 rb_str_shared_replace(str
, newstr
);
512 return str_encode_associate(str
, encidx
);
517 * str.encode(encoding [, options] ) => str
518 * str.encode(to_encoding, from_encoding [, options] ) => str
520 * The first form returns a copy of <i>str</i> transcoded
521 * to encoding +encoding+.
522 * The second form returns a copy of <i>str</i> transcoded
523 * from from_encoding to to_encoding.
524 * The options Hash gives details for conversion. Details
529 str_encode(int argc
, VALUE
*argv
, VALUE str
)
532 int encidx
= str_transcode(argc
, argv
, &newstr
);
534 if (encidx
< 0) return rb_str_dup(str
);
535 RBASIC(newstr
)->klass
= rb_obj_class(str
);
536 return str_encode_associate(newstr
, encidx
);
540 rb_str_transcode(VALUE str
, VALUE to
)
542 return str_encode(1, &to
, str
);
548 transcoder_table
= st_init_strcasetable();
549 transcoder_lib_table
= st_init_strcasetable();
551 sym_invalid
= ID2SYM(rb_intern("invalid"));
552 sym_undef
= ID2SYM(rb_intern("undef"));
553 sym_ignore
= ID2SYM(rb_intern("ignore"));
554 sym_replace
= ID2SYM(rb_intern("replace"));
556 rb_define_method(rb_cString
, "encode", str_encode
, -1);
557 rb_define_method(rb_cString
, "encode!", str_encode_bang
, -1);