1 /**********************************************************************
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
15 #include "transcode_data.h"
18 static VALUE sym_invalid
, sym_undef
, sym_ignore
, sym_replace
;
19 #define INVALID_IGNORE 0x1
20 #define INVALID_REPLACE 0x2
21 #define UNDEF_IGNORE 0x10
22 #define UNDEF_REPLACE 0x20
23 #define PARTIAL_INPUT 0x100
26 * Dispatch data and logic
32 const char *lib
; /* maybe null. it means that don't load the library. */
33 const rb_transcoder
*transcoder
;
36 static st_table
*transcoder_table
;
38 static transcoder_entry_t
*
39 make_transcoder_entry(const char *from
, const char *to
)
44 if (!st_lookup(transcoder_table
, (st_data_t
)from
, &val
)) {
45 val
= (st_data_t
)st_init_strcasetable();
46 st_add_direct(transcoder_table
, (st_data_t
)from
, val
);
48 table2
= (st_table
*)val
;
49 if (!st_lookup(table2
, (st_data_t
)to
, &val
)) {
50 transcoder_entry_t
*entry
= ALLOC(transcoder_entry_t
);
54 entry
->transcoder
= NULL
;
55 val
= (st_data_t
)entry
;
56 st_add_direct(table2
, (st_data_t
)to
, val
);
58 return (transcoder_entry_t
*)val
;
61 static transcoder_entry_t
*
62 get_transcoder_entry(const char *from
, const char *to
)
67 if (!st_lookup(transcoder_table
, (st_data_t
)from
, &val
)) {
70 table2
= (st_table
*)val
;
71 if (!st_lookup(table2
, (st_data_t
)to
, &val
)) {
74 return (transcoder_entry_t
*)val
;
78 rb_register_transcoder(const rb_transcoder
*tr
)
80 const char *const from_e
= tr
->from_encoding
;
81 const char *const to_e
= tr
->to_encoding
;
83 transcoder_entry_t
*entry
;
85 entry
= make_transcoder_entry(from_e
, to_e
);
86 if (entry
->transcoder
) {
87 rb_raise(rb_eArgError
, "transcoder from %s to %s has been already registered",
91 entry
->transcoder
= tr
;
95 declare_transcoder(const char *to
, const char *from
, const char *lib
)
97 transcoder_entry_t
*entry
;
99 entry
= make_transcoder_entry(from
, to
);
103 #define MAX_TRANSCODER_LIBNAME_LEN 64
104 static const char transcoder_lib_prefix
[] = "enc/trans/";
107 rb_declare_transcoder(const char *enc1
, const char *enc2
, const char *lib
)
109 if (!lib
|| strlen(lib
) > MAX_TRANSCODER_LIBNAME_LEN
) {
110 rb_raise(rb_eArgError
, "invalid library name - %s",
111 lib
? lib
: "(null)");
113 declare_transcoder(enc1
, enc2
, lib
);
114 declare_transcoder(enc2
, enc1
, lib
);
117 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
119 typedef struct search_path_queue_tag
{
120 struct search_path_queue_tag
*next
;
122 } search_path_queue_t
;
126 search_path_queue_t
*queue
;
127 search_path_queue_t
**queue_last_ptr
;
128 const char *base_enc
;
132 transcode_search_path_i(st_data_t key
, st_data_t val
, st_data_t arg
)
134 const char *to
= (const char *)key
;
135 search_path_bfs_t
*bfs
= (search_path_bfs_t
*)arg
;
136 search_path_queue_t
*q
;
138 if (st_lookup(bfs
->visited
, (st_data_t
)to
, &val
)) {
142 q
= ALLOC(search_path_queue_t
);
145 *bfs
->queue_last_ptr
= q
;
146 bfs
->queue_last_ptr
= &q
->next
;
148 st_add_direct(bfs
->visited
, (st_data_t
)to
, (st_data_t
)bfs
->base_enc
);
153 transcode_search_path(const char *from
, const char *to
,
154 void (*callback
)(const char *from
, const char *to
, int depth
, void *arg
),
157 search_path_bfs_t bfs
;
158 search_path_queue_t
*q
;
163 q
= ALLOC(search_path_queue_t
);
166 bfs
.queue_last_ptr
= &q
->next
;
169 bfs
.visited
= st_init_strcasetable();
170 st_add_direct(bfs
.visited
, (st_data_t
)from
, (st_data_t
)NULL
);
176 bfs
.queue_last_ptr
= &bfs
.queue
;
178 if (!st_lookup(transcoder_table
, (st_data_t
)q
->enc
, &val
)) {
182 table2
= (st_table
*)val
;
184 if (st_lookup(table2
, (st_data_t
)to
, &val
)) {
185 st_add_direct(bfs
.visited
, (st_data_t
)to
, (st_data_t
)q
->enc
);
191 bfs
.base_enc
= q
->enc
;
192 st_foreach(table2
, transcode_search_path_i
, (st_data_t
)&bfs
);
207 const char *enc
= to
;
210 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
214 enc
= (const char *)val
;
218 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
221 callback((const char *)val
, enc
, --depth
, arg
);
222 enc
= (const char *)val
;
226 st_free_table(bfs
.visited
);
231 static const rb_transcoder
*
232 load_transcoder(transcoder_entry_t
*entry
)
234 if (entry
->transcoder
)
235 return entry
->transcoder
;
238 const char *lib
= entry
->lib
;
239 int len
= strlen(lib
);
240 char path
[sizeof(transcoder_lib_prefix
) + MAX_TRANSCODER_LIBNAME_LEN
];
244 if (len
> MAX_TRANSCODER_LIBNAME_LEN
)
246 memcpy(path
, transcoder_lib_prefix
, sizeof(transcoder_lib_prefix
) - 1);
247 memcpy(path
+ sizeof(transcoder_lib_prefix
) - 1, lib
, len
+ 1);
248 if (!rb_require(path
))
252 if (entry
->transcoder
)
253 return entry
->transcoder
;
259 get_replacement_character(rb_encoding
*enc
, int *len_ret
)
261 static rb_encoding
*utf16be_encoding
, *utf16le_encoding
;
262 static rb_encoding
*utf32be_encoding
, *utf32le_encoding
;
263 if (!utf16be_encoding
) {
264 utf16be_encoding
= rb_enc_find("UTF-16BE");
265 utf16le_encoding
= rb_enc_find("UTF-16LE");
266 utf32be_encoding
= rb_enc_find("UTF-32BE");
267 utf32le_encoding
= rb_enc_find("UTF-32LE");
269 if (rb_utf8_encoding() == enc
) {
271 return "\xEF\xBF\xBD";
273 else if (utf16be_encoding
== enc
) {
277 else if (utf16le_encoding
== enc
) {
281 else if (utf32be_encoding
== enc
) {
283 return "\x00\x00\xFF\xFD";
285 else if (utf32le_encoding
== enc
) {
287 return "\xFD\xFF\x00\x00";
296 * Transcoding engine logic
299 static const unsigned char *
300 transcode_char_start(rb_transcoding
*tc
,
301 const unsigned char *in_start
,
302 const unsigned char *inchar_start
,
303 const unsigned char *in_p
,
304 size_t *char_len_ptr
)
306 const unsigned char *ptr
;
307 if (inchar_start
- in_start
< tc
->recognized_len
) {
308 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
309 inchar_start
, unsigned char, in_p
- inchar_start
);
310 ptr
= TRANSCODING_READBUF(tc
);
313 ptr
= inchar_start
- tc
->recognized_len
;
315 *char_len_ptr
= tc
->recognized_len
+ (in_p
- inchar_start
);
319 static rb_trans_result_t
320 transcode_restartable0(const unsigned char **in_pos
, unsigned char **out_pos
,
321 const unsigned char *in_stop
, unsigned char *out_stop
,
326 const rb_transcoder
*tr
= tc
->transcoder
;
327 int unitlen
= tr
->input_unit_length
;
328 int readagain_len
= 0;
330 const unsigned char *inchar_start
;
331 const unsigned char *in_p
;
333 unsigned char *out_p
;
334 const BYTE_LOOKUP
*next_table
;
336 unsigned char next_byte
;
338 unsigned char empty_buf
;
339 unsigned char *empty_ptr
= &empty_buf
;
342 in_pos
= (const unsigned char **)&empty_ptr
;
347 out_pos
= &empty_ptr
;
348 out_stop
= empty_ptr
;
351 in_p
= inchar_start
= *in_pos
;
354 next_table
= tc
->next_table
;
355 next_info
= tc
->next_info
;
356 next_byte
= tc
->next_byte
;
358 #define SUSPEND(ret, num) \
360 tc->resume_position = (num); \
361 if (0 < in_p - inchar_start) \
362 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
363 inchar_start, unsigned char, in_p - inchar_start); \
366 tc->recognized_len += in_p - inchar_start; \
367 if (readagain_len) { \
368 tc->recognized_len -= readagain_len; \
369 tc->readagain_len = readagain_len; \
371 tc->next_table = next_table; \
372 tc->next_info = next_info; \
373 tc->next_byte = next_byte; \
375 resume_label ## num:; \
378 switch (tc
->resume_position
) {
380 case 1: goto resume_label1
;
381 case 2: goto resume_label2
;
382 case 3: goto resume_label3
;
383 case 4: goto resume_label4
;
384 case 5: goto resume_label5
;
385 case 6: goto resume_label6
;
386 case 7: goto resume_label7
;
387 case 8: goto resume_label8
;
388 case 9: goto resume_label9
;
389 case 10: goto resume_label10
;
390 case 11: goto resume_label11
;
391 case 12: goto resume_label12
;
392 case 13: goto resume_label13
;
393 case 14: goto resume_label14
;
397 if (in_stop
<= in_p
) {
398 if (!(opt
& PARTIAL_INPUT
))
400 SUSPEND(transcode_ibuf_empty
, 7);
404 tc
->recognized_len
= 0;
406 next_table
= tr
->conv_tree_start
;
407 next_byte
= (unsigned char)*in_p
++;
409 if (next_byte
< next_table
->base
[0] || next_table
->base
[1] < next_byte
)
412 unsigned int next_offset
= next_table
->base
[2+next_byte
-next_table
->base
[0]];
413 next_info
= (VALUE
)next_table
->info
[next_offset
];
416 switch (next_info
& 0x1F) {
417 case NOMAP
: /* xxx: copy last byte only? */
418 while (out_stop
- out_p
< 1) { SUSPEND(transcode_obuf_full
, 3); }
419 *out_p
++ = next_byte
;
421 case 0x00: case 0x04: case 0x08: case 0x0C:
422 case 0x10: case 0x14: case 0x18: case 0x1C:
423 while (in_p
>= in_stop
) {
424 if (!(opt
& PARTIAL_INPUT
))
426 SUSPEND(transcode_ibuf_empty
, 5);
428 next_byte
= (unsigned char)*in_p
++;
429 next_table
= (const BYTE_LOOKUP
*)next_info
;
431 case ZERObt
: /* drop input */
434 while (out_stop
- out_p
< 1) { SUSPEND(transcode_obuf_full
, 9); }
435 *out_p
++ = getBT1(next_info
);
438 while (out_stop
- out_p
< 2) { SUSPEND(transcode_obuf_full
, 10); }
439 *out_p
++ = getBT1(next_info
);
440 *out_p
++ = getBT2(next_info
);
443 while (out_stop
- out_p
< 3) { SUSPEND(transcode_obuf_full
, 11); }
444 *out_p
++ = getBT1(next_info
);
445 *out_p
++ = getBT2(next_info
);
446 *out_p
++ = getBT3(next_info
);
449 while (out_stop
- out_p
< 4) { SUSPEND(transcode_obuf_full
, 12); }
450 *out_p
++ = getBT0(next_info
);
451 *out_p
++ = getBT1(next_info
);
452 *out_p
++ = getBT2(next_info
);
453 *out_p
++ = getBT3(next_info
);
456 next_info
= (VALUE
)(*tr
->func_ii
)(tc
, next_info
);
460 const unsigned char *char_start
;
462 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
463 next_info
= (VALUE
)(*tr
->func_si
)(tc
, char_start
, (size_t)char_len
);
467 while (out_stop
- out_p
< tr
->max_output
) { SUSPEND(transcode_obuf_full
, 13); }
468 out_p
+= (VALUE
)(*tr
->func_io
)(tc
, next_info
, out_p
);
472 const unsigned char *char_start
;
474 while (out_stop
- out_p
< tr
->max_output
) { SUSPEND(transcode_obuf_full
, 14); }
475 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
476 out_p
+= (VALUE
)(*tr
->func_so
)(tc
, char_start
, (size_t)char_len
, out_p
);
480 if (tc
->recognized_len
+ (in_p
- inchar_start
) <= unitlen
) {
481 while ((opt
& PARTIAL_INPUT
) && tc
->recognized_len
+ (in_stop
- inchar_start
) < unitlen
) {
483 SUSPEND(transcode_ibuf_empty
, 8);
485 if (tc
->recognized_len
+ (in_stop
- inchar_start
) <= unitlen
) {
489 in_p
= inchar_start
+ (unitlen
- tc
->recognized_len
);
493 int invalid_len
; /* including the last byte which causes invalid */
495 invalid_len
= tc
->recognized_len
+ (in_p
- inchar_start
);
496 discard_len
= ((invalid_len
- 1) / unitlen
) * unitlen
;
497 readagain_len
= invalid_len
- discard_len
;
506 SUSPEND(transcode_invalid_input
, 1);
510 SUSPEND(transcode_undefined_conversion
, 2);
515 if (tr
->finish_func
) {
516 while (out_stop
- out_p
< tr
->max_output
) {
517 SUSPEND(transcode_obuf_full
, 4);
519 out_p
+= tr
->finish_func(tc
, out_p
);
522 SUSPEND(transcode_finished
, 6);
526 static rb_trans_result_t
527 transcode_restartable(const unsigned char **in_pos
, unsigned char **out_pos
,
528 const unsigned char *in_stop
, unsigned char *out_stop
,
532 if (tc
->readagain_len
) {
533 unsigned char *readagain_buf
= ALLOCA_N(unsigned char, tc
->readagain_len
);
534 const unsigned char *readagain_pos
= readagain_buf
;
535 const unsigned char *readagain_stop
= readagain_buf
+ tc
->readagain_len
;
536 rb_trans_result_t res
;
538 MEMCPY(readagain_buf
, TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
539 unsigned char, tc
->readagain_len
);
540 tc
->readagain_len
= 0;
541 res
= transcode_restartable0(&readagain_pos
, out_pos
, readagain_stop
, out_stop
, tc
, opt
|PARTIAL_INPUT
);
542 if (res
!= transcode_ibuf_empty
) {
543 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
+ tc
->readagain_len
,
544 readagain_pos
, unsigned char, readagain_stop
- readagain_pos
);
545 tc
->readagain_len
+= readagain_stop
- readagain_pos
;
549 return transcode_restartable0(in_pos
, out_pos
, in_stop
, out_stop
, tc
, opt
);
552 static rb_transcoding
*
553 rb_transcoding_open(const char *from
, const char *to
, int flags
)
556 const rb_transcoder
*tr
;
558 transcoder_entry_t
*entry
;
560 entry
= get_transcoder_entry(from
, to
);
564 tr
= load_transcoder(entry
);
568 tc
= ALLOC(rb_transcoding
);
571 memset(tc
->stateful
, 0, sizeof(tc
->stateful
));
572 tc
->resume_position
= 0;
573 tc
->recognized_len
= 0;
574 tc
->readagain_len
= 0;
575 if (sizeof(tc
->readbuf
.ary
) < tr
->max_input
) {
576 tc
->readbuf
.ptr
= xmalloc(tr
->max_input
);
581 static rb_trans_result_t
582 rb_transcoding_convert(rb_transcoding
*tc
,
583 const unsigned char **input_ptr
, const unsigned char *input_stop
,
584 unsigned char **output_ptr
, unsigned char *output_stop
,
587 return transcode_restartable(
588 input_ptr
, output_ptr
,
589 input_stop
, output_stop
,
594 rb_transcoding_close(rb_transcoding
*tc
)
596 const rb_transcoder
*tr
= tc
->transcoder
;
597 if (sizeof(tc
->readbuf
.ary
) < tr
->max_input
)
598 xfree(tc
->readbuf
.ptr
);
603 trans_open_i(const char *from
, const char *to
, int depth
, void *arg
)
605 rb_trans_t
**tsp
= (rb_trans_t
**)arg
;
610 ts
= *tsp
= ALLOC(rb_trans_t
);
611 ts
->num_trans
= depth
+1;
612 ts
->elems
= ALLOC_N(rb_trans_elem_t
, ts
->num_trans
);
613 ts
->num_finished
= 0;
614 for (i
= 0; i
< ts
->num_trans
; i
++) {
615 ts
->elems
[i
].tc
= NULL
;
616 ts
->elems
[i
].out_buf_start
= NULL
;
617 ts
->elems
[i
].out_data_start
= NULL
;
618 ts
->elems
[i
].out_data_end
= NULL
;
619 ts
->elems
[i
].out_buf_end
= NULL
;
620 ts
->elems
[i
].last_result
= transcode_ibuf_empty
;
627 ts
->elems
[depth
].tc
= rb_transcoding_open(from
, to
, 0);
628 if (depth
< ts
->num_trans
-1) {
631 p
= xmalloc(bufsize
);
632 ts
->elems
[depth
].out_buf_start
= p
;
633 ts
->elems
[depth
].out_buf_end
= p
+ bufsize
;
634 ts
->elems
[depth
].out_data_start
= p
;
635 ts
->elems
[depth
].out_data_end
= p
;
640 rb_trans_open(const char *from
, const char *to
, int flags
)
642 rb_trans_t
*ts
= NULL
;
644 transcode_search_path(from
, to
, trans_open_i
, (void *)&ts
);
653 trans_sweep(rb_trans_t
*ts
,
654 const unsigned char **input_ptr
, const unsigned char *input_stop
,
655 unsigned char **output_ptr
, unsigned char *output_stop
,
662 const unsigned char **ipp
, *is
, *iold
;
663 unsigned char **opp
, *os
, *oold
;
664 rb_trans_result_t res
;
669 for (i
= start
; i
< ts
->num_trans
; i
++) {
670 rb_trans_elem_t
*te
= &ts
->elems
[i
];
677 rb_trans_elem_t
*prev_te
= &ts
->elems
[i
-1];
678 ipp
= (const unsigned char **)&prev_te
->out_data_start
;
679 is
= prev_te
->out_data_end
;
682 if (!te
->out_buf_start
) {
687 if (te
->out_buf_start
!= te
->out_data_start
) {
688 int len
= te
->out_data_end
- te
->out_data_start
;
689 int off
= te
->out_data_start
- te
->out_buf_start
;
690 MEMMOVE(te
->out_buf_start
, te
->out_data_start
, unsigned char, len
);
691 te
->out_data_start
= te
->out_buf_start
;
692 te
->out_data_end
-= off
;
694 opp
= &te
->out_data_end
;
695 os
= te
->out_buf_end
;
699 if (ts
->num_finished
!= i
)
703 te
->last_result
= res
= rb_transcoding_convert(te
->tc
, ipp
, is
, opp
, os
, f
);
704 if (iold
!= *ipp
|| oold
!= *opp
)
708 case transcode_invalid_input
:
709 case transcode_undefined_conversion
:
712 case transcode_obuf_full
:
713 case transcode_ibuf_empty
:
716 case transcode_finished
:
717 ts
->num_finished
= i
+1;
725 static rb_trans_result_t
726 rb_trans_conv(rb_trans_t
*ts
,
727 const unsigned char **input_ptr
, const unsigned char *input_stop
,
728 unsigned char **output_ptr
, unsigned char *output_stop
,
732 int start
, err_index
, no_error
;
734 unsigned char empty_buf
;
735 unsigned char *empty_ptr
= &empty_buf
;
738 input_ptr
= (const unsigned char **)&empty_ptr
;
739 input_stop
= empty_ptr
;
743 output_ptr
= &empty_ptr
;
744 output_stop
= empty_ptr
;
749 for (i
= ts
->num_trans
-1; 0 <= i
; i
--) {
750 if (ts
->elems
[i
].last_result
== transcode_invalid_input
||
751 ts
->elems
[i
].last_result
== transcode_undefined_conversion
) {
757 /* second last error */
765 start
= err_index
+ 1;
766 err_index
= trans_sweep(ts
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, start
);
767 } while (err_index
!= -1 && err_index
!= ts
->num_trans
-1);
769 if (err_index
== ts
->num_trans
-1)
770 return ts
->elems
[ts
->num_trans
-1].last_result
;
772 return ts
->elems
[ts
->num_trans
-1].last_result
;
774 return ts
->elems
[start
-1].last_result
;
778 rb_trans_close(rb_trans_t
*ts
)
782 for (i
= 0; i
< ts
->num_trans
; i
++) {
783 rb_transcoding_close(ts
->elems
[i
].tc
);
784 if (ts
->elems
[i
].out_buf_start
)
785 xfree(ts
->elems
[i
].out_buf_start
);
795 unsigned char *(*resize_destination
)(VALUE
, int, int),
797 unsigned char **out_start_ptr
,
798 unsigned char **out_pos
,
799 unsigned char **out_stop_ptr
)
801 size_t len
= (*out_pos
- *out_start_ptr
);
802 size_t new_len
= (len
+ ts
->elems
[ts
->num_trans
-1].tc
->transcoder
->max_output
) * 2;
803 *out_start_ptr
= resize_destination(destination
, len
, new_len
);
804 *out_pos
= *out_start_ptr
+ len
;
805 *out_stop_ptr
= *out_start_ptr
+ new_len
;
809 output_replacement_character(
811 unsigned char *(*resize_destination
)(VALUE
, int, int),
813 unsigned char **out_start_ptr
,
814 unsigned char **out_pos
,
815 unsigned char **out_stop_ptr
)
819 const rb_transcoder
*tr
;
822 const char *replacement
;
825 tc
= ts
->elems
[ts
->num_trans
-1].tc
;
827 max_output
= tr
->max_output
;
828 enc
= rb_enc_find(tr
->to_encoding
);
831 * Assumption for stateful encoding:
833 * - The replacement character can be output on resetted state and doesn't
835 * - it is acceptable that extra state changing sequence if the replacement
836 * character contains a state changing sequence.
838 * Currently the replacement character for stateful encoding such as
839 * ISO-2022-JP is "?" and it has no state changing sequence.
840 * So the extra state changing sequence don't occur.
842 * Thease assumption may be removed in future.
843 * It needs to scan the replacement character to check
844 * state changing sequences in the replacement character.
847 if (tr
->resetstate_func
) {
848 if (*out_stop_ptr
- *out_pos
< max_output
)
849 more_output_buffer(destination
, resize_destination
, ts
, out_start_ptr
, out_pos
, out_stop_ptr
);
850 *out_pos
+= tr
->resetstate_func(tc
, *out_pos
);
853 if (*out_stop_ptr
- *out_pos
< max_output
)
854 more_output_buffer(destination
, resize_destination
, ts
, out_start_ptr
, out_pos
, out_stop_ptr
);
856 replacement
= get_replacement_character(enc
, &len
);
858 memcpy(*out_pos
, replacement
, len
);
866 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
867 const unsigned char *in_stop
, unsigned char *out_stop
,
869 unsigned char *(*resize_destination
)(VALUE
, int, int),
870 const char *from_encoding
,
871 const char *to_encoding
,
875 rb_trans_result_t ret
;
876 unsigned char *out_start
= *out_pos
;
879 ts
= rb_trans_open(from_encoding
, to_encoding
, 0);
881 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_encoding
, to_encoding
);
883 max_output
= ts
->elems
[ts
->num_trans
-1].tc
->transcoder
->max_output
;
886 ret
= rb_trans_conv(ts
, in_pos
, in_stop
, out_pos
, out_stop
, opt
);
887 if (ret
== transcode_invalid_input
) {
888 /* deal with invalid byte sequence */
889 /* todo: add more alternative behaviors */
890 if (opt
&INVALID_IGNORE
) {
893 else if (opt
&INVALID_REPLACE
) {
894 output_replacement_character(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
898 rb_raise(TRANSCODE_ERROR
, "invalid byte sequence");
900 if (ret
== transcode_undefined_conversion
) {
901 /* valid character in from encoding
902 * but no related character(s) in to encoding */
903 /* todo: add more alternative behaviors */
904 if (opt
&UNDEF_IGNORE
) {
907 else if (opt
&UNDEF_REPLACE
) {
908 output_replacement_character(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
912 rb_raise(TRANSCODE_ERROR
, "conversion undefined for byte sequence (maybe invalid byte sequence)");
914 if (ret
== transcode_obuf_full
) {
915 more_output_buffer(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
923 /* sample transcode_loop implementation in byte-by-byte stream style */
925 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
926 const unsigned char *in_stop
, unsigned char *out_stop
,
928 unsigned char *(*resize_destination
)(VALUE
, int, int),
929 const char *from_encoding
,
930 const char *to_encoding
,
934 rb_trans_result_t ret
;
935 unsigned char *out_start
= *out_pos
;
936 const unsigned char *ptr
;
939 ts
= rb_trans_open(from_encoding
, to_encoding
, 0);
941 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_encoding
, to_encoding
);
943 max_output
= ts
->elems
[ts
->num_trans
-1].tc
->transcoder
->max_output
;
945 ret
= transcode_ibuf_empty
;
947 while (ret
!= transcode_finished
) {
948 unsigned char input_byte
;
949 const unsigned char *p
= &input_byte
;
951 if (ret
== transcode_ibuf_empty
) {
954 ret
= rb_trans_conv(ts
, &p
, p
+1, out_pos
, out_stop
, PARTIAL_INPUT
);
957 ret
= rb_trans_conv(ts
, NULL
, NULL
, out_pos
, out_stop
, 0);
961 ret
= rb_trans_conv(ts
, NULL
, NULL
, out_pos
, out_stop
, PARTIAL_INPUT
);
963 if (&input_byte
!= p
)
964 ptr
+= p
- &input_byte
;
966 case transcode_invalid_input
:
967 /* deal with invalid byte sequence */
968 /* todo: add more alternative behaviors */
969 if (opt
&INVALID_IGNORE
) {
972 else if (opt
&INVALID_REPLACE
) {
973 output_replacement_character(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
977 rb_raise(TRANSCODE_ERROR
, "invalid byte sequence");
980 case transcode_undefined_conversion
:
981 /* valid character in from encoding
982 * but no related character(s) in to encoding */
983 /* todo: add more alternative behaviors */
984 if (opt
&UNDEF_IGNORE
) {
987 else if (opt
&UNDEF_REPLACE
) {
988 output_replacement_character(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
992 rb_raise(TRANSCODE_ERROR
, "conversion undefined for byte sequence (maybe invalid byte sequence)");
995 case transcode_obuf_full
:
996 more_output_buffer(destination
, resize_destination
, ts
, &out_start
, out_pos
, &out_stop
);
999 case transcode_ibuf_empty
:
1002 case transcode_finished
:
1014 * String-specific code
1017 static unsigned char *
1018 str_transcoding_resize(VALUE destination
, int len
, int new_len
)
1020 rb_str_resize(destination
, new_len
);
1021 return (unsigned char *)RSTRING_PTR(destination
);
1025 str_transcode(int argc
, VALUE
*argv
, VALUE
*self
)
1030 unsigned char *buf
, *bp
, *sp
;
1031 const unsigned char *fromp
;
1032 rb_encoding
*from_enc
, *to_enc
;
1033 const char *from_e
, *to_e
;
1034 int from_encidx
, to_encidx
;
1035 VALUE from_encval
, to_encval
;
1039 opt
= rb_check_convert_type(argv
[argc
-1], T_HASH
, "Hash", "to_hash");
1044 v
= rb_hash_aref(opt
, sym_invalid
);
1047 else if (v
==sym_ignore
) {
1048 options
|= INVALID_IGNORE
;
1050 else if (v
==sym_replace
) {
1051 options
|= INVALID_REPLACE
;
1052 v
= rb_hash_aref(opt
, sym_replace
);
1055 rb_raise(rb_eArgError
, "unknown value for invalid: setting");
1057 v
= rb_hash_aref(opt
, sym_undef
);
1060 else if (v
==sym_ignore
) {
1061 options
|= UNDEF_IGNORE
;
1063 else if (v
==sym_replace
) {
1064 options
|= UNDEF_REPLACE
;
1067 rb_raise(rb_eArgError
, "unknown value for undef: setting");
1070 if (argc
< 1 || argc
> 2) {
1071 rb_raise(rb_eArgError
, "wrong number of arguments (%d for 1..2)", argc
);
1073 if ((to_encidx
= rb_to_encoding_index(to_encval
= argv
[0])) < 0) {
1076 to_e
= StringValueCStr(to_encval
);
1079 to_enc
= rb_enc_from_index(to_encidx
);
1080 to_e
= rb_enc_name(to_enc
);
1083 from_encidx
= rb_enc_get_index(str
);
1084 from_enc
= rb_enc_from_index(from_encidx
);
1085 from_e
= rb_enc_name(from_enc
);
1087 else if ((from_encidx
= rb_to_encoding_index(from_encval
= argv
[1])) < 0) {
1089 from_e
= StringValueCStr(from_encval
);
1092 from_enc
= rb_enc_from_index(from_encidx
);
1093 from_e
= rb_enc_name(from_enc
);
1096 if (from_enc
&& from_enc
== to_enc
) {
1099 if (from_enc
&& to_enc
&& rb_enc_asciicompat(from_enc
) && rb_enc_asciicompat(to_enc
)) {
1100 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
) {
1104 if (encoding_equal(from_e
, to_e
)) {
1108 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
1109 slen
= RSTRING_LEN(str
);
1110 blen
= slen
+ 30; /* len + margin */
1111 dest
= rb_str_tmp_new(blen
);
1112 bp
= (unsigned char *)RSTRING_PTR(dest
);
1114 transcode_loop(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), dest
, str_transcoding_resize
, from_e
, to_e
, options
);
1115 if (fromp
!= sp
+slen
) {
1116 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
1118 buf
= (unsigned char *)RSTRING_PTR(dest
);
1120 rb_str_set_len(dest
, bp
- buf
);
1124 to_encidx
= rb_define_dummy_encoding(to_e
);
1132 str_encode_associate(VALUE str
, int encidx
)
1136 rb_enc_associate_index(str
, encidx
);
1138 /* transcoded string never be broken. */
1139 if (rb_enc_asciicompat(rb_enc_from_index(encidx
))) {
1140 rb_str_coderange_scan_restartable(RSTRING_PTR(str
), RSTRING_END(str
), 0, &cr
);
1143 cr
= ENC_CODERANGE_VALID
;
1145 ENC_CODERANGE_SET(str
, cr
);
1151 * str.encode!(encoding [, options] ) => str
1152 * str.encode!(to_encoding, from_encoding [, options] ) => str
1154 * The first form transcodes the contents of <i>str</i> from
1155 * str.encoding to +encoding+.
1156 * The second form transcodes the contents of <i>str</i> from
1157 * from_encoding to to_encoding.
1158 * The options Hash gives details for conversion. See String#encode
1160 * Returns the string even if no changes were made.
1164 str_encode_bang(int argc
, VALUE
*argv
, VALUE str
)
1167 int encidx
= str_transcode(argc
, argv
, &newstr
);
1169 if (encidx
< 0) return str
;
1170 rb_str_shared_replace(str
, newstr
);
1171 return str_encode_associate(str
, encidx
);
1176 * str.encode(encoding [, options] ) => str
1177 * str.encode(to_encoding, from_encoding [, options] ) => str
1179 * The first form returns a copy of <i>str</i> transcoded
1180 * to encoding +encoding+.
1181 * The second form returns a copy of <i>str</i> transcoded
1182 * from from_encoding to to_encoding.
1183 * The options Hash gives details for conversion. Details
1188 str_encode(int argc
, VALUE
*argv
, VALUE str
)
1191 int encidx
= str_transcode(argc
, argv
, &newstr
);
1193 if (encidx
< 0) return rb_str_dup(str
);
1194 RBASIC(newstr
)->klass
= rb_obj_class(str
);
1195 return str_encode_associate(newstr
, encidx
);
1199 rb_str_transcode(VALUE str
, VALUE to
)
1201 return str_encode(1, &to
, str
);
1205 Init_transcode(void)
1207 transcoder_table
= st_init_strcasetable();
1209 sym_invalid
= ID2SYM(rb_intern("invalid"));
1210 sym_undef
= ID2SYM(rb_intern("undef"));
1211 sym_ignore
= ID2SYM(rb_intern("ignore"));
1212 sym_replace
= ID2SYM(rb_intern("replace"));
1214 rb_define_method(rb_cString
, "encode", str_encode
, -1);
1215 rb_define_method(rb_cString
, "encode!", str_encode_bang
, -1);