1 /**********************************************************************
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
15 #include "transcode_data.h"
18 VALUE rb_eConversionUndefined
;
19 VALUE rb_eInvalidByteSequence
;
21 VALUE rb_cEncodingConverter
;
23 static VALUE sym_invalid
, sym_undef
, sym_ignore
, sym_replace
;
24 #define INVALID_IGNORE 0x1
25 #define INVALID_REPLACE 0x2
26 #define UNDEF_IGNORE 0x10
27 #define UNDEF_REPLACE 0x20
30 * Dispatch data and logic
36 const char *lib
; /* maybe null. it means that don't load the library. */
37 const rb_transcoder
*transcoder
;
40 static st_table
*transcoder_table
;
42 static transcoder_entry_t
*
43 make_transcoder_entry(const char *from
, const char *to
)
48 if (!st_lookup(transcoder_table
, (st_data_t
)from
, &val
)) {
49 val
= (st_data_t
)st_init_strcasetable();
50 st_add_direct(transcoder_table
, (st_data_t
)from
, val
);
52 table2
= (st_table
*)val
;
53 if (!st_lookup(table2
, (st_data_t
)to
, &val
)) {
54 transcoder_entry_t
*entry
= ALLOC(transcoder_entry_t
);
58 entry
->transcoder
= NULL
;
59 val
= (st_data_t
)entry
;
60 st_add_direct(table2
, (st_data_t
)to
, val
);
62 return (transcoder_entry_t
*)val
;
65 static transcoder_entry_t
*
66 get_transcoder_entry(const char *from
, const char *to
)
71 if (!st_lookup(transcoder_table
, (st_data_t
)from
, &val
)) {
74 table2
= (st_table
*)val
;
75 if (!st_lookup(table2
, (st_data_t
)to
, &val
)) {
78 return (transcoder_entry_t
*)val
;
82 rb_register_transcoder(const rb_transcoder
*tr
)
84 const char *const from_e
= tr
->from_encoding
;
85 const char *const to_e
= tr
->to_encoding
;
87 transcoder_entry_t
*entry
;
89 entry
= make_transcoder_entry(from_e
, to_e
);
90 if (entry
->transcoder
) {
91 rb_raise(rb_eArgError
, "transcoder from %s to %s has been already registered",
95 entry
->transcoder
= tr
;
99 declare_transcoder(const char *to
, const char *from
, const char *lib
)
101 transcoder_entry_t
*entry
;
103 entry
= make_transcoder_entry(from
, to
);
107 #define MAX_TRANSCODER_LIBNAME_LEN 64
108 static const char transcoder_lib_prefix
[] = "enc/trans/";
111 rb_declare_transcoder(const char *enc1
, const char *enc2
, const char *lib
)
113 if (!lib
|| strlen(lib
) > MAX_TRANSCODER_LIBNAME_LEN
) {
114 rb_raise(rb_eArgError
, "invalid library name - %s",
115 lib
? lib
: "(null)");
117 declare_transcoder(enc1
, enc2
, lib
);
118 declare_transcoder(enc2
, enc1
, lib
);
121 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
123 typedef struct search_path_queue_tag
{
124 struct search_path_queue_tag
*next
;
126 } search_path_queue_t
;
130 search_path_queue_t
*queue
;
131 search_path_queue_t
**queue_last_ptr
;
132 const char *base_enc
;
136 transcode_search_path_i(st_data_t key
, st_data_t val
, st_data_t arg
)
138 const char *to
= (const char *)key
;
139 search_path_bfs_t
*bfs
= (search_path_bfs_t
*)arg
;
140 search_path_queue_t
*q
;
142 if (st_lookup(bfs
->visited
, (st_data_t
)to
, &val
)) {
146 q
= ALLOC(search_path_queue_t
);
149 *bfs
->queue_last_ptr
= q
;
150 bfs
->queue_last_ptr
= &q
->next
;
152 st_add_direct(bfs
->visited
, (st_data_t
)to
, (st_data_t
)bfs
->base_enc
);
157 transcode_search_path(const char *from
, const char *to
,
158 void (*callback
)(const char *from
, const char *to
, int depth
, void *arg
),
161 search_path_bfs_t bfs
;
162 search_path_queue_t
*q
;
168 q
= ALLOC(search_path_queue_t
);
171 bfs
.queue_last_ptr
= &q
->next
;
174 bfs
.visited
= st_init_strcasetable();
175 st_add_direct(bfs
.visited
, (st_data_t
)from
, (st_data_t
)NULL
);
181 bfs
.queue_last_ptr
= &bfs
.queue
;
183 if (!st_lookup(transcoder_table
, (st_data_t
)q
->enc
, &val
)) {
187 table2
= (st_table
*)val
;
189 if (st_lookup(table2
, (st_data_t
)to
, &val
)) {
190 st_add_direct(bfs
.visited
, (st_data_t
)to
, (st_data_t
)q
->enc
);
196 bfs
.base_enc
= q
->enc
;
197 st_foreach(table2
, transcode_search_path_i
, (st_data_t
)&bfs
);
212 const char *enc
= to
;
216 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
220 enc
= (const char *)val
;
225 st_lookup(bfs
.visited
, (st_data_t
)enc
, &val
);
228 callback((const char *)val
, enc
, --depth
, arg
);
229 enc
= (const char *)val
;
233 st_free_table(bfs
.visited
);
241 static const rb_transcoder
*
242 load_transcoder_entry(transcoder_entry_t
*entry
)
244 if (entry
->transcoder
)
245 return entry
->transcoder
;
248 const char *lib
= entry
->lib
;
249 int len
= strlen(lib
);
250 char path
[sizeof(transcoder_lib_prefix
) + MAX_TRANSCODER_LIBNAME_LEN
];
254 if (len
> MAX_TRANSCODER_LIBNAME_LEN
)
256 memcpy(path
, transcoder_lib_prefix
, sizeof(transcoder_lib_prefix
) - 1);
257 memcpy(path
+ sizeof(transcoder_lib_prefix
) - 1, lib
, len
+ 1);
258 if (!rb_require(path
))
262 if (entry
->transcoder
)
263 return entry
->transcoder
;
269 get_replacement_character(rb_encoding
*enc
, int *len_ret
)
271 static rb_encoding
*utf16be_encoding
, *utf16le_encoding
;
272 static rb_encoding
*utf32be_encoding
, *utf32le_encoding
;
273 if (!utf16be_encoding
) {
274 utf16be_encoding
= rb_enc_find("UTF-16BE");
275 utf16le_encoding
= rb_enc_find("UTF-16LE");
276 utf32be_encoding
= rb_enc_find("UTF-32BE");
277 utf32le_encoding
= rb_enc_find("UTF-32LE");
279 if (rb_utf8_encoding() == enc
) {
281 return "\xEF\xBF\xBD";
283 else if (utf16be_encoding
== enc
) {
287 else if (utf16le_encoding
== enc
) {
291 else if (utf32be_encoding
== enc
) {
293 return "\x00\x00\xFF\xFD";
295 else if (utf32le_encoding
== enc
) {
297 return "\xFD\xFF\x00\x00";
306 * Transcoding engine logic
309 static const unsigned char *
310 transcode_char_start(rb_transcoding
*tc
,
311 const unsigned char *in_start
,
312 const unsigned char *inchar_start
,
313 const unsigned char *in_p
,
314 size_t *char_len_ptr
)
316 const unsigned char *ptr
;
317 if (inchar_start
- in_start
< tc
->recognized_len
) {
318 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
319 inchar_start
, unsigned char, in_p
- inchar_start
);
320 ptr
= TRANSCODING_READBUF(tc
);
323 ptr
= inchar_start
- tc
->recognized_len
;
325 *char_len_ptr
= tc
->recognized_len
+ (in_p
- inchar_start
);
329 static rb_econv_result_t
330 transcode_restartable0(const unsigned char **in_pos
, unsigned char **out_pos
,
331 const unsigned char *in_stop
, unsigned char *out_stop
,
336 const rb_transcoder
*tr
= tc
->transcoder
;
337 int unitlen
= tr
->input_unit_length
;
338 int readagain_len
= 0;
340 const unsigned char *inchar_start
;
341 const unsigned char *in_p
;
343 unsigned char *out_p
;
345 unsigned char empty_buf
;
346 unsigned char *empty_ptr
= &empty_buf
;
349 in_pos
= (const unsigned char **)&empty_ptr
;
354 out_pos
= &empty_ptr
;
355 out_stop
= empty_ptr
;
358 in_p
= inchar_start
= *in_pos
;
362 #define SUSPEND(ret, num) \
364 tc->resume_position = (num); \
365 if (0 < in_p - inchar_start) \
366 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
367 inchar_start, unsigned char, in_p - inchar_start); \
370 tc->recognized_len += in_p - inchar_start; \
371 if (readagain_len) { \
372 tc->recognized_len -= readagain_len; \
373 tc->readagain_len = readagain_len; \
376 resume_label ## num:; \
378 #define SUSPEND_OBUF(num) \
380 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
383 #define SUSPEND_OUTPUT_FOLLOWED_BY_INPUT(num) \
384 if ((opt & ECONV_OUTPUT_FOLLOWED_BY_INPUT) && *out_pos != out_p) { \
385 SUSPEND(econv_output_followed_by_input, num); \
388 #define next_table (tc->next_table)
389 #define next_info (tc->next_info)
390 #define next_byte (tc->next_byte)
391 #define writebuf_len (tc->writebuf_len)
392 #define writebuf_off (tc->writebuf_off)
394 switch (tc
->resume_position
) {
396 case 1: goto resume_label1
;
397 case 2: goto resume_label2
;
398 case 3: goto resume_label3
;
399 case 4: goto resume_label4
;
400 case 5: goto resume_label5
;
401 case 6: goto resume_label6
;
402 case 7: goto resume_label7
;
403 case 8: goto resume_label8
;
404 case 9: goto resume_label9
;
405 case 10: goto resume_label10
;
406 case 11: goto resume_label11
;
407 case 12: goto resume_label12
;
408 case 13: goto resume_label13
;
409 case 14: goto resume_label14
;
410 case 15: goto resume_label15
;
411 case 16: goto resume_label16
;
412 case 17: goto resume_label17
;
413 case 18: goto resume_label18
;
414 case 19: goto resume_label19
;
415 case 20: goto resume_label20
;
416 case 21: goto resume_label21
;
417 case 22: goto resume_label22
;
418 case 23: goto resume_label23
;
419 case 24: goto resume_label24
;
420 case 25: goto resume_label25
;
421 case 26: goto resume_label26
;
425 SUSPEND_OUTPUT_FOLLOWED_BY_INPUT(24);
426 if (in_stop
<= in_p
) {
427 if (!(opt
& ECONV_PARTIAL_INPUT
))
429 SUSPEND(econv_source_buffer_empty
, 7);
433 tc
->recognized_len
= 0;
435 next_table
= tr
->conv_tree_start
;
436 next_byte
= (unsigned char)*in_p
++;
438 if (next_byte
< next_table
->base
[0] || next_table
->base
[1] < next_byte
)
441 unsigned int next_offset
= next_table
->base
[2+next_byte
-next_table
->base
[0]];
442 next_info
= (VALUE
)next_table
->info
[next_offset
];
445 switch (next_info
& 0x1F) {
446 case NOMAP
: /* xxx: copy last byte only? */
447 SUSPEND_OBUF(3); *out_p
++ = next_byte
;
449 case 0x00: case 0x04: case 0x08: case 0x0C:
450 case 0x10: case 0x14: case 0x18: case 0x1C:
451 SUSPEND_OUTPUT_FOLLOWED_BY_INPUT(25);
452 while (in_p
>= in_stop
) {
453 if (!(opt
& ECONV_PARTIAL_INPUT
))
455 SUSPEND(econv_source_buffer_empty
, 5);
457 next_byte
= (unsigned char)*in_p
++;
458 next_table
= (const BYTE_LOOKUP
*)next_info
;
460 case ZERObt
: /* drop input */
463 SUSPEND_OBUF(9); *out_p
++ = getBT1(next_info
);
466 SUSPEND_OBUF(10); *out_p
++ = getBT1(next_info
);
467 SUSPEND_OBUF(21); *out_p
++ = getBT2(next_info
);
470 SUSPEND_OBUF(11); *out_p
++ = getBT1(next_info
);
471 SUSPEND_OBUF(15); *out_p
++ = getBT2(next_info
);
472 SUSPEND_OBUF(16); *out_p
++ = getBT3(next_info
);
475 SUSPEND_OBUF(12); *out_p
++ = getBT0(next_info
);
476 SUSPEND_OBUF(17); *out_p
++ = getBT1(next_info
);
477 SUSPEND_OBUF(18); *out_p
++ = getBT2(next_info
);
478 SUSPEND_OBUF(19); *out_p
++ = getBT3(next_info
);
481 next_info
= (VALUE
)(*tr
->func_ii
)(tc
, next_info
);
485 const unsigned char *char_start
;
487 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
488 next_info
= (VALUE
)(*tr
->func_si
)(tc
, char_start
, (size_t)char_len
);
493 if (tr
->max_output
<= out_stop
- out_p
)
494 out_p
+= (VALUE
)(*tr
->func_io
)(tc
, next_info
, out_p
);
496 writebuf_len
= (VALUE
)(*tr
->func_io
)(tc
, next_info
, TRANSCODING_WRITEBUF(tc
));
498 while (writebuf_off
< writebuf_len
) {
500 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
506 const unsigned char *char_start
;
509 if (tr
->max_output
<= out_stop
- out_p
) {
510 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
511 out_p
+= (VALUE
)(*tr
->func_so
)(tc
, char_start
, (size_t)char_len
, out_p
);
514 char_start
= transcode_char_start(tc
, *in_pos
, inchar_start
, in_p
, &char_len
);
515 writebuf_len
= (VALUE
)(*tr
->func_so
)(tc
, char_start
, (size_t)char_len
, TRANSCODING_WRITEBUF(tc
));
517 while (writebuf_off
< writebuf_len
) {
519 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
525 if (tc
->recognized_len
+ (in_p
- inchar_start
) <= unitlen
) {
526 if (tc
->recognized_len
+ (in_p
- inchar_start
) < unitlen
)
527 SUSPEND_OUTPUT_FOLLOWED_BY_INPUT(26);
528 while ((opt
& ECONV_PARTIAL_INPUT
) && tc
->recognized_len
+ (in_stop
- inchar_start
) < unitlen
) {
530 SUSPEND(econv_source_buffer_empty
, 8);
532 if (tc
->recognized_len
+ (in_stop
- inchar_start
) <= unitlen
) {
536 in_p
= inchar_start
+ (unitlen
- tc
->recognized_len
);
540 int invalid_len
; /* including the last byte which causes invalid */
542 invalid_len
= tc
->recognized_len
+ (in_p
- inchar_start
);
543 discard_len
= ((invalid_len
- 1) / unitlen
) * unitlen
;
544 readagain_len
= invalid_len
- discard_len
;
553 SUSPEND(econv_invalid_byte_sequence
, 1);
557 SUSPEND(econv_undefined_conversion
, 2);
562 if (tr
->finish_func
) {
564 if (tr
->max_output
<= out_stop
- out_p
) {
565 out_p
+= tr
->finish_func(tc
, out_p
);
568 writebuf_len
= tr
->finish_func(tc
, TRANSCODING_WRITEBUF(tc
));
570 while (writebuf_off
< writebuf_len
) {
572 *out_p
++ = TRANSCODING_WRITEBUF(tc
)[writebuf_off
++];
577 SUSPEND(econv_finished
, 6);
586 static rb_econv_result_t
587 transcode_restartable(const unsigned char **in_pos
, unsigned char **out_pos
,
588 const unsigned char *in_stop
, unsigned char *out_stop
,
592 if (tc
->readagain_len
) {
593 unsigned char *readagain_buf
= ALLOCA_N(unsigned char, tc
->readagain_len
);
594 const unsigned char *readagain_pos
= readagain_buf
;
595 const unsigned char *readagain_stop
= readagain_buf
+ tc
->readagain_len
;
596 rb_econv_result_t res
;
598 MEMCPY(readagain_buf
, TRANSCODING_READBUF(tc
) + tc
->recognized_len
,
599 unsigned char, tc
->readagain_len
);
600 tc
->readagain_len
= 0;
601 res
= transcode_restartable0(&readagain_pos
, out_pos
, readagain_stop
, out_stop
, tc
, opt
|ECONV_PARTIAL_INPUT
);
602 if (res
!= econv_source_buffer_empty
) {
603 MEMCPY(TRANSCODING_READBUF(tc
) + tc
->recognized_len
+ tc
->readagain_len
,
604 readagain_pos
, unsigned char, readagain_stop
- readagain_pos
);
605 tc
->readagain_len
+= readagain_stop
- readagain_pos
;
609 return transcode_restartable0(in_pos
, out_pos
, in_stop
, out_stop
, tc
, opt
);
612 static rb_transcoding
*
613 rb_transcoding_open_by_transcoder(const rb_transcoder
*tr
, int flags
)
617 tc
= ALLOC(rb_transcoding
);
620 memset(tc
->stateful
, 0, sizeof(tc
->stateful
));
621 tc
->resume_position
= 0;
622 tc
->recognized_len
= 0;
623 tc
->readagain_len
= 0;
624 tc
->writebuf_len
= 0;
625 tc
->writebuf_off
= 0;
626 if (sizeof(tc
->readbuf
.ary
) < tr
->max_input
) {
627 tc
->readbuf
.ptr
= xmalloc(tr
->max_input
);
629 if (sizeof(tc
->writebuf
.ary
) < tr
->max_output
) {
630 tc
->writebuf
.ptr
= xmalloc(tr
->max_output
);
635 static rb_econv_result_t
636 rb_transcoding_convert(rb_transcoding
*tc
,
637 const unsigned char **input_ptr
, const unsigned char *input_stop
,
638 unsigned char **output_ptr
, unsigned char *output_stop
,
641 return transcode_restartable(
642 input_ptr
, output_ptr
,
643 input_stop
, output_stop
,
648 rb_transcoding_close(rb_transcoding
*tc
)
650 const rb_transcoder
*tr
= tc
->transcoder
;
651 if (sizeof(tc
->readbuf
.ary
) < tr
->max_input
)
652 xfree(tc
->readbuf
.ptr
);
653 if (sizeof(tc
->writebuf
.ary
) < tr
->max_output
)
654 xfree(tc
->writebuf
.ptr
);
659 rb_econv_open_by_transcoder_entries(int n
, transcoder_entry_t
**entries
)
664 for (i
= 0; i
< n
; i
++) {
665 const rb_transcoder
*tr
;
666 tr
= load_transcoder_entry(entries
[i
]);
671 ec
= ALLOC(rb_econv_t
);
673 ec
->elems
= ALLOC_N(rb_econv_elem_t
, ec
->num_trans
);
674 ec
->num_finished
= 0;
676 ec
->source_encoding
= NULL
;
677 ec
->destination_encoding
= NULL
;
678 for (i
= 0; i
< ec
->num_trans
; i
++) {
679 const rb_transcoder
*tr
= load_transcoder_entry(entries
[i
]);
680 ec
->elems
[i
].from
= tr
->from_encoding
;
681 ec
->elems
[i
].to
= tr
->to_encoding
;
682 ec
->elems
[i
].tc
= rb_transcoding_open_by_transcoder(tr
, 0);
683 ec
->elems
[i
].out_buf_start
= NULL
;
684 ec
->elems
[i
].out_data_start
= NULL
;
685 ec
->elems
[i
].out_data_end
= NULL
;
686 ec
->elems
[i
].out_buf_end
= NULL
;
687 ec
->elems
[i
].last_result
= econv_source_buffer_empty
;
689 ec
->last_tc
= ec
->elems
[ec
->num_trans
-1].tc
;
691 for (i
= 0; i
< ec
->num_trans
-1; i
++) {
694 p
= xmalloc(bufsize
);
695 ec
->elems
[i
].out_buf_start
= p
;
696 ec
->elems
[i
].out_buf_end
= p
+ bufsize
;
697 ec
->elems
[i
].out_data_start
= p
;
698 ec
->elems
[i
].out_data_end
= p
;
705 trans_open_i(const char *from
, const char *to
, int depth
, void *arg
)
707 transcoder_entry_t
***entries_ptr
= arg
;
708 transcoder_entry_t
**entries
;
711 entries
= ALLOC_N(transcoder_entry_t
*, depth
+1+2);
712 *entries_ptr
= entries
;
715 entries
= *entries_ptr
;
717 entries
[depth
] = get_transcoder_entry(from
, to
);
721 rb_econv_open(const char *from
, const char *to
, int flags
)
723 transcoder_entry_t
**entries
= NULL
;
725 static rb_econv_t
*ec
;
727 num_trans
= transcode_search_path(from
, to
, trans_open_i
, (void *)&entries
);
729 if (num_trans
< 0 || !entries
)
732 if (flags
& (ECONV_CRLF_NEWLINE_ENCODER
|ECONV_CR_NEWLINE_ENCODER
)) {
733 const char *name
= (flags
& ECONV_CRLF_NEWLINE_ENCODER
) ? "crlf_newline" : "cr_newline";
734 transcoder_entry_t
*e
= get_transcoder_entry("", name
);
737 MEMMOVE(entries
+1, entries
, transcoder_entry_t
*, num_trans
);
742 if (flags
& ECONV_UNIVERSAL_NEWLINE_DECODER
) {
743 transcoder_entry_t
*e
= get_transcoder_entry("universal_newline", "");
746 entries
[num_trans
++] = e
;
749 ec
= rb_econv_open_by_transcoder_entries(num_trans
, entries
);
751 if (flags
& ECONV_UNIVERSAL_NEWLINE_DECODER
) {
752 ec
->last_tc
= ec
->elems
[ec
->num_trans
-2].tc
;
759 trans_sweep(rb_econv_t
*ec
,
760 const unsigned char **input_ptr
, const unsigned char *input_stop
,
761 unsigned char **output_ptr
, unsigned char *output_stop
,
768 const unsigned char **ipp
, *is
, *iold
;
769 unsigned char **opp
, *os
, *oold
;
770 rb_econv_result_t res
;
775 for (i
= start
; i
< ec
->num_trans
; i
++) {
776 rb_econv_elem_t
*te
= &ec
->elems
[i
];
783 rb_econv_elem_t
*prev_te
= &ec
->elems
[i
-1];
784 ipp
= (const unsigned char **)&prev_te
->out_data_start
;
785 is
= prev_te
->out_data_end
;
788 if (!te
->out_buf_start
) {
793 if (te
->out_buf_start
!= te
->out_data_start
) {
794 int len
= te
->out_data_end
- te
->out_data_start
;
795 int off
= te
->out_data_start
- te
->out_buf_start
;
796 MEMMOVE(te
->out_buf_start
, te
->out_data_start
, unsigned char, len
);
797 te
->out_data_start
= te
->out_buf_start
;
798 te
->out_data_end
-= off
;
800 opp
= &te
->out_data_end
;
801 os
= te
->out_buf_end
;
805 if (ec
->num_finished
!= i
)
806 f
|= ECONV_PARTIAL_INPUT
;
807 if (i
== 0 && (flags
& ECONV_OUTPUT_FOLLOWED_BY_INPUT
)) {
809 flags
&= ~ECONV_OUTPUT_FOLLOWED_BY_INPUT
;
812 f
&= ~ECONV_OUTPUT_FOLLOWED_BY_INPUT
;
815 te
->last_result
= res
= rb_transcoding_convert(te
->tc
, ipp
, is
, opp
, os
, f
);
816 if (iold
!= *ipp
|| oold
!= *opp
)
820 case econv_invalid_byte_sequence
:
821 case econv_undefined_conversion
:
822 case econv_output_followed_by_input
:
825 case econv_destination_buffer_full
:
826 case econv_source_buffer_empty
:
830 ec
->num_finished
= i
+1;
838 static rb_econv_result_t
839 rb_trans_conv(rb_econv_t
*ec
,
840 const unsigned char **input_ptr
, const unsigned char *input_stop
,
841 unsigned char **output_ptr
, unsigned char *output_stop
,
845 int needreport_index
;
848 unsigned char empty_buf
;
849 unsigned char *empty_ptr
= &empty_buf
;
852 input_ptr
= (const unsigned char **)&empty_ptr
;
853 input_stop
= empty_ptr
;
857 output_ptr
= &empty_ptr
;
858 output_stop
= empty_ptr
;
861 if (ec
->elems
[0].last_result
== econv_output_followed_by_input
)
862 ec
->elems
[0].last_result
= econv_source_buffer_empty
;
864 needreport_index
= -1;
865 for (i
= ec
->num_trans
-1; 0 <= i
; i
--) {
866 switch (ec
->elems
[i
].last_result
) {
867 case econv_invalid_byte_sequence
:
868 case econv_undefined_conversion
:
869 case econv_output_followed_by_input
:
872 needreport_index
= i
;
873 goto found_needreport
;
875 case econv_destination_buffer_full
:
876 case econv_source_buffer_empty
:
880 rb_bug("unexpected transcode last result");
884 /* /^[io]+$/ is confirmed. but actually /^i*o*$/. */
886 if (ec
->elems
[ec
->num_trans
-1].last_result
== econv_destination_buffer_full
&&
887 (flags
& ECONV_OUTPUT_FOLLOWED_BY_INPUT
)) {
888 rb_econv_result_t res
;
890 res
= rb_trans_conv(ec
, NULL
, NULL
, output_ptr
, output_stop
,
891 (flags
& ~ECONV_OUTPUT_FOLLOWED_BY_INPUT
)|ECONV_PARTIAL_INPUT
);
893 if (res
== econv_source_buffer_empty
)
894 return econv_output_followed_by_input
;
903 needreport_index
= trans_sweep(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
, sweep_start
);
904 sweep_start
= needreport_index
+ 1;
905 } while (needreport_index
!= -1 && needreport_index
!= ec
->num_trans
-1);
907 for (i
= ec
->num_trans
-1; 0 <= i
; i
--) {
908 if (ec
->elems
[i
].last_result
!= econv_source_buffer_empty
) {
909 rb_econv_result_t res
= ec
->elems
[i
].last_result
;
910 if (res
== econv_invalid_byte_sequence
||
911 res
== econv_undefined_conversion
||
912 res
== econv_output_followed_by_input
) {
913 ec
->elems
[i
].last_result
= econv_source_buffer_empty
;
918 return econv_source_buffer_empty
;
922 rb_econv_convert(rb_econv_t
*ec
,
923 const unsigned char **input_ptr
, const unsigned char *input_stop
,
924 unsigned char **output_ptr
, unsigned char *output_stop
,
927 rb_econv_result_t res
;
929 if ((flags
& ECONV_OUTPUT_FOLLOWED_BY_INPUT
) ||
931 return rb_trans_conv(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
);
933 flags
|= ECONV_OUTPUT_FOLLOWED_BY_INPUT
;
935 res
= rb_trans_conv(ec
, input_ptr
, input_stop
, output_ptr
, output_stop
, flags
);
936 } while (res
== econv_output_followed_by_input
);
941 rb_econv_output(rb_econv_t
*ec
,
942 const unsigned char *str
, size_t len
, /* string in destination encoding */
943 unsigned char **destination_buffer_ptr
, unsigned char *destination_buffer_end
,
944 size_t *required_size
)
946 size_t reset_len
, total_len
;
947 rb_transcoding
*tc
= ec
->last_tc
;
948 const rb_transcoder
*tr
= tc
->transcoder
;
951 * Assumption for stateful encoding:
953 * - str can be output on resetted state and doesn't change the state.
954 * - it is acceptable that extra state changing sequence if str contains
955 * a state changing sequence.
957 * Currently the replacement character for stateful encoding such as
958 * ISO-2022-JP is "?" and it has no state changing sequence.
959 * So the extra state changing sequence don't occur when
960 * rb_econv_output is used for replacement characters.
962 * Thease assumption may be removed in future.
963 * It needs to scan str to check state changing sequences in it.
967 if (tr
->resetsize_func
) {
968 reset_len
= tr
->resetsize_func(tc
);
971 total_len
= reset_len
+ len
;
976 *required_size
= total_len
;
979 if (destination_buffer_end
- *destination_buffer_ptr
< total_len
)
983 *destination_buffer_ptr
+= tr
->resetstate_func(tc
, *destination_buffer_ptr
);
986 memcpy(*destination_buffer_ptr
, str
, len
);
987 *destination_buffer_ptr
+= len
;
993 rb_econv_close(rb_econv_t
*ec
)
997 for (i
= 0; i
< ec
->num_trans
; i
++) {
998 rb_transcoding_close(ec
->elems
[i
].tc
);
999 if (ec
->elems
[i
].out_buf_start
)
1000 xfree(ec
->elems
[i
].out_buf_start
);
1010 unsigned char *(*resize_destination
)(VALUE
, int, int),
1012 unsigned char **out_start_ptr
,
1013 unsigned char **out_pos
,
1014 unsigned char **out_stop_ptr
)
1016 size_t len
= (*out_pos
- *out_start_ptr
);
1017 size_t new_len
= (len
+ max_output
) * 2;
1018 *out_start_ptr
= resize_destination(destination
, len
, new_len
);
1019 *out_pos
= *out_start_ptr
+ len
;
1020 *out_stop_ptr
= *out_start_ptr
+ new_len
;
1024 output_replacement_character(
1026 unsigned char *(*resize_destination
)(VALUE
, int, int),
1028 unsigned char **out_start_ptr
,
1029 unsigned char **out_pos
,
1030 unsigned char **out_stop_ptr
)
1033 rb_transcoding
*tc
= ec
->last_tc
;
1034 const rb_transcoder
*tr
;
1036 const unsigned char *replacement
;
1038 size_t required_size
;
1040 tr
= tc
->transcoder
;
1041 enc
= rb_enc_find(tr
->to_encoding
);
1043 replacement
= (const unsigned char *)get_replacement_character(enc
, &len
);
1045 if (rb_econv_output(ec
, replacement
, len
, out_pos
, *out_stop_ptr
, &required_size
) == 0)
1048 if (required_size
< len
)
1049 return -1; /* overflow */
1051 more_output_buffer(destination
, resize_destination
, required_size
, out_start_ptr
, out_pos
, out_stop_ptr
);
1053 if (rb_econv_output(ec
, replacement
, len
, out_pos
, *out_stop_ptr
, &required_size
) == 0)
1061 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
1062 const unsigned char *in_stop
, unsigned char *out_stop
,
1064 unsigned char *(*resize_destination
)(VALUE
, int, int),
1065 const char *from_encoding
,
1066 const char *to_encoding
,
1070 rb_transcoding
*last_tc
;
1071 rb_econv_result_t ret
;
1072 unsigned char *out_start
= *out_pos
;
1075 ec
= rb_econv_open(from_encoding
, to_encoding
, 0);
1077 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_encoding
, to_encoding
);
1079 last_tc
= ec
->last_tc
;
1080 max_output
= last_tc
->transcoder
->max_output
;
1083 ret
= rb_econv_convert(ec
, in_pos
, in_stop
, out_pos
, out_stop
, opt
);
1084 if (ret
== econv_invalid_byte_sequence
) {
1085 /* deal with invalid byte sequence */
1086 /* todo: add more alternative behaviors */
1087 if (opt
&INVALID_IGNORE
) {
1090 else if (opt
&INVALID_REPLACE
) {
1091 if (output_replacement_character(destination
, resize_destination
, ec
, &out_start
, out_pos
, &out_stop
) == 0)
1095 rb_raise(rb_eInvalidByteSequence
, "invalid byte sequence");
1097 if (ret
== econv_undefined_conversion
) {
1098 /* valid character in from encoding
1099 * but no related character(s) in to encoding */
1100 /* todo: add more alternative behaviors */
1101 if (opt
&UNDEF_IGNORE
) {
1104 else if (opt
&UNDEF_REPLACE
) {
1105 if (output_replacement_character(destination
, resize_destination
, ec
, &out_start
, out_pos
, &out_stop
) == 0)
1109 rb_raise(rb_eConversionUndefined
, "conversion undefined for byte sequence (maybe invalid byte sequence)");
1111 if (ret
== econv_destination_buffer_full
) {
1112 more_output_buffer(destination
, resize_destination
, max_output
, &out_start
, out_pos
, &out_stop
);
1120 /* sample transcode_loop implementation in byte-by-byte stream style */
1122 transcode_loop(const unsigned char **in_pos
, unsigned char **out_pos
,
1123 const unsigned char *in_stop
, unsigned char *out_stop
,
1125 unsigned char *(*resize_destination
)(VALUE
, int, int),
1126 const char *from_encoding
,
1127 const char *to_encoding
,
1131 rb_transcoding
*last_tc
;
1132 rb_econv_result_t ret
;
1133 unsigned char *out_start
= *out_pos
;
1134 const unsigned char *ptr
;
1137 ec
= rb_econv_open(from_encoding
, to_encoding
, 0);
1139 rb_raise(rb_eArgError
, "transcoding not supported (from %s to %s)", from_encoding
, to_encoding
);
1141 last_tc
= ec
->last_tc
;
1142 max_output
= ec
->elems
[ec
->num_trans
-1].tc
->transcoder
->max_output
;
1144 ret
= econv_source_buffer_empty
;
1146 while (ret
!= econv_finished
) {
1147 unsigned char input_byte
;
1148 const unsigned char *p
= &input_byte
;
1150 if (ret
== econv_source_buffer_empty
) {
1151 if (ptr
< in_stop
) {
1153 ret
= rb_econv_convert(ec
, &p
, p
+1, out_pos
, out_stop
, ECONV_PARTIAL_INPUT
);
1156 ret
= rb_econv_convert(ec
, NULL
, NULL
, out_pos
, out_stop
, 0);
1160 ret
= rb_econv_convert(ec
, NULL
, NULL
, out_pos
, out_stop
, ECONV_PARTIAL_INPUT
);
1162 if (&input_byte
!= p
)
1163 ptr
+= p
- &input_byte
;
1165 case econv_invalid_byte_sequence
:
1166 /* deal with invalid byte sequence */
1167 /* todo: add more alternative behaviors */
1168 if (opt
&INVALID_IGNORE
) {
1171 else if (opt
&INVALID_REPLACE
) {
1172 if (output_replacement_character(destination
, resize_destination
, ec
, &out_start
, out_pos
, &out_stop
) == 0)
1176 rb_raise(rb_eInvalidByteSequence
, "invalid byte sequence");
1179 case econv_undefined_conversion
:
1180 /* valid character in from encoding
1181 * but no related character(s) in to encoding */
1182 /* todo: add more alternative behaviors */
1183 if (opt
&UNDEF_IGNORE
) {
1186 else if (opt
&UNDEF_REPLACE
) {
1187 if (output_replacement_character(destination
, resize_destination
, ec
, &out_start
, out_pos
, &out_stop
) == 0)
1191 rb_raise(rb_eConversionUndefined
, "conversion undefined for byte sequence (maybe invalid byte sequence)");
1194 case econv_destination_buffer_full
:
1195 more_output_buffer(destination
, resize_destination
, max_output
, &out_start
, out_pos
, &out_stop
);
1198 case econv_source_buffer_empty
:
1201 case econv_finished
:
1213 * String-specific code
1216 static unsigned char *
1217 str_transcoding_resize(VALUE destination
, int len
, int new_len
)
1219 rb_str_resize(destination
, new_len
);
1220 return (unsigned char *)RSTRING_PTR(destination
);
1224 str_transcode(int argc
, VALUE
*argv
, VALUE
*self
)
1229 unsigned char *buf
, *bp
, *sp
;
1230 const unsigned char *fromp
;
1231 rb_encoding
*from_enc
, *to_enc
;
1232 const char *from_e
, *to_e
;
1233 int from_encidx
, to_encidx
;
1234 VALUE from_encval
, to_encval
;
1238 opt
= rb_check_convert_type(argv
[argc
-1], T_HASH
, "Hash", "to_hash");
1243 v
= rb_hash_aref(opt
, sym_invalid
);
1246 else if (v
==sym_ignore
) {
1247 options
|= INVALID_IGNORE
;
1249 else if (v
==sym_replace
) {
1250 options
|= INVALID_REPLACE
;
1251 v
= rb_hash_aref(opt
, sym_replace
);
1254 rb_raise(rb_eArgError
, "unknown value for invalid character option");
1256 v
= rb_hash_aref(opt
, sym_undef
);
1259 else if (v
==sym_ignore
) {
1260 options
|= UNDEF_IGNORE
;
1262 else if (v
==sym_replace
) {
1263 options
|= UNDEF_REPLACE
;
1266 rb_raise(rb_eArgError
, "unknown value for undefined character option");
1269 if (argc
< 1 || argc
> 2) {
1270 rb_raise(rb_eArgError
, "wrong number of arguments (%d for 1..2)", argc
);
1272 if ((to_encidx
= rb_to_encoding_index(to_encval
= argv
[0])) < 0) {
1275 to_e
= StringValueCStr(to_encval
);
1278 to_enc
= rb_enc_from_index(to_encidx
);
1279 to_e
= rb_enc_name(to_enc
);
1282 from_encidx
= rb_enc_get_index(str
);
1283 from_enc
= rb_enc_from_index(from_encidx
);
1284 from_e
= rb_enc_name(from_enc
);
1286 else if ((from_encidx
= rb_to_encoding_index(from_encval
= argv
[1])) < 0) {
1288 from_e
= StringValueCStr(from_encval
);
1291 from_enc
= rb_enc_from_index(from_encidx
);
1292 from_e
= rb_enc_name(from_enc
);
1295 if (from_enc
&& from_enc
== to_enc
) {
1298 if (from_enc
&& to_enc
&& rb_enc_asciicompat(from_enc
) && rb_enc_asciicompat(to_enc
)) {
1299 if (ENC_CODERANGE(str
) == ENC_CODERANGE_7BIT
) {
1303 if (encoding_equal(from_e
, to_e
)) {
1307 fromp
= sp
= (unsigned char *)RSTRING_PTR(str
);
1308 slen
= RSTRING_LEN(str
);
1309 blen
= slen
+ 30; /* len + margin */
1310 dest
= rb_str_tmp_new(blen
);
1311 bp
= (unsigned char *)RSTRING_PTR(dest
);
1313 transcode_loop(&fromp
, &bp
, (sp
+slen
), (bp
+blen
), dest
, str_transcoding_resize
, from_e
, to_e
, options
);
1314 if (fromp
!= sp
+slen
) {
1315 rb_raise(rb_eArgError
, "not fully converted, %"PRIdPTRDIFF
" bytes left", sp
+slen
-fromp
);
1317 buf
= (unsigned char *)RSTRING_PTR(dest
);
1319 rb_str_set_len(dest
, bp
- buf
);
1323 to_encidx
= rb_define_dummy_encoding(to_e
);
1331 str_encode_associate(VALUE str
, int encidx
)
1335 rb_enc_associate_index(str
, encidx
);
1337 /* transcoded string never be broken. */
1338 if (rb_enc_asciicompat(rb_enc_from_index(encidx
))) {
1339 rb_str_coderange_scan_restartable(RSTRING_PTR(str
), RSTRING_END(str
), 0, &cr
);
1342 cr
= ENC_CODERANGE_VALID
;
1344 ENC_CODERANGE_SET(str
, cr
);
1350 * str.encode!(encoding [, options] ) => str
1351 * str.encode!(to_encoding, from_encoding [, options] ) => str
1353 * The first form transcodes the contents of <i>str</i> from
1354 * str.encoding to +encoding+.
1355 * The second form transcodes the contents of <i>str</i> from
1356 * from_encoding to to_encoding.
1357 * The options Hash gives details for conversion. See String#encode
1359 * Returns the string even if no changes were made.
1363 str_encode_bang(int argc
, VALUE
*argv
, VALUE str
)
1366 int encidx
= str_transcode(argc
, argv
, &newstr
);
1368 if (encidx
< 0) return str
;
1369 rb_str_shared_replace(str
, newstr
);
1370 return str_encode_associate(str
, encidx
);
1375 * str.encode(encoding [, options] ) => str
1376 * str.encode(to_encoding, from_encoding [, options] ) => str
1378 * The first form returns a copy of <i>str</i> transcoded
1379 * to encoding +encoding+.
1380 * The second form returns a copy of <i>str</i> transcoded
1381 * from from_encoding to to_encoding.
1382 * The options Hash gives details for conversion. Details
1387 str_encode(int argc
, VALUE
*argv
, VALUE str
)
1390 int encidx
= str_transcode(argc
, argv
, &newstr
);
1392 if (encidx
< 0) return rb_str_dup(str
);
1393 RBASIC(newstr
)->klass
= rb_obj_class(str
);
1394 return str_encode_associate(newstr
, encidx
);
1398 rb_str_transcode(VALUE str
, VALUE to
)
1400 return str_encode(1, &to
, str
);
1404 econv_free(rb_econv_t
*ec
)
1410 econv_s_allocate(VALUE klass
)
1412 return Data_Wrap_Struct(klass
, NULL
, econv_free
, NULL
);
1415 static rb_encoding
*
1416 make_encoding(VALUE encoding
)
1418 int idx
= rb_to_encoding_index(encoding
);
1421 enc
= rb_enc_from_index(idx
);
1423 idx
= rb_define_dummy_encoding(StringValueCStr(encoding
));
1424 enc
= rb_enc_from_index(idx
);
1431 * Encoding::Converter.new(source_encoding, destination_encoding)
1432 * Encoding::Converter.new(source_encoding, destination_encoding, flags)
1435 * Encoding::Converter::UNIVERSAL_NEWLINE_DECODER # convert CRLF and CR to LF at last
1436 * Encoding::Converter::CRLF_NEWLINE_ENCODER # convert LF to CRLF at first
1437 * Encoding::Converter::CR_NEWLINE_ENCODER # convert LF to CR at first
1439 * Encoding::Converter.new creates an instance of Encoding::Converter.
1441 * source_encoding and destination_encoding should be a string.
1442 * flags should be an integer.
1445 * # UTF-16BE to UTF-8
1446 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
1448 * # (1) convert UTF-16BE to UTF-8
1449 * # (2) convert CRLF and CR to LF
1450 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", Encoding::Converter::UNIVERSAL_NEWLINE_DECODER)
1452 * # (1) convert LF to CRLF
1453 * # (2) convert UTF-8 to UTF-16BE
1454 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", Encoding::Converter::CRLF_NEWLINE_ENCODER)
1458 econv_init(int argc
, VALUE
*argv
, VALUE self
)
1460 VALUE source_encoding
, destination_encoding
, flags_v
;
1461 rb_encoding
*senc
, *denc
;
1465 rb_scan_args(argc
, argv
, "21", &source_encoding
, &destination_encoding
, &flags_v
);
1467 if (flags_v
== Qnil
)
1470 flags
= NUM2INT(flags_v
);
1472 senc
= make_encoding(source_encoding
);
1473 denc
= make_encoding(destination_encoding
);
1475 if (DATA_PTR(self
)) {
1476 rb_raise(rb_eTypeError
, "already initialized");
1479 ec
= rb_econv_open(senc
->name
, denc
->name
, flags
);
1481 rb_raise(rb_eArgError
, "encoding convewrter not supported (from %s to %s)", senc
->name
, denc
->name
);
1484 ec
->source_encoding
= senc
;
1485 ec
->destination_encoding
= denc
;
1487 DATA_PTR(self
) = ec
;
1493 econv_inspect(VALUE self
)
1495 const char *cname
= rb_obj_classname(self
);
1496 rb_econv_t
*ec
= DATA_PTR(self
);
1499 return rb_sprintf("#<%s: uninitialized>", cname
);
1501 return rb_sprintf("#<%s: %s to %s>", cname
,
1503 ec
->last_tc
->transcoder
->to_encoding
);
1506 #define IS_ECONV(obj) (RDATA(obj)->dfree == (RUBY_DATA_FUNC)econv_free)
1509 check_econv(VALUE self
)
1511 Check_Type(self
, T_DATA
);
1512 if (!IS_ECONV(self
)) {
1513 rb_raise(rb_eTypeError
, "wrong argument type %s (expected Encoding::Converter)",
1514 rb_class2name(CLASS_OF(self
)));
1516 if (!DATA_PTR(self
)) {
1517 rb_raise(rb_eTypeError
, "uninitialized encoding converter");
1519 return DATA_PTR(self
);
1524 * source_encoding -> encoding
1526 * returns source encoding as Encoding object.
1529 econv_source_encoding(VALUE self
)
1531 rb_econv_t
*ec
= check_econv(self
);
1532 if (!ec
->source_encoding
)
1534 return rb_enc_from_encoding(ec
->source_encoding
);
1539 * destination_encoding -> encoding
1541 * returns destination encoding as Encoding object.
1544 econv_destination_encoding(VALUE self
)
1546 rb_econv_t
*ec
= check_econv(self
);
1547 if (!ec
->destination_encoding
)
1549 return rb_enc_from_encoding(ec
->destination_encoding
);
1554 * primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
1555 * primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, flags) -> symbol
1558 * Encoding::Converter::PARTIAL_INPUT # source buffer may be part of larger source
1559 * Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT # stop conversion after output before input
1562 * :invalid_byte_sequence
1563 * :undefined_conversion
1564 * :output_followed_by_input
1565 * :destination_buffer_full
1566 * :source_buffer_empty
1569 * primitive_convert converts source_buffer into destination_buffer.
1571 * source_buffer and destination_buffer should be a string.
1572 * destination_byteoffset should be an integer or nil.
1573 * destination_bytesize and flags should be an integer.
1575 * primitive_convert convert the content of source_buffer from beginning
1576 * and store the result into destination_buffer.
1578 * destination_byteoffset and destination_bytesize specify the region which
1579 * the converted result is stored.
1580 * destination_byteoffset specifies the start position in destination_buffer in bytes.
1581 * If destination_byteoffset is nil,
1582 * destination_buffer.bytesize is used for appending the result.
1583 * destination_bytesize specifies maximum number of bytes.
1584 * After conversion, destination_buffer is resized to
1585 * destination_byteoffset + actually converted number of bytes.
1586 * Also destination_buffer's encoding is set to destination_encoding.
1588 * primitive_convert drops the first part of source_buffer.
1589 * the dropped part is converted in destination_buffer or
1590 * buffered in Encoding::Converter object.
1592 * primitive_convert stops conversion when one of following condition met.
1593 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
1594 * - character not representable in output encoding (:undefined_conversion)
1595 * - after some output is generated, before input is done (:output_followed_by_input)
1596 * this occur only when OUTPUT_FOLLOWED_BY_INPUT is specified.
1597 * - destination buffer is full (:destination_buffer_full)
1598 * - source buffer is empty (:source_buffer_empty)
1599 * this occur only when PARTIAL_INPUT is specified.
1600 * - conversion is finished (:finished)
1603 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
1604 * ret = ec.primitive_convert(src="pi", dst="", 100)
1605 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
1607 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
1608 * ret = ec.primitive_convert(src="pi", dst="", 1)
1609 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
1610 * ret = ec.primitive_convert(src, dst="", 1)
1611 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
1612 * ret = ec.primitive_convert(src, dst="", 1)
1613 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
1614 * ret = ec.primitive_convert(src, dst="", 1)
1615 * p [ret, src, dst] #=> [:finished, "", "i"]
1619 econv_primitive_convert(int argc
, VALUE
*argv
, VALUE self
)
1621 VALUE input
, output
, output_byteoffset_v
, output_bytesize_v
, flags_v
;
1622 rb_econv_t
*ec
= check_econv(self
);
1623 rb_econv_result_t res
;
1624 const unsigned char *ip
, *is
;
1625 unsigned char *op
, *os
;
1626 long output_byteoffset
, output_bytesize
;
1627 unsigned long output_byteend
;
1630 rb_scan_args(argc
, argv
, "41", &input
, &output
, &output_byteoffset_v
, &output_bytesize_v
, &flags_v
);
1632 if (output_byteoffset_v
== Qnil
)
1633 output_byteoffset
= 0;
1635 output_byteoffset
= NUM2LONG(output_byteoffset_v
);
1637 output_bytesize
= NUM2LONG(output_bytesize_v
);
1639 if (flags_v
== Qnil
)
1642 flags
= NUM2INT(flags_v
);
1644 StringValue(output
);
1646 rb_str_modify(output
);
1648 if (output_byteoffset_v
== Qnil
)
1649 output_byteoffset
= RSTRING_LEN(output
);
1651 if (output_byteoffset
< 0)
1652 rb_raise(rb_eArgError
, "negative output_byteoffset");
1654 if (RSTRING_LEN(output
) < output_byteoffset
)
1655 rb_raise(rb_eArgError
, "output_byteoffset too big");
1657 if (output_bytesize
< 0)
1658 rb_raise(rb_eArgError
, "negative output_bytesize");
1660 output_byteend
= (unsigned long)output_byteoffset
+
1661 (unsigned long)output_bytesize
;
1663 if (output_byteend
< (unsigned long)output_byteoffset
||
1664 LONG_MAX
< output_byteend
)
1665 rb_raise(rb_eArgError
, "output_byteoffset+output_bytesize too big");
1667 if (rb_str_capacity(output
) < output_byteend
)
1668 rb_str_resize(output
, output_byteend
);
1670 ip
= (const unsigned char *)RSTRING_PTR(input
);
1671 is
= ip
+ RSTRING_LEN(input
);
1673 op
= (unsigned char *)RSTRING_PTR(output
) + output_byteoffset
;
1674 os
= op
+ output_bytesize
;
1676 res
= rb_econv_convert(ec
, &ip
, is
, &op
, os
, flags
);
1677 rb_str_set_len(output
, op
-(unsigned char *)RSTRING_PTR(output
));
1678 rb_str_drop_bytes(input
, ip
- (unsigned char *)RSTRING_PTR(input
));
1680 if (ec
->destination_encoding
) {
1681 rb_enc_associate(output
, ec
->destination_encoding
);
1685 case econv_invalid_byte_sequence
: return ID2SYM(rb_intern("invalid_byte_sequence"));
1686 case econv_undefined_conversion
: return ID2SYM(rb_intern("undefined_conversion"));
1687 case econv_destination_buffer_full
: return ID2SYM(rb_intern("destination_buffer_full"));
1688 case econv_source_buffer_empty
: return ID2SYM(rb_intern("source_buffer_empty"));
1689 case econv_finished
: return ID2SYM(rb_intern("finished"));
1690 case econv_output_followed_by_input
: return ID2SYM(rb_intern("output_followed_by_input"));
1691 default: return INT2NUM(res
); /* should not be reached */
1696 Init_transcode(void)
1698 rb_eConversionUndefined
= rb_define_class_under(rb_cEncoding
, "ConversionUndefined", rb_eStandardError
);
1699 rb_eInvalidByteSequence
= rb_define_class_under(rb_cEncoding
, "InvalidByteSequence", rb_eStandardError
);
1701 transcoder_table
= st_init_strcasetable();
1703 sym_invalid
= ID2SYM(rb_intern("invalid"));
1704 sym_undef
= ID2SYM(rb_intern("undef"));
1705 sym_ignore
= ID2SYM(rb_intern("ignore"));
1706 sym_replace
= ID2SYM(rb_intern("replace"));
1708 rb_define_method(rb_cString
, "encode", str_encode
, -1);
1709 rb_define_method(rb_cString
, "encode!", str_encode_bang
, -1);
1711 rb_cEncodingConverter
= rb_define_class_under(rb_cEncoding
, "Converter", rb_cData
);
1712 rb_define_alloc_func(rb_cEncodingConverter
, econv_s_allocate
);
1713 rb_define_method(rb_cEncodingConverter
, "initialize", econv_init
, -1);
1714 rb_define_method(rb_cEncodingConverter
, "inspect", econv_inspect
, 0);
1715 rb_define_method(rb_cEncodingConverter
, "source_encoding", econv_source_encoding
, 0);
1716 rb_define_method(rb_cEncodingConverter
, "destination_encoding", econv_destination_encoding
, 0);
1717 rb_define_method(rb_cEncodingConverter
, "primitive_convert", econv_primitive_convert
, -1);
1718 rb_define_const(rb_cEncodingConverter
, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT
));
1719 rb_define_const(rb_cEncodingConverter
, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT
));
1720 rb_define_const(rb_cEncodingConverter
, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER
));
1721 rb_define_const(rb_cEncodingConverter
, "CRLF_NEWLINE_ENCODER", INT2FIX(ECONV_CRLF_NEWLINE_ENCODER
));
1722 rb_define_const(rb_cEncodingConverter
, "CR_NEWLINE_ENCODER", INT2FIX(ECONV_CR_NEWLINE_ENCODER
));