* transcode_data.h (rb_transcoder): add resetstate_func field for
[ruby-svn.git] / transcode.c
blob1802552a410fbc590611729c658b98df9224b246
1 /**********************************************************************
3 transcode.c -
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
8 Copyright (C) 2007 Martin Duerst
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #define PType (int)
15 #include "transcode_data.h"
16 #include <ctype.h>
18 static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
19 #define INVALID_IGNORE 0x1
20 #define INVALID_REPLACE 0x2
21 #define UNDEF_IGNORE 0x10
22 #define UNDEF_REPLACE 0x20
23 #define PARTIAL_INPUT 0x100
26 * Dispatch data and logic
29 typedef struct {
30 const char *from;
31 const char *to;
32 const char *lib; /* maybe null. it means that don't load the library. */
33 const rb_transcoder *transcoder;
34 } transcoder_entry_t;
36 static st_table *transcoder_table;
38 static transcoder_entry_t *
39 make_transcoder_entry(const char *from, const char *to)
41 st_data_t val;
42 st_table *table2;
44 if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
45 val = (st_data_t)st_init_strcasetable();
46 st_add_direct(transcoder_table, (st_data_t)from, val);
48 table2 = (st_table *)val;
49 if (!st_lookup(table2, (st_data_t)to, &val)) {
50 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
51 entry->from = from;
52 entry->to = to;
53 entry->lib = NULL;
54 entry->transcoder = NULL;
55 val = (st_data_t)entry;
56 st_add_direct(table2, (st_data_t)to, val);
58 return (transcoder_entry_t *)val;
61 static transcoder_entry_t *
62 get_transcoder_entry(const char *from, const char *to)
64 st_data_t val;
65 st_table *table2;
67 if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
68 return NULL;
70 table2 = (st_table *)val;
71 if (!st_lookup(table2, (st_data_t)to, &val)) {
72 return NULL;
74 return (transcoder_entry_t *)val;
77 void
78 rb_register_transcoder(const rb_transcoder *tr)
80 const char *const from_e = tr->from_encoding;
81 const char *const to_e = tr->to_encoding;
83 transcoder_entry_t *entry;
85 entry = make_transcoder_entry(from_e, to_e);
86 if (entry->transcoder) {
87 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
88 from_e, to_e);
91 entry->transcoder = tr;
94 static void
95 declare_transcoder(const char *to, const char *from, const char *lib)
97 transcoder_entry_t *entry;
99 entry = make_transcoder_entry(from, to);
100 entry->lib = lib;
103 #define MAX_TRANSCODER_LIBNAME_LEN 64
104 static const char transcoder_lib_prefix[] = "enc/trans/";
106 void
107 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
109 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
110 rb_raise(rb_eArgError, "invalid library name - %s",
111 lib ? lib : "(null)");
113 declare_transcoder(enc1, enc2, lib);
114 declare_transcoder(enc2, enc1, lib);
117 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
119 typedef struct search_path_queue_tag {
120 struct search_path_queue_tag *next;
121 const char *enc;
122 } search_path_queue_t;
124 typedef struct {
125 st_table *visited;
126 search_path_queue_t *queue;
127 search_path_queue_t **queue_last_ptr;
128 const char *base_enc;
129 } search_path_bfs_t;
131 static int
132 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
134 const char *to = (const char *)key;
135 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
136 search_path_queue_t *q;
138 if (st_lookup(bfs->visited, (st_data_t)to, &val)) {
139 return ST_CONTINUE;
142 q = ALLOC(search_path_queue_t);
143 q->enc = to;
144 q->next = NULL;
145 *bfs->queue_last_ptr = q;
146 bfs->queue_last_ptr = &q->next;
148 st_add_direct(bfs->visited, (st_data_t)to, (st_data_t)bfs->base_enc);
149 return ST_CONTINUE;
152 static int
153 transcode_search_path(const char *from, const char *to,
154 void (*callback)(const char *from, const char *to, int depth, void *arg),
155 void *arg)
157 search_path_bfs_t bfs;
158 search_path_queue_t *q;
159 st_data_t val;
160 st_table *table2;
161 int found;
163 q = ALLOC(search_path_queue_t);
164 q->enc = from;
165 q->next = NULL;
166 bfs.queue_last_ptr = &q->next;
167 bfs.queue = q;
169 bfs.visited = st_init_strcasetable();
170 st_add_direct(bfs.visited, (st_data_t)from, (st_data_t)NULL);
172 while (bfs.queue) {
173 q = bfs.queue;
174 bfs.queue = q->next;
175 if (!bfs.queue)
176 bfs.queue_last_ptr = &bfs.queue;
178 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
179 xfree(q);
180 continue;
182 table2 = (st_table *)val;
184 if (st_lookup(table2, (st_data_t)to, &val)) {
185 st_add_direct(bfs.visited, (st_data_t)to, (st_data_t)q->enc);
186 xfree(q);
187 found = 1;
188 goto cleanup;
191 bfs.base_enc = q->enc;
192 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
193 bfs.base_enc = NULL;
195 xfree(q);
197 found = 0;
199 cleanup:
200 while (bfs.queue) {
201 q = bfs.queue;
202 bfs.queue = q->next;
203 xfree(q);
206 if (found) {
207 const char *enc = to;
208 int depth = 0;
209 while (1) {
210 st_lookup(bfs.visited, (st_data_t)enc, &val);
211 if (!val)
212 break;
213 depth++;
214 enc = (const char *)val;
216 enc = to;
217 while (1) {
218 st_lookup(bfs.visited, (st_data_t)enc, &val);
219 if (!val)
220 break;
221 callback((const char *)val, enc, --depth, arg);
222 enc = (const char *)val;
226 st_free_table(bfs.visited);
228 return found;
231 static const rb_transcoder *
232 load_transcoder(transcoder_entry_t *entry)
234 if (entry->transcoder)
235 return entry->transcoder;
237 if (entry->lib) {
238 const char *lib = entry->lib;
239 int len = strlen(lib);
240 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
242 entry->lib = NULL;
244 if (len > MAX_TRANSCODER_LIBNAME_LEN)
245 return NULL;
246 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
247 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
248 if (!rb_require(path))
249 return NULL;
252 if (entry->transcoder)
253 return entry->transcoder;
255 return NULL;
258 static const char*
259 get_replacement_character(rb_encoding *enc, int *len_ret)
261 static rb_encoding *utf16be_encoding, *utf16le_encoding;
262 static rb_encoding *utf32be_encoding, *utf32le_encoding;
263 if (!utf16be_encoding) {
264 utf16be_encoding = rb_enc_find("UTF-16BE");
265 utf16le_encoding = rb_enc_find("UTF-16LE");
266 utf32be_encoding = rb_enc_find("UTF-32BE");
267 utf32le_encoding = rb_enc_find("UTF-32LE");
269 if (rb_utf8_encoding() == enc) {
270 *len_ret = 3;
271 return "\xEF\xBF\xBD";
273 else if (utf16be_encoding == enc) {
274 *len_ret = 2;
275 return "\xFF\xFD";
277 else if (utf16le_encoding == enc) {
278 *len_ret = 2;
279 return "\xFD\xFF";
281 else if (utf32be_encoding == enc) {
282 *len_ret = 4;
283 return "\x00\x00\xFF\xFD";
285 else if (utf32le_encoding == enc) {
286 *len_ret = 4;
287 return "\xFD\xFF\x00\x00";
289 else {
290 *len_ret = 1;
291 return "?";
296 * Transcoding engine logic
299 static const unsigned char *
300 transcode_char_start(rb_transcoding *tc,
301 const unsigned char *in_start,
302 const unsigned char *inchar_start,
303 const unsigned char *in_p,
304 size_t *char_len_ptr)
306 const unsigned char *ptr;
307 if (inchar_start - in_start < tc->recognized_len) {
308 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
309 inchar_start, unsigned char, in_p - inchar_start);
310 ptr = TRANSCODING_READBUF(tc);
312 else {
313 ptr = inchar_start - tc->recognized_len;
315 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
316 return ptr;
319 static rb_trans_result_t
320 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
321 const unsigned char *in_stop, unsigned char *out_stop,
322 rb_transcoding *tc,
323 const int opt)
326 const rb_transcoder *tr = tc->transcoder;
327 int unitlen = tr->input_unit_length;
328 int readagain_len = 0;
330 const unsigned char *inchar_start;
331 const unsigned char *in_p;
333 unsigned char *out_p;
334 const BYTE_LOOKUP *next_table;
335 VALUE next_info;
336 unsigned char next_byte;
338 unsigned char empty_buf;
339 unsigned char *empty_ptr = &empty_buf;
341 if (!in_pos) {
342 in_pos = (const unsigned char **)&empty_ptr;
343 in_stop = empty_ptr;
346 if (!out_pos) {
347 out_pos = &empty_ptr;
348 out_stop = empty_ptr;
351 in_p = inchar_start = *in_pos;
353 out_p = *out_pos;
354 next_table = tc->next_table;
355 next_info = tc->next_info;
356 next_byte = tc->next_byte;
358 #define SUSPEND(ret, num) \
359 do { \
360 tc->resume_position = (num); \
361 if (0 < in_p - inchar_start) \
362 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
363 inchar_start, unsigned char, in_p - inchar_start); \
364 *in_pos = in_p; \
365 *out_pos = out_p; \
366 tc->recognized_len += in_p - inchar_start; \
367 if (readagain_len) { \
368 tc->recognized_len -= readagain_len; \
369 tc->readagain_len = readagain_len; \
371 tc->next_table = next_table; \
372 tc->next_info = next_info; \
373 tc->next_byte = next_byte; \
374 return ret; \
375 resume_label ## num:; \
376 } while (0)
378 switch (tc->resume_position) {
379 case 0: break;
380 case 1: goto resume_label1;
381 case 2: goto resume_label2;
382 case 3: goto resume_label3;
383 case 4: goto resume_label4;
384 case 5: goto resume_label5;
385 case 6: goto resume_label6;
386 case 7: goto resume_label7;
387 case 8: goto resume_label8;
388 case 9: goto resume_label9;
389 case 10: goto resume_label10;
390 case 11: goto resume_label11;
391 case 12: goto resume_label12;
392 case 13: goto resume_label13;
393 case 14: goto resume_label14;
396 while (1) {
397 if (in_stop <= in_p) {
398 if (!(opt & PARTIAL_INPUT))
399 break;
400 SUSPEND(transcode_ibuf_empty, 7);
401 continue;
404 tc->recognized_len = 0;
405 inchar_start = in_p;
406 next_table = tr->conv_tree_start;
407 next_byte = (unsigned char)*in_p++;
408 follow_byte:
409 if (next_byte < next_table->base[0] || next_table->base[1] < next_byte)
410 next_info = INVALID;
411 else {
412 unsigned int next_offset = next_table->base[2+next_byte-next_table->base[0]];
413 next_info = (VALUE)next_table->info[next_offset];
415 follow_info:
416 switch (next_info & 0x1F) {
417 case NOMAP: /* xxx: copy last byte only? */
418 while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 3); }
419 *out_p++ = next_byte;
420 continue;
421 case 0x00: case 0x04: case 0x08: case 0x0C:
422 case 0x10: case 0x14: case 0x18: case 0x1C:
423 while (in_p >= in_stop) {
424 if (!(opt & PARTIAL_INPUT))
425 goto invalid;
426 SUSPEND(transcode_ibuf_empty, 5);
428 next_byte = (unsigned char)*in_p++;
429 next_table = (const BYTE_LOOKUP *)next_info;
430 goto follow_byte;
431 case ZERObt: /* drop input */
432 continue;
433 case ONEbt:
434 while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 9); }
435 *out_p++ = getBT1(next_info);
436 continue;
437 case TWObt:
438 while (out_stop - out_p < 2) { SUSPEND(transcode_obuf_full, 10); }
439 *out_p++ = getBT1(next_info);
440 *out_p++ = getBT2(next_info);
441 continue;
442 case THREEbt:
443 while (out_stop - out_p < 3) { SUSPEND(transcode_obuf_full, 11); }
444 *out_p++ = getBT1(next_info);
445 *out_p++ = getBT2(next_info);
446 *out_p++ = getBT3(next_info);
447 continue;
448 case FOURbt:
449 while (out_stop - out_p < 4) { SUSPEND(transcode_obuf_full, 12); }
450 *out_p++ = getBT0(next_info);
451 *out_p++ = getBT1(next_info);
452 *out_p++ = getBT2(next_info);
453 *out_p++ = getBT3(next_info);
454 continue;
455 case FUNii:
456 next_info = (VALUE)(*tr->func_ii)(tc, next_info);
457 goto follow_info;
458 case FUNsi:
460 const unsigned char *char_start;
461 size_t char_len;
462 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
463 next_info = (VALUE)(*tr->func_si)(tc, char_start, (size_t)char_len);
464 goto follow_info;
466 case FUNio:
467 while (out_stop - out_p < tr->max_output) { SUSPEND(transcode_obuf_full, 13); }
468 out_p += (VALUE)(*tr->func_io)(tc, next_info, out_p);
469 break;
470 case FUNso:
472 const unsigned char *char_start;
473 size_t char_len;
474 while (out_stop - out_p < tr->max_output) { SUSPEND(transcode_obuf_full, 14); }
475 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
476 out_p += (VALUE)(*tr->func_so)(tc, char_start, (size_t)char_len, out_p);
477 break;
479 case INVALID:
480 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
481 while ((opt & PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
482 in_p = in_stop;
483 SUSPEND(transcode_ibuf_empty, 8);
485 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
486 in_p = in_stop;
488 else {
489 in_p = inchar_start + (unitlen - tc->recognized_len);
492 else {
493 int invalid_len; /* including the last byte which causes invalid */
494 int discard_len;
495 invalid_len = tc->recognized_len + (in_p - inchar_start);
496 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
497 readagain_len = invalid_len - discard_len;
499 goto invalid;
500 case UNDEF:
501 goto undef;
503 continue;
505 invalid:
506 SUSPEND(transcode_invalid_input, 1);
507 continue;
509 undef:
510 SUSPEND(transcode_undefined_conversion, 2);
511 continue;
514 /* cleanup */
515 if (tr->finish_func) {
516 while (out_stop - out_p < tr->max_output) {
517 SUSPEND(transcode_obuf_full, 4);
519 out_p += tr->finish_func(tc, out_p);
521 while (1)
522 SUSPEND(transcode_finished, 6);
523 #undef SUSPEND
526 static rb_trans_result_t
527 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
528 const unsigned char *in_stop, unsigned char *out_stop,
529 rb_transcoding *tc,
530 const int opt)
532 if (tc->readagain_len) {
533 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
534 const unsigned char *readagain_pos = readagain_buf;
535 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
536 rb_trans_result_t res;
538 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
539 unsigned char, tc->readagain_len);
540 tc->readagain_len = 0;
541 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|PARTIAL_INPUT);
542 if (res != transcode_ibuf_empty) {
543 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
544 readagain_pos, unsigned char, readagain_stop - readagain_pos);
545 tc->readagain_len += readagain_stop - readagain_pos;
546 return res;
549 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
552 static rb_transcoding *
553 rb_transcoding_open(const char *from, const char *to, int flags)
555 rb_transcoding *tc;
556 const rb_transcoder *tr;
558 transcoder_entry_t *entry;
560 entry = get_transcoder_entry(from, to);
561 if (!entry)
562 return NULL;
564 tr = load_transcoder(entry);
565 if (!tr)
566 return NULL;
568 tc = ALLOC(rb_transcoding);
569 tc->transcoder = tr;
570 tc->flags = flags;
571 memset(tc->stateful, 0, sizeof(tc->stateful));
572 tc->resume_position = 0;
573 tc->recognized_len = 0;
574 tc->readagain_len = 0;
575 if (sizeof(tc->readbuf.ary) < tr->max_input) {
576 tc->readbuf.ptr = xmalloc(tr->max_input);
578 return tc;
581 static rb_trans_result_t
582 rb_transcoding_convert(rb_transcoding *tc,
583 const unsigned char **input_ptr, const unsigned char *input_stop,
584 unsigned char **output_ptr, unsigned char *output_stop,
585 int flags)
587 return transcode_restartable(
588 input_ptr, output_ptr,
589 input_stop, output_stop,
590 tc, flags);
593 static void
594 rb_transcoding_close(rb_transcoding *tc)
596 const rb_transcoder *tr = tc->transcoder;
597 if (sizeof(tc->readbuf.ary) < tr->max_input)
598 xfree(tc->readbuf.ptr);
599 xfree(tc);
602 static void
603 trans_open_i(const char *from, const char *to, int depth, void *arg)
605 rb_trans_t **tsp = (rb_trans_t **)arg;
606 rb_trans_t *ts;
607 int i;
609 if (!*tsp) {
610 ts = *tsp = ALLOC(rb_trans_t);
611 ts->num_trans = depth+1;
612 ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans);
613 ts->num_finished = 0;
614 for (i = 0; i < ts->num_trans; i++) {
615 ts->elems[i].tc = NULL;
616 ts->elems[i].out_buf_start = NULL;
617 ts->elems[i].out_data_start = NULL;
618 ts->elems[i].out_data_end = NULL;
619 ts->elems[i].out_buf_end = NULL;
620 ts->elems[i].last_result = transcode_ibuf_empty;
623 else {
624 ts = *tsp;
627 ts->elems[depth].tc = rb_transcoding_open(from, to, 0);
628 if (depth < ts->num_trans-1) {
629 int bufsize = 4096;
630 unsigned char *p;
631 p = xmalloc(bufsize);
632 ts->elems[depth].out_buf_start = p;
633 ts->elems[depth].out_buf_end = p + bufsize;
634 ts->elems[depth].out_data_start = p;
635 ts->elems[depth].out_data_end = p;
639 static rb_trans_t *
640 rb_trans_open(const char *from, const char *to, int flags)
642 rb_trans_t *ts = NULL;
644 transcode_search_path(from, to, trans_open_i, (void *)&ts);
646 if (!ts)
647 return NULL;
649 return ts;
652 static int
653 trans_sweep(rb_trans_t *ts,
654 const unsigned char **input_ptr, const unsigned char *input_stop,
655 unsigned char **output_ptr, unsigned char *output_stop,
656 int flags,
657 int start)
659 int try;
660 int i, f;
662 const unsigned char **ipp, *is, *iold;
663 unsigned char **opp, *os, *oold;
664 rb_trans_result_t res;
666 try = 1;
667 while (try) {
668 try = 0;
669 for (i = start; i < ts->num_trans; i++) {
670 rb_trans_elem_t *te = &ts->elems[i];
672 if (i == 0) {
673 ipp = input_ptr;
674 is = input_stop;
676 else {
677 rb_trans_elem_t *prev_te = &ts->elems[i-1];
678 ipp = (const unsigned char **)&prev_te->out_data_start;
679 is = prev_te->out_data_end;
682 if (!te->out_buf_start) {
683 opp = output_ptr;
684 os = output_stop;
686 else {
687 if (te->out_buf_start != te->out_data_start) {
688 int len = te->out_data_end - te->out_data_start;
689 int off = te->out_data_start - te->out_buf_start;
690 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
691 te->out_data_start = te->out_buf_start;
692 te->out_data_end -= off;
694 opp = &te->out_data_end;
695 os = te->out_buf_end;
698 f = flags;
699 if (ts->num_finished != i)
700 f |= PARTIAL_INPUT;
701 iold = *ipp;
702 oold = *opp;
703 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
704 if (iold != *ipp || oold != *opp)
705 try = 1;
707 switch (res) {
708 case transcode_invalid_input:
709 case transcode_undefined_conversion:
710 return i;
712 case transcode_obuf_full:
713 case transcode_ibuf_empty:
714 break;
716 case transcode_finished:
717 ts->num_finished = i+1;
718 break;
722 return -1;
725 static rb_trans_result_t
726 rb_trans_conv(rb_trans_t *ts,
727 const unsigned char **input_ptr, const unsigned char *input_stop,
728 unsigned char **output_ptr, unsigned char *output_stop,
729 int flags)
731 int i;
732 int start, err_index, no_error;
734 unsigned char empty_buf;
735 unsigned char *empty_ptr = &empty_buf;
737 if (!input_ptr) {
738 input_ptr = (const unsigned char **)&empty_ptr;
739 input_stop = empty_ptr;
742 if (!output_ptr) {
743 output_ptr = &empty_ptr;
744 output_stop = empty_ptr;
747 no_error = 1;
748 err_index = -1;
749 for (i = ts->num_trans-1; 0 <= i; i--) {
750 if (ts->elems[i].last_result == transcode_invalid_input ||
751 ts->elems[i].last_result == transcode_undefined_conversion) {
752 if (no_error) {
753 /* last error */
754 no_error = 0;
756 else {
757 /* second last error */
758 err_index = i;
759 break;
764 do {
765 start = err_index + 1;
766 err_index = trans_sweep(ts, input_ptr, input_stop, output_ptr, output_stop, flags, start);
767 } while (err_index != -1 && err_index != ts->num_trans-1);
769 if (err_index == ts->num_trans-1)
770 return ts->elems[ts->num_trans-1].last_result;
771 else if (start == 0)
772 return ts->elems[ts->num_trans-1].last_result;
773 else
774 return ts->elems[start-1].last_result;
777 static void
778 rb_trans_close(rb_trans_t *ts)
780 int i;
782 for (i = 0; i < ts->num_trans; i++) {
783 rb_transcoding_close(ts->elems[i].tc);
784 if (ts->elems[i].out_buf_start)
785 xfree(ts->elems[i].out_buf_start);
788 xfree(ts->elems);
789 xfree(ts);
792 static void
793 more_output_buffer(
794 VALUE destination,
795 unsigned char *(*resize_destination)(VALUE, int, int),
796 rb_trans_t *ts,
797 unsigned char **out_start_ptr,
798 unsigned char **out_pos,
799 unsigned char **out_stop_ptr)
801 size_t len = (*out_pos - *out_start_ptr);
802 size_t new_len = (len + ts->elems[ts->num_trans-1].tc->transcoder->max_output) * 2;
803 *out_start_ptr = resize_destination(destination, len, new_len);
804 *out_pos = *out_start_ptr + len;
805 *out_stop_ptr = *out_start_ptr + new_len;
808 static void
809 output_replacement_character(
810 VALUE destination,
811 unsigned char *(*resize_destination)(VALUE, int, int),
812 rb_trans_t *ts,
813 unsigned char **out_start_ptr,
814 unsigned char **out_pos,
815 unsigned char **out_stop_ptr)
818 rb_transcoding *tc;
819 const rb_transcoder *tr;
820 int max_output;
821 rb_encoding *enc;
822 const char *replacement;
823 int len;
825 tc = ts->elems[ts->num_trans-1].tc;
826 tr = tc->transcoder;
827 max_output = tr->max_output;
828 enc = rb_enc_find(tr->to_encoding);
831 * Assumption for stateful encoding:
833 * - The replacement character can be output on resetted state and doesn't
834 * change the state.
835 * - it is acceptable that extra state changing sequence if the replacement
836 * character contains a state changing sequence.
838 * Currently the replacement character for stateful encoding such as
839 * ISO-2022-JP is "?" and it has no state changing sequence.
840 * So the extra state changing sequence don't occur.
842 * Thease assumption may be removed in future.
843 * It needs to scan the replacement character to check
844 * state changing sequences in the replacement character.
847 if (tr->resetstate_func) {
848 if (*out_stop_ptr - *out_pos < max_output)
849 more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
850 *out_pos += tr->resetstate_func(tc, *out_pos);
853 if (*out_stop_ptr - *out_pos < max_output)
854 more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
856 replacement = get_replacement_character(enc, &len);
858 memcpy(*out_pos, replacement, len);
860 *out_pos += len;
861 return;
864 #if 1
865 static void
866 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
867 const unsigned char *in_stop, unsigned char *out_stop,
868 VALUE destination,
869 unsigned char *(*resize_destination)(VALUE, int, int),
870 const char *from_encoding,
871 const char *to_encoding,
872 const int opt)
874 rb_trans_t *ts;
875 rb_trans_result_t ret;
876 unsigned char *out_start = *out_pos;
877 int max_output;
879 ts = rb_trans_open(from_encoding, to_encoding, 0);
880 if (!ts)
881 rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
883 max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
885 resume:
886 ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt);
887 if (ret == transcode_invalid_input) {
888 /* deal with invalid byte sequence */
889 /* todo: add more alternative behaviors */
890 if (opt&INVALID_IGNORE) {
891 goto resume;
893 else if (opt&INVALID_REPLACE) {
894 output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
895 goto resume;
897 rb_trans_close(ts);
898 rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
900 if (ret == transcode_undefined_conversion) {
901 /* valid character in from encoding
902 * but no related character(s) in to encoding */
903 /* todo: add more alternative behaviors */
904 if (opt&UNDEF_IGNORE) {
905 goto resume;
907 else if (opt&UNDEF_REPLACE) {
908 output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
909 goto resume;
911 rb_trans_close(ts);
912 rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
914 if (ret == transcode_obuf_full) {
915 more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
916 goto resume;
919 rb_trans_close(ts);
920 return;
922 #else
923 /* sample transcode_loop implementation in byte-by-byte stream style */
924 static void
925 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
926 const unsigned char *in_stop, unsigned char *out_stop,
927 VALUE destination,
928 unsigned char *(*resize_destination)(VALUE, int, int),
929 const char *from_encoding,
930 const char *to_encoding,
931 const int opt)
933 rb_trans_t *ts;
934 rb_trans_result_t ret;
935 unsigned char *out_start = *out_pos;
936 const unsigned char *ptr;
937 int max_output;
939 ts = rb_trans_open(from_encoding, to_encoding, 0);
940 if (!ts)
941 rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
943 max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
945 ret = transcode_ibuf_empty;
946 ptr = *in_pos;
947 while (ret != transcode_finished) {
948 unsigned char input_byte;
949 const unsigned char *p = &input_byte;
951 if (ret == transcode_ibuf_empty) {
952 if (ptr < in_stop) {
953 input_byte = *ptr;
954 ret = rb_trans_conv(ts, &p, p+1, out_pos, out_stop, PARTIAL_INPUT);
956 else {
957 ret = rb_trans_conv(ts, NULL, NULL, out_pos, out_stop, 0);
960 else {
961 ret = rb_trans_conv(ts, NULL, NULL, out_pos, out_stop, PARTIAL_INPUT);
963 if (&input_byte != p)
964 ptr += p - &input_byte;
965 switch (ret) {
966 case transcode_invalid_input:
967 /* deal with invalid byte sequence */
968 /* todo: add more alternative behaviors */
969 if (opt&INVALID_IGNORE) {
970 break;
972 else if (opt&INVALID_REPLACE) {
973 output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
974 break;
976 rb_trans_close(ts);
977 rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
978 break;
980 case transcode_undefined_conversion:
981 /* valid character in from encoding
982 * but no related character(s) in to encoding */
983 /* todo: add more alternative behaviors */
984 if (opt&UNDEF_IGNORE) {
985 break;
987 else if (opt&UNDEF_REPLACE) {
988 output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
989 break;
991 rb_trans_close(ts);
992 rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
993 break;
995 case transcode_obuf_full:
996 more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
997 break;
999 case transcode_ibuf_empty:
1000 break;
1002 case transcode_finished:
1003 break;
1006 rb_trans_close(ts);
1007 *in_pos = in_stop;
1008 return;
1010 #endif
1014 * String-specific code
1017 static unsigned char *
1018 str_transcoding_resize(VALUE destination, int len, int new_len)
1020 rb_str_resize(destination, new_len);
1021 return (unsigned char *)RSTRING_PTR(destination);
1024 static int
1025 str_transcode(int argc, VALUE *argv, VALUE *self)
1027 VALUE dest;
1028 VALUE str = *self;
1029 long blen, slen;
1030 unsigned char *buf, *bp, *sp;
1031 const unsigned char *fromp;
1032 rb_encoding *from_enc, *to_enc;
1033 const char *from_e, *to_e;
1034 int from_encidx, to_encidx;
1035 VALUE from_encval, to_encval;
1036 VALUE opt;
1037 int options = 0;
1039 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
1040 if (!NIL_P(opt)) {
1041 VALUE v;
1043 argc--;
1044 v = rb_hash_aref(opt, sym_invalid);
1045 if (NIL_P(v)) {
1047 else if (v==sym_ignore) {
1048 options |= INVALID_IGNORE;
1050 else if (v==sym_replace) {
1051 options |= INVALID_REPLACE;
1052 v = rb_hash_aref(opt, sym_replace);
1054 else {
1055 rb_raise(rb_eArgError, "unknown value for invalid: setting");
1057 v = rb_hash_aref(opt, sym_undef);
1058 if (NIL_P(v)) {
1060 else if (v==sym_ignore) {
1061 options |= UNDEF_IGNORE;
1063 else if (v==sym_replace) {
1064 options |= UNDEF_REPLACE;
1066 else {
1067 rb_raise(rb_eArgError, "unknown value for undef: setting");
1070 if (argc < 1 || argc > 2) {
1071 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
1073 if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) {
1074 to_enc = 0;
1075 to_encidx = 0;
1076 to_e = StringValueCStr(to_encval);
1078 else {
1079 to_enc = rb_enc_from_index(to_encidx);
1080 to_e = rb_enc_name(to_enc);
1082 if (argc==1) {
1083 from_encidx = rb_enc_get_index(str);
1084 from_enc = rb_enc_from_index(from_encidx);
1085 from_e = rb_enc_name(from_enc);
1087 else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) {
1088 from_enc = 0;
1089 from_e = StringValueCStr(from_encval);
1091 else {
1092 from_enc = rb_enc_from_index(from_encidx);
1093 from_e = rb_enc_name(from_enc);
1096 if (from_enc && from_enc == to_enc) {
1097 return -1;
1099 if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) {
1100 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
1101 return to_encidx;
1104 if (encoding_equal(from_e, to_e)) {
1105 return -1;
1108 fromp = sp = (unsigned char *)RSTRING_PTR(str);
1109 slen = RSTRING_LEN(str);
1110 blen = slen + 30; /* len + margin */
1111 dest = rb_str_tmp_new(blen);
1112 bp = (unsigned char *)RSTRING_PTR(dest);
1114 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, from_e, to_e, options);
1115 if (fromp != sp+slen) {
1116 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
1118 buf = (unsigned char *)RSTRING_PTR(dest);
1119 *bp = '\0';
1120 rb_str_set_len(dest, bp - buf);
1122 /* set encoding */
1123 if (!to_enc) {
1124 to_encidx = rb_define_dummy_encoding(to_e);
1126 *self = dest;
1128 return to_encidx;
1131 static inline VALUE
1132 str_encode_associate(VALUE str, int encidx)
1134 int cr = 0;
1136 rb_enc_associate_index(str, encidx);
1138 /* transcoded string never be broken. */
1139 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
1140 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
1142 else {
1143 cr = ENC_CODERANGE_VALID;
1145 ENC_CODERANGE_SET(str, cr);
1146 return str;
1150 * call-seq:
1151 * str.encode!(encoding [, options] ) => str
1152 * str.encode!(to_encoding, from_encoding [, options] ) => str
1154 * The first form transcodes the contents of <i>str</i> from
1155 * str.encoding to +encoding+.
1156 * The second form transcodes the contents of <i>str</i> from
1157 * from_encoding to to_encoding.
1158 * The options Hash gives details for conversion. See String#encode
1159 * for details.
1160 * Returns the string even if no changes were made.
1163 static VALUE
1164 str_encode_bang(int argc, VALUE *argv, VALUE str)
1166 VALUE newstr = str;
1167 int encidx = str_transcode(argc, argv, &newstr);
1169 if (encidx < 0) return str;
1170 rb_str_shared_replace(str, newstr);
1171 return str_encode_associate(str, encidx);
1175 * call-seq:
1176 * str.encode(encoding [, options] ) => str
1177 * str.encode(to_encoding, from_encoding [, options] ) => str
1179 * The first form returns a copy of <i>str</i> transcoded
1180 * to encoding +encoding+.
1181 * The second form returns a copy of <i>str</i> transcoded
1182 * from from_encoding to to_encoding.
1183 * The options Hash gives details for conversion. Details
1184 * to be added.
1187 static VALUE
1188 str_encode(int argc, VALUE *argv, VALUE str)
1190 VALUE newstr = str;
1191 int encidx = str_transcode(argc, argv, &newstr);
1193 if (encidx < 0) return rb_str_dup(str);
1194 RBASIC(newstr)->klass = rb_obj_class(str);
1195 return str_encode_associate(newstr, encidx);
1198 VALUE
1199 rb_str_transcode(VALUE str, VALUE to)
1201 return str_encode(1, &to, str);
1204 void
1205 Init_transcode(void)
1207 transcoder_table = st_init_strcasetable();
1209 sym_invalid = ID2SYM(rb_intern("invalid"));
1210 sym_undef = ID2SYM(rb_intern("undef"));
1211 sym_ignore = ID2SYM(rb_intern("ignore"));
1212 sym_replace = ID2SYM(rb_intern("replace"));
1214 rb_define_method(rb_cString, "encode", str_encode, -1);
1215 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);