1 /**********************************************************************
6 created at: Thu May 24 17:23:27 JST 2007
8 Copyright (C) 2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
16 #ifdef HAVE_LANGINFO_H
19 #include "ruby/util.h"
21 static ID id_encoding
, id_base_encoding
;
23 static VALUE rb_encoding_list
;
25 struct rb_encoding_entry
{
32 struct rb_encoding_entry
*list
;
38 void rb_enc_init(void);
40 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
42 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
44 static int load_encoding(const char *name
);
45 static VALUE
enc_base_encoding(VALUE self
);
53 enc_new(rb_encoding
*encoding
)
55 return Data_Wrap_Struct(rb_cEncoding
, enc_mark
, 0, encoding
);
59 rb_enc_from_encoding(rb_encoding
*encoding
)
64 if (!encoding
) return Qnil
;
65 idx
= ENC_TO_ENCINDEX(encoding
);
66 if (!(list
= rb_encoding_list
)) {
67 rb_bug("rb_enc_from_encoding(%d\"%s\"): no rb_encoding_list",
68 idx
, rb_enc_name(encoding
));
70 enc
= rb_ary_entry(list
, idx
);
72 rb_bug("rb_enc_from_encoding(%d\"%s\"): not created yet",
73 idx
, rb_enc_name(encoding
));
78 static int enc_autoload(rb_encoding
*);
81 check_encoding(rb_encoding
*enc
)
83 int index
= rb_enc_to_index(enc
);
84 if (rb_enc_from_index(index
) != enc
)
86 if (enc_autoload_p(enc
)) {
87 index
= enc_autoload(enc
);
93 enc_check_encoding(VALUE obj
)
95 if (SPECIAL_CONST_P(obj
) || BUILTIN_TYPE(obj
) != T_DATA
||
96 RDATA(obj
)->dmark
!= enc_mark
) {
99 return check_encoding(RDATA(obj
)->data
);
103 must_encoding(VALUE enc
)
105 int index
= enc_check_encoding(enc
);
107 rb_raise(rb_eTypeError
, "wrong argument type %s (expected Encoding)",
108 rb_obj_classname(enc
));
114 rb_to_encoding_index(VALUE enc
)
118 idx
= enc_check_encoding(enc
);
122 else if (NIL_P(enc
= rb_check_string_type(enc
))) {
125 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
128 return rb_enc_find_index(StringValueCStr(enc
));
132 to_encoding(VALUE enc
)
137 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
138 rb_raise(rb_eArgError
, "invalid name encoding (non ASCII)");
140 idx
= rb_enc_find_index(StringValueCStr(enc
));
142 rb_raise(rb_eArgError
, "unknown encoding name - %s", RSTRING_PTR(enc
));
144 return rb_enc_from_index(idx
);
148 rb_to_encoding(VALUE enc
)
150 if (enc_check_encoding(enc
) >= 0) return RDATA(enc
)->data
;
151 return to_encoding(enc
);
155 rb_gc_mark_encodings(void)
160 enc_table_expand(int newsize
)
162 struct rb_encoding_entry
*ent
;
165 if (enc_table
.size
>= newsize
) return newsize
;
166 newsize
= (newsize
+ 7) / 8 * 8;
167 ent
= realloc(enc_table
.list
, sizeof(*enc_table
.list
) * newsize
);
169 memset(ent
+ enc_table
.size
, 0, sizeof(*ent
)*(newsize
- enc_table
.size
));
170 enc_table
.list
= ent
;
171 enc_table
.size
= newsize
;
176 enc_register_at(int index
, const char *name
, rb_encoding
*encoding
)
178 struct rb_encoding_entry
*ent
= &enc_table
.list
[index
];
182 ent
->name
= name
= strdup(name
);
184 else if (STRCASECMP(name
, ent
->name
)) {
188 ent
->enc
= xmalloc(sizeof(rb_encoding
));
191 *ent
->enc
= *encoding
;
194 memset(ent
->enc
, 0, sizeof(*ent
->enc
));
197 encoding
->name
= name
;
198 encoding
->ruby_encoding_index
= index
;
199 st_insert(enc_table
.names
, (st_data_t
)name
, (st_data_t
)index
);
200 list
= rb_encoding_list
;
201 if (list
&& NIL_P(rb_ary_entry(list
, index
))) {
202 /* initialize encoding data */
203 rb_ary_store(list
, index
, enc_new(encoding
));
209 enc_register(const char *name
, rb_encoding
*encoding
)
211 int index
= enc_table
.count
;
213 if ((index
= enc_table_expand(index
+ 1)) < 0) return -1;
214 enc_table
.count
= index
;
215 return enc_register_at(index
- 1, name
, encoding
);
218 static void set_encoding_const(const char *, rb_encoding
*);
219 int rb_enc_registered(const char *name
);
222 rb_enc_register(const char *name
, rb_encoding
*encoding
)
224 int index
= rb_enc_registered(name
);
227 rb_encoding
*oldenc
= rb_enc_from_index(index
);
228 if (STRCASECMP(name
, rb_enc_name(oldenc
))) {
229 index
= enc_register(name
, encoding
);
231 else if (enc_autoload_p(oldenc
) || !ENC_DUMMY_P(oldenc
)) {
232 enc_register_at(index
, name
, encoding
);
235 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
239 index
= enc_register(name
, encoding
);
240 set_encoding_const(name
, rb_enc_from_index(index
));
246 rb_encdb_declare(const char *name
)
248 int idx
= rb_enc_registered(name
);
250 idx
= enc_register(name
, 0);
252 set_encoding_const(name
, rb_enc_from_index(idx
));
256 enc_check_duplication(const char *name
)
258 if (rb_enc_registered(name
) >= 0) {
259 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
264 set_base_encoding(int index
, rb_encoding
*base
)
266 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
268 enc_table
.list
[index
].base
= base
;
269 if (rb_enc_dummy_p(base
)) ENC_SET_DUMMY(enc
);
274 rb_enc_replicate(const char *name
, rb_encoding
*encoding
)
278 enc_check_duplication(name
);
279 idx
= enc_register(name
, encoding
);
280 set_base_encoding(idx
, encoding
);
281 set_encoding_const(name
, rb_enc_from_index(idx
));
286 enc_replicate(int idx
, const char *name
, rb_encoding
*origenc
)
289 idx
= enc_register(name
, origenc
);
292 idx
= enc_register_at(idx
, name
, origenc
);
295 set_base_encoding(idx
, origenc
);
296 set_encoding_const(name
, rb_enc_from_index(idx
));
302 rb_encdb_replicate(const char *name
, const char *orig
)
304 int origidx
= rb_enc_registered(orig
);
305 int idx
= rb_enc_registered(name
);
308 origidx
= enc_register(orig
, 0);
310 return enc_replicate(idx
, name
, rb_enc_from_index(origidx
));
314 rb_define_dummy_encoding(const char *name
)
316 int index
= rb_enc_replicate(name
, rb_ascii8bit_encoding());
317 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
324 rb_encdb_dummy(const char *name
)
326 int index
= enc_replicate(rb_enc_registered(name
), name
,
327 rb_ascii8bit_encoding());
328 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
336 * enc.dummy? => true or false
338 * Returns true for dummy encodings.
339 * A dummy encoding is an encoding for which character handling is not properly
341 * It is used for stateful encodings.
343 * Encoding::ISO_2022_JP.dummy? #=> true
344 * Encoding::UTF_8.dummy? #=> false
348 enc_dummy_p(VALUE enc
)
350 return ENC_DUMMY_P(enc_table
.list
[must_encoding(enc
)].enc
) ? Qtrue
: Qfalse
;
354 enc_alias(const char *alias
, int idx
)
356 alias
= strdup(alias
);
357 st_insert(enc_table
.names
, (st_data_t
)alias
, (st_data_t
)idx
);
358 set_encoding_const(alias
, rb_enc_from_index(idx
));
363 rb_enc_alias(const char *alias
, const char *orig
)
367 enc_check_duplication(alias
);
368 if (!enc_table
.list
) {
371 if ((idx
= rb_enc_find_index(orig
)) < 0) {
374 return enc_alias(alias
, idx
);
378 rb_encdb_alias(const char *alias
, const char *orig
)
380 int idx
= rb_enc_registered(orig
);
383 idx
= enc_register(orig
, 0);
385 return enc_alias(alias
, idx
);
395 extern rb_encoding OnigEncodingUTF_8
;
396 extern rb_encoding OnigEncodingUS_ASCII
;
401 enc_table_expand(ENCODING_COUNT
+ 1);
402 if (!enc_table
.names
) {
403 enc_table
.names
= st_init_strcasetable();
405 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
408 ENC_REGISTER(US_ASCII
);
410 enc_table
.count
= ENCINDEX_BUILTIN_MAX
;
414 rb_enc_from_index(int index
)
416 if (!enc_table
.list
) {
419 if (index
< 0 || enc_table
.count
<= index
) {
422 return enc_table
.list
[index
].enc
;
426 rb_enc_registered(const char *name
)
430 if (!name
) return -1;
431 if (!enc_table
.list
) return -1;
432 if (st_lookup(enc_table
.names
, (st_data_t
)name
, &idx
)) {
439 require_enc(VALUE enclib
)
441 return rb_require_safe(enclib
, rb_safe_level());
445 load_encoding(const char *name
)
447 VALUE enclib
= rb_sprintf("enc/%s", name
);
448 VALUE verbose
= ruby_verbose
;
449 VALUE debug
= ruby_debug
;
451 char *s
= RSTRING_PTR(enclib
) + 4, *e
= RSTRING_END(enclib
);
455 if (!ISALNUM(*s
)) *s
= '_';
456 else if (ISUPPER(*s
)) *s
= TOLOWER(*s
);
460 ruby_verbose
= Qfalse
;
462 loaded
= rb_protect(require_enc
, enclib
, 0);
463 ruby_verbose
= verbose
;
465 rb_set_errinfo(Qnil
);
466 if (NIL_P(loaded
)) return -1;
467 if ((idx
= rb_enc_registered(name
)) < 0) return -1;
468 if (enc_autoload_p(enc_table
.list
[idx
].enc
)) return -1;
473 enc_autoload(rb_encoding
*enc
)
476 rb_encoding
*base
= enc_table
.list
[ENC_TO_ENCINDEX(enc
)].base
;
481 if (i
>= enc_table
.count
) return -1;
482 } while (enc_table
.list
[i
].enc
!= base
&& (++i
, 1));
483 if (enc_autoload_p(base
)) {
484 if (enc_autoload(base
) < 0) return -1;
486 i
= ENC_TO_ENCINDEX(enc
);
487 enc_register_at(i
, rb_enc_name(enc
), base
);
490 i
= load_encoding(rb_enc_name(enc
));
496 rb_enc_find_index(const char *name
)
498 int i
= rb_enc_registered(name
);
502 i
= load_encoding(name
);
504 else if (enc_autoload_p(enc
= rb_enc_from_index(i
))) {
505 if (enc_autoload(enc
) < 0) {
506 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
515 rb_enc_find(const char *name
)
517 int idx
= rb_enc_find_index(name
);
518 if (idx
< 0) idx
= 0;
519 return rb_enc_from_index(idx
);
523 enc_capable(VALUE obj
)
525 if (SPECIAL_CONST_P(obj
)) return Qfalse
;
526 switch (BUILTIN_TYPE(obj
)) {
532 if (RDATA(obj
)->dmark
== enc_mark
) return Qtrue
;
541 CONST_ID(id_encoding
, "encoding");
546 rb_enc_get_index(VALUE obj
)
550 i
= ENCODING_GET_INLINED(obj
);
551 if (i
== ENCODING_INLINE_MAX
) {
554 iv
= rb_ivar_get(obj
, rb_id_encoding());
561 rb_enc_set_index(VALUE obj
, int idx
)
563 if (idx
< ENCODING_INLINE_MAX
) {
564 ENCODING_SET_INLINED(obj
, idx
);
567 ENCODING_SET_INLINED(obj
, ENCODING_INLINE_MAX
);
568 rb_ivar_set(obj
, rb_id_encoding(), INT2NUM(idx
));
573 rb_enc_associate_index(VALUE obj
, int idx
)
575 /* enc_check_capable(obj);*/
576 if (rb_enc_get_index(obj
) == idx
)
578 if (!ENC_CODERANGE_ASCIIONLY(obj
) ||
579 !rb_enc_asciicompat(rb_enc_from_index(idx
))) {
580 ENC_CODERANGE_CLEAR(obj
);
582 rb_enc_set_index(obj
, idx
);
587 rb_enc_associate(VALUE obj
, rb_encoding
*enc
)
589 return rb_enc_associate_index(obj
, rb_enc_to_index(enc
));
593 rb_enc_get(VALUE obj
)
595 return rb_enc_from_index(rb_enc_get_index(obj
));
599 rb_enc_check(VALUE str1
, VALUE str2
)
601 rb_encoding
*enc
= rb_enc_compatible(str1
, str2
);
603 rb_raise(rb_eEncCompatError
, "incompatible character encodings: %s and %s",
604 rb_enc_name(rb_enc_get(str1
)),
605 rb_enc_name(rb_enc_get(str2
)));
610 rb_enc_compatible(VALUE str1
, VALUE str2
)
613 rb_encoding
*enc1
, *enc2
;
615 idx1
= rb_enc_get_index(str1
);
616 idx2
= rb_enc_get_index(str2
);
618 if (idx1
< 0 || idx2
< 0)
622 return rb_enc_from_index(idx1
);
624 enc1
= rb_enc_from_index(idx1
);
625 enc2
= rb_enc_from_index(idx2
);
627 if (TYPE(str2
) == T_STRING
&& RSTRING_LEN(str2
) == 0)
629 if (TYPE(str1
) == T_STRING
&& RSTRING_LEN(str1
) == 0)
631 if (!rb_enc_asciicompat(enc1
) || !rb_enc_asciicompat(enc2
)) {
635 if (BUILTIN_TYPE(str1
) != T_STRING
) {
643 if (BUILTIN_TYPE(str1
) == T_STRING
) {
646 cr1
= rb_enc_str_coderange(str1
);
647 if (BUILTIN_TYPE(str2
) == T_STRING
) {
648 cr2
= rb_enc_str_coderange(str2
);
650 /* may need to handle ENC_CODERANGE_BROKEN */
651 if (cr1
== ENC_CODERANGE_7BIT
) return enc2
;
652 if (cr2
== ENC_CODERANGE_7BIT
) return enc1
;
654 if (cr2
== ENC_CODERANGE_7BIT
) {
655 if (idx1
== 0) return enc2
;
659 if (cr1
== ENC_CODERANGE_7BIT
)
666 rb_enc_copy(VALUE obj1
, VALUE obj2
)
668 rb_enc_associate_index(obj1
, rb_enc_get_index(obj2
));
674 * obj.encoding => encoding
676 * Returns the Encoding object that represents the encoding of obj.
680 rb_obj_encoding(VALUE obj
)
682 rb_encoding
*enc
= rb_enc_get(obj
);
684 rb_raise(rb_eTypeError
, "unknown encoding");
686 return rb_enc_from_encoding(enc
);
690 rb_enc_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
692 int n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
693 if (MBCLEN_CHARFOUND_P(n
) && MBCLEN_CHARFOUND_LEN(n
) <= e
-p
)
694 return MBCLEN_CHARFOUND_LEN(n
);
696 int min
= rb_enc_mbminlen(enc
);
697 return min
<= e
-p
? min
: e
-p
;
702 rb_enc_precise_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
706 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
707 n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
709 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n
-(e
-p
));
714 rb_enc_ascget(const char *p
, const char *e
, int *len
, rb_encoding
*enc
)
719 if (rb_enc_asciicompat(enc
)) {
720 c
= (unsigned char)*p
;
726 l
= rb_enc_precise_mbclen(p
, e
, enc
);
727 if (!MBCLEN_CHARFOUND_P(l
))
729 c
= rb_enc_mbc_to_codepoint(p
, e
, enc
);
730 if (!rb_enc_isascii(c
, enc
))
737 rb_enc_codepoint(const char *p
, const char *e
, rb_encoding
*enc
)
741 rb_raise(rb_eArgError
, "empty string");
742 r
= rb_enc_precise_mbclen(p
, e
, enc
);
743 if (MBCLEN_CHARFOUND_P(r
))
744 return rb_enc_mbc_to_codepoint(p
, e
, enc
);
746 rb_raise(rb_eArgError
, "invalid byte sequence in %s", rb_enc_name(enc
));
750 rb_enc_codelen(int c
, rb_encoding
*enc
)
752 int n
= ONIGENC_CODE_TO_MBCLEN(enc
,c
);
754 rb_raise(rb_eArgError
, "invalid codepoint 0x%x in %s", c
, rb_enc_name(enc
));
760 rb_enc_toupper(int c
, rb_encoding
*enc
)
762 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
):(c
));
766 rb_enc_tolower(int c
, rb_encoding
*enc
)
768 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
):(c
));
773 * enc.inspect => string
775 * Returns a string which represents the encoding for programmers.
777 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
778 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
781 enc_inspect(VALUE self
)
783 VALUE str
= rb_sprintf("#<%s:%s%s>", rb_obj_classname(self
),
784 rb_enc_name((rb_encoding
*)DATA_PTR(self
)),
785 (enc_dummy_p(self
) ? " (dummy)" : ""));
786 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
794 * Returns the name of the encoding.
796 * Encoding::UTF_8.name => "UTF-8"
801 return rb_usascii_str_new2(rb_enc_name((rb_encoding
*)DATA_PTR(self
)));
805 enc_base_encoding(VALUE self
)
807 rb_encoding
*base
= enc_table
.list
[must_encoding(self
)].base
;
808 if (!base
) return Qnil
;
809 return ENC_FROM_ENCODING(base
);
814 * Encoding.list => [enc1, enc2, ...]
816 * Returns the list of loaded encodings.
819 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
820 * #<Encoding:ISO-2022-JP (dummy)>]
822 * Encoding.find("US-ASCII")
823 * => #<Encoding:US-ASCII>
826 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
827 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
831 enc_list(VALUE klass
)
833 VALUE ary
= rb_ary_new2(0);
834 rb_ary_replace(ary
, rb_encoding_list
);
840 * Encoding.find(string) => enc
841 * Encoding.find(symbol) => enc
843 * Search the encoding with specified <i>name</i>.
844 * <i>name</i> should be a string or symbol.
846 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
847 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
851 enc_find(VALUE klass
, VALUE enc
)
853 return rb_enc_from_encoding(to_encoding(enc
));
858 * Encoding.compatible?(str1, str2) => enc or nil
860 * Checks the compatibility of two strings.
861 * If they are compatible, means concatenatable,
862 * returns an encoding which the concatinated string will be.
863 * If they are not compatible, nil is returned.
865 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
866 * => #<Encoding:ISO-8859-1>
868 * Encoding.compatible?(
869 * "\xa1".force_encoding("iso-8859-1"),
870 * "\xa1\xa1".force_encoding("euc-jp"))
875 enc_compatible_p(VALUE klass
, VALUE str1
, VALUE str2
)
879 if (!enc_capable(str1
)) return Qnil
;
880 if (!enc_capable(str2
)) return Qnil
;
881 enc
= rb_enc_compatible(str1
, str2
);
882 if (!enc
) return Qnil
;
883 return rb_enc_from_encoding(enc
);
888 enc_dump(int argc
, VALUE
*argv
, VALUE self
)
890 rb_scan_args(argc
, argv
, "01", 0);
891 return enc_name(self
);
896 enc_load(VALUE klass
, VALUE str
)
898 return enc_find(klass
, str
);
902 rb_ascii8bit_encoding(void)
904 if (!enc_table
.list
) {
907 return enc_table
.list
[ENCINDEX_ASCII
].enc
;
911 rb_ascii8bit_encindex(void)
913 return ENCINDEX_ASCII
;
917 rb_utf8_encoding(void)
919 if (!enc_table
.list
) {
922 return enc_table
.list
[ENCINDEX_UTF_8
].enc
;
926 rb_utf8_encindex(void)
928 return ENCINDEX_UTF_8
;
932 rb_usascii_encoding(void)
934 if (!enc_table
.list
) {
937 return enc_table
.list
[ENCINDEX_US_ASCII
].enc
;
941 rb_usascii_encindex(void)
943 return ENCINDEX_US_ASCII
;
947 rb_locale_encoding(void)
949 VALUE charmap
= rb_locale_charmap(rb_cEncoding
);
953 return rb_usascii_encoding();
955 idx
= rb_enc_find_index(StringValueCStr(charmap
));
957 return rb_ascii8bit_encoding();
959 return rb_enc_from_index(idx
);
963 rb_filesystem_encoding(void)
965 static rb_encoding
*enc
;
968 enc
= rb_locale_encoding();
969 #elif defined __APPLE__
970 enc
= rb_enc_find("UTF8-MAC");
972 enc
= rb_locale_encoding();
978 static int default_external_index
;
981 rb_default_external_encoding(void)
983 return rb_enc_from_index(default_external_index
);
987 rb_enc_default_external(void)
989 return rb_enc_from_encoding(rb_default_external_encoding());
994 * Encoding.default_external => enc
996 * Returns default external encoding.
998 * It is initialized by the locale or -E option.
1001 get_default_external(VALUE klass
)
1003 return rb_enc_default_external();
1007 rb_enc_set_default_external(VALUE encoding
)
1009 default_external_index
= rb_enc_to_index(rb_to_encoding(encoding
));
1014 * Encoding.locale_charmap => string
1016 * Returns the locale charmap name.
1020 * Encoding.locale_charmap => "ANSI_X3.4-1968"
1022 * Encoding.locale_charmap => "EUC-JP"
1026 * Encoding.locale_charmap => "646"
1028 * Encoding.locale_charmap => "eucJP"
1032 rb_locale_charmap(VALUE klass
)
1034 #if defined NO_LOCALE_CHARMAP
1035 return rb_usascii_str_new2("ASCII-8BIT");
1036 #elif defined HAVE_LANGINFO_H
1038 codeset
= nl_langinfo(CODESET
);
1039 return rb_usascii_str_new2(codeset
);
1040 #elif defined _WIN32
1041 return rb_sprintf("CP%d", GetACP());
1048 set_encoding_const(const char *name
, rb_encoding
*enc
)
1050 VALUE encoding
= rb_enc_from_encoding(enc
);
1051 char *s
= (char *)name
;
1052 int haslower
= 0, hasupper
= 0, valid
= 0;
1054 if (ISDIGIT(*s
)) return;
1057 while (*++s
&& (ISALNUM(*s
) || *s
== '_')) {
1058 if (ISLOWER(*s
)) haslower
= 1;
1063 rb_define_const(rb_cEncoding
, name
, encoding
);
1065 if (!valid
|| haslower
) {
1066 int len
= strlen(name
) + 1;
1067 if (!haslower
|| !hasupper
) {
1069 if (ISLOWER(*s
)) haslower
= 1;
1070 if (ISUPPER(*s
)) hasupper
= 1;
1071 } while (*++s
&& (!haslower
|| !hasupper
));
1073 MEMCPY(s
= ALLOCA_N(char, len
), name
, char, len
);
1076 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1078 if (!ISALNUM(*s
)) *s
= '_';
1081 rb_define_const(rb_cEncoding
, name
, encoding
);
1085 for (s
= (char *)name
; *s
; ++s
) {
1086 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1088 rb_define_const(rb_cEncoding
, name
, encoding
);
1094 rb_enc_name_list_i(st_data_t name
, st_data_t idx
, st_data_t arg
)
1096 VALUE ary
= (VALUE
)arg
;
1097 VALUE str
= rb_usascii_str_new2((char *)name
);
1099 rb_ary_push(ary
, str
);
1105 * Encoding.name_list => ["enc1", "enc2", ...]
1107 * Returns the list of available encoding names.
1109 * Encoding.name_list
1110 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1111 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1113 * "BINARY", "CP932", "eucJP"]
1115 * This list doesn't include dummy encodings.
1120 rb_enc_name_list(VALUE klass
)
1122 VALUE ary
= rb_ary_new2(enc_table
.names
->num_entries
);
1123 st_foreach(enc_table
.names
, rb_enc_name_list_i
, (st_data_t
)ary
);
1128 rb_enc_aliases_enc_i(st_data_t name
, st_data_t orig
, st_data_t arg
)
1130 VALUE
*p
= (VALUE
*)arg
;
1131 VALUE aliases
= p
[0], ary
= p
[1];
1132 int idx
= (int)orig
;
1133 VALUE key
, str
= rb_ary_entry(ary
, idx
);
1136 rb_encoding
*enc
= rb_enc_from_index(idx
);
1138 if (STRCASECMP((char*)name
, rb_enc_name(enc
)) == 0) {
1141 str
= rb_usascii_str_new2(rb_enc_name(enc
));
1143 rb_ary_store(ary
, idx
, str
);
1145 key
= rb_usascii_str_new2((char *)name
);
1147 rb_hash_aset(aliases
, key
, str
);
1153 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1155 * Returns the hash of available encoding alias and original encoding name.
1158 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1159 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1164 rb_enc_aliases(VALUE klass
)
1167 aliases
[0] = rb_hash_new();
1168 aliases
[1] = rb_ary_new();
1169 st_foreach(enc_table
.names
, rb_enc_aliases_enc_i
, (st_data_t
)aliases
);
1180 id_base_encoding
= rb_intern("#base_encoding");
1182 rb_cEncoding
= rb_define_class("Encoding", rb_cObject
);
1183 rb_undef_alloc_func(rb_cEncoding
);
1184 rb_define_method(rb_cEncoding
, "to_s", enc_name
, 0);
1185 rb_define_method(rb_cEncoding
, "inspect", enc_inspect
, 0);
1186 rb_define_method(rb_cEncoding
, "name", enc_name
, 0);
1187 rb_define_method(rb_cEncoding
, "base_encoding", enc_base_encoding
, 0);
1188 rb_define_method(rb_cEncoding
, "dummy?", enc_dummy_p
, 0);
1189 rb_define_singleton_method(rb_cEncoding
, "list", enc_list
, 0);
1190 rb_define_singleton_method(rb_cEncoding
, "name_list", rb_enc_name_list
, 0);
1191 rb_define_singleton_method(rb_cEncoding
, "aliases", rb_enc_aliases
, 0);
1192 rb_define_singleton_method(rb_cEncoding
, "find", enc_find
, 1);
1193 rb_define_singleton_method(rb_cEncoding
, "compatible?", enc_compatible_p
, 2);
1195 rb_define_method(rb_cEncoding
, "_dump", enc_dump
, -1);
1196 rb_define_singleton_method(rb_cEncoding
, "_load", enc_load
, 1);
1198 rb_define_singleton_method(rb_cEncoding
, "default_external", get_default_external
, 0);
1199 rb_define_singleton_method(rb_cEncoding
, "locale_charmap", rb_locale_charmap
, 0);
1201 rb_gc_register_address(&rb_encoding_list
);
1202 list
= rb_ary_new2(enc_table
.count
);
1203 RBASIC(list
)->klass
= 0;
1204 rb_encoding_list
= list
;
1205 for (i
= 0; i
< enc_table
.count
; ++i
) {
1206 rb_ary_push(list
, enc_new(enc_table
.list
[i
].enc
));
1210 /* locale insensitive functions */
1212 #define ctype_test(c, ctype) \
1213 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
1215 int rb_isalnum(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALNUM
); }
1216 int rb_isalpha(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALPHA
); }
1217 int rb_isblank(int c
) { return ctype_test(c
, ONIGENC_CTYPE_BLANK
); }
1218 int rb_iscntrl(int c
) { return ctype_test(c
, ONIGENC_CTYPE_CNTRL
); }
1219 int rb_isdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_DIGIT
); }
1220 int rb_isgraph(int c
) { return ctype_test(c
, ONIGENC_CTYPE_GRAPH
); }
1221 int rb_islower(int c
) { return ctype_test(c
, ONIGENC_CTYPE_LOWER
); }
1222 int rb_isprint(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PRINT
); }
1223 int rb_ispunct(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PUNCT
); }
1224 int rb_isspace(int c
) { return ctype_test(c
, ONIGENC_CTYPE_SPACE
); }
1225 int rb_isupper(int c
) { return ctype_test(c
, ONIGENC_CTYPE_UPPER
); }
1226 int rb_isxdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_XDIGIT
); }
1231 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
) : c
;
1237 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
) : c
;