1 /**********************************************************************
6 created at: Thu May 24 17:23:27 JST 2007
8 Copyright (C) 2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
16 #ifdef HAVE_LANGINFO_H
20 static ID id_encoding
, id_base_encoding
;
22 static VALUE rb_encoding_list
;
24 struct rb_encoding_entry
{
31 struct rb_encoding_entry
*list
;
37 void rb_enc_init(void);
39 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
41 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
43 static int load_encoding(const char *name
);
44 static VALUE
enc_base_encoding(VALUE self
);
52 enc_new(rb_encoding
*encoding
)
54 return Data_Wrap_Struct(rb_cEncoding
, enc_mark
, 0, encoding
);
58 rb_enc_from_encoding(rb_encoding
*encoding
)
63 if (!encoding
) return Qnil
;
64 idx
= ENC_TO_ENCINDEX(encoding
);
65 if (!(list
= rb_encoding_list
)) {
66 rb_bug("rb_enc_from_encoding(%d\"%s\"): no rb_encoding_list",
67 idx
, rb_enc_name(encoding
));
69 enc
= rb_ary_entry(list
, idx
);
71 rb_bug("rb_enc_from_encoding(%d\"%s\"): not created yet",
72 idx
, rb_enc_name(encoding
));
77 static int enc_autoload(rb_encoding
*);
80 check_encoding(rb_encoding
*enc
)
82 int index
= rb_enc_to_index(enc
);
83 if (rb_enc_from_index(index
) != enc
)
85 if (enc_autoload_p(enc
)) {
86 index
= enc_autoload(enc
);
92 enc_check_encoding(VALUE obj
)
94 if (SPECIAL_CONST_P(obj
) || BUILTIN_TYPE(obj
) != T_DATA
||
95 RDATA(obj
)->dmark
!= enc_mark
) {
98 return check_encoding(RDATA(obj
)->data
);
102 must_encoding(VALUE enc
)
104 int index
= enc_check_encoding(enc
);
106 rb_raise(rb_eTypeError
, "wrong argument type %s (expected Encoding)",
107 rb_obj_classname(enc
));
113 rb_to_encoding_index(VALUE enc
)
117 idx
= enc_check_encoding(enc
);
121 else if (NIL_P(enc
= rb_check_string_type(enc
))) {
124 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
127 return rb_enc_find_index(StringValueCStr(enc
));
131 to_encoding(VALUE enc
)
136 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
137 rb_raise(rb_eArgError
, "invalid name encoding (non ASCII)");
139 idx
= rb_enc_find_index(StringValueCStr(enc
));
141 rb_raise(rb_eArgError
, "unknown encoding name - %s", RSTRING_PTR(enc
));
143 return rb_enc_from_index(idx
);
147 rb_to_encoding(VALUE enc
)
149 if (enc_check_encoding(enc
) >= 0) return RDATA(enc
)->data
;
150 return to_encoding(enc
);
154 rb_gc_mark_encodings(void)
159 enc_table_expand(int newsize
)
161 struct rb_encoding_entry
*ent
;
164 if (enc_table
.size
>= newsize
) return newsize
;
165 newsize
= (newsize
+ 7) / 8 * 8;
166 ent
= realloc(enc_table
.list
, sizeof(*enc_table
.list
) * newsize
);
168 memset(ent
+ enc_table
.size
, 0, sizeof(*ent
)*(newsize
- enc_table
.size
));
169 enc_table
.list
= ent
;
170 enc_table
.size
= newsize
;
175 enc_register_at(int index
, const char *name
, rb_encoding
*encoding
)
177 struct rb_encoding_entry
*ent
= &enc_table
.list
[index
];
181 ent
->name
= name
= strdup(name
);
183 else if (STRCASECMP(name
, ent
->name
)) {
187 ent
->enc
= xmalloc(sizeof(rb_encoding
));
190 *ent
->enc
= *encoding
;
193 memset(ent
->enc
, 0, sizeof(*ent
->enc
));
196 encoding
->name
= name
;
197 encoding
->ruby_encoding_index
= index
;
198 st_insert(enc_table
.names
, (st_data_t
)name
, (st_data_t
)index
);
199 list
= rb_encoding_list
;
200 if (list
&& NIL_P(rb_ary_entry(list
, index
))) {
201 /* initialize encoding data */
202 rb_ary_store(list
, index
, enc_new(encoding
));
208 enc_register(const char *name
, rb_encoding
*encoding
)
210 int index
= enc_table
.count
;
212 if ((index
= enc_table_expand(index
+ 1)) < 0) return -1;
213 enc_table
.count
= index
;
214 return enc_register_at(index
- 1, name
, encoding
);
217 static void set_encoding_const(const char *, rb_encoding
*);
218 int rb_enc_registered(const char *name
);
221 rb_enc_register(const char *name
, rb_encoding
*encoding
)
223 int index
= rb_enc_registered(name
);
226 rb_encoding
*oldenc
= rb_enc_from_index(index
);
227 if (STRCASECMP(name
, rb_enc_name(oldenc
))) {
228 index
= enc_register(name
, encoding
);
230 else if (enc_autoload_p(oldenc
) || !ENC_DUMMY_P(oldenc
)) {
231 enc_register_at(index
, name
, encoding
);
234 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
238 index
= enc_register(name
, encoding
);
239 set_encoding_const(name
, rb_enc_from_index(index
));
245 rb_encdb_declare(const char *name
)
247 int idx
= rb_enc_registered(name
);
249 idx
= enc_register(name
, 0);
251 set_encoding_const(name
, rb_enc_from_index(idx
));
255 enc_check_duplication(const char *name
)
257 if (rb_enc_registered(name
) >= 0) {
258 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
263 set_base_encoding(int index
, rb_encoding
*base
)
265 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
267 enc_table
.list
[index
].base
= base
;
268 if (rb_enc_dummy_p(base
)) ENC_SET_DUMMY(enc
);
273 rb_enc_replicate(const char *name
, rb_encoding
*encoding
)
277 enc_check_duplication(name
);
278 idx
= enc_register(name
, encoding
);
279 set_base_encoding(idx
, encoding
);
280 set_encoding_const(name
, rb_enc_from_index(idx
));
285 enc_replicate(int idx
, const char *name
, rb_encoding
*origenc
)
288 idx
= enc_register(name
, origenc
);
291 idx
= enc_register_at(idx
, name
, origenc
);
294 set_base_encoding(idx
, origenc
);
295 set_encoding_const(name
, rb_enc_from_index(idx
));
301 rb_encdb_replicate(const char *name
, const char *orig
)
303 int origidx
= rb_enc_registered(orig
);
304 int idx
= rb_enc_registered(name
);
307 origidx
= enc_register(orig
, 0);
309 return enc_replicate(idx
, name
, rb_enc_from_index(origidx
));
313 rb_define_dummy_encoding(const char *name
)
315 int index
= rb_enc_replicate(name
, rb_ascii8bit_encoding());
316 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
323 rb_encdb_dummy(const char *name
)
325 int index
= enc_replicate(rb_enc_registered(name
), name
,
326 rb_ascii8bit_encoding());
327 rb_encoding
*enc
= enc_table
.list
[index
].enc
;
335 * enc.dummy? => true or false
337 * Returns true for dummy encodings.
338 * A dummy encoding is an encoding for which character handling is not properly
340 * It is used for stateful encodings.
342 * Encoding::ISO_2022_JP.dummy? #=> true
343 * Encoding::UTF_8.dummy? #=> false
347 enc_dummy_p(VALUE enc
)
349 return ENC_DUMMY_P(enc_table
.list
[must_encoding(enc
)].enc
) ? Qtrue
: Qfalse
;
353 enc_alias(const char *alias
, int idx
)
355 alias
= strdup(alias
);
356 st_insert(enc_table
.names
, (st_data_t
)alias
, (st_data_t
)idx
);
357 set_encoding_const(alias
, rb_enc_from_index(idx
));
362 rb_enc_alias(const char *alias
, const char *orig
)
366 enc_check_duplication(alias
);
367 if (!enc_table
.list
) {
370 if ((idx
= rb_enc_find_index(orig
)) < 0) {
373 return enc_alias(alias
, idx
);
377 rb_encdb_alias(const char *alias
, const char *orig
)
379 int idx
= rb_enc_registered(orig
);
382 idx
= enc_register(orig
, 0);
384 return enc_alias(alias
, idx
);
394 extern rb_encoding OnigEncodingUTF_8
;
395 extern rb_encoding OnigEncodingUS_ASCII
;
400 enc_table_expand(ENCODING_COUNT
+ 1);
401 if (!enc_table
.names
) {
402 enc_table
.names
= st_init_strcasetable();
404 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
407 ENC_REGISTER(US_ASCII
);
409 enc_table
.count
= ENCINDEX_BUILTIN_MAX
;
413 rb_enc_from_index(int index
)
415 if (!enc_table
.list
) {
418 if (index
< 0 || enc_table
.count
<= index
) {
421 return enc_table
.list
[index
].enc
;
425 rb_enc_registered(const char *name
)
429 if (!name
) return -1;
430 if (!enc_table
.list
) return -1;
431 if (st_lookup(enc_table
.names
, (st_data_t
)name
, &idx
)) {
438 require_enc(VALUE enclib
)
440 return rb_require_safe(enclib
, rb_safe_level());
444 load_encoding(const char *name
)
446 VALUE enclib
= rb_sprintf("enc/%s", name
);
447 VALUE verbose
= ruby_verbose
;
448 VALUE debug
= ruby_debug
;
450 char *s
= RSTRING_PTR(enclib
) + 4, *e
= RSTRING_END(enclib
);
454 if (!ISALNUM(*s
)) *s
= '_';
455 else if (ISUPPER(*s
)) *s
= TOLOWER(*s
);
459 ruby_verbose
= Qfalse
;
461 loaded
= rb_protect(require_enc
, enclib
, 0);
462 ruby_verbose
= verbose
;
464 rb_set_errinfo(Qnil
);
465 if (NIL_P(loaded
)) return -1;
466 if ((idx
= rb_enc_registered(name
)) < 0) return -1;
467 if (enc_autoload_p(enc_table
.list
[idx
].enc
)) return -1;
472 enc_autoload(rb_encoding
*enc
)
475 rb_encoding
*base
= enc_table
.list
[ENC_TO_ENCINDEX(enc
)].base
;
480 if (i
>= enc_table
.count
) return -1;
481 } while (enc_table
.list
[i
].enc
!= base
&& (++i
, 1));
482 if (enc_autoload_p(base
)) {
483 if (enc_autoload(base
) < 0) return -1;
485 i
= ENC_TO_ENCINDEX(enc
);
486 enc_register_at(i
, rb_enc_name(enc
), base
);
489 i
= load_encoding(rb_enc_name(enc
));
495 rb_enc_find_index(const char *name
)
497 int i
= rb_enc_registered(name
);
501 i
= load_encoding(name
);
503 else if (enc_autoload_p(enc
= rb_enc_from_index(i
))) {
504 if (enc_autoload(enc
) < 0) {
505 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
514 rb_enc_find(const char *name
)
516 int idx
= rb_enc_find_index(name
);
517 if (idx
< 0) idx
= 0;
518 return rb_enc_from_index(idx
);
522 enc_capable(VALUE obj
)
524 if (SPECIAL_CONST_P(obj
)) return Qfalse
;
525 switch (BUILTIN_TYPE(obj
)) {
531 if (RDATA(obj
)->dmark
== enc_mark
) return Qtrue
;
540 CONST_ID(id_encoding
, "encoding");
545 rb_enc_get_index(VALUE obj
)
549 i
= ENCODING_GET_INLINED(obj
);
550 if (i
== ENCODING_INLINE_MAX
) {
553 iv
= rb_ivar_get(obj
, rb_id_encoding());
560 rb_enc_set_index(VALUE obj
, int idx
)
562 if (idx
< ENCODING_INLINE_MAX
) {
563 ENCODING_SET_INLINED(obj
, idx
);
566 ENCODING_SET_INLINED(obj
, ENCODING_INLINE_MAX
);
567 rb_ivar_set(obj
, rb_id_encoding(), INT2NUM(idx
));
572 rb_enc_associate_index(VALUE obj
, int idx
)
574 /* enc_check_capable(obj);*/
575 if (rb_enc_get_index(obj
) == idx
)
577 if (!ENC_CODERANGE_ASCIIONLY(obj
) ||
578 !rb_enc_asciicompat(rb_enc_from_index(idx
))) {
579 ENC_CODERANGE_CLEAR(obj
);
581 rb_enc_set_index(obj
, idx
);
586 rb_enc_associate(VALUE obj
, rb_encoding
*enc
)
588 return rb_enc_associate_index(obj
, rb_enc_to_index(enc
));
592 rb_enc_get(VALUE obj
)
594 return rb_enc_from_index(rb_enc_get_index(obj
));
598 rb_enc_check(VALUE str1
, VALUE str2
)
600 rb_encoding
*enc
= rb_enc_compatible(str1
, str2
);
602 rb_raise(rb_eArgError
, "character encodings differ: %s and %s",
603 rb_enc_name(rb_enc_get(str1
)),
604 rb_enc_name(rb_enc_get(str2
)));
609 rb_enc_compatible(VALUE str1
, VALUE str2
)
612 rb_encoding
*enc1
, *enc2
;
614 idx1
= rb_enc_get_index(str1
);
615 idx2
= rb_enc_get_index(str2
);
617 if (idx1
< 0 || idx2
< 0)
621 return rb_enc_from_index(idx1
);
623 enc1
= rb_enc_from_index(idx1
);
624 enc2
= rb_enc_from_index(idx2
);
626 if (TYPE(str2
) == T_STRING
&& RSTRING_LEN(str2
) == 0)
628 if (TYPE(str1
) == T_STRING
&& RSTRING_LEN(str1
) == 0)
630 if (!rb_enc_asciicompat(enc1
) || !rb_enc_asciicompat(enc2
)) {
634 if (BUILTIN_TYPE(str1
) != T_STRING
) {
642 if (BUILTIN_TYPE(str1
) == T_STRING
) {
645 cr1
= rb_enc_str_coderange(str1
);
646 if (BUILTIN_TYPE(str2
) == T_STRING
) {
647 cr2
= rb_enc_str_coderange(str2
);
649 /* may need to handle ENC_CODERANGE_BROKEN */
650 if (cr1
== ENC_CODERANGE_7BIT
) return enc2
;
651 if (cr2
== ENC_CODERANGE_7BIT
) return enc1
;
653 if (cr2
== ENC_CODERANGE_7BIT
) {
654 if (idx1
== 0) return enc2
;
658 if (cr1
== ENC_CODERANGE_7BIT
)
665 rb_enc_copy(VALUE obj1
, VALUE obj2
)
667 rb_enc_associate_index(obj1
, rb_enc_get_index(obj2
));
673 * obj.encoding => encoding
675 * Returns the Encoding object that represents the encoding of obj.
679 rb_obj_encoding(VALUE obj
)
681 rb_encoding
*enc
= rb_enc_get(obj
);
683 rb_raise(rb_eTypeError
, "unknown encoding");
685 return rb_enc_from_encoding(enc
);
689 rb_enc_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
691 int n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
692 if (MBCLEN_CHARFOUND_P(n
) && MBCLEN_CHARFOUND_LEN(n
) <= e
-p
)
693 return MBCLEN_CHARFOUND_LEN(n
);
695 int min
= rb_enc_mbminlen(enc
);
696 return min
<= e
-p
? min
: e
-p
;
701 rb_enc_precise_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
705 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
706 n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
708 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n
-(e
-p
));
713 rb_enc_ascget(const char *p
, const char *e
, int *len
, rb_encoding
*enc
)
718 if (rb_enc_asciicompat(enc
)) {
719 c
= (unsigned char)*p
;
725 l
= rb_enc_precise_mbclen(p
, e
, enc
);
726 if (!MBCLEN_CHARFOUND_P(l
))
728 c
= rb_enc_mbc_to_codepoint(p
, e
, enc
);
729 if (!rb_enc_isascii(c
, enc
))
736 rb_enc_codepoint(const char *p
, const char *e
, rb_encoding
*enc
)
740 rb_raise(rb_eArgError
, "empty string");
741 r
= rb_enc_precise_mbclen(p
, e
, enc
);
742 if (MBCLEN_CHARFOUND_P(r
))
743 return rb_enc_mbc_to_codepoint(p
, e
, enc
);
745 rb_raise(rb_eArgError
, "invalid mbstring sequence");
749 rb_enc_codelen(int c
, rb_encoding
*enc
)
751 int n
= ONIGENC_CODE_TO_MBCLEN(enc
,c
);
753 rb_raise(rb_eArgError
, "invalid codepoint 0x%x", c
);
759 rb_enc_toupper(int c
, rb_encoding
*enc
)
761 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
):(c
));
765 rb_enc_tolower(int c
, rb_encoding
*enc
)
767 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
):(c
));
772 * enc.inspect => string
774 * Returns a string which represents the encoding for programmers.
776 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
777 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
780 enc_inspect(VALUE self
)
782 VALUE str
= rb_sprintf("#<%s:%s%s>", rb_obj_classname(self
),
783 rb_enc_name((rb_encoding
*)DATA_PTR(self
)),
784 (enc_dummy_p(self
) ? " (dummy)" : ""));
785 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
793 * Returns the name of the encoding.
795 * Encoding::UTF_8.name => "UTF-8"
800 return rb_usascii_str_new2(rb_enc_name((rb_encoding
*)DATA_PTR(self
)));
804 enc_base_encoding(VALUE self
)
806 rb_encoding
*base
= enc_table
.list
[must_encoding(self
)].base
;
807 if (!base
) return Qnil
;
808 return ENC_FROM_ENCODING(base
);
813 * Encoding.list => [enc1, enc2, ...]
815 * Returns the list of loaded encodings.
818 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
819 * #<Encoding:ISO-2022-JP (dummy)>]
821 * Encoding.find("US-ASCII")
822 * => #<Encoding:US-ASCII>
825 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
826 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
830 enc_list(VALUE klass
)
832 VALUE ary
= rb_ary_new2(0);
833 rb_ary_replace(ary
, rb_encoding_list
);
839 * Encoding.find(string) => enc
840 * Encoding.find(symbol) => enc
842 * Search the encoding with specified <i>name</i>.
843 * <i>name</i> should be a string or symbol.
845 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
846 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
850 enc_find(VALUE klass
, VALUE enc
)
852 return rb_enc_from_encoding(to_encoding(enc
));
857 * Encoding.compatible?(str1, str2) => enc or nil
859 * Checks the compatibility of two strings.
860 * If they are compatible, means concatenatable,
861 * returns an encoding which the concatinated string will be.
862 * If they are not compatible, nil is returned.
864 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
865 * => #<Encoding:ISO-8859-1>
867 * Encoding.compatible?(
868 * "\xa1".force_encoding("iso-8859-1"),
869 * "\xa1\xa1".force_encoding("euc-jp"))
874 enc_compatible_p(VALUE klass
, VALUE str1
, VALUE str2
)
878 if (!enc_capable(str1
)) return Qnil
;
879 if (!enc_capable(str2
)) return Qnil
;
880 enc
= rb_enc_compatible(str1
, str2
);
881 if (!enc
) return Qnil
;
882 return rb_enc_from_encoding(enc
);
887 enc_dump(int argc
, VALUE
*argv
, VALUE self
)
889 rb_scan_args(argc
, argv
, "01", 0);
890 return enc_name(self
);
895 enc_load(VALUE klass
, VALUE str
)
897 return enc_find(klass
, str
);
901 rb_ascii8bit_encoding(void)
903 if (!enc_table
.list
) {
906 return enc_table
.list
[ENCINDEX_ASCII
].enc
;
910 rb_ascii8bit_encindex(void)
912 return ENCINDEX_ASCII
;
916 rb_utf8_encoding(void)
918 if (!enc_table
.list
) {
921 return enc_table
.list
[ENCINDEX_UTF_8
].enc
;
925 rb_utf8_encindex(void)
927 return ENCINDEX_UTF_8
;
931 rb_usascii_encoding(void)
933 if (!enc_table
.list
) {
936 return enc_table
.list
[ENCINDEX_US_ASCII
].enc
;
940 rb_usascii_encindex(void)
942 return ENCINDEX_US_ASCII
;
946 rb_locale_encoding(void)
948 VALUE charmap
= rb_locale_charmap(rb_cEncoding
);
952 return rb_usascii_encoding();
954 idx
= rb_enc_find_index(StringValueCStr(charmap
));
956 return rb_ascii8bit_encoding();
958 return rb_enc_from_index(idx
);
962 rb_filesystem_encoding(void)
964 static rb_encoding
*enc
;
967 enc
= rb_locale_encoding();
968 #elif defined __APPLE__
969 enc
= rb_enc_find("UTF8-MAC");
971 enc
= rb_locale_encoding();
977 static int default_external_index
;
980 rb_default_external_encoding(void)
982 return rb_enc_from_index(default_external_index
);
986 rb_enc_default_external(void)
988 return rb_enc_from_encoding(rb_default_external_encoding());
993 * Encoding.default_external => enc
995 * Returns default external encoding.
997 * It is initialized by the locale or -E option.
1000 get_default_external(VALUE klass
)
1002 return rb_enc_default_external();
1006 rb_enc_set_default_external(VALUE encoding
)
1008 default_external_index
= rb_enc_to_index(rb_to_encoding(encoding
));
1013 * Encoding.locale_charmap => string
1015 * Returns the locale charmap name.
1019 * Encoding.locale_charmap => "ANSI_X3.4-1968"
1021 * Encoding.locale_charmap => "EUC-JP"
1025 * Encoding.locale_charmap => "646"
1027 * Encoding.locale_charmap => "eucJP"
1031 rb_locale_charmap(VALUE klass
)
1033 #if defined NO_LOCALE_CHARMAP
1034 return rb_usascii_str_new2("ASCII-8BIT");
1035 #elif defined HAVE_LANGINFO_H
1037 codeset
= nl_langinfo(CODESET
);
1038 return rb_usascii_str_new2(codeset
);
1039 #elif defined _WIN32
1040 return rb_sprintf("CP%d", GetACP());
1047 set_encoding_const(const char *name
, rb_encoding
*enc
)
1049 VALUE encoding
= rb_enc_from_encoding(enc
);
1050 char *s
= (char *)name
;
1051 int haslower
= 0, hasupper
= 0, valid
= 0;
1053 if (ISDIGIT(*s
)) return;
1056 while (*++s
&& (ISALNUM(*s
) || *s
== '_')) {
1057 if (ISLOWER(*s
)) haslower
= 1;
1062 rb_define_const(rb_cEncoding
, name
, encoding
);
1064 if (!valid
|| haslower
) {
1065 int len
= strlen(name
) + 1;
1066 if (!haslower
|| !hasupper
) {
1068 if (ISLOWER(*s
)) haslower
= 1;
1069 if (ISUPPER(*s
)) hasupper
= 1;
1070 } while (*++s
&& (!haslower
|| !hasupper
));
1072 MEMCPY(s
= ALLOCA_N(char, len
), name
, char, len
);
1075 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1077 if (!ISALNUM(*s
)) *s
= '_';
1080 rb_define_const(rb_cEncoding
, name
, encoding
);
1084 for (s
= (char *)name
; *s
; ++s
) {
1085 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1087 rb_define_const(rb_cEncoding
, name
, encoding
);
1093 rb_enc_name_list_i(st_data_t name
, st_data_t idx
, st_data_t arg
)
1095 VALUE ary
= (VALUE
)arg
;
1096 VALUE str
= rb_usascii_str_new2((char *)name
);
1098 rb_ary_push(ary
, str
);
1104 * Encoding.name_list => ["enc1", "enc2", ...]
1106 * Returns the list of available encoding names.
1108 * Encoding.name_list
1109 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1110 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1112 * "BINARY", "CP932", "eucJP"]
1114 * This list doesn't include dummy encodings.
1119 rb_enc_name_list(VALUE klass
)
1121 VALUE ary
= rb_ary_new2(enc_table
.names
->num_entries
);
1122 st_foreach(enc_table
.names
, rb_enc_name_list_i
, (st_data_t
)ary
);
1127 rb_enc_aliases_enc_i(st_data_t name
, st_data_t orig
, st_data_t arg
)
1129 VALUE
*p
= (VALUE
*)arg
;
1130 VALUE aliases
= p
[0], ary
= p
[1];
1131 int idx
= (int)orig
;
1132 VALUE key
, str
= rb_ary_entry(ary
, idx
);
1135 rb_encoding
*enc
= rb_enc_from_index(idx
);
1137 if (STRCASECMP((char*)name
, rb_enc_name(enc
)) == 0) {
1140 str
= rb_usascii_str_new2(rb_enc_name(enc
));
1142 rb_ary_store(ary
, idx
, str
);
1144 key
= rb_usascii_str_new2((char *)name
);
1146 rb_hash_aset(aliases
, key
, str
);
1152 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1154 * Returns the hash of available encoding alias and original encoding name.
1157 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1158 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1163 rb_enc_aliases(VALUE klass
)
1166 aliases
[0] = rb_hash_new();
1167 aliases
[1] = rb_ary_new();
1168 st_foreach(enc_table
.names
, rb_enc_aliases_enc_i
, (st_data_t
)aliases
);
1179 id_base_encoding
= rb_intern("#base_encoding");
1181 rb_cEncoding
= rb_define_class("Encoding", rb_cObject
);
1182 rb_undef_alloc_func(rb_cEncoding
);
1183 rb_define_method(rb_cEncoding
, "to_s", enc_name
, 0);
1184 rb_define_method(rb_cEncoding
, "inspect", enc_inspect
, 0);
1185 rb_define_method(rb_cEncoding
, "name", enc_name
, 0);
1186 rb_define_method(rb_cEncoding
, "base_encoding", enc_base_encoding
, 0);
1187 rb_define_method(rb_cEncoding
, "dummy?", enc_dummy_p
, 0);
1188 rb_define_singleton_method(rb_cEncoding
, "list", enc_list
, 0);
1189 rb_define_singleton_method(rb_cEncoding
, "name_list", rb_enc_name_list
, 0);
1190 rb_define_singleton_method(rb_cEncoding
, "aliases", rb_enc_aliases
, 0);
1191 rb_define_singleton_method(rb_cEncoding
, "find", enc_find
, 1);
1192 rb_define_singleton_method(rb_cEncoding
, "compatible?", enc_compatible_p
, 2);
1194 rb_define_method(rb_cEncoding
, "_dump", enc_dump
, -1);
1195 rb_define_singleton_method(rb_cEncoding
, "_load", enc_load
, 1);
1197 rb_define_singleton_method(rb_cEncoding
, "default_external", get_default_external
, 0);
1198 rb_define_singleton_method(rb_cEncoding
, "locale_charmap", rb_locale_charmap
, 0);
1200 rb_gc_register_address(&rb_encoding_list
);
1201 list
= rb_ary_new2(enc_table
.count
);
1202 RBASIC(list
)->klass
= 0;
1203 rb_encoding_list
= list
;
1204 for (i
= 0; i
< enc_table
.count
; ++i
) {
1205 rb_ary_push(list
, enc_new(enc_table
.list
[i
].enc
));
1209 /* locale insensitive functions */
1211 #define ctype_test(c, ctype) \
1212 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
1214 int rb_isalnum(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALNUM
); }
1215 int rb_isalpha(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALPHA
); }
1216 int rb_isblank(int c
) { return ctype_test(c
, ONIGENC_CTYPE_BLANK
); }
1217 int rb_iscntrl(int c
) { return ctype_test(c
, ONIGENC_CTYPE_CNTRL
); }
1218 int rb_isdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_DIGIT
); }
1219 int rb_isgraph(int c
) { return ctype_test(c
, ONIGENC_CTYPE_GRAPH
); }
1220 int rb_islower(int c
) { return ctype_test(c
, ONIGENC_CTYPE_LOWER
); }
1221 int rb_isprint(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PRINT
); }
1222 int rb_ispunct(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PUNCT
); }
1223 int rb_isspace(int c
) { return ctype_test(c
, ONIGENC_CTYPE_SPACE
); }
1224 int rb_isupper(int c
) { return ctype_test(c
, ONIGENC_CTYPE_UPPER
); }
1225 int rb_isxdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_XDIGIT
); }
1230 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
) : c
;
1236 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
) : c
;