1 /**********************************************************************
6 created at: Thu May 24 17:23:27 JST 2007
8 Copyright (C) 2007 Yukihiro Matsumoto
10 **********************************************************************/
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
16 #ifdef HAVE_LANGINFO_H
20 static ID id_encoding
, id_base_encoding
;
21 static VALUE rb_cEncoding
;
23 struct rb_encoding_entry
{
29 struct rb_encoding_entry
*list
;
35 void rb_enc_init(void);
37 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
39 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
41 #define ENC_UNINITIALIZED (&rb_cEncoding)
42 #define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
43 #define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
45 #define ENC_DUMMY_FLAG FL_USER2
46 #define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
47 #define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
49 static int load_encoding(const char *name
);
50 static VALUE
enc_base_encoding(VALUE self
);
58 enc_new(rb_encoding
*encoding
)
60 VALUE enc
= Data_Wrap_Struct(rb_cEncoding
, enc_mark
, 0, encoding
);
61 encoding
->auxiliary_data
= (void *)enc
;
66 rb_enc_from_encoding(rb_encoding
*encoding
)
68 if (!encoding
) return Qnil
;
69 if (enc_initialized_p(encoding
))
70 return ENC_FROM_ENCODING(encoding
);
71 return enc_new(encoding
);
75 enc_check_encoding(VALUE obj
)
80 if (SPECIAL_CONST_P(obj
) || BUILTIN_TYPE(obj
) != T_DATA
||
81 RDATA(obj
)->dmark
!= enc_mark
) {
84 enc
= (rb_encoding
*)RDATA(obj
)->data
;
85 index
= rb_enc_to_index(enc
);
86 if (rb_enc_from_index(index
) != enc
)
88 if (enc_autoload_p(enc
)) {
89 index
= rb_enc_find_index(enc
->name
);
95 rb_to_encoding_index(VALUE enc
)
99 idx
= enc_check_encoding(enc
);
103 else if (NIL_P(enc
= rb_check_string_type(enc
))) {
107 return rb_enc_find_index(StringValueCStr(enc
));
112 rb_to_encoding(VALUE enc
)
116 idx
= enc_check_encoding(enc
);
117 if (idx
>= 0) return RDATA(enc
)->data
;
118 if ((idx
= rb_enc_find_index(StringValueCStr(enc
))) < 0) {
119 rb_raise(rb_eArgError
, "unknown encoding name - %s", RSTRING_PTR(enc
));
121 return rb_enc_from_index(idx
);
125 rb_gc_mark_encodings(void)
128 for (i
= 0; i
< enc_table
.count
; ++i
) {
129 rb_encoding
*enc
= enc_table
.list
[i
].enc
;
130 if (enc
&& enc_initialized_p(enc
)) {
131 rb_gc_mark(ENC_FROM_ENCODING(enc
));
137 enc_table_expand(int newsize
)
139 struct rb_encoding_entry
*ent
;
142 if (enc_table
.size
>= newsize
) return newsize
;
143 newsize
= (newsize
+ 7) / 8 * 8;
144 ent
= realloc(enc_table
.list
, sizeof(*enc_table
.list
) * newsize
);
146 memset(ent
+ enc_table
.size
, 0, sizeof(*ent
)*(newsize
- enc_table
.size
));
147 enc_table
.list
= ent
;
148 enc_table
.size
= newsize
;
153 enc_register_at(int index
, const char *name
, rb_encoding
*encoding
)
155 struct rb_encoding_entry
*ent
= &enc_table
.list
[index
];
156 void *obj
= ENC_UNINITIALIZED
;
159 ent
->name
= name
= strdup(name
);
161 else if (STRCASECMP(name
, ent
->name
)) {
165 ent
->enc
= malloc(sizeof(rb_encoding
));
168 obj
= ent
->enc
->auxiliary_data
;
171 *ent
->enc
= *encoding
;
174 memset(ent
->enc
, 0, sizeof(*ent
->enc
));
177 encoding
->name
= name
;
178 encoding
->ruby_encoding_index
= index
;
179 st_insert(enc_table
.names
, (st_data_t
)name
, (st_data_t
)index
);
180 if (obj
!= ENC_UNINITIALIZED
) {
181 encoding
->auxiliary_data
= obj
;
183 else if (rb_cEncoding
) {
184 /* initialize encoding data */
188 encoding
->auxiliary_data
= ENC_UNINITIALIZED
;
194 enc_register(const char *name
, rb_encoding
*encoding
)
196 int index
= enc_table
.count
;
198 if ((index
= enc_table_expand(index
+ 1)) < 0) return -1;
199 enc_table
.count
= index
;
200 return enc_register_at(index
- 1, name
, encoding
);
203 static void set_encoding_const(const char *, rb_encoding
*);
204 int rb_enc_registered(const char *name
);
207 rb_enc_register(const char *name
, rb_encoding
*encoding
)
209 int index
= rb_enc_registered(name
);
212 rb_encoding
*oldenc
= rb_enc_from_index(index
);
213 if (STRCASECMP(name
, rb_enc_name(oldenc
))) {
214 index
= enc_register(name
, encoding
);
216 else if (!enc_autoload_p(oldenc
) ||
217 (enc_initialized_p(oldenc
) &&
218 !ENC_DUMMY_P(ENC_FROM_ENCODING(oldenc
)))) {
219 enc_register_at(index
, name
, encoding
);
222 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
226 index
= enc_register(name
, encoding
);
227 set_encoding_const(name
, rb_enc_from_index(index
));
233 rb_encdb_declare(const char *name
)
235 int idx
= rb_enc_registered(name
);
237 idx
= enc_register(name
, 0);
239 set_encoding_const(name
, rb_enc_from_index(idx
));
243 enc_check_duplication(const char *name
)
245 if (rb_enc_registered(name
) >= 0) {
246 rb_raise(rb_eArgError
, "encoding %s is already registered", name
);
251 set_base_encoding(int index
, rb_encoding
*base
)
253 VALUE enc
= rb_enc_from_encoding(enc_table
.list
[index
].enc
);
255 rb_ivar_set(enc
, id_base_encoding
, rb_enc_from_encoding(base
));
256 if (rb_enc_dummy_p(base
)) ENC_SET_DUMMY(enc
);
261 rb_enc_replicate(const char *name
, rb_encoding
*encoding
)
265 enc_check_duplication(name
);
266 idx
= enc_register(name
, encoding
);
267 set_base_encoding(idx
, encoding
);
268 set_encoding_const(name
, rb_enc_from_index(idx
));
273 enc_replicate(int idx
, const char *name
, rb_encoding
*origenc
)
276 idx
= enc_register(name
, origenc
);
279 idx
= enc_register_at(idx
, name
, origenc
);
282 set_base_encoding(idx
, origenc
);
283 set_encoding_const(name
, rb_enc_from_index(idx
));
289 rb_encdb_replicate(const char *name
, const char *orig
)
291 int origidx
= rb_enc_registered(orig
);
292 int idx
= rb_enc_registered(name
);
295 origidx
= enc_register(orig
, 0);
297 return enc_replicate(idx
, name
, rb_enc_from_index(origidx
));
301 rb_define_dummy_encoding(const char *name
)
303 int index
= rb_enc_replicate(name
, rb_ascii8bit_encoding());
304 VALUE enc
= rb_enc_from_encoding(enc_table
.list
[index
].enc
);
311 rb_encdb_dummy(const char *name
)
313 int index
= enc_replicate(rb_enc_registered(name
), name
,
314 rb_ascii8bit_encoding());
315 VALUE enc
= rb_enc_from_encoding(enc_table
.list
[index
].enc
);
322 rb_enc_dummy_p(rb_encoding
*enc
)
325 if (!enc_initialized_p(enc
)) return Qfalse
;
326 encoding
= rb_enc_from_encoding(enc
);
327 return ENC_DUMMY_P(encoding
);
332 * enc.dummy? => true or false
334 * Returns true for dummy encodings.
335 * A dummy encoding is an encoding for which character handling is not properly
337 * It is used for stateful encodings.
339 * Encoding::ISO_2022_JP.dummy? #=> true
340 * Encoding::UTF_8.dummy? #=> false
344 enc_dummy_p(VALUE enc
)
346 return rb_enc_dummy_p(rb_to_encoding(enc
)) ? Qtrue
: Qfalse
;
350 enc_alias(const char *alias
, int idx
)
352 alias
= strdup(alias
);
353 st_insert(enc_table
.names
, (st_data_t
)alias
, (st_data_t
)idx
);
354 set_encoding_const(alias
, rb_enc_from_index(idx
));
359 rb_enc_alias(const char *alias
, const char *orig
)
363 enc_check_duplication(alias
);
364 if (!enc_table
.list
) {
367 if ((idx
= rb_enc_find_index(orig
)) < 0) {
370 return enc_alias(alias
, idx
);
374 rb_encdb_alias(const char *alias
, const char *orig
)
376 int idx
= rb_enc_registered(orig
);
379 idx
= enc_register(orig
, 0);
381 return enc_alias(alias
, idx
);
391 extern rb_encoding OnigEncodingUTF_8
;
392 extern rb_encoding OnigEncodingUS_ASCII
;
397 enc_table_expand(ENCODING_COUNT
+ 1);
398 if (!enc_table
.names
) {
399 enc_table
.names
= st_init_strcasetable();
401 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
404 ENC_REGISTER(US_ASCII
);
406 enc_table
.count
= ENCINDEX_BUILTIN_MAX
;
410 rb_enc_from_index(int index
)
412 if (!enc_table
.list
) {
415 if (index
< 0 || enc_table
.count
<= index
) {
418 return enc_table
.list
[index
].enc
;
422 rb_enc_registered(const char *name
)
426 if (!name
) return -1;
427 if (!enc_table
.list
) return -1;
428 if (st_lookup(enc_table
.names
, (st_data_t
)name
, &idx
)) {
435 require_enc(VALUE enclib
)
437 return rb_require_safe(enclib
, rb_safe_level());
441 load_encoding(const char *name
)
443 VALUE enclib
= rb_sprintf("enc/%s", name
);
444 VALUE verbose
= ruby_verbose
;
445 VALUE debug
= ruby_debug
;
447 char *s
= RSTRING_PTR(enclib
) + 4, *e
= RSTRING_END(enclib
);
451 if (!ISALNUM(*s
)) *s
= '_';
452 else if (ISUPPER(*s
)) *s
= TOLOWER(*s
);
456 ruby_verbose
= Qfalse
;
458 loaded
= rb_protect(require_enc
, enclib
, 0);
459 ruby_verbose
= verbose
;
461 rb_set_errinfo(Qnil
);
462 if (NIL_P(loaded
)) return -1;
463 if ((idx
= rb_enc_registered(name
)) < 0) return -1;
464 if (enc_autoload_p(enc_table
.list
[idx
].enc
)) return -1;
469 rb_enc_find_index(const char *name
)
471 int i
= rb_enc_registered(name
), b
;
476 i
= load_encoding(name
);
478 else if (enc_autoload_p(enc
= rb_enc_from_index(i
))) {
479 if (enc_initialized_p(enc
) &&
480 (base
= enc_base_encoding(ENC_FROM_ENCODING(enc
)), !NIL_P(base
))) {
481 if ((b
= enc_check_encoding(base
)) < 0) {
484 enc_register_at(i
, rb_enc_name(enc
), rb_enc_from_index(b
));
487 i
= load_encoding(rb_enc_name(enc
));
490 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
500 rb_enc_find(const char *name
)
502 int idx
= rb_enc_find_index(name
);
503 if (idx
< 0) idx
= 0;
504 return rb_enc_from_index(idx
);
508 enc_capable(VALUE obj
)
510 if (SPECIAL_CONST_P(obj
)) return Qfalse
;
511 switch (BUILTIN_TYPE(obj
)) {
517 if (RDATA(obj
)->dmark
== enc_mark
) return Qtrue
;
524 enc_check_capable(VALUE x
)
526 if (!enc_capable(x
)) {
532 else if (FIXNUM_P(x
)) {
535 else if (SYMBOL_P(x
)) {
538 else if (rb_special_const_p(x
)) {
539 etype
= RSTRING_PTR(rb_obj_as_string(x
));
542 etype
= rb_obj_classname(x
);
544 rb_raise(rb_eTypeError
, "wrong argument type %s (not encode capable)", etype
);
552 id_encoding
= rb_intern("encoding");
558 rb_enc_internal_get_index(VALUE obj
)
562 i
= ENCODING_GET_INLINED(obj
);
563 if (i
== ENCODING_INLINE_MAX
) {
566 iv
= rb_ivar_get(obj
, rb_id_encoding());
573 rb_enc_internal_set_index(VALUE obj
, int idx
)
575 if (idx
< ENCODING_INLINE_MAX
) {
576 ENCODING_SET_INLINED(obj
, idx
);
579 ENCODING_SET_INLINED(obj
, ENCODING_INLINE_MAX
);
580 rb_ivar_set(obj
, rb_id_encoding(), INT2NUM(idx
));
585 rb_enc_associate_index(VALUE obj
, int idx
)
587 enc_check_capable(obj
);
588 if (rb_enc_internal_get_index(obj
) == idx
)
590 if (!ENC_CODERANGE_ASCIIONLY(obj
) ||
591 !rb_enc_asciicompat(rb_enc_from_index(idx
))) {
592 ENC_CODERANGE_CLEAR(obj
);
594 rb_enc_internal_set_index(obj
, idx
);
598 rb_enc_associate(VALUE obj
, rb_encoding
*enc
)
600 rb_enc_associate_index(obj
, rb_enc_to_index(enc
));
604 rb_enc_get_index(VALUE obj
)
606 if (!enc_capable(obj
)) return -1;
607 return rb_enc_internal_get_index(obj
);
611 rb_enc_get(VALUE obj
)
613 return rb_enc_from_index(rb_enc_get_index(obj
));
617 rb_enc_check(VALUE str1
, VALUE str2
)
619 rb_encoding
*enc
= rb_enc_compatible(str1
, str2
);
621 rb_raise(rb_eArgError
, "character encodings differ: %s and %s",
622 rb_enc_name(rb_enc_get(str1
)),
623 rb_enc_name(rb_enc_get(str2
)));
628 rb_enc_compatible(VALUE str1
, VALUE str2
)
631 rb_encoding
*enc1
, *enc2
;
633 idx1
= rb_enc_get_index(str1
);
634 idx2
= rb_enc_get_index(str2
);
636 if (idx1
< 0 || idx2
< 0)
640 return rb_enc_from_index(idx1
);
642 enc1
= rb_enc_from_index(idx1
);
643 enc2
= rb_enc_from_index(idx2
);
645 if (TYPE(str2
) == T_STRING
&& RSTRING_LEN(str2
) == 0)
647 if (TYPE(str1
) == T_STRING
&& RSTRING_LEN(str1
) == 0)
649 if (!rb_enc_asciicompat(enc1
) || !rb_enc_asciicompat(enc2
)) {
653 if (BUILTIN_TYPE(str1
) != T_STRING
) {
661 if (BUILTIN_TYPE(str1
) == T_STRING
) {
664 cr1
= rb_enc_str_coderange(str1
);
665 if (BUILTIN_TYPE(str2
) == T_STRING
) {
666 cr2
= rb_enc_str_coderange(str2
);
668 /* may need to handle ENC_CODERANGE_BROKEN */
669 if (cr1
== ENC_CODERANGE_7BIT
) return enc2
;
670 if (cr2
== ENC_CODERANGE_7BIT
) return enc1
;
672 if (cr2
== ENC_CODERANGE_7BIT
) {
673 if (idx1
== 0) return enc2
;
677 if (cr1
== ENC_CODERANGE_7BIT
)
684 rb_enc_copy(VALUE obj1
, VALUE obj2
)
686 rb_enc_associate_index(obj1
, rb_enc_get_index(obj2
));
692 * obj.encoding => encoding
694 * Returns the Encoding object that represents the encoding of obj.
698 rb_obj_encoding(VALUE obj
)
700 rb_encoding
*enc
= rb_enc_get(obj
);
702 rb_raise(rb_eTypeError
, "unknown encoding");
704 return rb_enc_from_encoding(enc
);
708 rb_enc_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
710 int n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
711 if (MBCLEN_CHARFOUND_P(n
) && MBCLEN_CHARFOUND_LEN(n
) <= e
-p
)
712 return MBCLEN_CHARFOUND_LEN(n
);
714 int min
= rb_enc_mbminlen(enc
);
715 return min
<= e
-p
? min
: e
-p
;
720 rb_enc_precise_mbclen(const char *p
, const char *e
, rb_encoding
*enc
)
724 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
725 n
= ONIGENC_PRECISE_MBC_ENC_LEN(enc
, (UChar
*)p
, (UChar
*)e
);
727 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n
-(e
-p
));
732 rb_enc_ascget(const char *p
, const char *e
, int *len
, rb_encoding
*enc
)
737 if (rb_enc_asciicompat(enc
)) {
738 c
= (unsigned char)*p
;
744 l
= rb_enc_precise_mbclen(p
, e
, enc
);
745 if (!MBCLEN_CHARFOUND_P(l
))
747 c
= rb_enc_mbc_to_codepoint(p
, e
, enc
);
748 if (!rb_enc_isascii(c
, enc
))
755 rb_enc_codepoint(const char *p
, const char *e
, rb_encoding
*enc
)
759 rb_raise(rb_eArgError
, "empty string");
760 r
= rb_enc_precise_mbclen(p
, e
, enc
);
761 if (MBCLEN_CHARFOUND_P(r
))
762 return rb_enc_mbc_to_codepoint(p
, e
, enc
);
764 rb_raise(rb_eArgError
, "invalid mbstring sequence");
768 rb_enc_codelen(int c
, rb_encoding
*enc
)
770 int n
= ONIGENC_CODE_TO_MBCLEN(enc
,c
);
772 rb_raise(rb_eArgError
, "invalid codepoint 0x%x", c
);
778 rb_enc_toupper(int c
, rb_encoding
*enc
)
780 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
):(c
));
784 rb_enc_tolower(int c
, rb_encoding
*enc
)
786 return (ONIGENC_IS_ASCII_CODE(c
)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
):(c
));
791 * enc.inspect => string
793 * Returns a string which represents the encoding for programmers.
795 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
796 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
799 enc_inspect(VALUE self
)
801 VALUE str
= rb_sprintf("#<%s:%s%s>", rb_obj_classname(self
),
802 rb_enc_name((rb_encoding
*)DATA_PTR(self
)),
803 (ENC_DUMMY_P(self
) ? " (dummy)" : ""));
804 ENCODING_CODERANGE_SET(str
, rb_usascii_encindex(), ENC_CODERANGE_7BIT
);
812 * Returns the name of the encoding.
814 * Encoding::UTF_8.name => "UTF-8"
819 return rb_usascii_str_new2(rb_enc_name((rb_encoding
*)DATA_PTR(self
)));
823 enc_base_encoding(VALUE self
)
825 return rb_attr_get(self
, id_base_encoding
);
830 * Encoding.list => [enc1, enc2, ...]
832 * Returns the list of loaded encodings.
835 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
836 * #<Encoding:ISO-2022-JP (dummy)>]
838 * Encoding.find("US-ASCII")
839 * => #<Encoding:US-ASCII>
842 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
843 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
847 enc_list(VALUE klass
)
849 VALUE ary
= rb_ary_new2(enc_table
.count
);
851 for (i
= 0; i
< enc_table
.count
; ++i
) {
852 rb_encoding
*enc
= enc_table
.list
[i
].enc
;
854 rb_ary_push(ary
, rb_enc_from_encoding(enc
));
862 * Encoding.find(string) => enc
863 * Encoding.find(symbol) => enc
865 * Search the encoding with specified <i>name</i>.
866 * <i>name</i> should be a string or symbol.
868 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
869 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
873 enc_find(VALUE klass
, VALUE enc
)
878 if (!rb_enc_asciicompat(rb_enc_get(enc
))) {
879 rb_raise(rb_eArgError
, "invalid name encoding (non ASCII)");
881 idx
= rb_enc_find_index(StringValueCStr(enc
));
883 rb_raise(rb_eArgError
, "unknown encoding name - %s", RSTRING_PTR(enc
));
885 return rb_enc_from_encoding(rb_enc_from_index(idx
));
890 * Encoding.compatible?(str1, str2) => enc or nil
892 * Checks the compatibility of two strings.
893 * If they are compatible, means concatenatable,
894 * returns an encoding which the concatinated string will be.
895 * If they are not compatible, nil is returned.
897 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
898 * => #<Encoding:ISO-8859-1>
900 * Encoding.compatible?(
901 * "\xa1".force_encoding("iso-8859-1"),
902 * "\xa1\xa1".force_encoding("euc-jp"))
907 enc_compatible_p(VALUE klass
, VALUE str1
, VALUE str2
)
909 rb_encoding
*enc
= rb_enc_compatible(str1
, str2
);
910 VALUE encoding
= Qnil
;
911 if (!enc
|| !(encoding
= rb_enc_from_encoding(enc
)))
918 enc_dump(int argc
, VALUE
*argv
, VALUE self
)
920 rb_scan_args(argc
, argv
, "01", 0);
921 return enc_name(self
);
926 enc_load(VALUE klass
, VALUE str
)
928 return enc_find(klass
, str
);
932 rb_ascii8bit_encoding(void)
934 if (!enc_table
.list
) {
937 return enc_table
.list
[0].enc
;
941 rb_utf8_encoding(void)
943 if (!enc_table
.list
) {
946 return enc_table
.list
[ENCINDEX_UTF_8
].enc
;
950 rb_usascii_encoding(void)
952 if (!enc_table
.list
) {
955 return enc_table
.list
[ENCINDEX_US_ASCII
].enc
;
959 rb_usascii_encindex(void)
961 return ENCINDEX_US_ASCII
;
965 rb_locale_encoding(void)
967 VALUE charmap
= rb_locale_charmap(rb_cEncoding
);
971 idx
= rb_enc_find_index("US-ASCII");
973 idx
= rb_enc_find_index(StringValueCStr(charmap
));
975 return rb_ascii8bit_encoding();
977 return rb_enc_from_index(idx
);
980 static int default_external_index
;
983 rb_default_external_encoding(void)
985 return rb_enc_from_index(default_external_index
);
989 rb_enc_default_external(void)
991 return rb_enc_from_encoding(rb_default_external_encoding());
996 * Encoding.default_external => enc
998 * Returns default external encoding.
1000 * It is initialized by the locale or -E option.
1003 get_default_external(VALUE klass
)
1005 return rb_enc_default_external();
1009 rb_enc_set_default_external(VALUE encoding
)
1011 default_external_index
= rb_enc_to_index(rb_to_encoding(encoding
));
1016 * Encoding.locale_charmap => string
1018 * Returns the locale charmap name.
1022 * Encoding.locale_charmap => "ANSI_X3.4-1968"
1024 * Encoding.locale_charmap => "EUC-JP"
1028 * Encoding.locale_charmap => "646"
1030 * Encoding.locale_charmap => "eucJP"
1034 rb_locale_charmap(VALUE klass
)
1036 #if defined NO_LOCALE_CHARMAP
1037 return rb_usascii_str_new2("ASCII-8BIT");
1038 #elif defined HAVE_LANGINFO_H
1040 codeset
= nl_langinfo(CODESET
);
1041 return rb_usascii_str_new2(codeset
);
1042 #elif defined _WIN32
1043 return rb_sprintf("CP%d", GetACP());
1050 set_encoding_const(const char *name
, rb_encoding
*enc
)
1052 VALUE encoding
= rb_enc_from_encoding(enc
);
1053 char *s
= (char *)name
;
1054 int haslower
= 0, hasupper
= 0, valid
= 0;
1056 if (ISDIGIT(*s
)) return;
1059 while (*++s
&& (ISALNUM(*s
) || *s
== '_')) {
1060 if (ISLOWER(*s
)) haslower
= 1;
1065 rb_define_const(rb_cEncoding
, name
, encoding
);
1067 if (!valid
|| haslower
) {
1068 int len
= strlen(name
) + 1;
1069 if (!haslower
|| !hasupper
) {
1071 if (ISLOWER(*s
)) haslower
= 1;
1072 if (ISUPPER(*s
)) hasupper
= 1;
1073 } while (*++s
&& (!haslower
|| !hasupper
));
1075 MEMCPY(s
= ALLOCA_N(char, len
), name
, char, len
);
1078 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1080 if (!ISALNUM(*s
)) *s
= '_';
1083 rb_define_const(rb_cEncoding
, name
, encoding
);
1087 for (s
= (char *)name
; *s
; ++s
) {
1088 if (ISLOWER(*s
)) *s
= ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s
);
1090 rb_define_const(rb_cEncoding
, name
, encoding
);
1096 rb_enc_name_list_i(st_data_t name
, st_data_t idx
, st_data_t arg
)
1098 VALUE ary
= (VALUE
)arg
;
1099 VALUE str
= rb_usascii_str_new2((char *)name
);
1101 rb_ary_push(ary
, str
);
1107 * Encoding.name_list => ["enc1", "enc2", ...]
1109 * Returns the list of available encoding names.
1111 * Encoding.name_list
1112 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1113 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1115 * "BINARY", "CP932", "eucJP"]
1117 * This list doesn't include dummy encodings.
1122 rb_enc_name_list(VALUE klass
)
1124 VALUE ary
= rb_ary_new2(enc_table
.names
->num_entries
);
1125 st_foreach(enc_table
.names
, rb_enc_name_list_i
, (st_data_t
)ary
);
1130 rb_enc_aliases_enc_i(st_data_t name
, st_data_t orig
, st_data_t arg
)
1132 VALUE
*p
= (VALUE
*)arg
;
1133 VALUE aliases
= p
[0], ary
= p
[1];
1134 int idx
= (int)orig
;
1135 VALUE key
, str
= rb_ary_entry(ary
, idx
);
1138 rb_encoding
*enc
= rb_enc_from_index(idx
);
1140 if (STRCASECMP((char*)name
, rb_enc_name(enc
)) == 0) {
1143 str
= rb_usascii_str_new2(rb_enc_name(enc
));
1145 rb_ary_store(ary
, idx
, str
);
1147 key
= rb_usascii_str_new2((char *)name
);
1149 rb_hash_aset(aliases
, key
, str
);
1155 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1157 * Returns the hash of available encoding alias and original encoding name.
1160 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1161 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1166 rb_enc_aliases(VALUE klass
)
1169 aliases
[0] = rb_hash_new();
1170 aliases
[1] = rb_ary_new();
1171 st_foreach(enc_table
.names
, rb_enc_aliases_enc_i
, (st_data_t
)aliases
);
1178 id_base_encoding
= rb_intern("#base_encoding");
1180 rb_cEncoding
= rb_define_class("Encoding", rb_cObject
);
1181 rb_undef_alloc_func(rb_cEncoding
);
1182 rb_define_method(rb_cEncoding
, "to_s", enc_name
, 0);
1183 rb_define_method(rb_cEncoding
, "inspect", enc_inspect
, 0);
1184 rb_define_method(rb_cEncoding
, "name", enc_name
, 0);
1185 rb_define_method(rb_cEncoding
, "base_encoding", enc_base_encoding
, 0);
1186 rb_define_method(rb_cEncoding
, "dummy?", enc_dummy_p
, 0);
1187 rb_define_singleton_method(rb_cEncoding
, "list", enc_list
, 0);
1188 rb_define_singleton_method(rb_cEncoding
, "name_list", rb_enc_name_list
, 0);
1189 rb_define_singleton_method(rb_cEncoding
, "aliases", rb_enc_aliases
, 0);
1190 rb_define_singleton_method(rb_cEncoding
, "find", enc_find
, 1);
1191 rb_define_singleton_method(rb_cEncoding
, "compatible?", enc_compatible_p
, 2);
1193 rb_define_method(rb_cEncoding
, "_dump", enc_dump
, -1);
1194 rb_define_singleton_method(rb_cEncoding
, "_load", enc_load
, 1);
1196 rb_define_singleton_method(rb_cEncoding
, "default_external", get_default_external
, 0);
1197 rb_define_singleton_method(rb_cEncoding
, "locale_charmap", rb_locale_charmap
, 0);
1200 /* locale insensitive functions */
1202 #define ctype_test(c, ctype) \
1203 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
1205 int rb_isalnum(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALNUM
); }
1206 int rb_isalpha(int c
) { return ctype_test(c
, ONIGENC_CTYPE_ALPHA
); }
1207 int rb_isblank(int c
) { return ctype_test(c
, ONIGENC_CTYPE_BLANK
); }
1208 int rb_iscntrl(int c
) { return ctype_test(c
, ONIGENC_CTYPE_CNTRL
); }
1209 int rb_isdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_DIGIT
); }
1210 int rb_isgraph(int c
) { return ctype_test(c
, ONIGENC_CTYPE_GRAPH
); }
1211 int rb_islower(int c
) { return ctype_test(c
, ONIGENC_CTYPE_LOWER
); }
1212 int rb_isprint(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PRINT
); }
1213 int rb_ispunct(int c
) { return ctype_test(c
, ONIGENC_CTYPE_PUNCT
); }
1214 int rb_isspace(int c
) { return ctype_test(c
, ONIGENC_CTYPE_SPACE
); }
1215 int rb_isupper(int c
) { return ctype_test(c
, ONIGENC_CTYPE_UPPER
); }
1216 int rb_isxdigit(int c
) { return ctype_test(c
, ONIGENC_CTYPE_XDIGIT
); }
1221 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c
) : c
;
1227 return rb_isascii(c
) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c
) : c
;