* test/ruby/envutil.rb (assert_normal_exit): show pid when fail.
[ruby-svn.git] / enc / euc_jp.c
blob703e0e4f38ae7bdf03283c0a038c84d533040012
1 /**********************************************************************
2 euc_jp.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "regint.h"
33 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
35 static const int EncLen_EUCJP[] = {
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
54 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
55 #define A ACCEPT
56 #define F FAILURE
57 static const signed char trans[][0x100] = {
58 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
59 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
60 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
61 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
62 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
63 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
64 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
65 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
66 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
67 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
68 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
69 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
76 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
77 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
78 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
79 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
80 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
85 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
86 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
87 /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
88 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
89 /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
90 /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
91 /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
92 /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
94 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
95 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
99 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
100 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
101 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
102 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105 /* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
114 #undef A
115 #undef F
117 static int
118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
120 int firstbyte = *p++;
121 state_t s;
122 s = trans[0][firstbyte];
123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
126 s = trans[s][*p++];
127 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
129 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
130 s = trans[s][*p++];
131 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
132 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
135 static OnigCodePoint
136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
138 int c, i, len;
139 OnigCodePoint n;
141 len = enclen(enc, p, end);
142 n = (OnigCodePoint )*p++;
143 if (len == 1) return n;
145 for (i = 1; i < len; i++) {
146 if (p >= end) break;
147 c = *p++;
148 n <<= 8; n += c;
150 return n;
153 static int
154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
156 if (ONIGENC_IS_CODE_ASCII(code)) return 1;
157 else if (code > 0xffffff) return 0;
158 else if ((code & 0xff0000) >= 0x800000) return 3;
159 else if ((code & 0xff00) >= 0x8000) return 2;
160 else
161 return ONIGERR_INVALID_CODE_POINT_VALUE;
164 #if 0
165 static int
166 code_to_mbc_first(OnigCodePoint code)
168 int first;
170 if ((code & 0xff0000) != 0) {
171 first = (code >> 16) & 0xff;
173 else if ((code & 0xff00) != 0) {
174 first = (code >> 8) & 0xff;
176 else {
177 return (int )code;
179 return first;
181 #endif
183 static int
184 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
186 UChar *p = buf;
188 if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
189 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
190 *p++ = (UChar )(code & 0xff);
192 #if 1
193 if (enclen(enc, buf, p) != (p - buf))
194 return ONIGERR_INVALID_CODE_POINT_VALUE;
195 #endif
196 return p - buf;
199 static int
200 mbc_case_fold(OnigCaseFoldType flag,
201 const UChar** pp, const UChar* end, UChar* lower,
202 OnigEncoding enc)
204 int len;
205 const UChar* p = *pp;
207 if (ONIGENC_IS_MBC_ASCII(p)) {
208 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
209 (*pp)++;
210 return 1;
212 else {
213 int i;
215 len = enclen(enc, p, end);
216 for (i = 0; i < len; i++) {
217 *lower++ = *p++;
219 (*pp) += len;
220 return len; /* return byte length of converted char to lower */
224 static UChar*
225 left_adjust_char_head(const UChar* start, const UChar* s, OnigEncoding enc)
227 /* In this encoding
228 mb-trail bytes doesn't mix with single bytes.
230 const UChar *p;
231 int len;
233 if (s <= start) return (UChar* )s;
234 p = s;
236 while (!eucjp_islead(*p) && p > start) p--;
237 len = enclen(enc, p, s);
238 if (p + len > s) return (UChar* )p;
239 p += len;
240 return (UChar* )(p + ((s - p) & ~1));
243 static int
244 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
246 const UChar c = *s;
247 if (c <= 0x7e || c == 0x8e || c == 0x8f)
248 return TRUE;
249 else
250 return FALSE;
254 static int PropertyInited = 0;
255 static const OnigCodePoint** PropertyList;
256 static int PropertyListNum;
257 static int PropertyListSize;
258 static hash_table_type* PropertyNameTable;
260 static const OnigCodePoint CR_Hiragana[] = {
262 0xa4a1, 0xa4f3
263 }; /* CR_Hiragana */
265 static const OnigCodePoint CR_Katakana[] = {
267 0xa5a1, 0xa5f6,
268 0xaaa6, 0xaaaf,
269 0xaab1, 0xaadd
270 }; /* CR_Katakana */
272 static int
273 init_property_list(void)
275 int r;
277 PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
278 PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
279 PropertyInited = 1;
281 end:
282 return r;
285 static int
286 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
288 int ctype;
290 PROPERTY_LIST_INIT_CHECK;
292 if (onig_st_lookup_strend(PropertyNameTable, p, end, (void*)&ctype) == 0) {
293 return onigenc_minimum_property_name_to_ctype(enc, p, end);
296 return ctype;
299 static int
300 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
302 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
303 if (code < 128)
304 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
305 else {
306 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
307 return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
311 else {
312 PROPERTY_LIST_INIT_CHECK;
314 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
315 if (ctype >= (unsigned int )PropertyListNum)
316 return ONIGERR_TYPE_BUG;
318 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
321 return FALSE;
324 static int
325 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
326 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
328 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
329 return ONIG_NO_SUPPORT_CONFIG;
331 else {
332 *sb_out = 0x80;
334 PROPERTY_LIST_INIT_CHECK;
336 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
337 if (ctype >= (OnigCtype )PropertyListNum)
338 return ONIGERR_TYPE_BUG;
340 *ranges = PropertyList[ctype];
341 return 0;
346 OnigEncodingDefine(euc_jp, EUC_JP) = {
347 mbc_enc_len,
348 "EUC-JP", /* name */
349 3, /* max enc length */
350 1, /* min enc length */
351 onigenc_is_mbc_newline_0x0a,
352 mbc_to_code,
353 code_to_mbclen,
354 code_to_mbc,
355 mbc_case_fold,
356 onigenc_ascii_apply_all_case_fold,
357 onigenc_ascii_get_case_fold_codes_by_str,
358 property_name_to_ctype,
359 is_code_ctype,
360 get_ctype_code_range,
361 left_adjust_char_head,
362 is_allowed_reverse_match,
366 * Name: EUC-JP
367 * MIBenum: 18
368 * Link: http://www.iana.org/assignments/character-sets
369 * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
371 ENC_ALIAS("eucJP", "EUC-JP"); /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
374 * Name: eucJP-ms
375 * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
376 * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
377 * Link: http://ja.wikipedia.org/wiki/EUC-JP
379 ENC_REPLICATE("eucJP-ms", "EUC-JP"); /* TOG/JVC CDE/Motif Technical WG */
380 ENC_ALIAS("euc-jp-ms", "EUC-JP");
383 * Name: CP51932
384 * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
385 * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
386 * Link: http://msyk.at.webry.info/200511/article_2.html
388 ENC_REPLICATE("CP51932", "EUC-JP");