1 /**********************************************************************
2 euc_jp.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
35 static const int EncLen_EUCJP
[] = {
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
47 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
48 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
49 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
54 typedef enum { FAILURE
= -2, ACCEPT
= -1, S0
= 0, S1
, S2
} state_t
;
57 static const signed char trans
[][0x100] = {
58 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
59 /* 0 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
60 /* 1 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
61 /* 2 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
62 /* 3 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
63 /* 4 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
64 /* 5 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
65 /* 6 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
66 /* 7 */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
67 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, 1, 2,
68 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
69 /* a */ F
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
70 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
71 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
72 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
76 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
77 /* 0 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
78 /* 1 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
79 /* 2 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
80 /* 3 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
81 /* 4 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
82 /* 5 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
83 /* 6 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
84 /* 7 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
85 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
86 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
87 /* a */ F
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
88 /* b */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
89 /* c */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
90 /* d */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
91 /* e */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
,
92 /* f */ A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, A
, F
94 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
95 /* 0 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
96 /* 1 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
97 /* 2 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
98 /* 3 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
99 /* 4 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
100 /* 5 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
101 /* 6 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
102 /* 7 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
103 /* 8 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
104 /* 9 */ F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
,
105 /* a */ F
, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 /* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
118 mbc_enc_len(const UChar
* p
, const UChar
* e
, OnigEncoding enc ARG_UNUSED
)
120 int firstbyte
= *p
++;
122 s
= trans
[0][firstbyte
];
123 if (s
< 0) return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
125 if (p
== e
) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP
[firstbyte
]-1);
127 if (s
< 0) return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
129 if (p
== e
) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP
[firstbyte
]-2);
131 return s
== ACCEPT
? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
132 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
136 mbc_to_code(const UChar
* p
, const UChar
* end
, OnigEncoding enc
)
141 len
= enclen(enc
, p
, end
);
142 n
= (OnigCodePoint
)*p
++;
143 if (len
== 1) return n
;
145 for (i
= 1; i
< len
; i
++) {
154 code_to_mbclen(OnigCodePoint code
, OnigEncoding enc ARG_UNUSED
)
156 if (ONIGENC_IS_CODE_ASCII(code
)) return 1;
157 else if (code
> 0xffffff) return 0;
158 else if ((code
& 0xff0000) >= 0x800000) return 3;
159 else if ((code
& 0xff00) >= 0x8000) return 2;
161 return ONIGERR_INVALID_CODE_POINT_VALUE
;
166 code_to_mbc_first(OnigCodePoint code
)
170 if ((code
& 0xff0000) != 0) {
171 first
= (code
>> 16) & 0xff;
173 else if ((code
& 0xff00) != 0) {
174 first
= (code
>> 8) & 0xff;
184 code_to_mbc(OnigCodePoint code
, UChar
*buf
, OnigEncoding enc
)
188 if ((code
& 0xff0000) != 0) *p
++ = (UChar
)(((code
>> 16) & 0xff));
189 if ((code
& 0xff00) != 0) *p
++ = (UChar
)(((code
>> 8) & 0xff));
190 *p
++ = (UChar
)(code
& 0xff);
193 if (enclen(enc
, buf
, p
) != (p
- buf
))
194 return ONIGERR_INVALID_CODE_POINT_VALUE
;
200 mbc_case_fold(OnigCaseFoldType flag
,
201 const UChar
** pp
, const UChar
* end
, UChar
* lower
,
205 const UChar
* p
= *pp
;
207 if (ONIGENC_IS_MBC_ASCII(p
)) {
208 *lower
= ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p
);
215 len
= enclen(enc
, p
, end
);
216 for (i
= 0; i
< len
; i
++) {
220 return len
; /* return byte length of converted char to lower */
225 left_adjust_char_head(const UChar
* start
, const UChar
* s
, OnigEncoding enc
)
228 mb-trail bytes doesn't mix with single bytes.
233 if (s
<= start
) return (UChar
* )s
;
236 while (!eucjp_islead(*p
) && p
> start
) p
--;
237 len
= enclen(enc
, p
, s
);
238 if (p
+ len
> s
) return (UChar
* )p
;
240 return (UChar
* )(p
+ ((s
- p
) & ~1));
244 is_allowed_reverse_match(const UChar
* s
, const UChar
* end
, OnigEncoding enc ARG_UNUSED
)
247 if (c
<= 0x7e || c
== 0x8e || c
== 0x8f)
254 static int PropertyInited
= 0;
255 static const OnigCodePoint
** PropertyList
;
256 static int PropertyListNum
;
257 static int PropertyListSize
;
258 static hash_table_type
* PropertyNameTable
;
260 static const OnigCodePoint CR_Hiragana
[] = {
265 static const OnigCodePoint CR_Katakana
[] = {
273 init_property_list(void)
277 PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana
);
278 PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana
);
286 property_name_to_ctype(OnigEncoding enc
, UChar
* p
, UChar
* end
)
290 PROPERTY_LIST_INIT_CHECK
;
292 if (onig_st_lookup_strend(PropertyNameTable
, p
, end
, (void*)&ctype
) == 0) {
293 return onigenc_minimum_property_name_to_ctype(enc
, p
, end
);
300 is_code_ctype(OnigCodePoint code
, unsigned int ctype
, OnigEncoding enc ARG_UNUSED
)
302 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
304 return ONIGENC_IS_ASCII_CODE_CTYPE(code
, ctype
);
306 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype
)) {
307 return (code_to_mbclen(code
, enc
) > 1 ? TRUE
: FALSE
);
312 PROPERTY_LIST_INIT_CHECK
;
314 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
315 if (ctype
>= (unsigned int )PropertyListNum
)
316 return ONIGERR_TYPE_BUG
;
318 return onig_is_in_code_range((UChar
* )PropertyList
[ctype
], code
);
325 get_ctype_code_range(OnigCtype ctype
, OnigCodePoint
* sb_out
,
326 const OnigCodePoint
* ranges
[], OnigEncoding enc ARG_UNUSED
)
328 if (ctype
<= ONIGENC_MAX_STD_CTYPE
) {
329 return ONIG_NO_SUPPORT_CONFIG
;
334 PROPERTY_LIST_INIT_CHECK
;
336 ctype
-= (ONIGENC_MAX_STD_CTYPE
+ 1);
337 if (ctype
>= (OnigCtype
)PropertyListNum
)
338 return ONIGERR_TYPE_BUG
;
340 *ranges
= PropertyList
[ctype
];
346 OnigEncodingDefine(euc_jp
, EUC_JP
) = {
349 3, /* max enc length */
350 1, /* min enc length */
351 onigenc_is_mbc_newline_0x0a
,
356 onigenc_ascii_apply_all_case_fold
,
357 onigenc_ascii_get_case_fold_codes_by_str
,
358 property_name_to_ctype
,
360 get_ctype_code_range
,
361 left_adjust_char_head
,
362 is_allowed_reverse_match
,
368 * Link: http://www.iana.org/assignments/character-sets
369 * Link: http://home.m05.itscom.net/numa/cde/sjis-euc/sjis-euc.html
371 ENC_ALIAS("eucJP", "EUC-JP"); /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
375 * Link: http://home.m05.itscom.net/numa/cde/ucs-conv/ucs-conv.html
376 * Link: http://www2d.biglobe.ne.jp/~msyk/charcode/cp932/eucJP-ms.html
377 * Link: http://ja.wikipedia.org/wiki/EUC-JP
379 ENC_REPLICATE("eucJP-ms", "EUC-JP"); /* TOG/JVC CDE/Motif Technical WG */
380 ENC_ALIAS("euc-jp-ms", "EUC-JP");
384 * Link: http://search.cpan.org/src/NARUSE/Encode-EUCJPMS-0.07/ucm/cp51932.ucm
385 * Link: http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
386 * Link: http://msyk.at.webry.info/200511/article_2.html
388 ENC_REPLICATE("CP51932", "EUC-JP");