* transcode_data.h (rb_transcoder_stateful_type_t): defined.
[ruby-svn.git] / enc / utf_8.c
blobb8ee92feb4414eb1de77b19a396ba27b4c8e1143
1 /**********************************************************************
2 utf_8.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "regenc.h"
32 #define USE_INVALID_CODE_SCHEME
34 #ifdef USE_INVALID_CODE_SCHEME
35 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
36 #define INVALID_CODE_FE 0xfffffffe
37 #define INVALID_CODE_FF 0xffffffff
38 #define VALID_CODE_LIMIT 0x7fffffff
39 #endif
41 #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
43 static const int EncLen_UTF8[] = {
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
57 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
58 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
59 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
62 typedef enum {
63 FAILURE = -2,
64 ACCEPT,
65 S0, S1, S2, S3,
66 S4, S5, S6, S7
67 } state_t;
68 #define A ACCEPT
69 #define F FAILURE
70 static const signed char trans[][0x100] = {
71 { /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
72 /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
73 /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
74 /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
75 /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
76 /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
77 /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
78 /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
79 /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
80 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
81 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
82 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
83 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
84 /* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85 /* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86 /* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
87 /* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
89 { /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
90 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
91 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
92 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
93 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
94 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
95 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
96 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
97 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
98 /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
99 /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
100 /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
101 /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
102 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
103 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
104 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
105 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
107 { /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
109 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
110 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
111 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
112 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
113 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
114 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
115 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
116 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
117 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
118 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
121 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
122 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
123 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
125 { /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
126 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
127 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
128 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
129 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
130 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
131 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
132 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
133 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
134 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136 /* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
137 /* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
139 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
140 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
141 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
143 { /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
144 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
145 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
146 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
147 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
148 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
149 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
150 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
151 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
152 /* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153 /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
155 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
156 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
157 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
158 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
159 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
161 { /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
162 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
163 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
164 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
165 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
166 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
167 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
168 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
169 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
170 /* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
171 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
172 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
173 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
175 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
176 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
177 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
179 { /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
180 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
181 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
182 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
183 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
184 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
185 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
186 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
187 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
188 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
189 /* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
190 /* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
191 /* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
192 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
193 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
194 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
195 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
197 { /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
198 /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
199 /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
200 /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
201 /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
202 /* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
203 /* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
204 /* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
205 /* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
206 /* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
207 /* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
208 /* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
209 /* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
210 /* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
211 /* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
212 /* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
213 /* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
216 #undef A
217 #undef F
219 static int
220 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
222 int firstbyte = *p++;
223 state_t s;
224 s = trans[0][firstbyte];
225 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
226 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
228 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
229 s = trans[s][*p++];
230 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
231 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
233 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
234 s = trans[s][*p++];
235 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
236 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
238 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
239 s = trans[s][*p++];
240 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
241 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
244 static int
245 is_mbc_newline(const UChar* p, const UChar* end, OnigEncoding enc)
247 if (p < end) {
248 if (*p == 0x0a) return 1;
250 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
251 #ifndef USE_CRNL_AS_LINE_TERMINATOR
252 if (*p == 0x0d) return 1;
253 #endif
254 if (p + 1 < end) {
255 if (*(p+1) == 0x85 && *p == 0xc2) /* U+0085 */
256 return 1;
257 if (p + 2 < end) {
258 if ((*(p+2) == 0xa8 || *(p+2) == 0xa9)
259 && *(p+1) == 0x80 && *p == 0xe2) /* U+2028, U+2029 */
260 return 1;
263 #endif
266 return 0;
269 static OnigCodePoint
270 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
272 int c, len;
273 OnigCodePoint n;
275 len = enclen(enc, p, end);
276 c = *p++;
277 if (len > 1) {
278 len--;
279 n = c & ((1 << (6 - len)) - 1);
280 while (len--) {
281 c = *p++;
282 n = (n << 6) | (c & ((1 << 6) - 1));
284 return n;
286 else {
287 #ifdef USE_INVALID_CODE_SCHEME
288 if (c > 0xfd) {
289 return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF);
291 #endif
292 return (OnigCodePoint )c;
296 static int
297 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
299 if ((code & 0xffffff80) == 0) return 1;
300 else if ((code & 0xfffff800) == 0) return 2;
301 else if ((code & 0xffff0000) == 0) return 3;
302 else if ((code & 0xffe00000) == 0) return 4;
303 else if ((code & 0xfc000000) == 0) return 5;
304 else if ((code & 0x80000000) == 0) return 6;
305 #ifdef USE_INVALID_CODE_SCHEME
306 else if (code == INVALID_CODE_FE) return 1;
307 else if (code == INVALID_CODE_FF) return 1;
308 #endif
309 else
310 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
313 static int
314 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
316 #define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80)
317 #define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80)
319 if ((code & 0xffffff80) == 0) {
320 *buf = (UChar )code;
321 return 1;
323 else {
324 UChar *p = buf;
326 if ((code & 0xfffff800) == 0) {
327 *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0);
329 else if ((code & 0xffff0000) == 0) {
330 *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0);
331 *p++ = UTF8_TRAILS(code, 6);
333 else if ((code & 0xffe00000) == 0) {
334 *p++ = (UChar )(((code>>18) & 0x07) | 0xf0);
335 *p++ = UTF8_TRAILS(code, 12);
336 *p++ = UTF8_TRAILS(code, 6);
338 else if ((code & 0xfc000000) == 0) {
339 *p++ = (UChar )(((code>>24) & 0x03) | 0xf8);
340 *p++ = UTF8_TRAILS(code, 18);
341 *p++ = UTF8_TRAILS(code, 12);
342 *p++ = UTF8_TRAILS(code, 6);
344 else if ((code & 0x80000000) == 0) {
345 *p++ = (UChar )(((code>>30) & 0x01) | 0xfc);
346 *p++ = UTF8_TRAILS(code, 24);
347 *p++ = UTF8_TRAILS(code, 18);
348 *p++ = UTF8_TRAILS(code, 12);
349 *p++ = UTF8_TRAILS(code, 6);
351 #ifdef USE_INVALID_CODE_SCHEME
352 else if (code == INVALID_CODE_FE) {
353 *p = 0xfe;
354 return 1;
356 else if (code == INVALID_CODE_FF) {
357 *p = 0xff;
358 return 1;
360 #endif
361 else {
362 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
365 *p++ = UTF8_TRAIL0(code);
366 return p - buf;
370 static int
371 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
372 const UChar* end, UChar* fold, OnigEncoding enc)
374 const UChar* p = *pp;
376 if (ONIGENC_IS_MBC_ASCII(p)) {
377 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
378 if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
379 if (*p == 0x49) {
380 *fold++ = 0xc4;
381 *fold = 0xb1;
382 (*pp)++;
383 return 2;
386 #endif
388 *fold = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
389 (*pp)++;
390 return 1; /* return byte length of converted char to lower */
392 else {
393 return onigenc_unicode_mbc_case_fold(enc, flag, pp, end, fold);
398 static int
399 get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out,
400 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
402 *sb_out = 0x80;
403 return onigenc_unicode_ctype_code_range(ctype, ranges);
407 static UChar*
408 left_adjust_char_head(const UChar* start, const UChar* s, OnigEncoding enc ARG_UNUSED)
410 const UChar *p;
412 if (s <= start) return (UChar* )s;
413 p = s;
415 while (!utf8_islead(*p) && p > start) p--;
416 return (UChar* )p;
419 static int
420 get_case_fold_codes_by_str(OnigCaseFoldType flag,
421 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[],
422 OnigEncoding enc)
424 return onigenc_unicode_get_case_fold_codes_by_str(enc, flag, p, end, items);
427 OnigEncodingDefine(utf_8, UTF_8) = {
428 mbc_enc_len,
429 "UTF-8", /* name */
430 6, /* max byte length */
431 1, /* min byte length */
432 is_mbc_newline,
433 mbc_to_code,
434 code_to_mbclen,
435 code_to_mbc,
436 mbc_case_fold,
437 onigenc_unicode_apply_all_case_fold,
438 get_case_fold_codes_by_str,
439 onigenc_unicode_property_name_to_ctype,
440 onigenc_unicode_is_code_ctype,
441 get_ctype_code_range,
442 left_adjust_char_head,
443 onigenc_always_true_is_allowed_reverse_match
445 ENC_ALIAS("CP65001", "UTF-8")
448 * Name: UTF8-MAC
449 * Link: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/BPFileSystem.html
450 * Link: http://developer.apple.com/qa/qa2001/qa1235.html
451 * Link: http://developer.apple.com/jp/qa/qa2001/qa1235.html
453 ENC_REPLICATE("UTF8-MAC", "UTF-8")
454 ENC_ALIAS("UTF-8-MAC", "UTF8-MAC")