add a test.
[ruby-svn.git] / regenc.c
blob5afa1807b9ebbb2dd78d8486a88edae1346e37f5
1 /**********************************************************************
2 regenc.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "regint.h"
32 OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
34 extern int
35 onigenc_init(void)
37 return 0;
40 extern OnigEncoding
41 onigenc_get_default_encoding(void)
43 return OnigEncDefaultCharEncoding;
46 extern int
47 onigenc_set_default_encoding(OnigEncoding enc)
49 OnigEncDefaultCharEncoding = enc;
50 return 0;
53 extern int
54 onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc)
56 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e);
57 if (ONIGENC_MBCLEN_CHARFOUND_P(ret))
58 return ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
59 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret))
60 return e-p+ONIGENC_MBCLEN_NEEDMORE_LEN(ret);
61 return 1;
64 extern UChar*
65 onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
67 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
68 if (p < s) {
69 p += enclen(enc, p, s);
71 return p;
74 extern UChar*
75 onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
76 const UChar* start, const UChar* s, const UChar** prev)
78 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
80 if (p < s) {
81 if (prev) *prev = (const UChar* )p;
82 p += enclen(enc, p, s);
84 else {
85 if (prev) *prev = (const UChar* )NULL; /* Sorry */
87 return p;
90 extern UChar*
91 onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
93 if (s <= start)
94 return (UChar* )NULL;
96 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
99 extern UChar*
100 onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
102 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
103 if (s <= start)
104 return (UChar* )NULL;
106 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
108 return (UChar* )s;
111 extern UChar*
112 onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
114 UChar* q = (UChar* )p;
115 while (n-- > 0) {
116 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
118 return (q <= end ? q : NULL);
121 extern int
122 onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
124 int n = 0;
125 UChar* q = (UChar* )p;
127 while (q < end) {
128 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
129 n++;
131 return n;
134 extern int
135 onigenc_strlen_null(OnigEncoding enc, const UChar* s)
137 int n = 0;
138 UChar* p = (UChar* )s;
139 UChar* e = p + strlen((const char *)s);
141 while (1) {
142 if (*p == '\0') {
143 UChar* q;
144 int len = ONIGENC_MBC_MINLEN(enc);
146 if (len == 1) return n;
147 q = p + 1;
148 while (len > 1) {
149 if (*q != '\0') break;
150 q++;
151 len--;
153 if (len == 1) return n;
155 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
156 n++;
160 extern int
161 onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
163 UChar* start = (UChar* )s;
164 UChar* p = (UChar* )s;
165 UChar* e = p + strlen((const char *)s);
167 while (1) {
168 if (*p == '\0') {
169 UChar* q;
170 int len = ONIGENC_MBC_MINLEN(enc);
172 if (len == 1) return (int )(p - start);
173 q = p + 1;
174 while (len > 1) {
175 if (*q != '\0') break;
176 q++;
177 len--;
179 if (len == 1) return (int )(p - start);
181 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
185 const UChar OnigEncAsciiToLowerCaseTable[] = {
186 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
187 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
188 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
189 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
190 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
191 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
192 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
193 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
194 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
195 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
196 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
197 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
198 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
199 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
200 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
201 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
202 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
203 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
204 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
205 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
206 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
207 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
208 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
209 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
210 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
211 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
212 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
213 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
214 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
215 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
216 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
217 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
220 #ifdef USE_UPPER_CASE_TABLE
221 const UChar OnigEncAsciiToUpperCaseTable[256] = {
222 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
223 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
224 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
225 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
226 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
227 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
228 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
229 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
230 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
231 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
232 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
233 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
234 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
235 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
236 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
237 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
238 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
239 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
240 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
241 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
242 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
243 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
244 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
245 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
246 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
247 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
248 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
249 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
250 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
251 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
252 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
253 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
255 #endif
257 const unsigned short OnigEncAsciiCtypeTable[256] = {
258 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
259 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
260 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
261 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
262 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
263 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
264 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
265 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
266 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
267 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
268 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
269 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
270 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
271 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
272 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
273 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
274 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
275 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
276 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
277 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
278 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
279 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
280 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
281 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
282 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
283 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
284 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
285 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
286 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
287 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
288 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
289 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
292 const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
293 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
294 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
295 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
296 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
297 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
298 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
299 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
300 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
301 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
302 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
303 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
304 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
305 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
306 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
307 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
308 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
309 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
310 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
311 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
312 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
313 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
314 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
315 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
316 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
317 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
318 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
319 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
320 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
321 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
322 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
323 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
324 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
327 #ifdef USE_UPPER_CASE_TABLE
328 const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
329 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
330 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
331 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
332 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
333 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
334 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
335 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
336 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
337 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
338 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
339 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
340 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
341 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
342 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
343 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
344 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
345 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
346 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
347 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
348 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
349 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
350 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
351 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
352 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
353 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
354 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
355 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
356 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
357 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
358 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
359 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
360 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
362 #endif
364 extern void
365 onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
367 /* nothing */
368 /* obsoleted. */
371 extern UChar*
372 onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
374 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
377 const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
378 { 0x41, 0x61 },
379 { 0x42, 0x62 },
380 { 0x43, 0x63 },
381 { 0x44, 0x64 },
382 { 0x45, 0x65 },
383 { 0x46, 0x66 },
384 { 0x47, 0x67 },
385 { 0x48, 0x68 },
386 { 0x49, 0x69 },
387 { 0x4a, 0x6a },
388 { 0x4b, 0x6b },
389 { 0x4c, 0x6c },
390 { 0x4d, 0x6d },
391 { 0x4e, 0x6e },
392 { 0x4f, 0x6f },
393 { 0x50, 0x70 },
394 { 0x51, 0x71 },
395 { 0x52, 0x72 },
396 { 0x53, 0x73 },
397 { 0x54, 0x74 },
398 { 0x55, 0x75 },
399 { 0x56, 0x76 },
400 { 0x57, 0x77 },
401 { 0x58, 0x78 },
402 { 0x59, 0x79 },
403 { 0x5a, 0x7a }
406 extern int
407 onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
408 OnigApplyAllCaseFoldFunc f, void* arg,
409 OnigEncoding enc ARG_UNUSED)
411 OnigCodePoint code;
412 int i, r;
414 for (i = 0;
415 i < (int )(sizeof(OnigAsciiLowerMap)/sizeof(OnigPairCaseFoldCodes));
416 i++) {
417 code = OnigAsciiLowerMap[i].to;
418 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
419 if (r != 0) return r;
421 code = OnigAsciiLowerMap[i].from;
422 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
423 if (r != 0) return r;
426 return 0;
429 extern int
430 onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
431 const OnigUChar* p, const OnigUChar* end ARG_UNUSED, OnigCaseFoldCodeItem items[],
432 OnigEncoding enc ARG_UNUSED)
434 if (0x41 <= *p && *p <= 0x5a) {
435 items[0].byte_len = 1;
436 items[0].code_len = 1;
437 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
438 return 1;
440 else if (0x61 <= *p && *p <= 0x7a) {
441 items[0].byte_len = 1;
442 items[0].code_len = 1;
443 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
444 return 1;
446 else
447 return 0;
450 static int
451 ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
452 OnigApplyAllCaseFoldFunc f, void* arg)
454 OnigCodePoint ss[] = { 0x73, 0x73 };
456 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
459 extern int
460 onigenc_apply_all_case_fold_with_map(int map_size,
461 const OnigPairCaseFoldCodes map[],
462 int ess_tsett_flag, OnigCaseFoldType flag,
463 OnigApplyAllCaseFoldFunc f, void* arg)
465 OnigCodePoint code;
466 int i, r;
468 r = onigenc_ascii_apply_all_case_fold(flag, f, arg, 0);
469 if (r != 0) return r;
471 for (i = 0; i < map_size; i++) {
472 code = map[i].to;
473 r = (*f)(map[i].from, &code, 1, arg);
474 if (r != 0) return r;
476 code = map[i].from;
477 r = (*f)(map[i].to, &code, 1, arg);
478 if (r != 0) return r;
481 if (ess_tsett_flag != 0)
482 return ss_apply_all_case_fold(flag, f, arg);
484 return 0;
487 extern int
488 onigenc_get_case_fold_codes_by_str_with_map(int map_size,
489 const OnigPairCaseFoldCodes map[],
490 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
491 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
493 if (0x41 <= *p && *p <= 0x5a) {
494 items[0].byte_len = 1;
495 items[0].code_len = 1;
496 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
497 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
498 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
499 /* SS */
500 items[1].byte_len = 2;
501 items[1].code_len = 1;
502 items[1].code[0] = (OnigCodePoint )0xdf;
503 return 2;
505 else
506 return 1;
508 else if (0x61 <= *p && *p <= 0x7a) {
509 items[0].byte_len = 1;
510 items[0].code_len = 1;
511 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
512 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
513 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
514 /* ss */
515 items[1].byte_len = 2;
516 items[1].code_len = 1;
517 items[1].code[0] = (OnigCodePoint )0xdf;
518 return 2;
520 else
521 return 1;
523 else if (*p == 0xdf && ess_tsett_flag != 0) {
524 items[0].byte_len = 1;
525 items[0].code_len = 2;
526 items[0].code[0] = (OnigCodePoint )'s';
527 items[0].code[1] = (OnigCodePoint )'s';
529 items[1].byte_len = 1;
530 items[1].code_len = 2;
531 items[1].code[0] = (OnigCodePoint )'S';
532 items[1].code[1] = (OnigCodePoint )'S';
534 items[2].byte_len = 1;
535 items[2].code_len = 2;
536 items[2].code[0] = (OnigCodePoint )'s';
537 items[2].code[1] = (OnigCodePoint )'S';
539 items[3].byte_len = 1;
540 items[3].code_len = 2;
541 items[3].code[0] = (OnigCodePoint )'S';
542 items[3].code[1] = (OnigCodePoint )'s';
544 return 4;
546 else {
547 int i;
549 for (i = 0; i < map_size; i++) {
550 if (*p == map[i].from) {
551 items[0].byte_len = 1;
552 items[0].code_len = 1;
553 items[0].code[0] = map[i].to;
554 return 1;
556 else if (*p == map[i].to) {
557 items[0].byte_len = 1;
558 items[0].code_len = 1;
559 items[0].code[0] = map[i].from;
560 return 1;
565 return 0;
569 extern int
570 onigenc_not_support_get_ctype_code_range(OnigCtype ctype,
571 OnigCodePoint* sb_out, const OnigCodePoint* ranges[],
572 OnigEncoding enc)
574 return ONIG_NO_SUPPORT_CONFIG;
577 extern int
578 onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED)
580 if (p < end) {
581 if (*p == 0x0a) return 1;
583 return 0;
586 /* for single byte encodings */
587 extern int
588 onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
589 const UChar*end, UChar* lower, OnigEncoding enc ARG_UNUSED)
591 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
593 (*p)++;
594 return 1; /* return byte length of converted char to lower */
597 #if 0
598 extern int
599 onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag ARG_UNUSED,
600 const UChar** pp, const UChar* end ARG_UNUSED)
602 const UChar* p = *pp;
604 (*pp)++;
605 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
607 #endif
609 extern int
610 onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED, const UChar* e ARG_UNUSED,
611 OnigEncoding enc ARG_UNUSED)
613 return 1;
616 extern OnigCodePoint
617 onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
618 OnigEncoding enc ARG_UNUSED)
620 return (OnigCodePoint )(*p);
623 extern int
624 onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
626 return 1;
629 extern int
630 onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
632 if (code > 0xff)
633 rb_raise(rb_eRangeError, "%"PRIdVALUE " out of char range", code);
634 *buf = (UChar )(code & 0xff);
635 return 1;
638 extern UChar*
639 onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED, const UChar* s,
640 OnigEncoding enc ARG_UNUSED)
642 return (UChar* )s;
645 extern int
646 onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED, const UChar* end ARG_UNUSED,
647 OnigEncoding enc ARG_UNUSED)
649 return TRUE;
652 extern int
653 onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED, const UChar* end ARG_UNUSED,
654 OnigEncoding enc ARG_UNUSED)
656 return FALSE;
659 extern int
660 onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype,
661 OnigEncoding enc ARG_UNUSED)
663 if (code < 128)
664 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
665 else
666 return FALSE;
669 extern OnigCodePoint
670 onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
672 int c, i, len;
673 OnigCodePoint n;
675 len = enclen(enc, p, end);
676 n = (OnigCodePoint )(*p++);
677 if (len == 1) return n;
679 for (i = 1; i < len; i++) {
680 if (p >= end) break;
681 c = *p++;
682 n <<= 8; n += c;
684 return n;
687 extern int
688 onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
689 const UChar** pp, const UChar* end ARG_UNUSED,
690 UChar* lower)
692 int len;
693 const UChar *p = *pp;
695 if (ONIGENC_IS_MBC_ASCII(p)) {
696 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
697 (*pp)++;
698 return 1;
700 else {
701 int i;
703 len = enclen(enc, p, end);
704 for (i = 0; i < len; i++) {
705 *lower++ = *p++;
707 (*pp) += len;
708 return len; /* return byte length of converted to lower char */
712 #if 0
713 extern int
714 onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
715 const UChar** pp ARG_UNUSED, const UChar* end ARG_UNUSED)
717 const UChar* p = *pp;
719 if (ONIGENC_IS_MBC_ASCII(p)) {
720 (*pp)++;
721 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
724 (*pp) += enclen(enc, p);
725 return FALSE;
727 #endif
729 extern int
730 onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
732 if ((code & 0xff00) != 0) return 2;
733 else return 1;
736 extern int
737 onigenc_mb4_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
739 if ((code & 0xff000000) != 0) return 4;
740 else if ((code & 0xff0000) != 0) return 3;
741 else if ((code & 0xff00) != 0) return 2;
742 else return 1;
745 extern int
746 onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
748 UChar *p = buf;
750 if ((code & 0xff00) != 0) {
751 *p++ = (UChar )((code >> 8) & 0xff);
753 *p++ = (UChar )(code & 0xff);
755 #if 1
756 if (enclen(enc, buf, p) != (p - buf))
757 return ONIGERR_INVALID_CODE_POINT_VALUE;
758 #endif
759 return p - buf;
762 extern int
763 onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
765 UChar *p = buf;
767 if ((code & 0xff000000) != 0) {
768 *p++ = (UChar )((code >> 24) & 0xff);
770 if ((code & 0xff0000) != 0 || p != buf) {
771 *p++ = (UChar )((code >> 16) & 0xff);
773 if ((code & 0xff00) != 0 || p != buf) {
774 *p++ = (UChar )((code >> 8) & 0xff);
776 *p++ = (UChar )(code & 0xff);
778 #if 1
779 if (enclen(enc, buf, p) != (p - buf))
780 return ONIGERR_INVALID_CODE_POINT_VALUE;
781 #endif
782 return p - buf;
785 extern int
786 onigenc_minimum_property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
788 static const PosixBracketEntryType PBS[] = {
789 { (UChar* )"Alnum", ONIGENC_CTYPE_ALNUM, 5 },
790 { (UChar* )"Alpha", ONIGENC_CTYPE_ALPHA, 5 },
791 { (UChar* )"Blank", ONIGENC_CTYPE_BLANK, 5 },
792 { (UChar* )"Cntrl", ONIGENC_CTYPE_CNTRL, 5 },
793 { (UChar* )"Digit", ONIGENC_CTYPE_DIGIT, 5 },
794 { (UChar* )"Graph", ONIGENC_CTYPE_GRAPH, 5 },
795 { (UChar* )"Lower", ONIGENC_CTYPE_LOWER, 5 },
796 { (UChar* )"Print", ONIGENC_CTYPE_PRINT, 5 },
797 { (UChar* )"Punct", ONIGENC_CTYPE_PUNCT, 5 },
798 { (UChar* )"Space", ONIGENC_CTYPE_SPACE, 5 },
799 { (UChar* )"Upper", ONIGENC_CTYPE_UPPER, 5 },
800 { (UChar* )"XDigit", ONIGENC_CTYPE_XDIGIT, 6 },
801 { (UChar* )"ASCII", ONIGENC_CTYPE_ASCII, 5 },
802 { (UChar* )"Word", ONIGENC_CTYPE_WORD, 4 },
803 { (UChar* )NULL, -1, 0 }
806 const PosixBracketEntryType *pb;
807 int len;
809 len = onigenc_strlen(enc, p, end);
810 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
811 if (len == pb->len &&
812 onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
813 return pb->ctype;
816 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
819 extern int
820 onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
821 unsigned int ctype)
823 if (code < 128)
824 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
825 else {
826 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
827 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
831 return FALSE;
834 extern int
835 onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
836 unsigned int ctype)
838 if (code < 128)
839 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
840 else {
841 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
842 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
846 return FALSE;
849 extern int
850 onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
851 const UChar* sascii /* ascii */, int n)
853 int x, c;
855 while (n-- > 0) {
856 if (p >= end) return (int )(*sascii);
858 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
859 x = *sascii - c;
860 if (x) return x;
862 sascii++;
863 p += enclen(enc, p, end);
865 return 0;
868 /* Property management */
869 static int
870 resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
872 int size;
873 const OnigCodePoint **list = *plist;
875 size = sizeof(OnigCodePoint*) * new_size;
876 if (IS_NULL(list)) {
877 list = (const OnigCodePoint** )xmalloc(size);
879 else {
880 list = (const OnigCodePoint** )xrealloc((void* )list, size);
883 if (IS_NULL(list)) return ONIGERR_MEMORY;
885 *plist = list;
886 *psize = new_size;
888 return 0;
891 extern int
892 onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
893 hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
894 int *psize)
896 #define PROP_INIT_SIZE 16
898 int r;
900 if (*psize <= *pnum) {
901 int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
902 r = resize_property_list(new_size, plist, psize);
903 if (r != 0) return r;
906 (*plist)[*pnum] = prop;
908 if (ONIG_IS_NULL(*table)) {
909 *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
910 if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
913 *pnum = *pnum + 1;
914 onig_st_insert_strend(*table, name, name + strlen((char* )name),
915 (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
916 return 0;
919 extern int
920 onigenc_property_list_init(int (*f)(void))
922 int r;
924 THREAD_ATOMIC_START;
926 r = f();
928 THREAD_ATOMIC_END;
929 return r;