1 /**********************************************************************
2 iso8859_1.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
33 ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
35 static const unsigned short EncISO_8859_1_CtypeTable
[256] = {
36 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
37 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
38 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
39 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
40 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
41 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
42 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
43 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
44 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
45 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
46 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
47 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
48 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
49 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
50 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
51 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
52 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
53 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
54 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
55 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
56 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
57 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
58 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
59 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
60 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
61 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
62 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
63 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
64 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
65 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
66 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
67 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
70 static const OnigPairCaseFoldCodes CaseFoldMap
[] = {
105 apply_all_case_fold(OnigCaseFoldType flag
,
106 OnigApplyAllCaseFoldFunc f
, void* arg
,
107 OnigEncoding enc ARG_UNUSED
)
109 return onigenc_apply_all_case_fold_with_map(
110 sizeof(CaseFoldMap
)/sizeof(OnigPairCaseFoldCodes
), CaseFoldMap
, 1,
115 get_case_fold_codes_by_str(OnigCaseFoldType flag
,
116 const OnigUChar
* p
, const OnigUChar
* end
,
117 OnigCaseFoldCodeItem items
[],
118 OnigEncoding enc ARG_UNUSED
)
120 if (0x41 <= *p
&& *p
<= 0x5a) {
121 items
[0].byte_len
= 1;
122 items
[0].code_len
= 1;
123 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
124 if (*p
== 0x53 && end
> p
+ 1
125 && (*(p
+1) == 0x53 || *(p
+1) == 0x73)) { /* SS */
126 items
[1].byte_len
= 2;
127 items
[1].code_len
= 1;
128 items
[1].code
[0] = (OnigCodePoint
)0xdf;
134 else if (0x61 <= *p
&& *p
<= 0x7a) {
135 items
[0].byte_len
= 1;
136 items
[0].code_len
= 1;
137 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
138 if (*p
== 0x73 && end
> p
+ 1
139 && (*(p
+1) == 0x73 || *(p
+1) == 0x53)) { /* ss */
140 items
[1].byte_len
= 2;
141 items
[1].code_len
= 1;
142 items
[1].code
[0] = (OnigCodePoint
)0xdf;
148 else if (0xc0 <= *p
&& *p
<= 0xcf) {
149 items
[0].byte_len
= 1;
150 items
[0].code_len
= 1;
151 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
154 else if (0xd0 <= *p
&& *p
<= 0xdf) {
156 items
[0].byte_len
= 1;
157 items
[0].code_len
= 2;
158 items
[0].code
[0] = (OnigCodePoint
)'s';
159 items
[0].code
[1] = (OnigCodePoint
)'s';
161 items
[1].byte_len
= 1;
162 items
[1].code_len
= 2;
163 items
[1].code
[0] = (OnigCodePoint
)'S';
164 items
[1].code
[1] = (OnigCodePoint
)'S';
166 items
[2].byte_len
= 1;
167 items
[2].code_len
= 2;
168 items
[2].code
[0] = (OnigCodePoint
)'s';
169 items
[2].code
[1] = (OnigCodePoint
)'S';
171 items
[3].byte_len
= 1;
172 items
[3].code_len
= 2;
173 items
[3].code
[0] = (OnigCodePoint
)'S';
174 items
[3].code
[1] = (OnigCodePoint
)'s';
178 else if (*p
!= 0xd7) {
179 items
[0].byte_len
= 1;
180 items
[0].code_len
= 1;
181 items
[0].code
[0] = (OnigCodePoint
)(*p
+ 0x20);
185 else if (0xe0 <= *p
&& *p
<= 0xef) {
186 items
[0].byte_len
= 1;
187 items
[0].code_len
= 1;
188 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
191 else if (0xf0 <= *p
&& *p
<= 0xfe) {
193 items
[0].byte_len
= 1;
194 items
[0].code_len
= 1;
195 items
[0].code
[0] = (OnigCodePoint
)(*p
- 0x20);
204 mbc_case_fold(OnigCaseFoldType flag
, const UChar
** pp
, const UChar
* end ARG_UNUSED
,
205 UChar
* lower
, OnigEncoding enc ARG_UNUSED
)
207 const UChar
* p
= *pp
;
209 if (*p
== 0xdf && (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
216 *lower
= ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p
);
223 is_mbc_ambiguous(OnigCaseFoldType flag
,
224 const UChar
** pp
, const UChar
* end
)
227 const UChar
* p
= *pp
;
229 if (*p
== 0xdf && (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
235 v
= (EncISO_8859_1_CtypeTable
[*p
] & (BIT_CTYPE_UPPER
| BIT_CTYPE_LOWER
));
236 if ((v
| BIT_CTYPE_LOWER
) != 0) {
237 /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
238 if (*p
>= 0xaa && *p
<= 0xba)
244 return (v
!= 0 ? TRUE
: FALSE
);
249 is_code_ctype(OnigCodePoint code
, unsigned int ctype
, OnigEncoding enc ARG_UNUSED
)
252 return ENC_IS_ISO_8859_1_CTYPE(code
, ctype
);
257 OnigEncodingDefine(iso_8859_1
, ISO_8859_1
) = {
258 onigenc_single_byte_mbc_enc_len
,
259 "ISO-8859-1", /* name */
260 1, /* max enc length */
261 1, /* min enc length */
262 onigenc_is_mbc_newline_0x0a
,
263 onigenc_single_byte_mbc_to_code
,
264 onigenc_single_byte_code_to_mbclen
,
265 onigenc_single_byte_code_to_mbc
,
268 get_case_fold_codes_by_str
,
269 onigenc_minimum_property_name_to_ctype
,
271 onigenc_not_support_get_ctype_code_range
,
272 onigenc_single_byte_left_adjust_char_head
,
273 onigenc_always_true_is_allowed_reverse_match
275 ENC_ALIAS("ISO8859-1", "ISO-8859-1")
280 * Link: http://www.iana.org/assignments/character-sets
281 * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx
282 * Link: http://en.wikipedia.org/wiki/Windows-1252
284 ENC_REPLICATE("Windows-1252", "ISO-8859-1")
285 ENC_ALIAS("CP1252", "Windows-1252")