* transcode_data.h (rb_transcoder_stateful_type_t): defined.
[ruby-svn.git] / enc / iso_8859_1.c
blobb73f8ca37954142daf99945e412db4628c435f6c
1 /**********************************************************************
2 iso8859_1.c - Oniguruma (regular expression library)
3 **********************************************************************/
4 /*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "regenc.h"
32 #define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \
33 ((EncISO_8859_1_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
35 static const unsigned short EncISO_8859_1_CtypeTable[256] = {
36 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
37 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
38 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
39 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
40 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
41 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
42 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
43 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
44 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
45 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
46 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
47 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
48 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
49 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
50 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
51 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
52 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
53 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
54 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
55 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
56 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
57 0x00a0, 0x00a0, 0x30e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0,
58 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x30e2, 0x00a0, 0x01a0,
59 0x00a0, 0x10a0, 0x30e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
60 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
61 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
62 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
63 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
64 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
65 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
66 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
67 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
70 static const OnigPairCaseFoldCodes CaseFoldMap[] = {
71 { 0xc0, 0xe0 },
72 { 0xc1, 0xe1 },
73 { 0xc2, 0xe2 },
74 { 0xc3, 0xe3 },
75 { 0xc4, 0xe4 },
76 { 0xc5, 0xe5 },
77 { 0xc6, 0xe6 },
78 { 0xc7, 0xe7 },
79 { 0xc8, 0xe8 },
80 { 0xc9, 0xe9 },
81 { 0xca, 0xea },
82 { 0xcb, 0xeb },
83 { 0xcc, 0xec },
84 { 0xcd, 0xed },
85 { 0xce, 0xee },
86 { 0xcf, 0xef },
88 { 0xd0, 0xf0 },
89 { 0xd1, 0xf1 },
90 { 0xd2, 0xf2 },
91 { 0xd3, 0xf3 },
92 { 0xd4, 0xf4 },
93 { 0xd5, 0xf5 },
94 { 0xd6, 0xf6 },
95 { 0xd8, 0xf8 },
96 { 0xd9, 0xf9 },
97 { 0xda, 0xfa },
98 { 0xdb, 0xfb },
99 { 0xdc, 0xfc },
100 { 0xdd, 0xfd },
101 { 0xde, 0xfe }
104 static int
105 apply_all_case_fold(OnigCaseFoldType flag,
106 OnigApplyAllCaseFoldFunc f, void* arg,
107 OnigEncoding enc ARG_UNUSED)
109 return onigenc_apply_all_case_fold_with_map(
110 sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 1,
111 flag, f, arg);
114 static int
115 get_case_fold_codes_by_str(OnigCaseFoldType flag,
116 const OnigUChar* p, const OnigUChar* end,
117 OnigCaseFoldCodeItem items[],
118 OnigEncoding enc ARG_UNUSED)
120 if (0x41 <= *p && *p <= 0x5a) {
121 items[0].byte_len = 1;
122 items[0].code_len = 1;
123 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
124 if (*p == 0x53 && end > p + 1
125 && (*(p+1) == 0x53 || *(p+1) == 0x73)) { /* SS */
126 items[1].byte_len = 2;
127 items[1].code_len = 1;
128 items[1].code[0] = (OnigCodePoint )0xdf;
129 return 2;
131 else
132 return 1;
134 else if (0x61 <= *p && *p <= 0x7a) {
135 items[0].byte_len = 1;
136 items[0].code_len = 1;
137 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
138 if (*p == 0x73 && end > p + 1
139 && (*(p+1) == 0x73 || *(p+1) == 0x53)) { /* ss */
140 items[1].byte_len = 2;
141 items[1].code_len = 1;
142 items[1].code[0] = (OnigCodePoint )0xdf;
143 return 2;
145 else
146 return 1;
148 else if (0xc0 <= *p && *p <= 0xcf) {
149 items[0].byte_len = 1;
150 items[0].code_len = 1;
151 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
152 return 1;
154 else if (0xd0 <= *p && *p <= 0xdf) {
155 if (*p == 0xdf) {
156 items[0].byte_len = 1;
157 items[0].code_len = 2;
158 items[0].code[0] = (OnigCodePoint )'s';
159 items[0].code[1] = (OnigCodePoint )'s';
161 items[1].byte_len = 1;
162 items[1].code_len = 2;
163 items[1].code[0] = (OnigCodePoint )'S';
164 items[1].code[1] = (OnigCodePoint )'S';
166 items[2].byte_len = 1;
167 items[2].code_len = 2;
168 items[2].code[0] = (OnigCodePoint )'s';
169 items[2].code[1] = (OnigCodePoint )'S';
171 items[3].byte_len = 1;
172 items[3].code_len = 2;
173 items[3].code[0] = (OnigCodePoint )'S';
174 items[3].code[1] = (OnigCodePoint )'s';
176 return 4;
178 else if (*p != 0xd7) {
179 items[0].byte_len = 1;
180 items[0].code_len = 1;
181 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
182 return 1;
185 else if (0xe0 <= *p && *p <= 0xef) {
186 items[0].byte_len = 1;
187 items[0].code_len = 1;
188 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
189 return 1;
191 else if (0xf0 <= *p && *p <= 0xfe) {
192 if (*p != 0xf7) {
193 items[0].byte_len = 1;
194 items[0].code_len = 1;
195 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
196 return 1;
200 return 0;
203 static int
204 mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED,
205 UChar* lower, OnigEncoding enc ARG_UNUSED)
207 const UChar* p = *pp;
209 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
210 *lower++ = 's';
211 *lower = 's';
212 (*pp)++;
213 return 2;
216 *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
217 (*pp)++;
218 return 1;
221 #if 0
222 static int
223 is_mbc_ambiguous(OnigCaseFoldType flag,
224 const UChar** pp, const UChar* end)
226 int v;
227 const UChar* p = *pp;
229 if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
230 (*pp)++;
231 return TRUE;
234 (*pp)++;
235 v = (EncISO_8859_1_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
236 if ((v | BIT_CTYPE_LOWER) != 0) {
237 /* 0xdf, 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
238 if (*p >= 0xaa && *p <= 0xba)
239 return FALSE;
240 else
241 return TRUE;
244 return (v != 0 ? TRUE : FALSE);
246 #endif
248 static int
249 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
251 if (code < 256)
252 return ENC_IS_ISO_8859_1_CTYPE(code, ctype);
253 else
254 return FALSE;
257 OnigEncodingDefine(iso_8859_1, ISO_8859_1) = {
258 onigenc_single_byte_mbc_enc_len,
259 "ISO-8859-1", /* name */
260 1, /* max enc length */
261 1, /* min enc length */
262 onigenc_is_mbc_newline_0x0a,
263 onigenc_single_byte_mbc_to_code,
264 onigenc_single_byte_code_to_mbclen,
265 onigenc_single_byte_code_to_mbc,
266 mbc_case_fold,
267 apply_all_case_fold,
268 get_case_fold_codes_by_str,
269 onigenc_minimum_property_name_to_ctype,
270 is_code_ctype,
271 onigenc_not_support_get_ctype_code_range,
272 onigenc_single_byte_left_adjust_char_head,
273 onigenc_always_true_is_allowed_reverse_match
275 ENC_ALIAS("ISO8859-1", "ISO-8859-1")
278 * Name: windows-1252
279 * MIBenum: 2252
280 * Link: http://www.iana.org/assignments/character-sets
281 * Link: http://www.microsoft.com/globaldev/reference/sbcs/1252.mspx
282 * Link: http://en.wikipedia.org/wiki/Windows-1252
284 ENC_REPLICATE("Windows-1252", "ISO-8859-1")
285 ENC_ALIAS("CP1252", "Windows-1252")