Prefer #include <...> for system headers.
[libiconv.git] / lib / iso2022_jpms.h
blob2c424054e628480531f4a1162fcfb882380d1c29
1 /*
2 * Copyright (C) 1999-2001, 2008, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, see <https://www.gnu.org/licenses/>.
21 * ISO-2022-JP-MS
22 * alias CP50221
24 * This is an extension of ISO-2022-JP-1 with larger character sets.
25 * It uses ESC $ B and ESC $ ( D to denote *extensions* of JIS X 0208 and
26 * JIS X 0212, respectively. This violates the principles of ISO 2022,
27 * where
28 * 1. character sets to be used by ISO 2022 have to be registered at the
29 * ISO IR registry <https://www.itscj.ipsj.or.jp/ISO-IR/>,
30 * 2. different character sets are designated by different escape
31 * sequences.
32 * It's a typical instance of the "embrace and extend" strategy by Microsoft
33 * <https://en.wikipedia.org/wiki/Embrace,_extend_and_extinguish>.
37 * Windows has three encodings CP50220, CP50221, CP50222.
38 * The common parts are:
39 * - US-ASCII (0x00..0x7F)
40 * - JIS X 0208 extended by
41 * - one row (0x2D),
42 * - a private use area (rows 0x75..0x7E = U+E000..U+E3AB),
43 * enabled with ESC $ B, disabled with ESC ( B.
44 * - JIS X 0212 extended by
45 * - two rows (0x73..0x74),
46 * - a private use area (rows 0x75..0x7E = U+E3AC..U+E757),
47 * enabled with ESC $ ( D, disabled with ESC ( B.
48 * They differ in the handling of JIS X 0201 characters (halfwidth Katakana)
49 * in the conversion direction Unicode -> CP5022x:
50 * * CP50220 maps the halfwidth Katakana to fullwidth Katakana characters.
51 * * CP50221 contains the JIS X 0201 halfwidth Katakana characters,
52 * enabled with ESC ( I, disabled with ESC ( B.
53 * * CP50222 contains the JIS X 0201 halfwidth Katakana characters,
54 * enabled with ESC ( J 0x0E, disabled with ESC ( B.
55 * In the conversion direction CP5022x -> Unicode, all three operate the same:
56 * - ESC ( I is supported and understood.
57 * - ESC ( J 0x0E is not accepted. (Tested on Windows XP SP3.)
58 * Conclusion:
59 * - CP50222 should not be used, because the multibyte sequence that it
60 * produces cannot be parsed by either of the three encodings.
61 * - CP50221 is preferrable to CP50220, because it can faithfully represent
62 * the halfwidth Katakana characters.
63 * We therefore implement CP50221. As an extension, in the mbtowc conversion
64 * direction, we support also ESC ( J 0x0E, just in case.
67 #include "cp50221_0208_ext.h"
68 #include "cp50221_0212_ext.h"
70 #define ESC 0x1b
71 #define SO 0x0e
72 #define SI 0x0f
75 * The state can be one of the following values.
77 #define STATE_ASCII 0 /* Esc ( B */
78 #define STATE_JISX0201ROMAN 1 /* Esc ( J */ /* only in mbtowc direction */
79 #define STATE_JISX0201KATAKANA 2 /* Esc ( I */
80 #define STATE_JISX0208MS 3 /* Esc $ @ or Esc $ B */
81 #define STATE_JISX0212MS 4 /* Esc $ ( D */
83 static int
84 iso2022_jpms_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
86 state_t state = conv->istate;
87 int count = 0;
88 unsigned char c;
89 for (;;) {
90 c = *s;
91 if (c == ESC) {
92 if (n < count+3)
93 goto none;
94 if (s[1] == '(') {
95 if (s[2] == 'B') {
96 state = STATE_ASCII;
97 s += 3; count += 3;
98 if (n < count+1)
99 goto none;
100 continue;
102 if (s[2] == 'I') {
103 state = STATE_JISX0201KATAKANA;
104 s += 3; count += 3;
105 if (n < count+1)
106 goto none;
107 continue;
109 if (s[2] == 'J') {
110 state = STATE_JISX0201ROMAN;
111 s += 3; count += 3;
112 if (n < count+1)
113 goto none;
114 continue;
116 goto ilseq;
118 if (s[1] == '$') {
119 if (s[2] == '@' || s[2] == 'B') {
120 /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
121 state = STATE_JISX0208MS;
122 s += 3; count += 3;
123 if (n < count+1)
124 goto none;
125 continue;
127 if (s[2] == '(') {
128 if (n < count+4)
129 goto none;
130 if (s[3] == 'D') {
131 state = STATE_JISX0212MS;
132 s += 4; count += 4;
133 if (n < count+1)
134 goto none;
135 continue;
138 goto ilseq;
140 goto ilseq;
142 if (c == SO) {
143 if (state == STATE_JISX0201ROMAN)
144 state = STATE_JISX0201KATAKANA;
145 s += 1; count += 1;
146 if (n < count+1)
147 goto none;
148 continue;
150 if (c == SI) {
151 if (state == STATE_JISX0201KATAKANA)
152 state = STATE_JISX0201ROMAN;
153 s += 1; count += 1;
154 if (n < count+1)
155 goto none;
156 continue;
158 break;
160 switch (state) {
161 case STATE_ASCII:
162 if (c < 0x80) {
163 int ret = ascii_mbtowc(conv,pwc,s,1);
164 if (ret == RET_ILSEQ)
165 goto ilseq;
166 if (ret != 1) abort();
167 conv->istate = state;
168 return count+1;
169 } else
170 goto ilseq;
171 case STATE_JISX0201ROMAN:
172 if (c < 0x80) {
173 int ret = jisx0201_mbtowc(conv,pwc,s,1);
174 if (ret == RET_ILSEQ)
175 goto ilseq;
176 if (ret != 1) abort();
177 conv->istate = state;
178 return count+1;
179 } else
180 goto ilseq;
181 case STATE_JISX0201KATAKANA:
182 if (c < 0x80) {
183 unsigned char buf = c+0x80;
184 int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
185 if (ret == RET_ILSEQ)
186 goto ilseq;
187 if (ret != 1) abort();
188 conv->istate = state;
189 return count+1;
190 } else
191 goto ilseq;
192 case STATE_JISX0208MS:
193 if (n < count+2)
194 goto none;
195 if (s[0] < 0x80 && s[1] < 0x80) {
196 int ret;
197 if (s[0] < 0x75) {
198 if (s[0] == 0x2d) {
199 /* Extension of JIS X 0208. */
200 if (s[1] >= 0x21 && s[1] <= 0x79) {
201 unsigned char i = (s[1] - 0x21) + 1;
202 ret = cp50221_0208_ext_mbtowc(conv,pwc,&i,1);
203 if (ret == 1)
204 ret = 2;
205 } else
206 ret = RET_ILSEQ;
207 } else {
208 /* JIS X 0208. */
209 ret = jisx0208_mbtowc(conv,pwc,s,2);
211 } else {
212 /* Extension of JIS X 0208.
213 0x{75..7E}{21..8E} maps to U+E000..U+E3AB.
214 But some rows maps to characters present in CP932. */
215 if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
216 unsigned short wc = 0xfffd;
217 if (s[0] >= 0x79 && s[0] <= 0x7c)
218 wc = cp932ext_2uni_pageed[(s[0] - 0x79) * 94 + (s[1] - 0x21)];
219 if (wc == 0xfffd)
220 wc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe000;
221 *pwc = wc;
222 ret = 2;
223 } else
224 ret = RET_ILSEQ;
226 if (ret == RET_ILSEQ)
227 goto ilseq;
228 if (ret != 2) abort();
229 conv->istate = state;
230 return count+2;
231 } else
232 goto ilseq;
233 case STATE_JISX0212MS:
234 if (n < count+2)
235 goto none;
236 if (s[0] < 0x80 && s[1] < 0x80) {
237 int ret;
238 if (s[0] < 0x73) {
239 /* JIS X 0212. */
240 ret = jisx0212_mbtowc(conv,pwc,s,2);
241 } else {
242 if (s[0] < 0x75) {
243 /* Extension of JIS X 0212. */
244 if (s[1] >= 0x21 && s[1] <= 0x7e) {
245 unsigned char i = (s[0] - 0x73) * 94 + (s[1] - 0x21) + 1;
246 ret = cp50221_0212_ext_mbtowc(conv,pwc,&i,1);
247 if (ret == 1)
248 ret = 2;
249 } else
250 ret = RET_ILSEQ;
251 } else {
252 /* Extension of JIS X 0208.
253 0x{75..7E}{21..8E} maps to U+E3AC..U+E757. */
254 if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
255 *pwc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe3ac;
256 ret = 2;
257 } else
258 ret = RET_ILSEQ;
261 if (ret == RET_ILSEQ)
262 goto ilseq;
263 if (ret != 2) abort();
264 conv->istate = state;
265 return count+2;
266 } else
267 goto ilseq;
268 default: abort();
271 none:
272 conv->istate = state;
273 return RET_TOOFEW(count);
275 ilseq:
276 conv->istate = state;
277 return RET_SHIFT_ILSEQ(count);
280 static int
281 iso2022_jpms_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
283 state_t state = conv->ostate;
284 unsigned char buf[2];
285 int ret;
287 /* Try ASCII. */
288 ret = ascii_wctomb(conv,buf,wc,1);
289 if (ret != RET_ILUNI) {
290 if (ret != 1) abort();
291 if (buf[0] < 0x80) {
292 int count = (state == STATE_ASCII ? 1 : 4);
293 if (n < count)
294 return RET_TOOSMALL;
295 if (state != STATE_ASCII) {
296 r[0] = ESC;
297 r[1] = '(';
298 r[2] = 'B';
299 r += 3;
300 state = STATE_ASCII;
302 r[0] = buf[0];
303 conv->ostate = state;
304 return count;
308 /* Try JIS X 0201-1976 Katakana. */
309 ret = jisx0201_wctomb(conv,buf,wc,1);
310 if (ret != RET_ILUNI) {
311 if (ret != 1) abort();
312 if (buf[0] >= 0x80) {
313 int count = (state == STATE_JISX0201KATAKANA ? 1 : 4);
314 if (n < count)
315 return RET_TOOSMALL;
316 if (state != STATE_JISX0201KATAKANA) {
317 r[0] = ESC;
318 r[1] = '(';
319 r[2] = 'I';
320 r += 3;
321 state = STATE_JISX0201KATAKANA;
323 r[0] = buf[0]-0x80;
324 conv->ostate = state;
325 return count;
329 /* Try JIS X 0208-1990, in place of JIS X 0208-1978 and JIS X 0208-1983,
330 and the extensions mentioned above. */
331 if (wc >= 0xe000 && wc < 0xe3ac) {
332 unsigned short i = wc - 0xe000;
333 buf[0] = (i / 94) + 0x75;
334 buf[1] = (i % 94) + 0x21;
335 ret = 2;
336 } else {
337 ret = jisx0208_wctomb(conv,buf,wc,2);
338 if (ret == RET_ILUNI) {
339 /* Extension of JIS X 0208. */
340 unsigned char i;
341 ret = cp50221_0208_ext_wctomb(conv,&i,wc,1);
342 if (ret == 1) {
343 buf[0] = 0x2d;
344 buf[1] = i-1 + 0x21;
345 ret = 2;
346 } else if (wc == 0x663B) {
347 buf[0] = 0x7a;
348 buf[1] = 0x36;
349 ret = 2;
350 } else if (wc == 0xffe2) {
351 buf[0] = 0x7c;
352 buf[1] = 0x7b;
353 ret = 2;
354 } else if (wc == 0xffe4) {
355 buf[0] = 0x7c;
356 buf[1] = 0x7c;
357 ret = 2;
361 if (ret != RET_ILUNI) {
362 if (ret != 2) abort();
363 if (buf[0] < 0x80 && buf[1] < 0x80) {
364 int count = (state == STATE_JISX0208MS ? 2 : 5);
365 if (n < count)
366 return RET_TOOSMALL;
367 if (state != STATE_JISX0208MS) {
368 r[0] = ESC;
369 r[1] = '$';
370 r[2] = 'B';
371 r += 3;
372 state = STATE_JISX0208MS;
374 r[0] = buf[0];
375 r[1] = buf[1];
376 conv->ostate = state;
377 return count;
381 /* Try JIS X 0212-1990 and the extensions mentioned above. */
382 if (wc >= 0xe3ac && wc < 0xe758) {
383 unsigned short i = wc - 0xe3ac;
384 buf[0] = (i / 94) + 0x75;
385 buf[1] = (i % 94) + 0x21;
386 ret = 2;
387 } else {
388 ret = jisx0212_wctomb(conv,buf,wc,2);
389 if (ret == RET_ILUNI) {
390 /* Extension of JIS X 0212. */
391 unsigned char i;
392 ret = cp50221_0212_ext_wctomb(conv,&i,wc,1);
393 if (ret == 1) {
394 i -= 1;
395 buf[0] = (i / 94) + 0x73;
396 buf[1] = (i % 94) + 0x21;
397 ret = 2;
401 if (ret != RET_ILUNI) {
402 if (ret != 2) abort();
403 if (buf[0] < 0x80 && buf[1] < 0x80) {
404 int count = (state == STATE_JISX0212MS ? 2 : 6);
405 if (n < count)
406 return RET_TOOSMALL;
407 if (state != STATE_JISX0212MS) {
408 r[0] = ESC;
409 r[1] = '$';
410 r[2] = '(';
411 r[3] = 'D';
412 r += 4;
413 state = STATE_JISX0212MS;
415 r[0] = buf[0];
416 r[1] = buf[1];
417 conv->ostate = state;
418 return count;
422 return RET_ILUNI;
425 static int
426 iso2022_jpms_reset (conv_t conv, unsigned char *r, size_t n)
428 state_t state = conv->ostate;
429 if (state != STATE_ASCII) {
430 if (n < 3)
431 return RET_TOOSMALL;
432 r[0] = ESC;
433 r[1] = '(';
434 r[2] = 'B';
435 /* conv->ostate = 0; will be done by the caller */
436 return 3;
437 } else
438 return 0;
441 #undef STATE_JISX0212MS
442 #undef STATE_JISX0208MS
443 #undef STATE_JISX0201KATAKANA
444 #undef STATE_JISX0201ROMAN
445 #undef STATE_ASCII