Prefer #include <...> for system headers.
[libiconv.git] / lib / isoir165.h
blob28f47e3cfa06d60400daaaa7e7fc2b71b8388d9b
1 /*
2 * Copyright (C) 1999-2001, 2005, 2012, 2016 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, see <https://www.gnu.org/licenses/>.
21 * ISO-IR-165
25 * ISO-IR-165 is an extension of GB 2312, consisting of:
26 * 1. GB 6345.1-86 corrections:
27 * Two corrections to GB 2312, at 0x2367 and 0x6F71.
28 * 2. GB 6345.1-86 additions:
29 * - 6 new full-width pinyin characters in row 0x28.
30 * - ISO646-CN in row 0x2A.
31 * - 32 half-width pinyin characters in row 0x2B.
32 * 3. GB 8565.2-88 additions:
33 * - 50 characters in row 0x2D.
34 * - 92 characters in row 0x2E.
35 * - 93 characters in row 0x2F.
36 * - 470 characters in rows 0x7A-0x7E.
37 * 4. ISO-IR-165 additions:
38 * - 22 characters in row 0x26.
39 * - 94 characters in row 0x2C.
40 * - 44 new characters in row 0x2D.
41 * - 1 new character in row 0x2F.
43 * The conversion table was created from the following sources:
44 * Ad 1. The 0x2367 correction is already integrated in the unicode.org
45 * GB2312.TXT table. The 0x6F71 mapping is the same in the unicode.org
46 * GB2312.TXT and UNIHAN.TXT table and in Koichi Yasuoka's Uni2GB table,
47 * so we assume it's correct.
48 * The unicode.org UNIHAN.TXT table about GB 8565 is not usable: it has
49 * extraneous code points at rows 0x28, 0x2C, 0x2D. Note also that it does
50 * not list the 69 non-hanzi in row 0x2F. Moreover, it has the characters
51 * 0x2F7A-0x2F7D shifted down by one to 0x2F79-0x2F7C.
52 * Therefore we take the GB8565 and ISO-IR-165 data from Koichi Yasuoka's
53 * Uni2GB table.
54 * Ad 1. Yasuoka maps 0x2367 to U+0261 (small script g) and 0x2840 to U+FF47
55 * (full-width small normal g). While coherent with ISO-IR's 165.pdf,
56 * this disagrees with Ken Lunde's book: He says that ISO-IR-165
57 * includes the GB6345 correction, i.e. maps 0x2367 to U+FF47 or U+0067
58 * and _not_ to U+0261 (small script g).
59 * To overcome the confusion, we just map both 0x2367 and 0x2840 to
60 * U+FF47.
61 * Ad 2. Row 0x28: Add a mapping from 0x283F to U+01F9.
62 * Row 0x2A: Mapping is well-known, also present in Koichi Yasuoka's
63 * table.
64 * Row 0x2B: Typed in by hand from appendix E in Ken Lunde's book.
65 * When converting from Unicode to ISO-IR-165, prefer the half-width
66 * range 0x2B{21..40} to the full-width range 0x28{21..40}.
67 * Ad 3. Rows 0x2D, 0x2E: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
68 * data for GB 8565 agree here.
69 * Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table.
70 * Rows 0x7A-0x7E: Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
71 * data for GB 8565 agree here mostly. Differences:
72 * 0x7C38 -> U+6F26 or U+527A ? We choose U+6F26.
73 * 0x7C5A -> U+7A40 or U+6996 ? We choose U+6996.
74 * Ad 4. Row 0x26: Mapping unknown.
75 * Rows 0x2C, 0x2D: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
76 * data for GB 8565 (!) agree here.
77 * Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table.
80 #include "isoir165ext.h"
82 static int
83 isoir165_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
85 int ret;
87 /* Map full-width pinyin (row 0x28) like half-width pinyin (row 0x2B). */
88 if (s[0] == 0x28) {
89 if (n >= 2) {
90 unsigned char c2 = s[1];
91 if (c2 >= 0x21 && c2 <= 0x40) {
92 unsigned char buf[2];
93 buf[0] = 0x2b;
94 buf[1] = c2;
95 ret = isoir165ext_mbtowc(conv,pwc,buf,2);
96 if (ret != RET_ILSEQ)
97 return ret;
101 /* Try the GB2312 -> Unicode table. */
102 ret = gb2312_mbtowc(conv,pwc,s,n);
103 if (ret != RET_ILSEQ)
104 return ret;
105 /* Row 0x2A is GB_1988-80. */
106 if (s[0] == 0x2a) {
107 if (n >= 2) {
108 unsigned char c2 = s[1];
109 if (c2 >= 0x21 && c2 < 0x7f) {
110 ret = iso646_cn_mbtowc(conv,pwc,s+1,1);
111 if (ret != 1) abort();
112 return 2;
114 return RET_ILSEQ;
116 return RET_TOOFEW(0);
118 /* Try the ISO-IR-165 extensions -> Unicode table. */
119 ret = isoir165ext_mbtowc(conv,pwc,s,n);
120 return ret;
123 static int
124 isoir165_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
126 unsigned char buf[2];
127 int ret;
129 /* Try the Unicode -> GB2312 table. */
130 ret = gb2312_wctomb(conv,buf,wc,2);
131 if (ret != RET_ILUNI) {
132 if (ret != 2) abort();
133 if (!(buf[0] == 0x28 && buf[1] >= 0x21 && buf[1] <= 0x40)) {
134 if (n >= 2) {
135 r[0] = buf[0];
136 r[1] = buf[1];
137 return 2;
139 return RET_TOOSMALL;
142 /* Row 0x2A is GB_1988-80. */
143 ret = iso646_cn_wctomb(conv,buf,wc,1);
144 if (ret != RET_ILUNI) {
145 if (ret != 1) abort();
146 if (buf[0] >= 0x21 && buf[0] < 0x7f) {
147 if (n >= 2) {
148 r[0] = 0x2a;
149 r[1] = buf[0];
150 return 2;
152 return RET_TOOSMALL;
155 /* Try the Unicode -> ISO-IR-165 extensions table. */
156 ret = isoir165ext_wctomb(conv,r,wc,n);
157 return ret;