Allow overriding the newline conversion for EBCDIC encodings.
[libiconv.git] / lib / johab_hangul.h
blob1a9bc7340fd970e8241e209aa6ff22570fbf076d
1 /*
2 * Copyright (C) 1999-2001, 2016 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, see <https://www.gnu.org/licenses/>.
21 * JOHAB Hangul
23 * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
24 * "Hangul can be composed of two or three jamo (some jamo are considered
25 * compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
26 * and 27 final jamo (consonants; 28 when you include the "fill" character
27 * for Hangul containing only two jamo). Multiplying these numbers results in
28 * 11172."
30 * Structure of the Johab encoding (see p. 181-184):
31 * bit 15 = 1
32 * bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
33 * bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
34 * bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
36 * Structure of the Unicode encoding:
37 * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
38 * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
39 * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
40 * in ascending order according to Johab encoding and according to the Unicode
41 * encoding. Now look a little more carefully, and you see that the following
42 * formula holds:
43 * unicode == 0xAC00
44 * + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
45 * + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
46 * + jamo_final_index[johab & 31]
47 * where the index tables are defined as below.
50 /* Tables mapping 5-bit groups to jamo letters. */
51 /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
52 #define NONE 0xfd
53 #define FILL 0xff
54 static const unsigned char jamo_initial[32] = {
55 NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
56 0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
57 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
58 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
60 static const unsigned char jamo_medial[32] = {
61 NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
62 NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
63 NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
64 NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
66 static const unsigned char jamo_final[32] = {
67 NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
68 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
69 0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
70 0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
72 /* Same as jamo_final, except that it excludes characters already
73 contained in jamo_initial. 11 characters instead of 27. */
74 static const unsigned char jamo_final_notinitial[32] = {
75 NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
76 NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
77 0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
78 NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
81 /* Tables mapping 5-bit groups to packed indices. */
82 #define none -1
83 #define fill 0
84 static const signed char jamo_initial_index[32] = {
85 none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
86 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
87 0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
88 none, none, none, none, none, none, none, none,
90 static const signed char jamo_medial_index[32] = {
91 none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
92 none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
93 none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
94 none, none, 0x12, 0x13, 0x14, 0x15, none, none,
96 static const signed char jamo_final_index[32] = {
97 none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
98 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
99 0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
100 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
103 static int
104 johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
106 unsigned char c1 = s[0];
107 if ((c1 >= 0x84 && c1 <= 0xd3)) {
108 if (n >= 2) {
109 unsigned char c2 = s[1];
110 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
111 unsigned int johab = (c1 << 8) | c2;
112 unsigned int bitspart1 = (johab >> 10) & 31;
113 unsigned int bitspart2 = (johab >> 5) & 31;
114 unsigned int bitspart3 = johab & 31;
115 int index1 = jamo_initial_index[bitspart1];
116 int index2 = jamo_medial_index[bitspart2];
117 int index3 = jamo_final_index[bitspart3];
118 /* Exclude "none" values. */
119 if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
120 /* Deal with "fill" values in initial or medial position. */
121 if (index1 == fill) {
122 if (index2 == fill) {
123 unsigned char jamo3 = jamo_final_notinitial[bitspart3];
124 if (jamo3 != NONE) {
125 *pwc = (ucs4_t) 0x3130 + jamo3;
126 return 2;
128 } else if (index3 == fill) {
129 unsigned char jamo2 = jamo_medial[bitspart2];
130 if (jamo2 != NONE && jamo2 != FILL) {
131 *pwc = (ucs4_t) 0x3130 + jamo2;
132 return 2;
135 /* Syllables composed only of medial and final don't exist. */
136 } else if (index2 == fill) {
137 if (index3 == fill) {
138 unsigned char jamo1 = jamo_initial[bitspart1];
139 if (jamo1 != NONE && jamo1 != FILL) {
140 *pwc = (ucs4_t) 0x3130 + jamo1;
141 return 2;
144 /* Syllables composed only of initial and final don't exist. */
145 } else {
146 /* index1 and index2 are not fill, but index3 may be fill. */
147 /* Nothing more to exclude. All 11172 code points are valid. */
148 *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
149 return 2;
153 return RET_ILSEQ;
155 return RET_TOOFEW(0);
157 return RET_ILSEQ;
160 /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
161 static const unsigned short johab_hangul_page31[51] = {
162 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
163 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
164 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
165 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
166 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
167 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
168 0x8741, 0x8761, 0x8781, 0x87a1, /*0x60-0x67*/
171 /* Tables mapping packed indices to 5-bit groups. */
172 /* index1+1 = jamo_initial_index[bitspart1] <==>
173 bitspart1 = jamo_initial_index_inverse[index1] */
174 static const char jamo_initial_index_inverse[19] = {
175 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
176 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
177 0x10, 0x11, 0x12, 0x13, 0x14,
179 /* index2+1 = jamo_medial_index[bitspart2] <==>
180 bitspart2 = jamo_medial_index_inverse[index2] */
181 static const char jamo_medial_index_inverse[21] = {
182 0x03, 0x04, 0x05, 0x06, 0x07,
183 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
185 0x1a, 0x1b, 0x1c, 0x1d,
187 /* index3 = jamo_final_index[bitspart3] <==>
188 bitspart3 = jamo_final_index_inverse[index3] */
189 static const char jamo_final_index_inverse[28] = {
190 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
191 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
192 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
193 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
196 static int
197 johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
199 if (n >= 2) {
200 if (wc >= 0x3131 && wc < 0x3164) {
201 unsigned short c = johab_hangul_page31[wc-0x3131];
202 r[0] = (c >> 8); r[1] = (c & 0xff);
203 return 2;
204 } else if (wc >= 0xac00 && wc < 0xd7a4) {
205 unsigned int index1;
206 unsigned int index2;
207 unsigned int index3;
208 unsigned short c;
209 unsigned int tmp = wc - 0xac00;
210 index3 = tmp % 28; tmp = tmp / 28;
211 index2 = tmp % 21; tmp = tmp / 21;
212 index1 = tmp;
213 c = (((((1 << 5)
214 | jamo_initial_index_inverse[index1]) << 5)
215 | jamo_medial_index_inverse[index2]) << 5)
216 | jamo_final_index_inverse[index3];
217 r[0] = (c >> 8); r[1] = (c & 0xff);
218 return 2;
220 return RET_ILUNI;
222 return RET_TOOSMALL;
226 * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
229 /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
230 Return RET_ILUNI if decomposition is not possible. */
232 static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
234 unsigned char buf[2];
235 int ret = johab_hangul_wctomb(conv,buf,wc,2);
236 if (ret != RET_ILUNI) {
237 unsigned int hangul = (buf[0] << 8) | buf[1];
238 unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
239 unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
240 unsigned char jamo3 = jamo_final[hangul & 31];
241 if ((hangul >> 15) != 1) abort();
242 if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
243 /* They are not all three == FILL because that would correspond to
244 johab = 0x8441, which doesn't exist. */
245 ucs4_t* p = r;
246 if (jamo1 != FILL)
247 *p++ = 0x3130 + jamo1;
248 if (jamo2 != FILL)
249 *p++ = 0x3130 + jamo2;
250 if (jamo3 != FILL)
251 *p++ = 0x3130 + jamo3;
252 return p-r;
255 return RET_ILUNI;
258 #undef fill
259 #undef none
260 #undef FILL
261 #undef NONE