1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 #include "sal/config.h"
30 #include "convertsimple.hxx"
31 #include "tenchelp.hxx"
33 /* ======================================================================= */
35 /* This file contain the tables for all 1 byte charsets, for the */
36 /* following scripts: asia (Thai, Vietnamese) and other exotic scripts */
38 /* ======================================================================= */
41 /* Windows/Dos Standard CharSet for Thai */
42 /* 1-Byte, 0x00-0x7F ASCII ohne Ausnahme */
43 /* Convert-Tables: mappings/vendors/micsft/pc/cp874.txt from 04/15/98 Version 2.00 */
44 /* Last-Changes from us: */
46 /* ----------------------------------------------------------------------- */
48 #define MS874UNI_START 0x80
49 #define MS874UNI_END 0xFB
50 static sal_uInt16 const aImplMS874ToUniTab[MS874UNI_END - MS874UNI_START + 1] =
54 0x20AC, 0, 0, 0, 0, 0x2026, 0, 0, /* 0x80 */
55 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 */
56 0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, /* 0x90 */
57 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 */
58 0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, /* 0xA0 */
59 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F, /* 0xA0 */
60 0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, /* 0xB0 */
61 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F, /* 0xB0 */
62 0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, /* 0xC0 */
63 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F, /* 0xC0 */
64 0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, /* 0xD0 */
65 0x0E38, 0x0E39, 0x0E3A, 0, 0, 0, 0, 0x0E3F, /* 0xD0 */
66 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, /* 0xE0 */
67 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F, /* 0xE0 */
68 0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, /* 0xF0 */
69 0x0E58, 0x0E59, 0x0E5A, 0x0E5B /* 0xF0 */
72 /* ----------------------------------------------------------------------- */
74 #define MS874CHAR_START 0x0E01
75 #define MS874CHAR_END 0x0E5B
76 static sal_uChar const aImplMS874ToCharTab[MS874CHAR_END - MS874CHAR_START + 1] =
80 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, /* 0x0E00 */
81 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, /* 0x0E00 */
82 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, /* 0x0E10 */
83 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, /* 0x0E10 */
84 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, /* 0x0E20 */
85 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, /* 0x0E20 */
86 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, /* 0x0E30 */
87 0xD8, 0xD9, 0xDA, 0, 0, 0, 0, 0xDF, /* 0x0E30 */
88 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, /* 0x0E40 */
89 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, /* 0x0E40 */
90 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, /* 0x0E50 */
91 0xF8, 0xF9, 0xFA, 0xFB /* 0x0E50 */
94 /* ----------------------------------------------------------------------- */
96 #define MS874TOCHARTABEX_COUNT 10
97 static ImplUniCharTabData const aImplMS874ToCharTabEx[MS874TOCHARTABEX_COUNT] =
111 /* ----------------------------------------------------------------------- */
113 static ImplByteConvertData const aImplMS874ByteCvtData =
117 MS874UNI_START, MS874UNI_END,
118 NOTABUNI_START, NOTABUNI_END,
121 aImplMS874ToCharTabEx,
122 MS874CHAR_START, MS874CHAR_END,
123 NOTABCHAR_START, NOTABCHAR_END,
124 MS874TOCHARTABEX_COUNT
127 /* ----------------------------------------------------------------------- */
129 static ImplTextEncodingData const aImplMS874TextEncodingData
130 = { { &aImplMS874ByteCvtData,
131 sal::detail::textenc::convertCharToUnicode,
132 sal::detail::textenc::convertUnicodeToChar,
143 "iso8859-1", /* TODO! correct? */
145 RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME };
146 /* WIN/DOS/OS2, SCRIPT_THAI, pc code page 874, mac encoding 21 */
148 /* ======================================================================= */
152 * A good source of information is <http://www.inet.co.th/cyberclub/trin/
153 * thairef/index.html> as of 18 Mar 2002.
155 * Single byte encoding, from which MS874 is derived (although it is the other
156 * way around in this implementation):
158 * 0x00--9F map to U+0000--009F
159 * 0xA0 is questionable (unassigned or U+00A0 NO BREAK SPACE), to ease
160 * implementation, it maps to U+00A0
161 * 0xA1--DA map to U+0E01--0E3A (TIS 620, same for MS874)
162 * 0xDB--DE are unassigned (TIS 620, same for MS874)
163 * 0xDF--FB map to U+0E3F--0E5B (TIS 620, same for MS874)
164 * 0xFC--FF are unassigned (TIS 620, same for MS874)
167 static ImplByteConvertData const aImplTis620ByteCvtData =
169 aImplMS874ToUniTab + (0xA0 - MS874UNI_START),
170 aImpl8090SameToUniTab,
172 SAME8090UNI_START, SAME8090UNI_END,
174 aImpl8090SameToCharTab,
175 aImplMS874ToCharTabEx,
176 MS874CHAR_START, MS874CHAR_END,
177 SAME8090CHAR_START, SAME8090CHAR_END,
181 static ImplTextEncodingData const aImplTis620TextEncodingData
182 = { { &aImplTis620ByteCvtData,
183 sal::detail::textenc::convertCharToUnicode,
184 sal::detail::textenc::convertUnicodeToChar,
195 "iso8859-1", /* TODO! correct? */
197 RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME };
199 /* ======================================================================= */
202 /* Windows Standard CharSet for Vietnamease */
203 /* 1-Byte, 0x00-0x7F ASCII ohne Ausnahme */
204 /* Convert-Tables: mappings/vendors/micsft/windows/cp1258.txt from 04/15/98 Version 2.01 */
205 /* Last-Changes from us: */
207 /* ----------------------------------------------------------------------- */
209 #define MS1258UNI_START 0x80
210 #define MS1258UNI_END 0xFF
211 static sal_uInt16 const aImplMS1258ToUniTab[MS1258UNI_END - MS1258UNI_START + 1] =
213 /* 0 1 2 3 4 5 6 7 */
214 /* 8 9 A B C D E F */
215 0x20AC, 0, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, /* 0x80 */
216 0x02C6, 0x2030, 0, 0x2039, 0x0152, 0, 0, 0, /* 0x80 */
217 0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, /* 0x90 */
218 0x02DC, 0x2122, 0, 0x203A, 0x0153, 0, 0, 0x0178, /* 0x90 */
219 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* 0xA0 */
220 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, /* 0xA0 */
221 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* 0xB0 */
222 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* 0xB0 */
223 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* 0xC0 */
224 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x0300, 0x00CD, 0x00CE, 0x00CF, /* 0xC0 */
225 0x0110, 0x00D1, 0x0309, 0x00D3, 0x00D4, 0x01A0, 0x00D6, 0x00D7, /* 0xD0 */
226 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x01AF, 0x0303, 0x00DF, /* 0xD0 */
227 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* 0xE0 */
228 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x0301, 0x00ED, 0x00EE, 0x00EF, /* 0xE0 */
229 0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7, /* 0xF0 */
230 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF /* 0xF0 */
233 /* ----------------------------------------------------------------------- */
235 #define MS1258CHAR_START 0x00A0
236 #define MS1258CHAR_END 0x00FF
237 static sal_uChar const aImplMS1258ToCharTab[MS1258CHAR_END - MS1258CHAR_START + 1] =
239 /* 0 1 2 3 4 5 6 7 */
240 /* 8 9 A B C D E F */
241 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, /* 0x00A0 */
242 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, /* 0x00A0 */
243 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, /* 0x00B0 */
244 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, /* 0x00B0 */
245 0xC0, 0xC1, 0xC2, 0, 0xC4, 0xC5, 0xC6, 0xC7, /* 0x00C0 */
246 0xC8, 0xC9, 0xCA, 0xCB, 0, 0xCD, 0xCE, 0xCF, /* 0x00C0 */
247 0, 0xD1, 0, 0xD3, 0xD4, 0, 0xD6, 0xD7, /* 0x00D0 */
248 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0, 0, 0xDF, /* 0x00D0 */
249 0xE0, 0xE1, 0xE2, 0, 0xE4, 0xE5, 0xE6, 0xE7, /* 0x00E0 */
250 0xE8, 0xE9, 0xEA, 0xEB, 0, 0xED, 0xEE, 0xEF, /* 0x00E0 */
251 0, 0xF1, 0, 0xF3, 0xF4, 0, 0xF6, 0xF7, /* 0x00F0 */
252 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0, 0, 0xFF /* 0x00F0 */
255 /* ----------------------------------------------------------------------- */
257 /* The list of Vietnamese-relevant precomposed Unicode characters that map to
258 MS1258 base + combining modifier are derived from the information present at
259 <http://vietunicode.sourceforge.net/charset/>: */
260 #define MS1258TOCHARTABEX_COUNT 141
261 static ImplUniCharTabData const aImplMS1258ToCharTabEx[MS1258TOCHARTABEX_COUNT] =
263 { 0x00C3, 0x41, 0xDE },
264 { 0x00CC, 0x49, 0xCC },
265 { 0x00D2, 0x4F, 0xCC },
266 { 0x00D5, 0x4F, 0xDE },
267 { 0x00DD, 0x59, 0xEC },
268 { 0x00E3, 0x61, 0xDE },
269 { 0x00EC, 0x69, 0xCC },
270 { 0x00F2, 0x6F, 0xCC },
271 { 0x00F5, 0x6F, 0xDE },
272 { 0x00FD, 0x79, 0xCC },
277 { 0x0128, 0x49, 0xDE },
278 { 0x0129, 0x69, 0xDE },
281 { 0x0168, 0x55, 0xDE },
282 { 0x0169, 0x75, 0xDE },
296 { 0x1EA0, 0x41, 0xF2 },
297 { 0x1EA1, 0x61, 0xF2 },
298 { 0x1EA2, 0x41, 0xD2 },
299 { 0x1EA3, 0x61, 0xD2 },
300 { 0x1EA4, 0xC2, 0xEC },
301 { 0x1EA5, 0xE2, 0xEC },
302 { 0x1EA6, 0xC2, 0xCC },
303 { 0x1EA7, 0xE2, 0xCC },
304 { 0x1EA8, 0xC2, 0xD2 },
305 { 0x1EA9, 0xE2, 0xD2 },
306 { 0x1EAA, 0xC2, 0xDE },
307 { 0x1EAB, 0xE2, 0xDE },
308 { 0x1EAC, 0xC2, 0xF2 },
309 { 0x1EAD, 0xE2, 0xF2 },
310 { 0x1EAE, 0xC3, 0xEC },
311 { 0x1EAF, 0xE3, 0xEC },
312 { 0x1EB0, 0xC3, 0xCC },
313 { 0x1EB1, 0xE3, 0xCC },
314 { 0x1EB2, 0xC3, 0xD2 },
315 { 0x1EB3, 0xE3, 0xD2 },
316 { 0x1EB4, 0xC3, 0xDE },
317 { 0x1EB5, 0xE3, 0xDE },
318 { 0x1EB6, 0xC3, 0xF2 },
319 { 0x1EB7, 0xE3, 0xF2 },
320 { 0x1EB8, 0x45, 0xF2 },
321 { 0x1EB9, 0x65, 0xF2 },
322 { 0x1EBA, 0x45, 0xD2 },
323 { 0x1EBB, 0x65, 0xD2 },
324 { 0x1EBC, 0x45, 0xDE },
325 { 0x1EBD, 0x65, 0xDE },
326 { 0x1EBE, 0xCA, 0xEC },
327 { 0x1EBF, 0xEA, 0xEC },
328 { 0x1EC0, 0xCA, 0xCC },
329 { 0x1EC1, 0xEA, 0xCC },
330 { 0x1EC2, 0xCA, 0xD2 },
331 { 0x1EC3, 0xEA, 0xD2 },
332 { 0x1EC4, 0xCA, 0xDE },
333 { 0x1EC5, 0xEA, 0xDE },
334 { 0x1EC6, 0xCA, 0xF2 },
335 { 0x1EC7, 0xEA, 0xF2 },
336 { 0x1EC8, 0x49, 0xD2 },
337 { 0x1EC9, 0x69, 0xD2 },
338 { 0x1ECA, 0x49, 0xF2 },
339 { 0x1ECB, 0x69, 0xF2 },
340 { 0x1ECC, 0x4F, 0xF2 },
341 { 0x1ECD, 0x6F, 0xF2 },
342 { 0x1ECE, 0x4F, 0xD2 },
343 { 0x1ECF, 0x6F, 0xD2 },
344 { 0x1ED0, 0xD4, 0xEC },
345 { 0x1ED1, 0xF4, 0xEC },
346 { 0x1ED2, 0xD4, 0xCC },
347 { 0x1ED3, 0xF4, 0xCC },
348 { 0x1ED4, 0xD4, 0xD2 },
349 { 0x1ED5, 0xF4, 0xD2 },
350 { 0x1ED6, 0xD4, 0xDE },
351 { 0x1ED7, 0xF4, 0xDE },
352 { 0x1ED8, 0xD4, 0xF2 },
353 { 0x1ED9, 0xF4, 0xF2 },
354 { 0x1EDA, 0xD5, 0xEC },
355 { 0x1EDB, 0xF5, 0xEC },
356 { 0x1EDC, 0xD5, 0xCC },
357 { 0x1EDD, 0xF5, 0xCC },
358 { 0x1EDE, 0xD5, 0xD2 },
359 { 0x1EDF, 0xF5, 0xD2 },
360 { 0x1EE0, 0xD5, 0xDE },
361 { 0x1EE1, 0xF5, 0xDE },
362 { 0x1EE2, 0xD5, 0xF2 },
363 { 0x1EE3, 0xF5, 0xF2 },
364 { 0x1EE4, 0x55, 0xF2 },
365 { 0x1EE5, 0x75, 0xF2 },
366 { 0x1EE6, 0x55, 0xD2 },
367 { 0x1EE7, 0x75, 0xD2 },
368 { 0x1EE8, 0xDD, 0xEC },
369 { 0x1EE9, 0xFD, 0xEC },
370 { 0x1EEA, 0xDD, 0xCC },
371 { 0x1EEB, 0xFD, 0xCC },
372 { 0x1EEC, 0xDD, 0xD2 },
373 { 0x1EED, 0xFD, 0xD2 },
374 { 0x1EEE, 0xDD, 0xDE },
375 { 0x1EEF, 0xFD, 0xDE },
376 { 0x1EF0, 0xDD, 0xF2 },
377 { 0x1EF1, 0xFD, 0xF2 },
378 { 0x1EF2, 0x59, 0xCC },
379 { 0x1EF3, 0x79, 0xCC },
380 { 0x1EF4, 0x59, 0xF2 },
381 { 0x1EF5, 0x79, 0xF2 },
382 { 0x1EF6, 0x59, 0xD2 },
383 { 0x1EF7, 0x79, 0xD2 },
384 { 0x1EF8, 0x59, 0xDE },
385 { 0x1EF9, 0x79, 0xDE },
406 /* ----------------------------------------------------------------------- */
408 static ImplByteConvertData const aImplMS1258ByteCvtData =
412 MS1258UNI_START, MS1258UNI_END,
413 NOTABUNI_START, NOTABUNI_END,
414 aImplMS1258ToCharTab,
416 aImplMS1258ToCharTabEx,
417 MS1258CHAR_START, MS1258CHAR_END,
418 NOTABCHAR_START, NOTABCHAR_END,
419 MS1258TOCHARTABEX_COUNT
422 /* ----------------------------------------------------------------------- */
424 static ImplTextEncodingData const aImplMS1258TextEncodingData
425 = { { &aImplMS1258ByteCvtData,
426 sal::detail::textenc::convertCharToUnicode,
427 sal::detail::textenc::convertUnicodeToChar,
438 "iso8859-1", /* TODO! correct? */
440 RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME };
441 /* WIN, SCRIPT_VIETNAMESE, mac encoding 30 */