1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 /* TODO! This file should not be called textenc.c, because it is not the
21 implementation of rtl/textenc.h. Rather, it should be called
22 gettextencodingdata.c. */
24 #include "sal/config.h"
29 #include "boost/noncopyable.hpp"
30 #include "osl/diagnose.h"
31 #include "osl/module.hxx"
32 #include "rtl/instance.hxx"
33 #include "rtl/textenc.h"
34 #include "rtl/ustring.h"
35 #include "rtl/ustring.hxx"
36 #include "sal/macros.h"
37 #include "sal/types.h"
39 #include "convertsimple.hxx"
40 #include "gettextencodingdata.hxx"
41 #include "tcvtbyte.hxx"
42 #include "tcvtutf8.hxx"
43 #include "tenchelp.hxx"
45 #define NOTABUNI_START 0xFF
46 #define NOTABUNI_END 0x00
48 #define NOTABCHAR_START 0xFFFF
49 #define NOTABCHAR_END 0x0000
51 #define SAME8090UNI_START 0x80
52 #define SAME8090UNI_END 0x9F
53 static sal_uInt16
const aImpl8090SameToUniTab
[SAME8090UNI_END
56 = { 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 */
57 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
58 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 */
59 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F };
61 #define SAME8090CHAR_START 0x0080
62 #define SAME8090CHAR_END 0x009F
63 static sal_uChar
const aImpl8090SameToCharTab
[SAME8090CHAR_END
66 = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x0080 */
67 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
68 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x0090 */
69 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F };
71 #define SAMEA0FFCHAR_START 0x00A0
72 #define SAMEA0FFCHAR_END 0x00FF
73 static sal_uChar
const aImplA0FFSameToCharTab
[SAMEA0FFCHAR_END
76 = { 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, /* 0x00A0 */
77 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
78 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, /* 0x00B0 */
79 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
80 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, /* 0x00C0 */
81 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
82 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, /* 0x00D0 */
83 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
84 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, /* 0x00E0 */
85 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
86 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, /* 0x00F0 */
87 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF };
89 static sal_uInt16
const aImplDoubleByteIdentifierTab
[1] = { 0 };
91 /* ======================================================================= */
94 /* Windows Standard CharSet (ANSI) for Western Script */
95 /* 1-Byte, 0x00-0x7F ASCII ohne Ausnahme */
96 /* Convert-Tables: mappings/vendors/micsft/windows/cp1252.txt from 04/15/98 Version 2.01 */
97 /* Last-Changes from us: */
99 /* ----------------------------------------------------------------------- */
101 #define MS1252UNI_START 0x80
102 #define MS1252UNI_END 0xFF
103 static sal_uInt16
const aImplMS1252ToUniTab
[MS1252UNI_END
- MS1252UNI_START
+ 1] =
105 /* 0 1 2 3 4 5 6 7 */
106 /* 8 9 A B C D E F */
107 0x20AC, 0, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, /* 0x80 */
108 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0, 0x017D, 0, /* 0x80 */
109 0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, /* 0x90 */
110 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0, 0x017E, 0x0178, /* 0x90 */
111 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* 0xA0 */
112 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, /* 0xA0 */
113 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* 0xB0 */
114 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* 0xB0 */
115 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* 0xC0 */
116 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* 0xC0 */
117 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* 0xD0 */
118 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* 0xD0 */
119 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* 0xE0 */
120 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* 0xE0 */
121 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* 0xF0 */
122 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF /* 0xF0 */
125 /* ----------------------------------------------------------------------- */
127 #define MS1252TOCHARTABEX_COUNT 27
128 static ImplUniCharTabData
const aImplMS1252ToCharTabEx
[MS1252TOCHARTABEX_COUNT
] =
159 /* ----------------------------------------------------------------------- */
161 static ImplByteConvertData
const aImplMS1252ByteCvtData
=
165 MS1252UNI_START
, MS1252UNI_END
,
166 NOTABUNI_START
, NOTABUNI_END
,
167 aImplA0FFSameToCharTab
,
169 aImplMS1252ToCharTabEx
,
170 SAMEA0FFCHAR_START
, SAMEA0FFCHAR_END
,
171 NOTABCHAR_START
, NOTABCHAR_END
,
172 MS1252TOCHARTABEX_COUNT
175 /* ----------------------------------------------------------------------- */
177 static ImplTextEncodingData
const aImplMS1252TextEncodingData
178 = { { &aImplMS1252ByteCvtData
,
179 sal::detail::textenc::convertCharToUnicode
,
180 sal::detail::textenc::convertUnicodeToChar
,
193 RTL_TEXTENCODING_INFO_ASCII
| RTL_TEXTENCODING_INFO_MIME
};
194 /* WIN, SCRIPT_LATIN, pc code page 850 */
196 /* ======================================================================= */
199 /* Unix Standard CharSet (Latin1) for Western Script */
200 /* 1-Byte, 0x00-0x7F ASCII ohne Ausnahme, 0x80-0x9F Control-Caracter wie in Unicode */
201 /* Convert-Tables: mappings/iso8859/8859-1.txt from 07/27/99 Version 1.0 (based on Unicode 3.0) */
202 /* Last-Changes from us: */
204 #define ISO88591UNI_START 0xA0
205 #define ISO88591UNI_END 0xFF
206 static sal_uInt16
const aImplISO88591ToUniTab
[ISO88591UNI_END
- ISO88591UNI_START
+ 1] =
208 /* 0 1 2 3 4 5 6 7 */
209 /* 8 9 A B C D E F */
210 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* 0xA0 */
211 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, /* 0xA0 */
212 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* 0xB0 */
213 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* 0xB0 */
214 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* 0xC0 */
215 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* 0xC0 */
216 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* 0xD0 */
217 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* 0xD0 */
218 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* 0xE0 */
219 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* 0xE0 */
220 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* 0xF0 */
221 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF /* 0xF0 */
224 /* ----------------------------------------------------------------------- */
226 static ImplByteConvertData
const aImplISO88591ByteCvtData
=
228 aImplISO88591ToUniTab
,
229 aImpl8090SameToUniTab
,
230 ISO88591UNI_START
, ISO88591UNI_END
,
231 SAME8090UNI_START
, SAME8090UNI_END
,
232 aImplA0FFSameToCharTab
,
233 aImpl8090SameToCharTab
,
235 SAMEA0FFCHAR_START
, SAMEA0FFCHAR_END
,
236 SAME8090CHAR_START
, SAME8090CHAR_END
,
240 /* ----------------------------------------------------------------------- */
242 static ImplTextEncodingData
const aImplISO88591TextEncodingData
243 = { { &aImplISO88591ByteCvtData
,
244 sal::detail::textenc::convertCharToUnicode
,
245 sal::detail::textenc::convertUnicodeToChar
,
258 RTL_TEXTENCODING_INFO_ASCII
| RTL_TEXTENCODING_INFO_MIME
};
259 /* SCRIPT_LATIN, pc code page 850 */
261 /* ======================================================================= */
265 /* 1-Byte, 0x00-0x7F ASCII ohne Ausnahme */
266 /* For the import we use ISO-8859-1 with MS extension (MS-1252), because */
267 /* when the 8-Bit is set, the chance, that this is a ISO-8859-1 character */
268 /* is the greatest. For the export all chars greater than 127 are not */
269 /* converted and are replaced by the replacement character. */
270 /* Last-Changes from us: */
272 /* ----------------------------------------------------------------------- */
274 static ImplByteConvertData
const aImplUSASCIIByteCvtData
=
278 MS1252UNI_START
, MS1252UNI_END
,
279 NOTABUNI_START
, NOTABUNI_END
,
283 NOTABCHAR_START
, NOTABCHAR_END
,
284 NOTABCHAR_START
, NOTABCHAR_END
,
288 /* ----------------------------------------------------------------------- */
290 static ImplTextEncodingData
const aImplUSASCIITextEncodingData
291 = { { &aImplUSASCIIByteCvtData
,
292 sal::detail::textenc::convertCharToUnicode
,
293 sal::detail::textenc::convertUnicodeToChar
,
306 RTL_TEXTENCODING_INFO_ASCII
307 | RTL_TEXTENCODING_INFO_7BIT
308 | RTL_TEXTENCODING_INFO_MIME
};
309 /* SCRIPT_LATIN, pc code page 437 */
311 static ImplTextEncodingData
const aImplUTF8TextEncodingData
313 &ImplConvertUtf8ToUnicode
,
314 &ImplConvertUnicodeToUtf8
,
315 &ImplCreateUtf8ToUnicodeContext
,
316 &ImplDestroyUtf8ToUnicodeContext
,
317 &ImplResetUtf8ToUnicodeContext
,
318 &ImplCreateUnicodeToUtf8Context
,
319 &ImplDestroyUnicodeToUtf8Context
,
320 &ImplResetUnicodeToUtf8Context
},
327 RTL_TEXTENCODING_INFO_ASCII
328 | RTL_TEXTENCODING_INFO_UNICODE
329 | RTL_TEXTENCODING_INFO_MULTIBYTE
330 | RTL_TEXTENCODING_INFO_MIME
};
331 /* SCRIPT_UNICODE, pc code page 850 */
333 static char aImplJavaUtf8TextConverterTag
;
334 /* The value of this tag is irrelevant. Only its address != NULL is used to
335 distinguish between RTL_TEXTENCODING_UTF8 and
336 RTL_TEXTENCODING_JAVA_UTF8. */
338 static ImplTextEncodingData
const aImplJavaUtf8TextEncodingData
339 = { { &aImplJavaUtf8TextConverterTag
,
340 &ImplConvertUtf8ToUnicode
,
341 &ImplConvertUnicodeToUtf8
,
342 &ImplCreateUtf8ToUnicodeContext
,
343 &ImplDestroyUtf8ToUnicodeContext
,
344 &ImplResetUtf8ToUnicodeContext
,
345 &ImplCreateUnicodeToUtf8Context
,
346 &ImplDestroyUnicodeToUtf8Context
,
347 &ImplResetUnicodeToUtf8Context
},
354 RTL_TEXTENCODING_INFO_UNICODE
| RTL_TEXTENCODING_INFO_MULTIBYTE
};
358 #if defined DISABLE_DYNLOADING || defined ANDROID
360 extern "C" ImplTextEncodingData
const * sal_getFullTextEncodingData(
361 rtl_TextEncoding
); // from tables.cxx in sal_textenc library
363 class FullTextEncodingData
: private boost::noncopyable
{
365 ImplTextEncodingData
const * get(rtl_TextEncoding encoding
) {
366 return sal_getFullTextEncodingData(encoding
);
374 typedef ImplTextEncodingData
const * TextEncodingFunction(rtl_TextEncoding
);
376 void SAL_CALL
thisModule() {}
380 class FullTextEncodingData
: private boost::noncopyable
{
382 FullTextEncodingData() {
383 if (!module_
.loadRelative(&thisModule
, SAL_MODULENAME("sal_textenclo")))
385 SAL_WARN( "sal.textenc", "Loading sal_textenc library failed" );
388 function_
= reinterpret_cast< TextEncodingFunction
* >(
389 module_
.getFunctionSymbol("sal_getFullTextEncodingData"));
390 if (function_
== 0) {
391 SAL_WARN( "sal.textenc", "Obtaining sal_getFullTextEncodingData function from sal_textenc"
397 ImplTextEncodingData
const * get(rtl_TextEncoding encoding
) {
398 return (*function_
)(encoding
);
403 TextEncodingFunction
* function_
;
408 struct FullTextEncodingDataSingleton
:
409 public rtl::Static
< FullTextEncodingData
, FullTextEncodingDataSingleton
>
414 ImplTextEncodingData
const *
415 Impl_getTextEncodingData(rtl_TextEncoding nEncoding
)
419 case RTL_TEXTENCODING_ASCII_US
:
420 return &aImplUSASCIITextEncodingData
; break;
421 case RTL_TEXTENCODING_MS_1252
:
422 return &aImplMS1252TextEncodingData
; break;
423 case RTL_TEXTENCODING_UTF8
:
424 return &aImplUTF8TextEncodingData
; break;
425 case RTL_TEXTENCODING_JAVA_UTF8
:
426 return &aImplJavaUtf8TextEncodingData
; break;
427 case RTL_TEXTENCODING_ISO_8859_1
:
428 return &aImplISO88591TextEncodingData
; break;
430 return FullTextEncodingDataSingleton::get().get(nEncoding
);
434 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */