1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include "converter.hxx"
11 #include "convertisciidevangari.hxx"
12 #include "convertsinglebytetobmpunicode.hxx"
13 #include <rtl/textcvt.h>
15 using namespace sal::detail::textenc
;
16 using namespace rtl::textenc
;
18 struct IsciiDevanagariToUnicode
20 sal_uInt8 m_cPrevChar
;
21 IsciiDevanagariToUnicode()
29 sal_Size
convert(char const* pSrcBuf
, sal_Size nSrcBytes
,
30 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
31 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
);
34 struct UnicodeToIsciiDevanagari
36 sal_Unicode m_cPrevChar
;
37 sal_Unicode m_cHighSurrogate
;
38 UnicodeToIsciiDevanagari()
48 sal_Size
convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
49 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
50 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
);
53 static const sal_Unicode IsciiDevanagariMap
[256] =
55 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
56 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
57 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
58 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
59 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
60 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
61 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
62 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
63 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
64 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
65 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
66 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
67 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
68 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
69 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
70 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
71 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
72 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
73 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
74 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
75 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
76 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
77 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
78 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
79 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
80 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
81 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
82 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
83 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
84 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
86 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
89 sal_Size
IsciiDevanagariToUnicode::convert(
90 char const* pSrcBuf
, sal_Size nSrcBytes
,
91 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
92 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
95 sal_Size nConverted
= 0;
96 sal_Unicode
* pDestBufPtr
= pDestBuf
;
97 sal_Unicode
* pDestBufEnd
= pDestBuf
+ nDestChars
;
99 while (nConverted
< nSrcBytes
)
101 if (pDestBufPtr
== pDestBufEnd
)
103 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
107 sal_Unicode cChar
= sal_Unicode();
108 sal_uInt8 nIn
= static_cast<sal_uInt8
>(pSrcBuf
[nConverted
]);
109 sal_uInt8 nNext
= nConverted
< nSrcBytes
+ 1 ? static_cast<sal_uInt8
>(pSrcBuf
[nConverted
+1]) : 0;
111 bool bDouble
= false;
112 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
113 //halant + nukta E8 E9 halant + ZWJ 094D 200D
114 if (m_cPrevChar
== 0xE8 && nIn
== 0xE8)
119 else if (m_cPrevChar
== 0xE8 && nIn
== 0xE9)
124 else if (nNext
== 0xE9)
183 cChar
= IsciiDevanagariMap
[nIn
];
185 bool bUndefined
= cChar
== 0xffff;
189 BadInputConversionAction eAction
= handleBadInputTextToUnicodeConversion(
190 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
192 if (eAction
== BAD_INPUT_CONTINUE
)
194 if (eAction
== BAD_INPUT_STOP
)
196 else if (eAction
== BAD_INPUT_NO_OUTPUT
)
198 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
206 *pDestBufPtr
++ = cChar
;
207 m_cPrevChar
= bNormal
? nIn
: 0;
213 *pSrcCvtBytes
= nConverted
;
215 return pDestBufPtr
- pDestBuf
;
218 BmpUnicodeToSingleByteRange
const unicodeToISCIIEncoding
[] =
220 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
221 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
222 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
223 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
224 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
225 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
226 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
227 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
228 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
229 { 0x0966, 0x096F - 0x0966, 0xF1 }
232 sal_Size
UnicodeToIsciiDevanagari::convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
233 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
234 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
236 size_t entries
= SAL_N_ELEMENTS(unicodeToISCIIEncoding
);
237 BmpUnicodeToSingleByteRange
const * ranges
= unicodeToISCIIEncoding
;
239 sal_Unicode cHighSurrogate
= m_cHighSurrogate
;
240 sal_uInt32 nInfo
= 0;
241 sal_Size nConverted
= 0;
242 sal_Char
* pDestBufPtr
= pDestBuf
;
243 sal_Char
* pDestBufEnd
= pDestBuf
+ nDestBytes
;
244 for (; nConverted
< nSrcChars
; ++nConverted
)
246 bool bUndefined
= true;
247 sal_uInt32 c
= *pSrcBuf
++;
248 sal_Char cSpecialChar
= 0;
249 if (cHighSurrogate
== 0)
251 if (ImplIsHighSurrogate(c
))
253 cHighSurrogate
= static_cast< sal_Unicode
>(c
);
257 else if (ImplIsLowSurrogate(c
))
259 c
= ImplCombineSurrogates(cHighSurrogate
, c
);
266 if (ImplIsLowSurrogate(c
) || ImplIsNoncharacter(c
))
272 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
273 //halant + nukta E8 E9 halant + ZWJ 094D 200D
274 if (m_cPrevChar
== 0x094D && c
== 0x200C)
275 cSpecialChar
= '\xE8';
276 else if (m_cPrevChar
== 0x094D && c
== 0x200D)
277 cSpecialChar
= '\xE9';
280 if (pDestBufEnd
- pDestBufPtr
< 1)
282 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
285 *pDestBufPtr
++ = cSpecialChar
;
292 cSpecialChar
= '\xA1';
295 cSpecialChar
= '\xA6';
298 cSpecialChar
= '\xA7';
301 cSpecialChar
= '\xAA';
304 cSpecialChar
= '\xB3';
307 cSpecialChar
= '\xB4';
310 cSpecialChar
= '\xB5';
313 cSpecialChar
= '\xBA';
316 cSpecialChar
= '\xBF';
319 cSpecialChar
= '\xC0';
322 cSpecialChar
= '\xC9';
325 cSpecialChar
= '\xDB';
328 cSpecialChar
= '\xDC';
331 cSpecialChar
= '\xDF';
334 cSpecialChar
= '\xEA';
341 if (pDestBufEnd
- pDestBufPtr
< 2)
343 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
346 *pDestBufPtr
++ = cSpecialChar
;
347 *pDestBufPtr
++ = '\xE9';
352 // Linearly searching through the ranges if probably fastest, assuming
353 // that most converted characters belong to the ASCII subset:
354 for (size_t i
= 0; i
< entries
; ++i
)
356 if (c
< ranges
[i
].unicode
)
360 else if (c
<= sal::static_int_cast
< sal_uInt32
>(
361 ranges
[i
].unicode
+ ranges
[i
].range
))
363 if (pDestBufEnd
- pDestBufPtr
< 1)
367 *pDestBufPtr
++ = static_cast< sal_Char
>(
368 ranges
[i
].byte
+ (c
- ranges
[i
].unicode
));
378 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
379 bUndefined
, c
, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, 0,
382 case sal::detail::textenc::BAD_INPUT_STOP
:
386 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
390 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
396 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
400 if (cHighSurrogate
!= 0
402 & (RTL_UNICODETOTEXT_INFO_ERROR
403 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
406 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
408 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
412 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
413 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, 0,
416 case sal::detail::textenc::BAD_INPUT_STOP
:
417 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
421 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
422 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
427 m_cHighSurrogate
= cHighSurrogate
;
431 *pSrcCvtChars
= nConverted
;
433 return pDestBufPtr
- pDestBuf
;
436 sal_Size
ImplConvertIsciiDevanagariToUnicode(void const*,
437 void* pContext
, char const* pSrcBuf
, sal_Size nSrcBytes
,
438 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
439 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
441 IsciiDevanagariToUnicode
*pCtx
=
442 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
443 return pCtx
->convert(pSrcBuf
, nSrcBytes
, pDestBuf
, nDestChars
, nFlags
,
444 pInfo
, pSrcCvtBytes
);
447 sal_Size
ImplConvertUnicodeToIsciiDevanagari(void const*,
448 void * pContext
, sal_Unicode
const * pSrcBuf
, sal_Size nSrcChars
,
449 char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
450 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
452 UnicodeToIsciiDevanagari
*pCtx
=
453 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
454 return pCtx
->convert(pSrcBuf
, nSrcChars
,
455 pDestBuf
, nDestBytes
, nFlags
, pInfo
, pSrcCvtChars
);
458 void *ImplCreateIsciiDevanagariToUnicodeContext()
460 return new IsciiDevanagariToUnicode
;
463 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext
)
465 IsciiDevanagariToUnicode
*pCtx
=
466 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
470 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext
)
472 IsciiDevanagariToUnicode
*pCtx
=
473 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
477 void *ImplCreateUnicodeToIsciiDevanagariContext()
479 return new UnicodeToIsciiDevanagari
;
482 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext
)
484 UnicodeToIsciiDevanagari
*pCtx
=
485 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
489 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext
)
491 UnicodeToIsciiDevanagari
*pCtx
=
492 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
496 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */