1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include "converter.hxx"
11 #include "unichars.hxx"
12 #include "convertisciidevangari.hxx"
13 #include "convertsinglebytetobmpunicode.hxx"
14 #include <rtl/textcvt.h>
16 using namespace sal::detail::textenc
;
17 using namespace rtl::textenc
;
19 struct IsciiDevanagariToUnicode
21 sal_uInt8 m_cPrevChar
;
22 IsciiDevanagariToUnicode()
30 sal_Size
convert(char const* pSrcBuf
, sal_Size nSrcBytes
,
31 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
32 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
);
35 struct UnicodeToIsciiDevanagari
37 sal_Unicode m_cPrevChar
;
38 sal_Unicode m_cHighSurrogate
;
39 UnicodeToIsciiDevanagari()
49 sal_Size
convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
50 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
51 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
);
54 static const sal_Unicode IsciiDevanagariMap
[256] =
56 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
57 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
58 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
59 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
60 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
61 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
62 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
63 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
64 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
65 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
66 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
67 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
68 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
69 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
70 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
71 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
72 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
73 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
74 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
75 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
76 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
77 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
78 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
79 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
80 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
81 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
82 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
83 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
84 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
85 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
86 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
87 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
90 sal_Size
IsciiDevanagariToUnicode::convert(
91 char const* pSrcBuf
, sal_Size nSrcBytes
,
92 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
93 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
96 sal_Size nConverted
= 0;
97 sal_Unicode
* pDestBufPtr
= pDestBuf
;
98 sal_Unicode
* pDestBufEnd
= pDestBuf
+ nDestChars
;
100 while (nConverted
< nSrcBytes
)
102 if (pDestBufPtr
== pDestBufEnd
)
104 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
108 sal_Unicode cChar
= sal_Unicode();
109 sal_uInt8 nIn
= static_cast<sal_uInt8
>(pSrcBuf
[nConverted
]);
110 sal_uInt8 nNext
= nConverted
< nSrcBytes
+ 1 ? static_cast<sal_uInt8
>(pSrcBuf
[nConverted
+1]) : 0;
112 bool bDouble
= false;
113 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
114 //halant + nukta E8 E9 halant + ZWJ 094D 200D
115 if (m_cPrevChar
== 0xE8 && nIn
== 0xE8)
120 else if (m_cPrevChar
== 0xE8 && nIn
== 0xE9)
125 else if (nNext
== 0xE9)
184 cChar
= IsciiDevanagariMap
[nIn
];
186 bool bUndefined
= cChar
== 0xffff;
190 BadInputConversionAction eAction
= handleBadInputTextToUnicodeConversion(
191 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
193 if (eAction
== BAD_INPUT_CONTINUE
)
195 if (eAction
== BAD_INPUT_STOP
)
197 else if (eAction
== BAD_INPUT_NO_OUTPUT
)
199 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
207 *pDestBufPtr
++ = cChar
;
208 m_cPrevChar
= bNormal
? nIn
: 0;
214 *pSrcCvtBytes
= nConverted
;
216 return pDestBufPtr
- pDestBuf
;
219 BmpUnicodeToSingleByteRange
const unicodeToISCIIEncoding
[] =
221 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
222 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
223 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
224 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
225 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
226 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
227 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
228 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
229 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
230 { 0x0966, 0x096F - 0x0966, 0xF1 }
233 sal_Size
UnicodeToIsciiDevanagari::convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
234 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
235 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
237 size_t entries
= SAL_N_ELEMENTS(unicodeToISCIIEncoding
);
238 BmpUnicodeToSingleByteRange
const * ranges
= unicodeToISCIIEncoding
;
240 sal_Unicode cHighSurrogate
= m_cHighSurrogate
;
241 sal_uInt32 nInfo
= 0;
242 sal_Size nConverted
= 0;
243 sal_Char
* pDestBufPtr
= pDestBuf
;
244 sal_Char
* pDestBufEnd
= pDestBuf
+ nDestBytes
;
245 for (; nConverted
< nSrcChars
; ++nConverted
)
247 bool bUndefined
= true;
248 sal_uInt32 c
= *pSrcBuf
++;
249 sal_Char cSpecialChar
= 0;
250 if (cHighSurrogate
== 0)
252 if (ImplIsHighSurrogate(c
))
254 cHighSurrogate
= static_cast< sal_Unicode
>(c
);
258 else if (ImplIsLowSurrogate(c
))
260 c
= ImplCombineSurrogates(cHighSurrogate
, c
);
267 if (ImplIsLowSurrogate(c
) || ImplIsNoncharacter(c
))
273 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
274 //halant + nukta E8 E9 halant + ZWJ 094D 200D
275 if (m_cPrevChar
== 0x094D && c
== 0x200C)
276 cSpecialChar
= '\xE8';
277 else if (m_cPrevChar
== 0x094D && c
== 0x200D)
278 cSpecialChar
= '\xE9';
281 if (pDestBufEnd
- pDestBufPtr
< 1)
283 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
286 *pDestBufPtr
++ = cSpecialChar
;
293 cSpecialChar
= '\xA1';
296 cSpecialChar
= '\xA6';
299 cSpecialChar
= '\xA7';
302 cSpecialChar
= '\xAA';
305 cSpecialChar
= '\xB3';
308 cSpecialChar
= '\xB4';
311 cSpecialChar
= '\xB5';
314 cSpecialChar
= '\xBA';
317 cSpecialChar
= '\xBF';
320 cSpecialChar
= '\xC0';
323 cSpecialChar
= '\xC9';
326 cSpecialChar
= '\xDB';
329 cSpecialChar
= '\xDC';
332 cSpecialChar
= '\xDF';
335 cSpecialChar
= '\xEA';
342 if (pDestBufEnd
- pDestBufPtr
< 2)
344 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
347 *pDestBufPtr
++ = cSpecialChar
;
348 *pDestBufPtr
++ = '\xE9';
353 // Linearly searching through the ranges if probably fastest, assuming
354 // that most converted characters belong to the ASCII subset:
355 for (size_t i
= 0; i
< entries
; ++i
)
357 if (c
< ranges
[i
].unicode
)
361 else if (c
<= sal::static_int_cast
< sal_uInt32
>(
362 ranges
[i
].unicode
+ ranges
[i
].range
))
364 if (pDestBufEnd
- pDestBufPtr
< 1)
368 *pDestBufPtr
++ = static_cast< sal_Char
>(
369 ranges
[i
].byte
+ (c
- ranges
[i
].unicode
));
379 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
380 bUndefined
, c
, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
383 case sal::detail::textenc::BAD_INPUT_STOP
:
387 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
391 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
397 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
401 if (cHighSurrogate
!= 0
403 & (RTL_UNICODETOTEXT_INFO_ERROR
404 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
407 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
409 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
413 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
414 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
417 case sal::detail::textenc::BAD_INPUT_STOP
:
418 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
422 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
423 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
428 m_cHighSurrogate
= cHighSurrogate
;
432 *pSrcCvtChars
= nConverted
;
434 return pDestBufPtr
- pDestBuf
;
437 sal_Size
ImplConvertIsciiDevanagariToUnicode(void const*,
438 void* pContext
, char const* pSrcBuf
, sal_Size nSrcBytes
,
439 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
440 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
442 IsciiDevanagariToUnicode
*pCtx
=
443 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
444 return pCtx
->convert(pSrcBuf
, nSrcBytes
, pDestBuf
, nDestChars
, nFlags
,
445 pInfo
, pSrcCvtBytes
);
448 sal_Size
ImplConvertUnicodeToIsciiDevanagari(void const*,
449 void * pContext
, sal_Unicode
const * pSrcBuf
, sal_Size nSrcChars
,
450 char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
451 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
453 UnicodeToIsciiDevanagari
*pCtx
=
454 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
455 return pCtx
->convert(pSrcBuf
, nSrcChars
,
456 pDestBuf
, nDestBytes
, nFlags
, pInfo
, pSrcCvtChars
);
459 void *ImplCreateIsciiDevanagariToUnicodeContext()
461 return new IsciiDevanagariToUnicode
;
464 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext
)
466 IsciiDevanagariToUnicode
*pCtx
=
467 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
471 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext
)
473 IsciiDevanagariToUnicode
*pCtx
=
474 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
478 void *ImplCreateUnicodeToIsciiDevanagariContext()
480 return new UnicodeToIsciiDevanagari
;
483 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext
)
485 UnicodeToIsciiDevanagari
*pCtx
=
486 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
490 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext
)
492 UnicodeToIsciiDevanagari
*pCtx
=
493 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
497 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */