1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <sal/config.h>
14 #include "converter.hxx"
15 #include "convertisciidevangari.hxx"
16 #include "convertsinglebytetobmpunicode.hxx"
18 #include <rtl/character.hxx>
19 #include <rtl/textcvt.h>
21 using namespace sal::detail::textenc
;
22 using namespace rtl::textenc
;
26 struct IsciiDevanagariToUnicode
28 sal_uInt8 m_cPrevChar
;
29 IsciiDevanagariToUnicode()
37 sal_Size
convert(char const* pSrcBuf
, sal_Size nSrcBytes
,
38 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
39 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
);
42 struct UnicodeToIsciiDevanagari
44 sal_Unicode m_cPrevChar
;
45 sal_Unicode m_cHighSurrogate
;
46 UnicodeToIsciiDevanagari()
56 sal_Size
convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
57 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
58 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
);
63 const sal_Unicode IsciiDevanagariMap
[256] =
65 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
66 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
67 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
68 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
69 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
70 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
71 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
72 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
73 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
74 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
75 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
76 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
77 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
78 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
79 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
80 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
81 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
82 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
83 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
84 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
86 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
87 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
88 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
89 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
90 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
91 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
92 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
93 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
94 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
95 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
96 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
99 sal_Size
IsciiDevanagariToUnicode::convert(
100 char const* pSrcBuf
, sal_Size nSrcBytes
,
101 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
102 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
104 sal_uInt32 nInfo
= 0;
105 sal_Size nConverted
= 0;
106 sal_Unicode
* pDestBufPtr
= pDestBuf
;
107 sal_Unicode
* pDestBufEnd
= pDestBuf
+ nDestChars
;
108 sal_Size startOfCurrentChar
= 0;
110 while (nConverted
< nSrcBytes
)
112 if (pDestBufPtr
== pDestBufEnd
)
114 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
118 sal_Unicode cChar
= sal_Unicode();
119 sal_uInt8 nIn
= static_cast<sal_uInt8
>(pSrcBuf
[nConverted
]);
120 sal_uInt8 nNext
= nConverted
< nSrcBytes
+ 1 ? static_cast<sal_uInt8
>(pSrcBuf
[nConverted
+1]) : 0;
122 bool bDouble
= false;
123 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
124 //halant + nukta E8 E9 halant + ZWJ 094D 200D
125 if (m_cPrevChar
== 0xE8 && nIn
== 0xE8)
130 else if (m_cPrevChar
== 0xE8 && nIn
== 0xE9)
135 else if (nNext
== 0xE9)
198 cChar
= IsciiDevanagariMap
[nIn
];
200 bool bUndefined
= cChar
== 0xffff;
204 BadInputConversionAction eAction
= handleBadInputTextToUnicodeConversion(
205 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
207 if (eAction
== BAD_INPUT_CONTINUE
) {
208 startOfCurrentChar
= nConverted
;
211 if (eAction
== BAD_INPUT_STOP
) {
212 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) != 0) {
213 nConverted
= startOfCurrentChar
;
217 assert(eAction
== BAD_INPUT_NO_OUTPUT
);
218 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
222 *pDestBufPtr
++ = cChar
;
223 m_cPrevChar
= bNormal
? nIn
: 0;
224 startOfCurrentChar
= nConverted
;
230 *pSrcCvtBytes
= nConverted
;
232 return pDestBufPtr
- pDestBuf
;
235 BmpUnicodeToSingleByteRange
const unicodeToISCIIEncoding
[] =
237 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
238 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
239 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
240 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
241 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
242 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
243 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
244 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
245 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
246 { 0x0966, 0x096F - 0x0966, 0xF1 }
249 sal_Size
UnicodeToIsciiDevanagari::convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
250 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
251 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
253 size_t const entries
= SAL_N_ELEMENTS(unicodeToISCIIEncoding
);
254 BmpUnicodeToSingleByteRange
const * ranges
= unicodeToISCIIEncoding
;
256 sal_Unicode cHighSurrogate
= m_cHighSurrogate
;
257 sal_uInt32 nInfo
= 0;
258 sal_Size nConverted
= 0;
259 char* pDestBufPtr
= pDestBuf
;
260 char* pDestBufEnd
= pDestBuf
+ nDestBytes
;
261 for (; nConverted
< nSrcChars
; ++nConverted
)
263 bool bUndefined
= true;
264 sal_uInt32 c
= *pSrcBuf
++;
265 char cSpecialChar
= 0;
266 if (cHighSurrogate
== 0)
268 if (rtl::isHighSurrogate(c
))
270 cHighSurrogate
= static_cast< sal_Unicode
>(c
);
273 else if (rtl::isLowSurrogate(c
))
279 else if (rtl::isLowSurrogate(c
))
281 c
= rtl::combineSurrogates(cHighSurrogate
, c
);
288 assert(rtl::isUnicodeScalarValue(c
));
290 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
291 //halant + nukta E8 E9 halant + ZWJ 094D 200D
292 if (m_cPrevChar
== 0x094D && c
== 0x200C)
293 cSpecialChar
= '\xE8';
294 else if (m_cPrevChar
== 0x094D && c
== 0x200D)
295 cSpecialChar
= '\xE9';
298 if (pDestBufEnd
- pDestBufPtr
< 1)
300 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
303 *pDestBufPtr
++ = cSpecialChar
;
310 cSpecialChar
= '\xA1';
313 cSpecialChar
= '\xA6';
316 cSpecialChar
= '\xA7';
319 cSpecialChar
= '\xAA';
322 cSpecialChar
= '\xB3';
325 cSpecialChar
= '\xB4';
328 cSpecialChar
= '\xB5';
331 cSpecialChar
= '\xBA';
334 cSpecialChar
= '\xBF';
337 cSpecialChar
= '\xC0';
340 cSpecialChar
= '\xC9';
343 cSpecialChar
= '\xDB';
346 cSpecialChar
= '\xDC';
349 cSpecialChar
= '\xDF';
352 cSpecialChar
= '\xEA';
359 if (pDestBufEnd
- pDestBufPtr
< 2)
361 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
364 *pDestBufPtr
++ = cSpecialChar
;
365 *pDestBufPtr
++ = '\xE9';
370 // Linearly searching through the ranges if probably fastest, assuming
371 // that most converted characters belong to the ASCII subset:
372 for (size_t i
= 0; i
< entries
; ++i
)
374 if (c
< ranges
[i
].unicode
)
378 if (c
<= sal::static_int_cast
< sal_uInt32
>(
379 ranges
[i
].unicode
+ ranges
[i
].range
))
381 if (pDestBufEnd
- pDestBufPtr
< 1)
385 *pDestBufPtr
++ = static_cast< char >(
386 ranges
[i
].byte
+ (c
- ranges
[i
].unicode
));
396 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
397 bUndefined
, c
, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
400 case sal::detail::textenc::BAD_INPUT_STOP
:
404 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
408 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
414 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
418 if (cHighSurrogate
!= 0
420 & (RTL_UNICODETOTEXT_INFO_ERROR
421 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
424 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
426 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
430 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
431 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
434 case sal::detail::textenc::BAD_INPUT_STOP
:
435 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
439 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
440 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
445 m_cHighSurrogate
= cHighSurrogate
;
449 *pSrcCvtChars
= nConverted
;
451 return pDestBufPtr
- pDestBuf
;
454 sal_Size
ImplConvertIsciiDevanagariToUnicode(void const*,
455 void* pContext
, char const* pSrcBuf
, sal_Size nSrcBytes
,
456 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
457 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
459 IsciiDevanagariToUnicode
*pCtx
=
460 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
461 return pCtx
->convert(pSrcBuf
, nSrcBytes
, pDestBuf
, nDestChars
, nFlags
,
462 pInfo
, pSrcCvtBytes
);
465 sal_Size
ImplConvertUnicodeToIsciiDevanagari(void const*,
466 void * pContext
, sal_Unicode
const * pSrcBuf
, sal_Size nSrcChars
,
467 char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
468 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
470 UnicodeToIsciiDevanagari
*pCtx
=
471 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
472 return pCtx
->convert(pSrcBuf
, nSrcChars
,
473 pDestBuf
, nDestBytes
, nFlags
, pInfo
, pSrcCvtChars
);
476 void *ImplCreateIsciiDevanagariToUnicodeContext()
478 return new IsciiDevanagariToUnicode
;
481 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext
)
483 IsciiDevanagariToUnicode
*pCtx
=
484 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
488 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext
)
490 IsciiDevanagariToUnicode
*pCtx
=
491 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
495 void *ImplCreateUnicodeToIsciiDevanagariContext()
497 return new UnicodeToIsciiDevanagari
;
500 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext
)
502 UnicodeToIsciiDevanagari
*pCtx
=
503 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
507 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext
)
509 UnicodeToIsciiDevanagari
*pCtx
=
510 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
514 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */