1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <sal/config.h>
14 #include "converter.hxx"
15 #include "unichars.hxx"
16 #include "convertisciidevangari.hxx"
17 #include "convertsinglebytetobmpunicode.hxx"
19 #include <rtl/character.hxx>
20 #include <rtl/textcvt.h>
22 using namespace sal::detail::textenc
;
23 using namespace rtl::textenc
;
27 struct IsciiDevanagariToUnicode
29 sal_uInt8 m_cPrevChar
;
30 IsciiDevanagariToUnicode()
38 sal_Size
convert(char const* pSrcBuf
, sal_Size nSrcBytes
,
39 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
40 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
);
43 struct UnicodeToIsciiDevanagari
45 sal_Unicode m_cPrevChar
;
46 sal_Unicode m_cHighSurrogate
;
47 UnicodeToIsciiDevanagari()
57 sal_Size
convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
58 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
59 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
);
64 const sal_Unicode IsciiDevanagariMap
[256] =
66 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
67 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
68 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
69 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
70 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
71 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
72 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
73 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
74 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
75 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
76 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
77 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
78 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
79 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
80 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
81 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
82 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
83 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
84 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
86 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
87 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
88 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
89 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
90 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
91 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
92 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
93 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
94 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
95 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
96 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
97 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
100 sal_Size
IsciiDevanagariToUnicode::convert(
101 char const* pSrcBuf
, sal_Size nSrcBytes
,
102 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
103 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
105 sal_uInt32 nInfo
= 0;
106 sal_Size nConverted
= 0;
107 sal_Unicode
* pDestBufPtr
= pDestBuf
;
108 sal_Unicode
* pDestBufEnd
= pDestBuf
+ nDestChars
;
109 sal_Size startOfCurrentChar
= 0;
111 while (nConverted
< nSrcBytes
)
113 if (pDestBufPtr
== pDestBufEnd
)
115 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
119 sal_Unicode cChar
= sal_Unicode();
120 sal_uInt8 nIn
= static_cast<sal_uInt8
>(pSrcBuf
[nConverted
]);
121 sal_uInt8 nNext
= nConverted
< nSrcBytes
+ 1 ? static_cast<sal_uInt8
>(pSrcBuf
[nConverted
+1]) : 0;
123 bool bDouble
= false;
124 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
125 //halant + nukta E8 E9 halant + ZWJ 094D 200D
126 if (m_cPrevChar
== 0xE8 && nIn
== 0xE8)
131 else if (m_cPrevChar
== 0xE8 && nIn
== 0xE9)
136 else if (nNext
== 0xE9)
199 cChar
= IsciiDevanagariMap
[nIn
];
201 bool bUndefined
= cChar
== 0xffff;
205 BadInputConversionAction eAction
= handleBadInputTextToUnicodeConversion(
206 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
208 if (eAction
== BAD_INPUT_CONTINUE
) {
209 startOfCurrentChar
= nConverted
;
212 if (eAction
== BAD_INPUT_STOP
) {
213 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) != 0) {
214 nConverted
= startOfCurrentChar
;
218 assert(eAction
== BAD_INPUT_NO_OUTPUT
);
219 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
223 *pDestBufPtr
++ = cChar
;
224 m_cPrevChar
= bNormal
? nIn
: 0;
225 startOfCurrentChar
= nConverted
;
231 *pSrcCvtBytes
= nConverted
;
233 return pDestBufPtr
- pDestBuf
;
236 BmpUnicodeToSingleByteRange
const unicodeToISCIIEncoding
[] =
238 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
239 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
240 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
241 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
242 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
243 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
244 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
245 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
246 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
247 { 0x0966, 0x096F - 0x0966, 0xF1 }
250 sal_Size
UnicodeToIsciiDevanagari::convert(sal_Unicode
const* pSrcBuf
, sal_Size nSrcChars
,
251 char* pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
252 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
254 size_t const entries
= SAL_N_ELEMENTS(unicodeToISCIIEncoding
);
255 BmpUnicodeToSingleByteRange
const * ranges
= unicodeToISCIIEncoding
;
257 sal_Unicode cHighSurrogate
= m_cHighSurrogate
;
258 sal_uInt32 nInfo
= 0;
259 sal_Size nConverted
= 0;
260 char* pDestBufPtr
= pDestBuf
;
261 char* pDestBufEnd
= pDestBuf
+ nDestBytes
;
262 for (; nConverted
< nSrcChars
; ++nConverted
)
264 bool bUndefined
= true;
265 sal_uInt32 c
= *pSrcBuf
++;
266 char cSpecialChar
= 0;
267 if (cHighSurrogate
== 0)
269 if (rtl::isHighSurrogate(c
))
271 cHighSurrogate
= static_cast< sal_Unicode
>(c
);
274 else if (rtl::isLowSurrogate(c
))
280 else if (rtl::isLowSurrogate(c
))
282 c
= rtl::combineSurrogates(cHighSurrogate
, c
);
289 assert(rtl::isUnicodeScalarValue(c
));
291 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
292 //halant + nukta E8 E9 halant + ZWJ 094D 200D
293 if (m_cPrevChar
== 0x094D && c
== 0x200C)
294 cSpecialChar
= '\xE8';
295 else if (m_cPrevChar
== 0x094D && c
== 0x200D)
296 cSpecialChar
= '\xE9';
299 if (pDestBufEnd
- pDestBufPtr
< 1)
301 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
304 *pDestBufPtr
++ = cSpecialChar
;
311 cSpecialChar
= '\xA1';
314 cSpecialChar
= '\xA6';
317 cSpecialChar
= '\xA7';
320 cSpecialChar
= '\xAA';
323 cSpecialChar
= '\xB3';
326 cSpecialChar
= '\xB4';
329 cSpecialChar
= '\xB5';
332 cSpecialChar
= '\xBA';
335 cSpecialChar
= '\xBF';
338 cSpecialChar
= '\xC0';
341 cSpecialChar
= '\xC9';
344 cSpecialChar
= '\xDB';
347 cSpecialChar
= '\xDC';
350 cSpecialChar
= '\xDF';
353 cSpecialChar
= '\xEA';
360 if (pDestBufEnd
- pDestBufPtr
< 2)
362 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
365 *pDestBufPtr
++ = cSpecialChar
;
366 *pDestBufPtr
++ = '\xE9';
371 // Linearly searching through the ranges if probably fastest, assuming
372 // that most converted characters belong to the ASCII subset:
373 for (size_t i
= 0; i
< entries
; ++i
)
375 if (c
< ranges
[i
].unicode
)
379 if (c
<= sal::static_int_cast
< sal_uInt32
>(
380 ranges
[i
].unicode
+ ranges
[i
].range
))
382 if (pDestBufEnd
- pDestBufPtr
< 1)
386 *pDestBufPtr
++ = static_cast< char >(
387 ranges
[i
].byte
+ (c
- ranges
[i
].unicode
));
397 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
398 bUndefined
, c
, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
401 case sal::detail::textenc::BAD_INPUT_STOP
:
405 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
409 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
415 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
419 if (cHighSurrogate
!= 0
421 & (RTL_UNICODETOTEXT_INFO_ERROR
422 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
425 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
427 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
431 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
435 case sal::detail::textenc::BAD_INPUT_STOP
:
436 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
440 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
441 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
446 m_cHighSurrogate
= cHighSurrogate
;
450 *pSrcCvtChars
= nConverted
;
452 return pDestBufPtr
- pDestBuf
;
455 sal_Size
ImplConvertIsciiDevanagariToUnicode(void const*,
456 void* pContext
, char const* pSrcBuf
, sal_Size nSrcBytes
,
457 sal_Unicode
* pDestBuf
, sal_Size nDestChars
, sal_uInt32 nFlags
,
458 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
460 IsciiDevanagariToUnicode
*pCtx
=
461 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
462 return pCtx
->convert(pSrcBuf
, nSrcBytes
, pDestBuf
, nDestChars
, nFlags
,
463 pInfo
, pSrcCvtBytes
);
466 sal_Size
ImplConvertUnicodeToIsciiDevanagari(void const*,
467 void * pContext
, sal_Unicode
const * pSrcBuf
, sal_Size nSrcChars
,
468 char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
469 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
471 UnicodeToIsciiDevanagari
*pCtx
=
472 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
473 return pCtx
->convert(pSrcBuf
, nSrcChars
,
474 pDestBuf
, nDestBytes
, nFlags
, pInfo
, pSrcCvtChars
);
477 void *ImplCreateIsciiDevanagariToUnicodeContext()
479 return new IsciiDevanagariToUnicode
;
482 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext
)
484 IsciiDevanagariToUnicode
*pCtx
=
485 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
489 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext
)
491 IsciiDevanagariToUnicode
*pCtx
=
492 static_cast<IsciiDevanagariToUnicode
*>(pContext
);
496 void *ImplCreateUnicodeToIsciiDevanagariContext()
498 return new UnicodeToIsciiDevanagari
;
501 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext
)
503 UnicodeToIsciiDevanagari
*pCtx
=
504 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
508 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext
)
510 UnicodeToIsciiDevanagari
*pCtx
=
511 static_cast<UnicodeToIsciiDevanagari
*>(pContext
);
515 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */