Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sal / textenc / convertisciidevangari.cxx
blob75fcadcf7fe4342bcea9a8ddcddd36b4bdac809c
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <sal/config.h>
12 #include <cassert>
14 #include "converter.hxx"
15 #include "unichars.hxx"
16 #include "convertisciidevangari.hxx"
17 #include "convertsinglebytetobmpunicode.hxx"
19 #include <rtl/character.hxx>
20 #include <rtl/textcvt.h>
22 using namespace sal::detail::textenc;
23 using namespace rtl::textenc;
25 namespace {
27 struct IsciiDevanagariToUnicode
29 sal_uInt8 m_cPrevChar;
30 IsciiDevanagariToUnicode()
31 : m_cPrevChar(0)
34 void reset()
36 m_cPrevChar = 0;
38 sal_Size convert(char const* pSrcBuf, sal_Size nSrcBytes,
39 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
40 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes);
43 struct UnicodeToIsciiDevanagari
45 sal_Unicode m_cPrevChar;
46 sal_Unicode m_cHighSurrogate;
47 UnicodeToIsciiDevanagari()
48 : m_cPrevChar(0)
49 , m_cHighSurrogate(0)
52 void reset()
54 m_cPrevChar = 0;
55 m_cHighSurrogate = 0;
57 sal_Size convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
58 char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
59 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars);
64 const sal_Unicode IsciiDevanagariMap[256] =
66 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
67 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
68 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
69 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
70 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
71 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
72 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
73 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
74 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
75 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
76 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
77 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
78 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
79 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
80 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
81 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
82 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
83 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
84 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
86 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
87 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
88 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
89 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
90 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
91 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
92 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
93 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
94 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
95 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
96 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
97 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
100 sal_Size IsciiDevanagariToUnicode::convert(
101 char const* pSrcBuf, sal_Size nSrcBytes,
102 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
103 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
105 sal_uInt32 nInfo = 0;
106 sal_Size nConverted = 0;
107 sal_Unicode* pDestBufPtr = pDestBuf;
108 sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
109 sal_Size startOfCurrentChar = 0;
111 while (nConverted < nSrcBytes)
113 if (pDestBufPtr == pDestBufEnd)
115 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
116 break;
119 sal_Unicode cChar = sal_Unicode();
120 sal_uInt8 nIn = static_cast<sal_uInt8>(pSrcBuf[nConverted]);
121 sal_uInt8 nNext = nConverted < nSrcBytes + 1 ? static_cast<sal_uInt8>(pSrcBuf[nConverted+1]) : 0;
122 bool bNormal = true;
123 bool bDouble = false;
124 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
125 //halant + nukta E8 E9 halant + ZWJ 094D 200D
126 if (m_cPrevChar == 0xE8 && nIn == 0xE8)
128 cChar = 0x200C;
129 bNormal = false;
131 else if (m_cPrevChar == 0xE8 && nIn == 0xE9)
133 cChar = 0x200D;
134 bNormal = false;
136 else if (nNext == 0xE9)
138 bNormal = false;
139 bDouble = true;
140 switch(nIn)
142 case 0xA1:
143 cChar = 0x0950;
144 break;
145 case 0xA6:
146 cChar = 0x090C;
147 break;
148 case 0xA7:
149 cChar = 0x0961;
150 break;
151 case 0xAA:
152 cChar = 0x0960;
153 break;
154 case 0xB3:
155 cChar = 0x0958;
156 break;
157 case 0xB4:
158 cChar = 0x0959;
159 break;
160 case 0xB5:
161 cChar = 0x095A;
162 break;
163 case 0xBA:
164 cChar = 0x095B;
165 break;
166 case 0xBF:
167 cChar = 0x095C;
168 break;
169 case 0xC0:
170 cChar = 0x095D;
171 break;
172 case 0xC9:
173 cChar = 0x095E;
174 break;
175 case 0xDB:
176 cChar = 0x0962;
177 break;
178 case 0xDC:
179 cChar = 0x0963;
180 break;
181 case 0xDF:
182 cChar = 0x0944;
183 break;
184 case 0xEA:
185 cChar = 0x093D;
186 break;
187 default:
188 bNormal = true;
189 bDouble = false;
190 break;
194 ++nConverted;
195 if (bDouble)
196 ++nConverted;
198 if (bNormal)
199 cChar = IsciiDevanagariMap[nIn];
201 bool bUndefined = cChar == 0xffff;
203 if (bUndefined)
205 BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
206 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
207 &nInfo);
208 if (eAction == BAD_INPUT_CONTINUE) {
209 startOfCurrentChar = nConverted;
210 continue;
212 if (eAction == BAD_INPUT_STOP) {
213 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
214 nConverted = startOfCurrentChar;
216 break;
218 assert(eAction == BAD_INPUT_NO_OUTPUT);
219 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
220 break;
223 *pDestBufPtr++ = cChar;
224 m_cPrevChar = bNormal ? nIn : 0;
225 startOfCurrentChar = nConverted;
228 if (pInfo)
229 *pInfo = nInfo;
230 if (pSrcCvtBytes)
231 *pSrcCvtBytes = nConverted;
233 return pDestBufPtr - pDestBuf;
236 BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] =
238 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
239 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
240 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
241 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
242 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
243 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
244 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
245 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
246 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
247 { 0x0966, 0x096F - 0x0966, 0xF1 }
250 sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
251 char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
252 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
254 size_t const entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding);
255 BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding;
257 sal_Unicode cHighSurrogate = m_cHighSurrogate;
258 sal_uInt32 nInfo = 0;
259 sal_Size nConverted = 0;
260 char* pDestBufPtr = pDestBuf;
261 char* pDestBufEnd = pDestBuf + nDestBytes;
262 for (; nConverted < nSrcChars; ++nConverted)
264 bool bUndefined = true;
265 sal_uInt32 c = *pSrcBuf++;
266 char cSpecialChar = 0;
267 if (cHighSurrogate == 0)
269 if (rtl::isHighSurrogate(c))
271 cHighSurrogate = static_cast< sal_Unicode >(c);
272 continue;
274 else if (rtl::isLowSurrogate(c))
276 bUndefined = false;
277 goto bad_input;
280 else if (rtl::isLowSurrogate(c))
282 c = rtl::combineSurrogates(cHighSurrogate, c);
284 else
286 bUndefined = false;
287 goto bad_input;
289 assert(rtl::isUnicodeScalarValue(c));
291 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
292 //halant + nukta E8 E9 halant + ZWJ 094D 200D
293 if (m_cPrevChar == 0x094D && c == 0x200C)
294 cSpecialChar = '\xE8';
295 else if (m_cPrevChar == 0x094D && c == 0x200D)
296 cSpecialChar = '\xE9';
297 if (cSpecialChar)
299 if (pDestBufEnd - pDestBufPtr < 1)
301 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
302 break;
304 *pDestBufPtr++ = cSpecialChar;
305 m_cPrevChar = 0;
306 goto done;
308 switch (c)
310 case 0x0950:
311 cSpecialChar = '\xA1';
312 break;
313 case 0x090C:
314 cSpecialChar = '\xA6';
315 break;
316 case 0x0961:
317 cSpecialChar = '\xA7';
318 break;
319 case 0x0960:
320 cSpecialChar = '\xAA';
321 break;
322 case 0x0958:
323 cSpecialChar = '\xB3';
324 break;
325 case 0x0959:
326 cSpecialChar = '\xB4';
327 break;
328 case 0x095A:
329 cSpecialChar = '\xB5';
330 break;
331 case 0x095B:
332 cSpecialChar = '\xBA';
333 break;
334 case 0x095C:
335 cSpecialChar = '\xBF';
336 break;
337 case 0x095D:
338 cSpecialChar = '\xC0';
339 break;
340 case 0x095E:
341 cSpecialChar = '\xC9';
342 break;
343 case 0x0962:
344 cSpecialChar = '\xDB';
345 break;
346 case 0x0963:
347 cSpecialChar = '\xDC';
348 break;
349 case 0x0944:
350 cSpecialChar = '\xDF';
351 break;
352 case 0x093D:
353 cSpecialChar = '\xEA';
354 break;
355 default:
356 break;
358 if (cSpecialChar)
360 if (pDestBufEnd - pDestBufPtr < 2)
362 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
363 break;
365 *pDestBufPtr++ = cSpecialChar;
366 *pDestBufPtr++ = '\xE9';
367 m_cPrevChar = 0;
368 goto done;
371 // Linearly searching through the ranges if probably fastest, assuming
372 // that most converted characters belong to the ASCII subset:
373 for (size_t i = 0; i < entries; ++i)
375 if (c < ranges[i].unicode)
377 break;
379 if (c <= sal::static_int_cast< sal_uInt32 >(
380 ranges[i].unicode + ranges[i].range))
382 if (pDestBufEnd - pDestBufPtr < 1)
384 goto no_output;
386 *pDestBufPtr++ = static_cast< char >(
387 ranges[i].byte + (c - ranges[i].unicode));
388 m_cPrevChar = c;
389 goto done;
392 goto bad_input;
393 done:
394 cHighSurrogate = 0;
395 continue;
396 bad_input:
397 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
398 bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
399 0, nullptr))
401 case sal::detail::textenc::BAD_INPUT_STOP:
402 cHighSurrogate = 0;
403 break;
405 case sal::detail::textenc::BAD_INPUT_CONTINUE:
406 cHighSurrogate = 0;
407 continue;
409 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
410 goto no_output;
412 break;
413 no_output:
414 --pSrcBuf;
415 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
416 break;
419 if (cHighSurrogate != 0
420 && ((nInfo
421 & (RTL_UNICODETOTEXT_INFO_ERROR
422 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
423 == 0))
425 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
427 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
429 else
431 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
433 0, nullptr))
435 case sal::detail::textenc::BAD_INPUT_STOP:
436 case sal::detail::textenc::BAD_INPUT_CONTINUE:
437 cHighSurrogate = 0;
438 break;
440 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
441 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
442 break;
446 m_cHighSurrogate = cHighSurrogate;
447 if (pInfo)
448 *pInfo = nInfo;
449 if (pSrcCvtChars)
450 *pSrcCvtChars = nConverted;
452 return pDestBufPtr - pDestBuf;
455 sal_Size ImplConvertIsciiDevanagariToUnicode(void const*,
456 void* pContext, char const* pSrcBuf, sal_Size nSrcBytes,
457 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
458 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
460 IsciiDevanagariToUnicode *pCtx =
461 static_cast<IsciiDevanagariToUnicode*>(pContext);
462 return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags,
463 pInfo, pSrcCvtBytes);
466 sal_Size ImplConvertUnicodeToIsciiDevanagari(void const*,
467 void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
468 char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
469 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
471 UnicodeToIsciiDevanagari *pCtx =
472 static_cast<UnicodeToIsciiDevanagari*>(pContext);
473 return pCtx->convert(pSrcBuf, nSrcChars,
474 pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars);
477 void *ImplCreateIsciiDevanagariToUnicodeContext()
479 return new IsciiDevanagariToUnicode;
482 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext)
484 IsciiDevanagariToUnicode *pCtx =
485 static_cast<IsciiDevanagariToUnicode*>(pContext);
486 delete pCtx;
489 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext)
491 IsciiDevanagariToUnicode *pCtx =
492 static_cast<IsciiDevanagariToUnicode*>(pContext);
493 pCtx->reset();
496 void *ImplCreateUnicodeToIsciiDevanagariContext()
498 return new UnicodeToIsciiDevanagari;
501 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext)
503 UnicodeToIsciiDevanagari *pCtx =
504 static_cast<UnicodeToIsciiDevanagari*>(pContext);
505 pCtx->reset();
508 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext)
510 UnicodeToIsciiDevanagari *pCtx =
511 static_cast<UnicodeToIsciiDevanagari*>(pContext);
512 delete pCtx;
515 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */