Avoid potential negative array index access to cached text.
[LibreOffice.git] / sal / textenc / convertisciidevangari.cxx
blobcf39721446eb951180fbb18ea09c17d760fdd982
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <sal/config.h>
12 #include <cassert>
14 #include "converter.hxx"
15 #include "convertisciidevangari.hxx"
16 #include "convertsinglebytetobmpunicode.hxx"
18 #include <rtl/character.hxx>
19 #include <rtl/textcvt.h>
21 using namespace sal::detail::textenc;
22 using namespace rtl::textenc;
24 namespace {
26 struct IsciiDevanagariToUnicode
28 sal_uInt8 m_cPrevChar;
29 IsciiDevanagariToUnicode()
30 : m_cPrevChar(0)
33 void reset()
35 m_cPrevChar = 0;
37 sal_Size convert(char const* pSrcBuf, sal_Size nSrcBytes,
38 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
39 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes);
42 struct UnicodeToIsciiDevanagari
44 sal_Unicode m_cPrevChar;
45 sal_Unicode m_cHighSurrogate;
46 UnicodeToIsciiDevanagari()
47 : m_cPrevChar(0)
48 , m_cHighSurrogate(0)
51 void reset()
53 m_cPrevChar = 0;
54 m_cHighSurrogate = 0;
56 sal_Size convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
57 char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
58 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars);
63 const sal_Unicode IsciiDevanagariMap[256] =
65 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
66 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
67 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
68 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
69 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
70 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
71 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
72 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
73 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
74 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
75 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
76 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
77 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
78 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
79 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
80 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
81 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
82 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
83 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
84 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
86 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
87 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
88 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
89 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
90 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
91 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
92 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
93 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
94 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
95 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
96 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
99 sal_Size IsciiDevanagariToUnicode::convert(
100 char const* pSrcBuf, sal_Size nSrcBytes,
101 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
102 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
104 sal_uInt32 nInfo = 0;
105 sal_Size nConverted = 0;
106 sal_Unicode* pDestBufPtr = pDestBuf;
107 sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
108 sal_Size startOfCurrentChar = 0;
110 while (nConverted < nSrcBytes)
112 if (pDestBufPtr == pDestBufEnd)
114 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
115 break;
118 sal_Unicode cChar = sal_Unicode();
119 sal_uInt8 nIn = static_cast<sal_uInt8>(pSrcBuf[nConverted]);
120 sal_uInt8 nNext = nConverted < nSrcBytes + 1 ? static_cast<sal_uInt8>(pSrcBuf[nConverted+1]) : 0;
121 bool bNormal = true;
122 bool bDouble = false;
123 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
124 //halant + nukta E8 E9 halant + ZWJ 094D 200D
125 if (m_cPrevChar == 0xE8 && nIn == 0xE8)
127 cChar = 0x200C;
128 bNormal = false;
130 else if (m_cPrevChar == 0xE8 && nIn == 0xE9)
132 cChar = 0x200D;
133 bNormal = false;
135 else if (nNext == 0xE9)
137 bNormal = false;
138 bDouble = true;
139 switch(nIn)
141 case 0xA1:
142 cChar = 0x0950;
143 break;
144 case 0xA6:
145 cChar = 0x090C;
146 break;
147 case 0xA7:
148 cChar = 0x0961;
149 break;
150 case 0xAA:
151 cChar = 0x0960;
152 break;
153 case 0xB3:
154 cChar = 0x0958;
155 break;
156 case 0xB4:
157 cChar = 0x0959;
158 break;
159 case 0xB5:
160 cChar = 0x095A;
161 break;
162 case 0xBA:
163 cChar = 0x095B;
164 break;
165 case 0xBF:
166 cChar = 0x095C;
167 break;
168 case 0xC0:
169 cChar = 0x095D;
170 break;
171 case 0xC9:
172 cChar = 0x095E;
173 break;
174 case 0xDB:
175 cChar = 0x0962;
176 break;
177 case 0xDC:
178 cChar = 0x0963;
179 break;
180 case 0xDF:
181 cChar = 0x0944;
182 break;
183 case 0xEA:
184 cChar = 0x093D;
185 break;
186 default:
187 bNormal = true;
188 bDouble = false;
189 break;
193 ++nConverted;
194 if (bDouble)
195 ++nConverted;
197 if (bNormal)
198 cChar = IsciiDevanagariMap[nIn];
200 bool bUndefined = cChar == 0xffff;
202 if (bUndefined)
204 BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
205 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
206 &nInfo);
207 if (eAction == BAD_INPUT_CONTINUE) {
208 startOfCurrentChar = nConverted;
209 continue;
211 if (eAction == BAD_INPUT_STOP) {
212 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
213 nConverted = startOfCurrentChar;
215 break;
217 assert(eAction == BAD_INPUT_NO_OUTPUT);
218 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
219 break;
222 *pDestBufPtr++ = cChar;
223 m_cPrevChar = bNormal ? nIn : 0;
224 startOfCurrentChar = nConverted;
227 if (pInfo)
228 *pInfo = nInfo;
229 if (pSrcCvtBytes)
230 *pSrcCvtBytes = nConverted;
232 return pDestBufPtr - pDestBuf;
235 BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] =
237 { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
238 { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
239 { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
240 { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
241 { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
242 { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
243 { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
244 { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
245 { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
246 { 0x0966, 0x096F - 0x0966, 0xF1 }
249 sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
250 char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
251 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
253 size_t const entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding);
254 BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding;
256 sal_Unicode cHighSurrogate = m_cHighSurrogate;
257 sal_uInt32 nInfo = 0;
258 sal_Size nConverted = 0;
259 char* pDestBufPtr = pDestBuf;
260 char* pDestBufEnd = pDestBuf + nDestBytes;
261 for (; nConverted < nSrcChars; ++nConverted)
263 bool bUndefined = true;
264 sal_uInt32 c = *pSrcBuf++;
265 char cSpecialChar = 0;
266 if (cHighSurrogate == 0)
268 if (rtl::isHighSurrogate(c))
270 cHighSurrogate = static_cast< sal_Unicode >(c);
271 continue;
273 else if (rtl::isLowSurrogate(c))
275 bUndefined = false;
276 goto bad_input;
279 else if (rtl::isLowSurrogate(c))
281 c = rtl::combineSurrogates(cHighSurrogate, c);
283 else
285 bUndefined = false;
286 goto bad_input;
288 assert(rtl::isUnicodeScalarValue(c));
290 //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
291 //halant + nukta E8 E9 halant + ZWJ 094D 200D
292 if (m_cPrevChar == 0x094D && c == 0x200C)
293 cSpecialChar = '\xE8';
294 else if (m_cPrevChar == 0x094D && c == 0x200D)
295 cSpecialChar = '\xE9';
296 if (cSpecialChar)
298 if (pDestBufEnd - pDestBufPtr < 1)
300 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 break;
303 *pDestBufPtr++ = cSpecialChar;
304 m_cPrevChar = 0;
305 goto done;
307 switch (c)
309 case 0x0950:
310 cSpecialChar = '\xA1';
311 break;
312 case 0x090C:
313 cSpecialChar = '\xA6';
314 break;
315 case 0x0961:
316 cSpecialChar = '\xA7';
317 break;
318 case 0x0960:
319 cSpecialChar = '\xAA';
320 break;
321 case 0x0958:
322 cSpecialChar = '\xB3';
323 break;
324 case 0x0959:
325 cSpecialChar = '\xB4';
326 break;
327 case 0x095A:
328 cSpecialChar = '\xB5';
329 break;
330 case 0x095B:
331 cSpecialChar = '\xBA';
332 break;
333 case 0x095C:
334 cSpecialChar = '\xBF';
335 break;
336 case 0x095D:
337 cSpecialChar = '\xC0';
338 break;
339 case 0x095E:
340 cSpecialChar = '\xC9';
341 break;
342 case 0x0962:
343 cSpecialChar = '\xDB';
344 break;
345 case 0x0963:
346 cSpecialChar = '\xDC';
347 break;
348 case 0x0944:
349 cSpecialChar = '\xDF';
350 break;
351 case 0x093D:
352 cSpecialChar = '\xEA';
353 break;
354 default:
355 break;
357 if (cSpecialChar)
359 if (pDestBufEnd - pDestBufPtr < 2)
361 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
362 break;
364 *pDestBufPtr++ = cSpecialChar;
365 *pDestBufPtr++ = '\xE9';
366 m_cPrevChar = 0;
367 goto done;
370 // Linearly searching through the ranges if probably fastest, assuming
371 // that most converted characters belong to the ASCII subset:
372 for (size_t i = 0; i < entries; ++i)
374 if (c < ranges[i].unicode)
376 break;
378 if (c <= sal::static_int_cast< sal_uInt32 >(
379 ranges[i].unicode + ranges[i].range))
381 if (pDestBufEnd - pDestBufPtr < 1)
383 goto no_output;
385 *pDestBufPtr++ = static_cast< char >(
386 ranges[i].byte + (c - ranges[i].unicode));
387 m_cPrevChar = c;
388 goto done;
391 goto bad_input;
392 done:
393 cHighSurrogate = 0;
394 continue;
395 bad_input:
396 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
397 bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
398 0, nullptr))
400 case sal::detail::textenc::BAD_INPUT_STOP:
401 cHighSurrogate = 0;
402 break;
404 case sal::detail::textenc::BAD_INPUT_CONTINUE:
405 cHighSurrogate = 0;
406 continue;
408 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
409 goto no_output;
411 break;
412 no_output:
413 --pSrcBuf;
414 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
415 break;
418 if (cHighSurrogate != 0
419 && ((nInfo
420 & (RTL_UNICODETOTEXT_INFO_ERROR
421 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
422 == 0))
424 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
426 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
428 else
430 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
431 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
432 0, nullptr))
434 case sal::detail::textenc::BAD_INPUT_STOP:
435 case sal::detail::textenc::BAD_INPUT_CONTINUE:
436 cHighSurrogate = 0;
437 break;
439 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
440 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
441 break;
445 m_cHighSurrogate = cHighSurrogate;
446 if (pInfo)
447 *pInfo = nInfo;
448 if (pSrcCvtChars)
449 *pSrcCvtChars = nConverted;
451 return pDestBufPtr - pDestBuf;
454 sal_Size ImplConvertIsciiDevanagariToUnicode(void const*,
455 void* pContext, char const* pSrcBuf, sal_Size nSrcBytes,
456 sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
457 sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
459 IsciiDevanagariToUnicode *pCtx =
460 static_cast<IsciiDevanagariToUnicode*>(pContext);
461 return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags,
462 pInfo, pSrcCvtBytes);
465 sal_Size ImplConvertUnicodeToIsciiDevanagari(void const*,
466 void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
467 char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
468 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
470 UnicodeToIsciiDevanagari *pCtx =
471 static_cast<UnicodeToIsciiDevanagari*>(pContext);
472 return pCtx->convert(pSrcBuf, nSrcChars,
473 pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars);
476 void *ImplCreateIsciiDevanagariToUnicodeContext()
478 return new IsciiDevanagariToUnicode;
481 void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext)
483 IsciiDevanagariToUnicode *pCtx =
484 static_cast<IsciiDevanagariToUnicode*>(pContext);
485 delete pCtx;
488 void ImplResetIsciiDevanagariToUnicodeContext(void * pContext)
490 IsciiDevanagariToUnicode *pCtx =
491 static_cast<IsciiDevanagariToUnicode*>(pContext);
492 pCtx->reset();
495 void *ImplCreateUnicodeToIsciiDevanagariContext()
497 return new UnicodeToIsciiDevanagari;
500 void ImplResetUnicodeToIsciiDevanagariContext(void * pContext)
502 UnicodeToIsciiDevanagari *pCtx =
503 static_cast<UnicodeToIsciiDevanagari*>(pContext);
504 pCtx->reset();
507 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext)
509 UnicodeToIsciiDevanagari *pCtx =
510 static_cast<UnicodeToIsciiDevanagari*>(pContext);
511 delete pCtx;
514 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */