Version 6.4.0.3, tag libreoffice-6.4.0.3
[LibreOffice.git] / sal / textenc / convertiso2022kr.cxx
blob5c7971ba5af6103edd9cf1bf736d5e5cb4ec99a7
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <rtl/character.hxx>
25 #include <rtl/textcvt.h>
26 #include <sal/types.h>
28 #include "converter.hxx"
29 #include "convertiso2022kr.hxx"
30 #include "tenchelp.hxx"
31 #include "unichars.hxx"
33 namespace {
35 enum ImplIso2022KrToUnicodeState // order is important:
37 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII,
38 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001,
39 IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2,
40 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC,
41 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR,
42 IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN
45 struct ImplIso2022KrToUnicodeContext
47 ImplIso2022KrToUnicodeState m_eState;
48 sal_uInt32 m_nRow;
51 enum ImplUnicodeToIso2022KrSet
53 IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE,
54 IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII,
55 IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
58 struct ImplUnicodeToIso2022KrContext
60 sal_Unicode m_nHighSurrogate;
61 ImplUnicodeToIso2022KrSet m_eSet;
66 void * ImplCreateIso2022KrToUnicodeContext()
68 ImplIso2022KrToUnicodeContext * pContext =
69 new ImplIso2022KrToUnicodeContext;
70 pContext->m_eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
71 return pContext;
74 void ImplResetIso2022KrToUnicodeContext(void * pContext)
76 if (pContext)
77 static_cast< ImplIso2022KrToUnicodeContext * >(pContext)->m_eState
78 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
81 void ImplDestroyIso2022KrToUnicodeContext(void * pContext)
83 delete static_cast< ImplIso2022KrToUnicodeContext * >(pContext);
86 sal_Size ImplConvertIso2022KrToUnicode(void const * pData,
87 void * pContext,
88 char const * pSrcBuf,
89 sal_Size nSrcBytes,
90 sal_Unicode * pDestBuf,
91 sal_Size nDestChars,
92 sal_uInt32 nFlags,
93 sal_uInt32 * pInfo,
94 sal_Size * pSrcCvtBytes)
96 ImplDBCSToUniLeadTab const * pKsX1001Data
97 = static_cast< ImplIso2022KrConverterData const * >(pData)->
98 m_pKsX1001ToUnicodeData;
99 ImplIso2022KrToUnicodeState eState
100 = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
101 sal_uInt32 nRow = 0;
102 sal_uInt32 nInfo = 0;
103 sal_Size nConverted = 0;
104 sal_Unicode * pDestBufPtr = pDestBuf;
105 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
106 sal_Size startOfCurrentChar = 0;
108 if (pContext)
110 eState = static_cast< ImplIso2022KrToUnicodeContext * >(pContext)->m_eState;
111 nRow = static_cast< ImplIso2022KrToUnicodeContext * >(pContext)->m_nRow;
114 for (; nConverted < nSrcBytes; ++nConverted)
116 bool bUndefined = true;
117 sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++);
118 switch (eState)
120 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII:
121 if (nChar == 0x0E) // SO
122 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
123 else if (nChar == 0x1B) // ESC
124 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC;
125 else if (nChar < 0x80)
126 if (pDestBufPtr != pDestBufEnd) {
127 *pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
128 startOfCurrentChar = nConverted + 1;
129 } else
130 goto no_output;
131 else
133 bUndefined = false;
134 goto bad_input;
136 break;
138 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001:
139 if (nChar == 0x0F) // SI
140 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
141 else if (nChar >= 0x21 && nChar <= 0x7E)
143 nRow = nChar + 0x80;
144 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2;
146 else
148 bUndefined = false;
149 goto bad_input;
151 break;
153 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001_2:
154 if (nChar >= 0x21 && nChar <= 0x7E)
156 sal_uInt16 nUnicode = 0;
157 sal_uInt32 nFirst = pKsX1001Data[nRow].mnTrailStart;
158 nChar += 0x80;
159 if (nChar >= nFirst && nChar <= pKsX1001Data[nRow].mnTrailEnd)
160 nUnicode = pKsX1001Data[nRow].
161 mpToUniTrailTab[nChar - nFirst];
162 if (nUnicode != 0)
163 if (pDestBufPtr != pDestBufEnd)
165 *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
166 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001;
167 startOfCurrentChar = nConverted + 1;
169 else
170 goto no_output;
171 else
172 goto bad_input;
174 else
176 bUndefined = false;
177 goto bad_input;
179 break;
181 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC:
182 if (nChar == 0x24) // $
183 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR;
184 else
186 bUndefined = false;
187 goto bad_input;
189 break;
191 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR:
192 if (nChar == 0x29) // )
193 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN;
194 else
196 bUndefined = false;
197 goto bad_input;
199 break;
201 case IMPL_ISO_2022_KR_TO_UNICODE_STATE_ESC_DOLLAR_RPAREN:
202 if (nChar == 0x43) // C
203 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
204 else
206 bUndefined = false;
207 goto bad_input;
209 break;
211 continue;
213 bad_input:
214 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
215 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
216 &nInfo))
218 case sal::detail::textenc::BAD_INPUT_STOP:
219 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
220 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
221 ++nConverted;
222 } else {
223 nConverted = startOfCurrentChar;
225 break;
227 case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
229 startOfCurrentChar = nConverted + 1;
230 continue;
232 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 goto no_output;
235 break;
237 no_output:
238 --pSrcBuf;
239 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
240 break;
243 if (eState > IMPL_ISO_2022_KR_TO_UNICODE_STATE_1001
244 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
245 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
246 == 0)
248 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
249 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
250 else
251 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
252 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
253 &nInfo))
255 case sal::detail::textenc::BAD_INPUT_STOP:
256 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
257 nConverted = startOfCurrentChar;
259 [[fallthrough]];
260 case sal::detail::textenc::BAD_INPUT_CONTINUE:
261 eState = IMPL_ISO_2022_KR_TO_UNICODE_STATE_ASCII;
262 break;
264 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
265 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
266 break;
270 if (pContext)
272 static_cast< ImplIso2022KrToUnicodeContext * >(pContext)->m_eState = eState;
273 static_cast< ImplIso2022KrToUnicodeContext * >(pContext)->m_nRow = nRow;
275 if (pInfo)
276 *pInfo = nInfo;
277 if (pSrcCvtBytes)
278 *pSrcCvtBytes = nConverted;
280 return pDestBufPtr - pDestBuf;
283 void * ImplCreateUnicodeToIso2022KrContext()
285 ImplUnicodeToIso2022KrContext * pContext =
286 new ImplUnicodeToIso2022KrContext;
287 pContext->m_nHighSurrogate = 0;
288 pContext->m_eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
289 return pContext;
292 void ImplResetUnicodeToIso2022KrContext(void * pContext)
294 if (pContext)
296 static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_nHighSurrogate = 0;
297 static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_eSet
298 = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
302 void ImplDestroyUnicodeToIso2022KrContext(void * pContext)
304 delete static_cast< ImplUnicodeToIso2022KrContext * >(pContext);
307 sal_Size ImplConvertUnicodeToIso2022Kr(void const * pData,
308 void * pContext,
309 sal_Unicode const * pSrcBuf,
310 sal_Size nSrcChars,
311 char * pDestBuf,
312 sal_Size nDestBytes,
313 sal_uInt32 nFlags,
314 sal_uInt32 * pInfo,
315 sal_Size * pSrcCvtChars)
317 ImplUniToDBCSHighTab const * pKsX1001Data
318 = static_cast< ImplIso2022KrConverterData const * >(pData)->
319 m_pUnicodeToKsX1001Data;
320 sal_Unicode nHighSurrogate = 0;
321 ImplUnicodeToIso2022KrSet eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE;
322 sal_uInt32 nInfo = 0;
323 sal_Size nConverted = 0;
324 char * pDestBufPtr = pDestBuf;
325 char * pDestBufEnd = pDestBuf + nDestBytes;
326 bool bWritten;
328 if (pContext)
330 nHighSurrogate
331 = static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_nHighSurrogate;
332 eSet = static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_eSet;
335 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_NONE)
337 if (pDestBufEnd - pDestBufPtr >= 4)
339 *pDestBufPtr++ = 0x1B; // ESC
340 *pDestBufPtr++ = 0x24; // $
341 *pDestBufPtr++ = 0x29; // )
342 *pDestBufPtr++ = 0x43; // C
343 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
345 else
346 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
349 if ((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0)
350 for (; nConverted < nSrcChars; ++nConverted)
352 bool bUndefined = true;
353 sal_uInt32 nChar = *pSrcBuf++;
354 if (nHighSurrogate == 0)
356 if (ImplIsHighSurrogate(nChar))
358 nHighSurrogate = static_cast<sal_Unicode>(nChar);
359 continue;
361 else if (ImplIsLowSurrogate(nChar))
363 bUndefined = false;
364 goto bad_input;
367 else if (ImplIsLowSurrogate(nChar))
368 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
369 else
371 bUndefined = false;
372 goto bad_input;
375 assert(rtl::isUnicodeScalarValue(nChar));
377 if (nChar == 0x0A || nChar == 0x0D) // LF, CR
379 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
381 if (pDestBufPtr != pDestBufEnd)
383 *pDestBufPtr++ = 0x0F; // SI
384 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
386 else
387 goto no_output;
389 if (pDestBufPtr != pDestBufEnd)
390 *pDestBufPtr++ = static_cast< char >(nChar);
391 else
392 goto no_output;
394 else if (nChar == 0x0E || nChar == 0x0F || nChar == 0x1B)
395 goto bad_input;
396 else if (nChar < 0x80)
398 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001)
400 if (pDestBufPtr != pDestBufEnd)
402 *pDestBufPtr++ = 0x0F; // SI
403 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
405 else
406 goto no_output;
408 if (pDestBufPtr != pDestBufEnd)
409 *pDestBufPtr++ = static_cast< char >(nChar);
410 else
411 goto no_output;
413 else
415 sal_uInt16 nBytes = 0;
416 sal_uInt32 nIndex1 = nChar >> 8;
417 if (nIndex1 < 0x100)
419 sal_uInt32 nIndex2 = nChar & 0xFF;
420 sal_uInt32 nFirst = pKsX1001Data[nIndex1].mnLowStart;
421 if (nIndex2 >= nFirst
422 && nIndex2 <= pKsX1001Data[nIndex1].mnLowEnd)
423 nBytes = pKsX1001Data[nIndex1].
424 mpToUniTrailTab[nIndex2 - nFirst];
426 if (nBytes != 0)
428 if (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII)
430 if (pDestBufPtr != pDestBufEnd)
432 *pDestBufPtr++ = 0x0E; // SO
433 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_1001;
435 else
436 goto no_output;
438 if (pDestBufEnd - pDestBufPtr >= 2)
440 *pDestBufPtr++ = static_cast< char >((nBytes >> 8) & 0x7F);
441 *pDestBufPtr++ = static_cast< char >(nBytes & 0x7F);
443 else
444 goto no_output;
446 else
447 goto bad_input;
449 nHighSurrogate = 0;
450 continue;
452 bad_input:
453 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
454 bUndefined, nChar, nFlags, &pDestBufPtr, pDestBufEnd,
455 &nInfo, "\x0F" /* SI */,
456 eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII ? 0 : 1,
457 &bWritten))
459 case sal::detail::textenc::BAD_INPUT_STOP:
460 nHighSurrogate = 0;
461 break;
463 case sal::detail::textenc::BAD_INPUT_CONTINUE:
464 if (bWritten)
465 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
466 nHighSurrogate = 0;
467 continue;
469 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
470 goto no_output;
472 break;
474 no_output:
475 --pSrcBuf;
476 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
477 break;
480 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
481 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
482 == 0)
484 bool bFlush = true;
485 if (nHighSurrogate != 0)
487 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
488 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
489 else
490 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
491 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
492 "\x0F" /* SI */,
493 (eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII
494 ? 0 : 1),
495 &bWritten))
497 case sal::detail::textenc::BAD_INPUT_STOP:
498 nHighSurrogate = 0;
499 bFlush = false;
500 break;
502 case sal::detail::textenc::BAD_INPUT_CONTINUE:
503 if (bWritten)
504 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
505 nHighSurrogate = 0;
506 break;
508 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
509 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
510 break;
513 if (bFlush
514 && eSet == IMPL_UNICODE_TO_ISO_2022_KR_SET_1001
515 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
517 if (pDestBufPtr != pDestBufEnd)
519 *pDestBufPtr++ = 0x0F; // SI
520 eSet = IMPL_UNICODE_TO_ISO_2022_KR_SET_ASCII;
522 else
523 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
527 if (pContext)
529 static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_nHighSurrogate
530 = nHighSurrogate;
531 static_cast< ImplUnicodeToIso2022KrContext * >(pContext)->m_eSet = eSet;
533 if (pInfo)
534 *pInfo = nInfo;
535 if (pSrcCvtChars)
536 *pSrcCvtChars = nConverted;
538 return pDestBufPtr - pDestBuf;
541 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */