Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / sal / textenc / convertiso2022jp.cxx
blob935bc8515d77b0d91a8f4a04d5083baeea19a48d
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <rtl/character.hxx>
25 #include <rtl/textcvt.h>
26 #include <sal/types.h>
28 #include "converter.hxx"
29 #include "convertiso2022jp.hxx"
30 #include "tenchelp.hxx"
31 #include "unichars.hxx"
33 namespace {
35 enum ImplIso2022JpToUnicodeState // order is important:
37 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII,
38 IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN,
39 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208,
40 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2,
41 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC,
42 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN,
43 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR
46 struct ImplIso2022JpToUnicodeContext
48 ImplIso2022JpToUnicodeState m_eState;
49 sal_uInt32 m_nRow;
52 struct ImplUnicodeToIso2022JpContext
54 sal_Unicode m_nHighSurrogate;
55 bool m_b0208;
60 void * ImplCreateIso2022JpToUnicodeContext()
62 ImplIso2022JpToUnicodeContext * pContext =
63 new ImplIso2022JpToUnicodeContext;
64 pContext->m_eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
65 return pContext;
68 void ImplResetIso2022JpToUnicodeContext(void * pContext)
70 if (pContext)
71 static_cast< ImplIso2022JpToUnicodeContext * >(pContext)->m_eState
72 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
75 void ImplDestroyIso2022JpToUnicodeContext(void * pContext)
77 delete static_cast< ImplIso2022JpToUnicodeContext * >(pContext);
80 sal_Size ImplConvertIso2022JpToUnicode(void const * pData,
81 void * pContext,
82 char const * pSrcBuf,
83 sal_Size nSrcBytes,
84 sal_Unicode * pDestBuf,
85 sal_Size nDestChars,
86 sal_uInt32 nFlags,
87 sal_uInt32 * pInfo,
88 sal_Size * pSrcCvtBytes)
90 ImplDBCSToUniLeadTab const * pJisX0208Data
91 = static_cast< ImplIso2022JpConverterData const * >(pData)->
92 m_pJisX0208ToUnicodeData;
93 ImplIso2022JpToUnicodeState eState
94 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
95 sal_uInt32 nRow = 0;
96 sal_uInt32 nInfo = 0;
97 sal_Size nConverted = 0;
98 sal_Unicode * pDestBufPtr = pDestBuf;
99 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
100 sal_Size startOfCurrentChar = 0;
102 if (pContext)
104 eState = static_cast< ImplIso2022JpToUnicodeContext * >(pContext)->m_eState;
105 nRow = static_cast< ImplIso2022JpToUnicodeContext * >(pContext)->m_nRow;
108 for (; nConverted < nSrcBytes; ++nConverted)
110 bool bUndefined = true;
111 sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++);
112 switch (eState)
114 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII:
115 if (nChar == 0x1B) // ESC
116 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
117 else if (nChar < 0x80)
118 if (pDestBufPtr != pDestBufEnd) {
119 *pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
120 startOfCurrentChar = nConverted + 1;
121 } else
122 goto no_output;
123 else
125 bUndefined = false;
126 goto bad_input;
128 break;
130 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN:
131 if (nChar == 0x1B) // ESC
132 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
133 else if (nChar < 0x80)
134 if (pDestBufPtr != pDestBufEnd)
136 switch (nChar)
138 case 0x5C: // REVERSE SOLIDUS (\)
139 nChar = 0xA5; // YEN SIGN
140 break;
142 case 0x7E: // ~
143 nChar = 0xAF; // MACRON
144 break;
146 *pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
147 startOfCurrentChar = nConverted + 1;
149 else
150 goto no_output;
151 else
153 bUndefined = false;
154 goto bad_input;
156 break;
158 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208:
159 if (nChar == 0x1B) // ESC
160 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
161 else if (nChar >= 0x21 && nChar <= 0x7E)
163 nRow = nChar;
164 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2;
166 else
168 bUndefined = false;
169 goto bad_input;
171 break;
173 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2:
174 if (nChar >= 0x21 && nChar <= 0x7E)
176 sal_uInt16 nUnicode = 0;
177 sal_uInt32 nFirst = pJisX0208Data[nRow].mnTrailStart;
178 if (nChar >= nFirst
179 && nChar <= pJisX0208Data[nRow].mnTrailEnd)
180 nUnicode = pJisX0208Data[nRow].
181 mpToUniTrailTab[nChar - nFirst];
182 if (nUnicode != 0)
183 if (pDestBufPtr != pDestBufEnd)
185 *pDestBufPtr++ = static_cast<sal_Unicode>(nUnicode);
186 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
187 startOfCurrentChar = nConverted + 1;
189 else
190 goto no_output;
191 else
192 goto bad_input;
194 else
196 bUndefined = false;
197 goto bad_input;
199 break;
201 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC:
202 switch (nChar)
204 case 0x24: // $
205 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR;
206 break;
208 case 0x28: // (
209 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN;
210 break;
212 default:
213 bUndefined = false;
214 goto bad_input;
216 break;
218 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN:
219 switch (nChar)
221 case 0x42: // B
222 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
223 break;
225 case 0x4A: // J
226 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN;
227 break;
229 default:
230 bUndefined = false;
231 goto bad_input;
233 break;
235 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR:
236 switch (nChar)
238 case 0x40: // @
239 case 0x42: // B
240 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
241 break;
243 default:
244 bUndefined = false;
245 goto bad_input;
247 break;
249 continue;
251 bad_input:
252 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
253 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
254 &nInfo))
256 case sal::detail::textenc::BAD_INPUT_STOP:
257 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
258 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
259 ++nConverted;
260 } else {
261 nConverted = startOfCurrentChar;
263 break;
265 case sal::detail::textenc::BAD_INPUT_CONTINUE:
266 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
267 startOfCurrentChar = nConverted + 1;
268 continue;
270 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
271 goto no_output;
273 break;
275 no_output:
276 --pSrcBuf;
277 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
278 break;
281 if (eState > IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208
282 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
283 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
284 == 0)
286 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
287 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
288 else
289 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
290 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
291 &nInfo))
293 case sal::detail::textenc::BAD_INPUT_STOP:
294 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
295 nConverted = startOfCurrentChar;
297 [[fallthrough]];
298 case sal::detail::textenc::BAD_INPUT_CONTINUE:
299 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
300 break;
302 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
303 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
304 break;
308 if (pContext)
310 static_cast< ImplIso2022JpToUnicodeContext * >(pContext)->m_eState = eState;
311 static_cast< ImplIso2022JpToUnicodeContext * >(pContext)->m_nRow = nRow;
313 if (pInfo)
314 *pInfo = nInfo;
315 if (pSrcCvtBytes)
316 *pSrcCvtBytes = nConverted;
318 return pDestBufPtr - pDestBuf;
321 void * ImplCreateUnicodeToIso2022JpContext()
323 ImplUnicodeToIso2022JpContext * pContext =
324 new ImplUnicodeToIso2022JpContext;
325 pContext->m_nHighSurrogate = 0;
326 pContext->m_b0208 = false;
327 return pContext;
330 void ImplResetUnicodeToIso2022JpContext(void * pContext)
332 if (pContext)
334 static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_nHighSurrogate = 0;
335 static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_b0208 = false;
339 void ImplDestroyUnicodeToIso2022JpContext(void * pContext)
341 delete static_cast< ImplUnicodeToIso2022JpContext * >(pContext);
344 sal_Size ImplConvertUnicodeToIso2022Jp(void const * pData,
345 void * pContext,
346 sal_Unicode const * pSrcBuf,
347 sal_Size nSrcChars,
348 char * pDestBuf,
349 sal_Size nDestBytes,
350 sal_uInt32 nFlags,
351 sal_uInt32 * pInfo,
352 sal_Size * pSrcCvtChars)
354 ImplUniToDBCSHighTab const * pJisX0208Data
355 = static_cast< ImplIso2022JpConverterData const * >(pData)->
356 m_pUnicodeToJisX0208Data;
357 sal_Unicode nHighSurrogate = 0;
358 bool b0208 = false;
359 sal_uInt32 nInfo = 0;
360 sal_Size nConverted = 0;
361 char * pDestBufPtr = pDestBuf;
362 char * pDestBufEnd = pDestBuf + nDestBytes;
363 bool bWritten;
365 if (pContext)
367 nHighSurrogate
368 = static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_nHighSurrogate;
369 b0208 = static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_b0208;
372 for (; nConverted < nSrcChars; ++nConverted)
374 bool bUndefined = true;
375 sal_uInt32 nChar = *pSrcBuf++;
376 if (nHighSurrogate == 0)
378 if (rtl::isHighSurrogate(nChar))
380 nHighSurrogate = static_cast<sal_Unicode>(nChar);
381 continue;
383 else if (rtl::isLowSurrogate(nChar))
385 bUndefined = false;
386 goto bad_input;
389 else if (rtl::isLowSurrogate(nChar))
390 nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
391 else
393 bUndefined = false;
394 goto bad_input;
397 assert(rtl::isUnicodeScalarValue(nChar));
399 if (nChar == 0x0A || nChar == 0x0D) // LF, CR
401 if (b0208)
403 if (pDestBufEnd - pDestBufPtr >= 3)
405 *pDestBufPtr++ = 0x1B; // ESC
406 *pDestBufPtr++ = 0x28; // (
407 *pDestBufPtr++ = 0x42; // B
408 b0208 = false;
410 else
411 goto no_output;
413 if (pDestBufPtr != pDestBufEnd)
414 *pDestBufPtr++ = static_cast< char >(nChar);
415 else
416 goto no_output;
418 else if (nChar == 0x1B)
419 goto bad_input;
420 else if (nChar < 0x80)
422 if (b0208)
424 if (pDestBufEnd - pDestBufPtr >= 3)
426 *pDestBufPtr++ = 0x1B; // ESC
427 *pDestBufPtr++ = 0x28; // (
428 *pDestBufPtr++ = 0x42; // B
429 b0208 = false;
431 else
432 goto no_output;
434 if (pDestBufPtr != pDestBufEnd)
435 *pDestBufPtr++ = static_cast< char >(nChar);
436 else
437 goto no_output;
439 else
441 sal_uInt16 nBytes = 0;
442 sal_uInt32 nIndex1 = nChar >> 8;
443 if (nIndex1 < 0x100)
445 sal_uInt32 nIndex2 = nChar & 0xFF;
446 sal_uInt32 nFirst = pJisX0208Data[nIndex1].mnLowStart;
447 if (nIndex2 >= nFirst
448 && nIndex2 <= pJisX0208Data[nIndex1].mnLowEnd)
450 nBytes = pJisX0208Data[nIndex1].
451 mpToUniTrailTab[nIndex2 - nFirst];
452 if (nBytes == 0)
453 // For some reason, the tables in tcvtjp4.tab do not
454 // include these two conversions:
455 switch (nChar)
457 case 0xA5: // YEN SIGN
458 nBytes = 0x216F;
459 break;
461 case 0xAF: // MACRON
462 nBytes = 0x2131;
463 break;
467 if (nBytes != 0)
469 if (!b0208)
471 if (pDestBufEnd - pDestBufPtr >= 3)
473 *pDestBufPtr++ = 0x1B; // ESC
474 *pDestBufPtr++ = 0x24; // $
475 *pDestBufPtr++ = 0x42; // B
476 b0208 = true;
478 else
479 goto no_output;
481 if (pDestBufEnd - pDestBufPtr >= 2)
483 *pDestBufPtr++ = static_cast< char >(nBytes >> 8);
484 *pDestBufPtr++ = static_cast< char >(nBytes & 0xFF);
486 else
487 goto no_output;
489 else
490 goto bad_input;
492 nHighSurrogate = 0;
493 continue;
495 bad_input:
496 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
497 bUndefined, nChar, nFlags, &pDestBufPtr, pDestBufEnd,
498 &nInfo, "\x1B(B", b0208 ? 3 : 0, &bWritten))
500 case sal::detail::textenc::BAD_INPUT_STOP:
501 nHighSurrogate = 0;
502 break;
504 case sal::detail::textenc::BAD_INPUT_CONTINUE:
505 if (bWritten)
506 b0208 = false;
507 nHighSurrogate = 0;
508 continue;
510 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
511 goto no_output;
513 break;
515 no_output:
516 --pSrcBuf;
517 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
518 break;
521 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
522 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
523 == 0)
525 bool bFlush = true;
526 if (nHighSurrogate != 0)
528 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
529 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
530 else
531 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
532 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
533 "\x1B(B", b0208 ? 3 : 0, &bWritten))
535 case sal::detail::textenc::BAD_INPUT_STOP:
536 nHighSurrogate = 0;
537 bFlush = false;
538 break;
540 case sal::detail::textenc::BAD_INPUT_CONTINUE:
541 if (bWritten)
542 b0208 = false;
543 nHighSurrogate = 0;
544 break;
546 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
547 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
548 break;
551 if (bFlush
552 && b0208
553 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
555 if (pDestBufEnd - pDestBufPtr >= 3)
557 *pDestBufPtr++ = 0x1B; // ESC
558 *pDestBufPtr++ = 0x28; // (
559 *pDestBufPtr++ = 0x42; // B
560 b0208 = false;
562 else
563 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
567 if (pContext)
569 static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_nHighSurrogate
570 = nHighSurrogate;
571 static_cast< ImplUnicodeToIso2022JpContext * >(pContext)->m_b0208 = b0208;
573 if (pInfo)
574 *pInfo = nInfo;
575 if (pSrcCvtChars)
576 *pSrcCvtChars = nConverted;
578 return pDestBufPtr - pDestBuf;
581 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */