Version 6.1.0.2, tag libreoffice-6.1.0.2
[LibreOffice.git] / sal / textenc / convertgb18030.cxx
blob87e814674bb78c3f255b77a861a00fbb9bb6c2cd
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <rtl/textcvt.h>
23 #include <sal/types.h>
25 #include "context.hxx"
26 #include "converter.hxx"
27 #include "convertgb18030.hxx"
28 #include "tenchelp.hxx"
29 #include "unichars.hxx"
31 namespace {
33 enum ImplGb18030ToUnicodeState
35 IMPL_GB_18030_TO_UNICODE_STATE_0,
36 IMPL_GB_18030_TO_UNICODE_STATE_1,
37 IMPL_GB_18030_TO_UNICODE_STATE_2,
38 IMPL_GB_18030_TO_UNICODE_STATE_3
41 struct ImplGb18030ToUnicodeContext
43 ImplGb18030ToUnicodeState m_eState;
44 sal_uInt32 m_nCode;
49 void * ImplCreateGb18030ToUnicodeContext()
51 ImplGb18030ToUnicodeContext * pContext = new ImplGb18030ToUnicodeContext;
52 pContext->m_eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
53 return pContext;
56 void ImplResetGb18030ToUnicodeContext(void * pContext)
58 if (pContext)
59 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState
60 = IMPL_GB_18030_TO_UNICODE_STATE_0;
63 void ImplDestroyGb18030ToUnicodeContext(void * pContext)
65 delete static_cast< ImplGb18030ToUnicodeContext * >(pContext);
68 sal_Size ImplConvertGb18030ToUnicode(void const * pData,
69 void * pContext,
70 char const * pSrcBuf,
71 sal_Size nSrcBytes,
72 sal_Unicode * pDestBuf,
73 sal_Size nDestChars,
74 sal_uInt32 nFlags,
75 sal_uInt32 * pInfo,
76 sal_Size * pSrcCvtBytes)
78 sal_Unicode const * pGb18030Data
79 = static_cast< ImplGb18030ConverterData const * >(pData)->m_pGb18030ToUnicodeData;
80 ImplGb180302000ToUnicodeRange const * pGb18030Ranges
81 = static_cast< ImplGb18030ConverterData const * >(pData)->
82 m_pGb18030ToUnicodeRanges;
83 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
84 sal_uInt32 nCode = 0;
85 sal_uInt32 nInfo = 0;
86 sal_Size nConverted = 0;
87 sal_Unicode * pDestBufPtr = pDestBuf;
88 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
90 if (pContext)
92 eState = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState;
93 nCode = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode;
96 for (; nConverted < nSrcBytes; ++nConverted)
98 bool bUndefined = true;
99 sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++);
100 switch (eState)
102 case IMPL_GB_18030_TO_UNICODE_STATE_0:
103 if (nChar < 0x80)
104 if (pDestBufPtr != pDestBufEnd)
105 *pDestBufPtr++ = static_cast<sal_Unicode>(nChar);
106 else
107 goto no_output;
108 else if (nChar == 0x80)
109 goto bad_input;
110 else if (nChar <= 0xFE)
112 nCode = nChar - 0x81;
113 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
115 else
117 bUndefined = false;
118 goto bad_input;
120 break;
122 case IMPL_GB_18030_TO_UNICODE_STATE_1:
123 if (nChar >= 0x30 && nChar <= 0x39)
125 nCode = nCode * 10 + (nChar - 0x30);
126 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
128 else if ((nChar >= 0x40 && nChar <= 0x7E)
129 || (nChar >= 0x80 && nChar <= 0xFE))
131 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
132 nChar - 0x80 + 63);
133 if (pDestBufPtr != pDestBufEnd)
134 *pDestBufPtr++ = pGb18030Data[nCode];
135 else
136 goto no_output;
137 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
139 else
141 bUndefined = false;
142 goto bad_input;
144 break;
146 case IMPL_GB_18030_TO_UNICODE_STATE_2:
147 if (nChar >= 0x81 && nChar <= 0xFE)
149 nCode = nCode * 126 + (nChar - 0x81);
150 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
152 else
154 bUndefined = false;
155 goto bad_input;
157 break;
159 case IMPL_GB_18030_TO_UNICODE_STATE_3:
160 if (nChar >= 0x30 && nChar <= 0x39)
162 nCode = nCode * 10 + (nChar - 0x30);
164 // 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF:
165 if (nCode >= 189000 && nCode <= 1237575)
166 if (pDestBufEnd - pDestBufPtr >= 2)
168 nCode -= 189000 - 0x10000;
169 *pDestBufPtr++
170 = static_cast<sal_Unicode>(ImplGetHighSurrogate(nCode));
171 *pDestBufPtr++
172 = static_cast<sal_Unicode>(ImplGetLowSurrogate(nCode));
174 else
175 goto no_output;
176 else
178 ImplGb180302000ToUnicodeRange const * pRange
179 = pGb18030Ranges;
180 sal_uInt32 nFirstNonRange = 0;
181 for (;;)
183 if (pRange->m_nNonRangeDataIndex == -1)
184 goto bad_input;
185 else if (nCode < pRange->m_nFirstLinear)
187 if (pDestBufPtr != pDestBufEnd)
188 *pDestBufPtr++
189 = pGb18030Data[
190 pRange->m_nNonRangeDataIndex
191 + (nCode - nFirstNonRange)];
192 else
193 goto no_output;
194 break;
196 else if (nCode < pRange->m_nPastLinear)
198 if (pDestBufPtr != pDestBufEnd)
199 *pDestBufPtr++
200 = static_cast<sal_Unicode>(pRange->m_nFirstUnicode
201 + (nCode
202 - pRange->
203 m_nFirstLinear));
204 else
205 goto no_output;
206 break;
208 nFirstNonRange = (pRange++)->m_nPastLinear;
211 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
213 else
215 bUndefined = false;
216 goto bad_input;
218 break;
220 continue;
222 bad_input:
223 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
224 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
225 &nInfo))
227 case sal::detail::textenc::BAD_INPUT_STOP:
228 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
229 break;
231 case sal::detail::textenc::BAD_INPUT_CONTINUE:
232 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
233 continue;
235 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
236 goto no_output;
238 break;
240 no_output:
241 --pSrcBuf;
242 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
243 break;
246 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
247 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
248 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
249 == 0)
251 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
252 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
253 else
254 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
255 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
256 &nInfo))
258 case sal::detail::textenc::BAD_INPUT_STOP:
259 case sal::detail::textenc::BAD_INPUT_CONTINUE:
260 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
261 break;
263 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
264 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
265 break;
269 if (pContext)
271 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState = eState;
272 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode = nCode;
274 if (pInfo)
275 *pInfo = nInfo;
276 if (pSrcCvtBytes)
277 *pSrcCvtBytes = nConverted;
279 return pDestBufPtr - pDestBuf;
282 sal_Size ImplConvertUnicodeToGb18030(void const * pData,
283 void * pContext,
284 sal_Unicode const * pSrcBuf,
285 sal_Size nSrcChars,
286 char * pDestBuf,
287 sal_Size nDestBytes,
288 sal_uInt32 nFlags,
289 sal_uInt32 * pInfo,
290 sal_Size * pSrcCvtChars)
292 sal_uInt32 const * pGb18030Data
293 = static_cast< ImplGb18030ConverterData const * >(pData)->
294 m_pUnicodeToGb18030Data;
295 ImplUnicodeToGb180302000Range const * pGb18030Ranges
296 = static_cast< ImplGb18030ConverterData const * >(pData)->
297 m_pUnicodeToGb18030Ranges;
298 sal_Unicode nHighSurrogate = 0;
299 sal_uInt32 nInfo = 0;
300 sal_Size nConverted = 0;
301 char * pDestBufPtr = pDestBuf;
302 char * pDestBufEnd = pDestBuf + nDestBytes;
304 if (pContext)
305 nHighSurrogate
306 = static_cast<ImplUnicodeToTextContext *>(pContext)->m_nHighSurrogate;
308 for (; nConverted < nSrcChars; ++nConverted)
310 bool bUndefined = true;
311 sal_uInt32 nChar = *pSrcBuf++;
312 if (nHighSurrogate == 0)
314 if (ImplIsHighSurrogate(nChar))
316 nHighSurrogate = static_cast<sal_Unicode>(nChar);
317 continue;
320 else if (ImplIsLowSurrogate(nChar))
321 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
322 else
324 bUndefined = false;
325 goto bad_input;
328 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
330 bUndefined = false;
331 goto bad_input;
334 if (nChar < 0x80)
335 if (pDestBufPtr != pDestBufEnd)
336 *pDestBufPtr++ = static_cast< char >(nChar);
337 else
338 goto no_output;
339 else if (nChar < 0x10000)
341 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
342 sal_Unicode nFirstNonRange = 0x80;
343 for (;;)
345 if (nChar < pRange->m_nFirstUnicode)
347 sal_uInt32 nCode
348 = pGb18030Data[pRange->m_nNonRangeDataIndex
349 + (nChar - nFirstNonRange)];
350 if (pDestBufEnd - pDestBufPtr
351 >= (nCode <= 0xFFFF ? 2 : 4))
353 if (nCode > 0xFFFF)
355 *pDestBufPtr++ = static_cast< char >(nCode >> 24);
356 *pDestBufPtr++ = static_cast< char >(nCode >> 16 & 0xFF);
358 *pDestBufPtr++ = static_cast< char >(nCode >> 8 & 0xFF);
359 *pDestBufPtr++ = static_cast< char >(nCode & 0xFF);
361 else
362 goto no_output;
363 break;
365 if (nChar <= pRange->m_nLastUnicode)
367 if (pDestBufEnd - pDestBufPtr >= 4)
369 sal_uInt32 nCode
370 = pRange->m_nFirstLinear
371 + (nChar - pRange->m_nFirstUnicode);
372 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x81);
373 *pDestBufPtr++
374 = static_cast< char >(nCode / 1260 % 10 + 0x30);
375 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
376 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
378 else
379 goto no_output;
380 break;
382 nFirstNonRange
383 = static_cast<sal_Unicode>((pRange++)->m_nLastUnicode + 1);
386 else
387 if (pDestBufEnd - pDestBufPtr >= 4)
389 sal_uInt32 nCode = nChar - 0x10000;
390 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x90);
391 *pDestBufPtr++ = static_cast< char >(nCode / 1260 % 10 + 0x30);
392 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
393 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
395 else
396 goto no_output;
397 nHighSurrogate = 0;
398 continue;
400 bad_input:
401 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
402 bUndefined, nChar, nFlags, &pDestBufPtr, pDestBufEnd,
403 &nInfo, nullptr, 0, nullptr))
405 case sal::detail::textenc::BAD_INPUT_STOP:
406 nHighSurrogate = 0;
407 break;
409 case sal::detail::textenc::BAD_INPUT_CONTINUE:
410 nHighSurrogate = 0;
411 continue;
413 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
414 goto no_output;
416 break;
418 no_output:
419 --pSrcBuf;
420 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
421 break;
424 if (nHighSurrogate != 0
425 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
426 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
427 == 0)
429 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
430 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
431 else
432 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
433 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
434 nullptr, 0, nullptr))
436 case sal::detail::textenc::BAD_INPUT_STOP:
437 case sal::detail::textenc::BAD_INPUT_CONTINUE:
438 nHighSurrogate = 0;
439 break;
441 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
442 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
443 break;
447 if (pContext)
448 static_cast<ImplUnicodeToTextContext *>(pContext)->m_nHighSurrogate
449 = nHighSurrogate;
450 if (pInfo)
451 *pInfo = nInfo;
452 if (pSrcCvtChars)
453 *pSrcCvtChars = nConverted;
455 return pDestBufPtr - pDestBuf;
458 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */