bump product version to 5.0.4.1
[LibreOffice.git] / sal / textenc / convertgb18030.cxx
blob87ede87a0830b62b150ba554b90cdaf296167c3f
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "rtl/textcvt.h"
23 #include "sal/types.h"
25 #include "context.hxx"
26 #include "converter.hxx"
27 #include "convertgb18030.hxx"
28 #include "tenchelp.hxx"
29 #include "unichars.hxx"
31 namespace {
33 enum ImplGb18030ToUnicodeState
35 IMPL_GB_18030_TO_UNICODE_STATE_0,
36 IMPL_GB_18030_TO_UNICODE_STATE_1,
37 IMPL_GB_18030_TO_UNICODE_STATE_2,
38 IMPL_GB_18030_TO_UNICODE_STATE_3
41 struct ImplGb18030ToUnicodeContext
43 ImplGb18030ToUnicodeState m_eState;
44 sal_uInt32 m_nCode;
49 void * ImplCreateGb18030ToUnicodeContext()
51 ImplGb18030ToUnicodeContext * pContext = new ImplGb18030ToUnicodeContext;
52 pContext->m_eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
53 return pContext;
56 void ImplResetGb18030ToUnicodeContext(void * pContext)
58 if (pContext)
59 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState
60 = IMPL_GB_18030_TO_UNICODE_STATE_0;
63 void ImplDestroyGb18030ToUnicodeContext(void * pContext)
65 delete static_cast< ImplGb18030ToUnicodeContext * >(pContext);
68 sal_Size ImplConvertGb18030ToUnicode(void const * pData,
69 void * pContext,
70 char const * pSrcBuf,
71 sal_Size nSrcBytes,
72 sal_Unicode * pDestBuf,
73 sal_Size nDestChars,
74 sal_uInt32 nFlags,
75 sal_uInt32 * pInfo,
76 sal_Size * pSrcCvtBytes)
78 sal_Unicode const * pGb18030Data
79 = static_cast< ImplGb18030ConverterData const * >(pData)->m_pGb18030ToUnicodeData;
80 ImplGb180302000ToUnicodeRange const * pGb18030Ranges
81 = static_cast< ImplGb18030ConverterData const * >(pData)->
82 m_pGb18030ToUnicodeRanges;
83 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
84 sal_uInt32 nCode = 0;
85 sal_uInt32 nInfo = 0;
86 sal_Size nConverted = 0;
87 sal_Unicode * pDestBufPtr = pDestBuf;
88 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
90 if (pContext)
92 eState = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState;
93 nCode = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode;
96 for (; nConverted < nSrcBytes; ++nConverted)
98 bool bUndefined = true;
99 sal_uInt32 nChar = *reinterpret_cast<unsigned char const *>(pSrcBuf++);
100 switch (eState)
102 case IMPL_GB_18030_TO_UNICODE_STATE_0:
103 if (nChar < 0x80)
104 if (pDestBufPtr != pDestBufEnd)
105 *pDestBufPtr++ = (sal_Unicode) nChar;
106 else
107 goto no_output;
108 else if (nChar == 0x80)
109 goto bad_input;
110 else if (nChar <= 0xFE)
112 nCode = nChar - 0x81;
113 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
115 else
117 bUndefined = false;
118 goto bad_input;
120 break;
122 case IMPL_GB_18030_TO_UNICODE_STATE_1:
123 if (nChar >= 0x30 && nChar <= 0x39)
125 nCode = nCode * 10 + (nChar - 0x30);
126 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
128 else if ((nChar >= 0x40 && nChar <= 0x7E)
129 || (nChar >= 0x80 && nChar <= 0xFE))
131 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
132 nChar - 0x80 + 63);
133 if (pDestBufPtr != pDestBufEnd)
134 *pDestBufPtr++ = pGb18030Data[nCode];
135 else
136 goto no_output;
137 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
139 else
141 bUndefined = false;
142 goto bad_input;
144 break;
146 case IMPL_GB_18030_TO_UNICODE_STATE_2:
147 if (nChar >= 0x81 && nChar <= 0xFE)
149 nCode = nCode * 126 + (nChar - 0x81);
150 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
152 else
154 bUndefined = false;
155 goto bad_input;
157 break;
159 case IMPL_GB_18030_TO_UNICODE_STATE_3:
160 if (nChar >= 0x30 && nChar <= 0x39)
162 nCode = nCode * 10 + (nChar - 0x30);
164 // 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF:
165 if (nCode >= 189000 && nCode <= 1237575)
166 if (pDestBufEnd - pDestBufPtr >= 2)
168 nCode -= 189000 - 0x10000;
169 *pDestBufPtr++
170 = (sal_Unicode) ImplGetHighSurrogate(nCode);
171 *pDestBufPtr++
172 = (sal_Unicode) ImplGetLowSurrogate(nCode);
174 else
175 goto no_output;
176 else
178 ImplGb180302000ToUnicodeRange const * pRange
179 = pGb18030Ranges;
180 sal_uInt32 nFirstNonRange = 0;
181 for (;;)
183 if (pRange->m_nNonRangeDataIndex == -1)
184 goto bad_input;
185 else if (nCode < pRange->m_nFirstLinear)
187 if (pDestBufPtr != pDestBufEnd)
188 *pDestBufPtr++
189 = pGb18030Data[
190 pRange->m_nNonRangeDataIndex
191 + (nCode - nFirstNonRange)];
192 else
193 goto no_output;
194 break;
196 else if (nCode < pRange->m_nPastLinear)
198 if (pDestBufPtr != pDestBufEnd)
199 *pDestBufPtr++
200 = (sal_Unicode)
201 (pRange->m_nFirstUnicode
202 + (nCode
203 - pRange->
204 m_nFirstLinear));
205 else
206 goto no_output;
207 break;
209 nFirstNonRange = (pRange++)->m_nPastLinear;
212 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
214 else
216 bUndefined = false;
217 goto bad_input;
219 break;
221 continue;
223 bad_input:
224 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
225 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
226 &nInfo))
228 case sal::detail::textenc::BAD_INPUT_STOP:
229 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
230 break;
232 case sal::detail::textenc::BAD_INPUT_CONTINUE:
233 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
234 continue;
236 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
237 goto no_output;
239 break;
241 no_output:
242 --pSrcBuf;
243 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
244 break;
247 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
248 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
249 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
250 == 0)
252 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
253 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
254 else
255 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
256 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
257 &nInfo))
259 case sal::detail::textenc::BAD_INPUT_STOP:
260 case sal::detail::textenc::BAD_INPUT_CONTINUE:
261 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
262 break;
264 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
265 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
266 break;
270 if (pContext)
272 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState = eState;
273 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode = nCode;
275 if (pInfo)
276 *pInfo = nInfo;
277 if (pSrcCvtBytes)
278 *pSrcCvtBytes = nConverted;
280 return pDestBufPtr - pDestBuf;
283 sal_Size ImplConvertUnicodeToGb18030(void const * pData,
284 void * pContext,
285 sal_Unicode const * pSrcBuf,
286 sal_Size nSrcChars,
287 char * pDestBuf,
288 sal_Size nDestBytes,
289 sal_uInt32 nFlags,
290 sal_uInt32 * pInfo,
291 sal_Size * pSrcCvtChars)
293 sal_uInt32 const * pGb18030Data
294 = static_cast< ImplGb18030ConverterData const * >(pData)->
295 m_pUnicodeToGb18030Data;
296 ImplUnicodeToGb180302000Range const * pGb18030Ranges
297 = static_cast< ImplGb18030ConverterData const * >(pData)->
298 m_pUnicodeToGb18030Ranges;
299 sal_Unicode nHighSurrogate = 0;
300 sal_uInt32 nInfo = 0;
301 sal_Size nConverted = 0;
302 char * pDestBufPtr = pDestBuf;
303 char * pDestBufEnd = pDestBuf + nDestBytes;
305 if (pContext)
306 nHighSurrogate
307 = static_cast<ImplUnicodeToTextContext *>(pContext)->m_nHighSurrogate;
309 for (; nConverted < nSrcChars; ++nConverted)
311 bool bUndefined = true;
312 sal_uInt32 nChar = *pSrcBuf++;
313 if (nHighSurrogate == 0)
315 if (ImplIsHighSurrogate(nChar))
317 nHighSurrogate = (sal_Unicode) nChar;
318 continue;
321 else if (ImplIsLowSurrogate(nChar))
322 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
323 else
325 bUndefined = false;
326 goto bad_input;
329 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
331 bUndefined = false;
332 goto bad_input;
335 if (nChar < 0x80)
336 if (pDestBufPtr != pDestBufEnd)
337 *pDestBufPtr++ = static_cast< char >(nChar);
338 else
339 goto no_output;
340 else if (nChar < 0x10000)
342 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
343 sal_Unicode nFirstNonRange = 0x80;
344 for (;;)
346 if (nChar < pRange->m_nFirstUnicode)
348 sal_uInt32 nCode
349 = pGb18030Data[pRange->m_nNonRangeDataIndex
350 + (nChar - nFirstNonRange)];
351 if (pDestBufEnd - pDestBufPtr
352 >= (nCode <= 0xFFFF ? 2 : 4))
354 if (nCode > 0xFFFF)
356 *pDestBufPtr++ = static_cast< char >(nCode >> 24);
357 *pDestBufPtr++ = static_cast< char >(nCode >> 16 & 0xFF);
359 *pDestBufPtr++ = static_cast< char >(nCode >> 8 & 0xFF);
360 *pDestBufPtr++ = static_cast< char >(nCode & 0xFF);
362 else
363 goto no_output;
364 break;
366 else if (nChar <= pRange->m_nLastUnicode)
368 if (pDestBufEnd - pDestBufPtr >= 4)
370 sal_uInt32 nCode
371 = pRange->m_nFirstLinear
372 + (nChar - pRange->m_nFirstUnicode);
373 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x81);
374 *pDestBufPtr++
375 = static_cast< char >(nCode / 1260 % 10 + 0x30);
376 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
377 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
379 else
380 goto no_output;
381 break;
383 nFirstNonRange
384 = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
387 else
388 if (pDestBufEnd - pDestBufPtr >= 4)
390 sal_uInt32 nCode = nChar - 0x10000;
391 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x90);
392 *pDestBufPtr++ = static_cast< char >(nCode / 1260 % 10 + 0x30);
393 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
394 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
396 else
397 goto no_output;
398 nHighSurrogate = 0;
399 continue;
401 bad_input:
402 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
403 bUndefined, nChar, nFlags, &pDestBufPtr, pDestBufEnd,
404 &nInfo, NULL, 0, NULL))
406 case sal::detail::textenc::BAD_INPUT_STOP:
407 nHighSurrogate = 0;
408 break;
410 case sal::detail::textenc::BAD_INPUT_CONTINUE:
411 nHighSurrogate = 0;
412 continue;
414 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
415 goto no_output;
417 break;
419 no_output:
420 --pSrcBuf;
421 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
422 break;
425 if (nHighSurrogate != 0
426 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
427 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
428 == 0)
430 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
431 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
432 else
433 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
434 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
435 NULL, 0, NULL))
437 case sal::detail::textenc::BAD_INPUT_STOP:
438 case sal::detail::textenc::BAD_INPUT_CONTINUE:
439 nHighSurrogate = 0;
440 break;
442 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
443 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
444 break;
448 if (pContext)
449 static_cast<ImplUnicodeToTextContext *>(pContext)->m_nHighSurrogate
450 = nHighSurrogate;
451 if (pInfo)
452 *pInfo = nInfo;
453 if (pSrcCvtChars)
454 *pSrcCvtChars = nConverted;
456 return pDestBufPtr - pDestBuf;
459 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */