Bump for 3.6-28
[LibreOffice.git] / sal / textenc / convertgb18030.cxx
blob48d742fe427833812c2e2eb2b5b14825c1bd02d8
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*************************************************************************
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * Copyright 2000, 2010 Oracle and/or its affiliates.
8 * OpenOffice.org - a multi-platform office productivity suite
10 * This file is part of OpenOffice.org.
12 * OpenOffice.org is free software: you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 3
14 * only, as published by the Free Software Foundation.
16 * OpenOffice.org is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License version 3 for more details
20 * (a copy is included in the LICENSE file that accompanied this code).
22 * You should have received a copy of the GNU Lesser General Public License
23 * version 3 along with OpenOffice.org. If not, see
24 * <http://www.openoffice.org/license.html>
25 * for a copy of the LGPLv3 License.
27 ************************************************************************/
29 #include "sal/config.h"
31 #include "rtl/textcvt.h"
32 #include "sal/types.h"
34 #include "context.hxx"
35 #include "converter.hxx"
36 #include "convertgb18030.hxx"
37 #include "tenchelp.hxx"
38 #include "unichars.hxx"
40 namespace {
42 enum ImplGb18030ToUnicodeState
44 IMPL_GB_18030_TO_UNICODE_STATE_0,
45 IMPL_GB_18030_TO_UNICODE_STATE_1,
46 IMPL_GB_18030_TO_UNICODE_STATE_2,
47 IMPL_GB_18030_TO_UNICODE_STATE_3
50 struct ImplGb18030ToUnicodeContext
52 ImplGb18030ToUnicodeState m_eState;
53 sal_uInt32 m_nCode;
58 void * ImplCreateGb18030ToUnicodeContext()
60 ImplGb18030ToUnicodeContext * pContext = new ImplGb18030ToUnicodeContext;
61 pContext->m_eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
62 return pContext;
65 void ImplResetGb18030ToUnicodeContext(void * pContext)
67 if (pContext)
68 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState
69 = IMPL_GB_18030_TO_UNICODE_STATE_0;
72 void ImplDestroyGb18030ToUnicodeContext(void * pContext)
74 delete static_cast< ImplGb18030ToUnicodeContext * >(pContext);
77 sal_Size ImplConvertGb18030ToUnicode(void const * pData,
78 void * pContext,
79 char const * pSrcBuf,
80 sal_Size nSrcBytes,
81 sal_Unicode * pDestBuf,
82 sal_Size nDestChars,
83 sal_uInt32 nFlags,
84 sal_uInt32 * pInfo,
85 sal_Size * pSrcCvtBytes)
87 sal_Unicode const * pGb18030Data
88 = static_cast< ImplGb18030ConverterData const * >(pData)->m_pGb18030ToUnicodeData;
89 ImplGb180302000ToUnicodeRange const * pGb18030Ranges
90 = static_cast< ImplGb18030ConverterData const * >(pData)->
91 m_pGb18030ToUnicodeRanges;
92 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
93 sal_uInt32 nCode = 0;
94 sal_uInt32 nInfo = 0;
95 sal_Size nConverted = 0;
96 sal_Unicode * pDestBufPtr = pDestBuf;
97 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
99 if (pContext)
101 eState = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState;
102 nCode = static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode;
105 for (; nConverted < nSrcBytes; ++nConverted)
107 bool bUndefined = true;
108 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
109 switch (eState)
111 case IMPL_GB_18030_TO_UNICODE_STATE_0:
112 if (nChar < 0x80)
113 if (pDestBufPtr != pDestBufEnd)
114 *pDestBufPtr++ = (sal_Unicode) nChar;
115 else
116 goto no_output;
117 else if (nChar == 0x80)
118 goto bad_input;
119 else if (nChar <= 0xFE)
121 nCode = nChar - 0x81;
122 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
124 else
126 bUndefined = false;
127 goto bad_input;
129 break;
131 case IMPL_GB_18030_TO_UNICODE_STATE_1:
132 if (nChar >= 0x30 && nChar <= 0x39)
134 nCode = nCode * 10 + (nChar - 0x30);
135 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
137 else if ((nChar >= 0x40 && nChar <= 0x7E)
138 || (nChar >= 0x80 && nChar <= 0xFE))
140 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
141 nChar - 0x80 + 63);
142 if (pDestBufPtr != pDestBufEnd)
143 *pDestBufPtr++ = pGb18030Data[nCode];
144 else
145 goto no_output;
146 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
148 else
150 bUndefined = false;
151 goto bad_input;
153 break;
155 case IMPL_GB_18030_TO_UNICODE_STATE_2:
156 if (nChar >= 0x81 && nChar <= 0xFE)
158 nCode = nCode * 126 + (nChar - 0x81);
159 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
161 else
163 bUndefined = false;
164 goto bad_input;
166 break;
168 case IMPL_GB_18030_TO_UNICODE_STATE_3:
169 if (nChar >= 0x30 && nChar <= 0x39)
171 nCode = nCode * 10 + (nChar - 0x30);
173 // 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF:
174 if (nCode >= 189000 && nCode <= 1237575)
175 if (pDestBufEnd - pDestBufPtr >= 2)
177 nCode -= 189000 - 0x10000;
178 *pDestBufPtr++
179 = (sal_Unicode) ImplGetHighSurrogate(nCode);
180 *pDestBufPtr++
181 = (sal_Unicode) ImplGetLowSurrogate(nCode);
183 else
184 goto no_output;
185 else
187 ImplGb180302000ToUnicodeRange const * pRange
188 = pGb18030Ranges;
189 sal_uInt32 nFirstNonRange = 0;
190 for (;;)
192 if (pRange->m_nNonRangeDataIndex == -1)
193 goto bad_input;
194 else if (nCode < pRange->m_nFirstLinear)
196 if (pDestBufPtr != pDestBufEnd)
197 *pDestBufPtr++
198 = pGb18030Data[
199 pRange->m_nNonRangeDataIndex
200 + (nCode - nFirstNonRange)];
201 else
202 goto no_output;
203 break;
205 else if (nCode < pRange->m_nPastLinear)
207 if (pDestBufPtr != pDestBufEnd)
208 *pDestBufPtr++
209 = (sal_Unicode)
210 (pRange->m_nFirstUnicode
211 + (nCode
212 - pRange->
213 m_nFirstLinear));
214 else
215 goto no_output;
216 break;
218 nFirstNonRange = (pRange++)->m_nPastLinear;
221 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
223 else
225 bUndefined = false;
226 goto bad_input;
228 break;
230 continue;
232 bad_input:
233 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
234 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
235 &nInfo))
237 case sal::detail::textenc::BAD_INPUT_STOP:
238 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
239 break;
241 case sal::detail::textenc::BAD_INPUT_CONTINUE:
242 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
243 continue;
245 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
246 goto no_output;
248 break;
250 no_output:
251 --pSrcBuf;
252 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
253 break;
256 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
257 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
258 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
259 == 0)
261 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
262 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
263 else
264 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
265 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
266 &nInfo))
268 case sal::detail::textenc::BAD_INPUT_STOP:
269 case sal::detail::textenc::BAD_INPUT_CONTINUE:
270 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
271 break;
273 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
274 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
275 break;
279 if (pContext)
281 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_eState = eState;
282 static_cast< ImplGb18030ToUnicodeContext * >(pContext)->m_nCode = nCode;
284 if (pInfo)
285 *pInfo = nInfo;
286 if (pSrcCvtBytes)
287 *pSrcCvtBytes = nConverted;
289 return pDestBufPtr - pDestBuf;
292 sal_Size ImplConvertUnicodeToGb18030(void const * pData,
293 void * pContext,
294 sal_Unicode const * pSrcBuf,
295 sal_Size nSrcChars,
296 char * pDestBuf,
297 sal_Size nDestBytes,
298 sal_uInt32 nFlags,
299 sal_uInt32 * pInfo,
300 sal_Size * pSrcCvtChars)
302 sal_uInt32 const * pGb18030Data
303 = static_cast< ImplGb18030ConverterData const * >(pData)->
304 m_pUnicodeToGb18030Data;
305 ImplUnicodeToGb180302000Range const * pGb18030Ranges
306 = static_cast< ImplGb18030ConverterData const * >(pData)->
307 m_pUnicodeToGb18030Ranges;
308 sal_Unicode nHighSurrogate = 0;
309 sal_uInt32 nInfo = 0;
310 sal_Size nConverted = 0;
311 char * pDestBufPtr = pDestBuf;
312 char * pDestBufEnd = pDestBuf + nDestBytes;
314 if (pContext)
315 nHighSurrogate
316 = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
318 for (; nConverted < nSrcChars; ++nConverted)
320 bool bUndefined = true;
321 sal_uInt32 nChar = *pSrcBuf++;
322 if (nHighSurrogate == 0)
324 if (ImplIsHighSurrogate(nChar))
326 nHighSurrogate = (sal_Unicode) nChar;
327 continue;
330 else if (ImplIsLowSurrogate(nChar))
331 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
332 else
334 bUndefined = false;
335 goto bad_input;
338 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
340 bUndefined = false;
341 goto bad_input;
344 if (nChar < 0x80)
345 if (pDestBufPtr != pDestBufEnd)
346 *pDestBufPtr++ = static_cast< char >(nChar);
347 else
348 goto no_output;
349 else if (nChar < 0x10000)
351 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
352 sal_Unicode nFirstNonRange = 0x80;
353 for (;;)
355 if (nChar < pRange->m_nFirstUnicode)
357 sal_uInt32 nCode
358 = pGb18030Data[pRange->m_nNonRangeDataIndex
359 + (nChar - nFirstNonRange)];
360 if (pDestBufEnd - pDestBufPtr
361 >= (nCode <= 0xFFFF ? 2 : 4))
363 if (nCode > 0xFFFF)
365 *pDestBufPtr++ = static_cast< char >(nCode >> 24);
366 *pDestBufPtr++ = static_cast< char >(nCode >> 16 & 0xFF);
368 *pDestBufPtr++ = static_cast< char >(nCode >> 8 & 0xFF);
369 *pDestBufPtr++ = static_cast< char >(nCode & 0xFF);
371 else
372 goto no_output;
373 break;
375 else if (nChar <= pRange->m_nLastUnicode)
377 if (pDestBufEnd - pDestBufPtr >= 4)
379 sal_uInt32 nCode
380 = pRange->m_nFirstLinear
381 + (nChar - pRange->m_nFirstUnicode);
382 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x81);
383 *pDestBufPtr++
384 = static_cast< char >(nCode / 1260 % 10 + 0x30);
385 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
386 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
388 else
389 goto no_output;
390 break;
392 nFirstNonRange
393 = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
396 else
397 if (pDestBufEnd - pDestBufPtr >= 4)
399 sal_uInt32 nCode = nChar - 0x10000;
400 *pDestBufPtr++ = static_cast< char >(nCode / 12600 + 0x90);
401 *pDestBufPtr++ = static_cast< char >(nCode / 1260 % 10 + 0x30);
402 *pDestBufPtr++ = static_cast< char >(nCode / 10 % 126 + 0x81);
403 *pDestBufPtr++ = static_cast< char >(nCode % 10 + 0x30);
405 else
406 goto no_output;
407 nHighSurrogate = 0;
408 continue;
410 bad_input:
411 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
412 bUndefined, nChar, nFlags, &pDestBufPtr, pDestBufEnd,
413 &nInfo, NULL, 0, NULL))
415 case sal::detail::textenc::BAD_INPUT_STOP:
416 nHighSurrogate = 0;
417 break;
419 case sal::detail::textenc::BAD_INPUT_CONTINUE:
420 nHighSurrogate = 0;
421 continue;
423 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
424 goto no_output;
426 break;
428 no_output:
429 --pSrcBuf;
430 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
431 break;
434 if (nHighSurrogate != 0
435 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
436 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
437 == 0)
439 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
440 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
441 else
442 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
443 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
444 NULL, 0, NULL))
446 case sal::detail::textenc::BAD_INPUT_STOP:
447 case sal::detail::textenc::BAD_INPUT_CONTINUE:
448 nHighSurrogate = 0;
449 break;
451 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
452 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
453 break;
457 if (pContext)
458 ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
459 = nHighSurrogate;
460 if (pInfo)
461 *pInfo = nInfo;
462 if (pSrcCvtChars)
463 *pSrcCvtChars = nConverted;
465 return pDestBufPtr - pDestBuf;
468 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */