Update ooo320-m1
[ooovba.git] / sal / textenc / convertgb18030.c
blobc29534d74efc005e01f2b90e7ae7dd06a186df13
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: convertgb18030.c,v $
10 * $Revision: 1.9 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 #include "convertgb18030.h"
32 #include "context.h"
33 #include "converter.h"
34 #include "tenchelp.h"
35 #include "unichars.h"
36 #include "rtl/alloc.h"
37 #include "rtl/textcvt.h"
38 #include "sal/types.h"
40 typedef enum
42 IMPL_GB_18030_TO_UNICODE_STATE_0,
43 IMPL_GB_18030_TO_UNICODE_STATE_1,
44 IMPL_GB_18030_TO_UNICODE_STATE_2,
45 IMPL_GB_18030_TO_UNICODE_STATE_3
46 } ImplGb18030ToUnicodeState;
48 typedef struct
50 ImplGb18030ToUnicodeState m_eState;
51 sal_uInt32 m_nCode;
52 } ImplGb18030ToUnicodeContext;
54 void * ImplCreateGb18030ToUnicodeContext(void)
56 void * pContext
57 = rtl_allocateMemory(sizeof (ImplGb18030ToUnicodeContext));
58 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
59 = IMPL_GB_18030_TO_UNICODE_STATE_0;
60 return pContext;
63 void ImplResetGb18030ToUnicodeContext(void * pContext)
65 if (pContext)
66 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState
67 = IMPL_GB_18030_TO_UNICODE_STATE_0;
70 sal_Size ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData,
71 void * pContext,
72 sal_Char const * pSrcBuf,
73 sal_Size nSrcBytes,
74 sal_Unicode * pDestBuf,
75 sal_Size nDestChars,
76 sal_uInt32 nFlags,
77 sal_uInt32 * pInfo,
78 sal_Size * pSrcCvtBytes)
80 sal_Unicode const * pGb18030Data
81 = ((ImplGb18030ConverterData const *) pData)->m_pGb18030ToUnicodeData;
82 ImplGb180302000ToUnicodeRange const * pGb18030Ranges
83 = ((ImplGb18030ConverterData const *) pData)->
84 m_pGb18030ToUnicodeRanges;
85 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
86 sal_uInt32 nCode = 0;
87 sal_uInt32 nInfo = 0;
88 sal_Size nConverted = 0;
89 sal_Unicode * pDestBufPtr = pDestBuf;
90 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
92 if (pContext)
94 eState = ((ImplGb18030ToUnicodeContext *) pContext)->m_eState;
95 nCode = ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode;
98 for (; nConverted < nSrcBytes; ++nConverted)
100 sal_Bool bUndefined = sal_True;
101 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
102 switch (eState)
104 case IMPL_GB_18030_TO_UNICODE_STATE_0:
105 if (nChar < 0x80)
106 if (pDestBufPtr != pDestBufEnd)
107 *pDestBufPtr++ = (sal_Unicode) nChar;
108 else
109 goto no_output;
110 else if (nChar == 0x80)
111 goto bad_input;
112 else if (nChar <= 0xFE)
114 nCode = nChar - 0x81;
115 eState = IMPL_GB_18030_TO_UNICODE_STATE_1;
117 else
119 bUndefined = sal_False;
120 goto bad_input;
122 break;
124 case IMPL_GB_18030_TO_UNICODE_STATE_1:
125 if (nChar >= 0x30 && nChar <= 0x39)
127 nCode = nCode * 10 + (nChar - 0x30);
128 eState = IMPL_GB_18030_TO_UNICODE_STATE_2;
130 else if ((nChar >= 0x40 && nChar <= 0x7E)
131 || (nChar >= 0x80 && nChar <= 0xFE))
133 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 :
134 nChar - 0x80 + 63);
135 if (pDestBufPtr != pDestBufEnd)
136 *pDestBufPtr++ = pGb18030Data[nCode];
137 else
138 goto no_output;
139 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
141 else
143 bUndefined = sal_False;
144 goto bad_input;
146 break;
148 case IMPL_GB_18030_TO_UNICODE_STATE_2:
149 if (nChar >= 0x81 && nChar <= 0xFE)
151 nCode = nCode * 126 + (nChar - 0x81);
152 eState = IMPL_GB_18030_TO_UNICODE_STATE_3;
154 else
156 bUndefined = sal_False;
157 goto bad_input;
159 break;
161 case IMPL_GB_18030_TO_UNICODE_STATE_3:
162 if (nChar >= 0x30 && nChar <= 0x39)
164 nCode = nCode * 10 + (nChar - 0x30);
166 /* 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF: */
167 if (nCode >= 189000 && nCode <= 1237575)
168 if (pDestBufEnd - pDestBufPtr >= 2)
170 nCode -= 189000 - 0x10000;
171 *pDestBufPtr++
172 = (sal_Unicode) ImplGetHighSurrogate(nCode);
173 *pDestBufPtr++
174 = (sal_Unicode) ImplGetLowSurrogate(nCode);
176 else
177 goto no_output;
178 else
180 ImplGb180302000ToUnicodeRange const * pRange
181 = pGb18030Ranges;
182 sal_uInt32 nFirstNonRange = 0;
183 for (;;)
185 if (pRange->m_nNonRangeDataIndex == -1)
186 goto bad_input;
187 else if (nCode < pRange->m_nFirstLinear)
189 if (pDestBufPtr != pDestBufEnd)
190 *pDestBufPtr++
191 = pGb18030Data[
192 pRange->m_nNonRangeDataIndex
193 + (nCode - nFirstNonRange)];
194 else
195 goto no_output;
196 break;
198 else if (nCode < pRange->m_nPastLinear)
200 if (pDestBufPtr != pDestBufEnd)
201 *pDestBufPtr++
202 = (sal_Unicode)
203 (pRange->m_nFirstUnicode
204 + (nCode
205 - pRange->
206 m_nFirstLinear));
207 else
208 goto no_output;
209 break;
211 nFirstNonRange = (pRange++)->m_nPastLinear;
214 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
216 else
218 bUndefined = sal_False;
219 goto bad_input;
221 break;
223 continue;
225 bad_input:
226 switch (ImplHandleBadInputTextToUnicodeConversion(
227 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
228 &nInfo))
230 case IMPL_BAD_INPUT_STOP:
231 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
232 break;
234 case IMPL_BAD_INPUT_CONTINUE:
235 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
236 continue;
238 case IMPL_BAD_INPUT_NO_OUTPUT:
239 goto no_output;
241 break;
243 no_output:
244 --pSrcBuf;
245 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
246 break;
249 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0
250 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
251 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
252 == 0)
254 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
255 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
256 else
257 switch (ImplHandleBadInputTextToUnicodeConversion(
258 sal_False, sal_True, 0, nFlags, &pDestBufPtr,
259 pDestBufEnd, &nInfo))
261 case IMPL_BAD_INPUT_STOP:
262 case IMPL_BAD_INPUT_CONTINUE:
263 eState = IMPL_GB_18030_TO_UNICODE_STATE_0;
264 break;
266 case IMPL_BAD_INPUT_NO_OUTPUT:
267 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
268 break;
272 if (pContext)
274 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState = eState;
275 ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode = nCode;
277 if (pInfo)
278 *pInfo = nInfo;
279 if (pSrcCvtBytes)
280 *pSrcCvtBytes = nConverted;
282 return pDestBufPtr - pDestBuf;
285 sal_Size ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData,
286 void * pContext,
287 sal_Unicode const * pSrcBuf,
288 sal_Size nSrcChars,
289 sal_Char * pDestBuf,
290 sal_Size nDestBytes,
291 sal_uInt32 nFlags,
292 sal_uInt32 * pInfo,
293 sal_Size * pSrcCvtChars)
295 sal_uInt32 const * pGb18030Data
296 = ((ImplGb18030ConverterData const *) pData)->
297 m_pUnicodeToGb18030Data;
298 ImplUnicodeToGb180302000Range const * pGb18030Ranges
299 = ((ImplGb18030ConverterData const *) pData)->
300 m_pUnicodeToGb18030Ranges;
301 sal_Unicode nHighSurrogate = 0;
302 sal_uInt32 nInfo = 0;
303 sal_Size nConverted = 0;
304 sal_Char * pDestBufPtr = pDestBuf;
305 sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
307 if (pContext)
308 nHighSurrogate
309 = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
311 for (; nConverted < nSrcChars; ++nConverted)
313 sal_Bool bUndefined = sal_True;
314 sal_uInt32 nChar = *pSrcBuf++;
315 if (nHighSurrogate == 0)
317 if (ImplIsHighSurrogate(nChar))
319 nHighSurrogate = (sal_Unicode) nChar;
320 continue;
323 else if (ImplIsLowSurrogate(nChar))
324 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
325 else
327 bUndefined = sal_False;
328 goto bad_input;
331 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
333 bUndefined = sal_False;
334 goto bad_input;
337 if (nChar < 0x80)
338 if (pDestBufPtr != pDestBufEnd)
339 *pDestBufPtr++ = (sal_Char) nChar;
340 else
341 goto no_output;
342 else if (nChar < 0x10000)
344 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges;
345 sal_Unicode nFirstNonRange = 0x80;
346 for (;;)
348 if (nChar < pRange->m_nFirstUnicode)
350 sal_uInt32 nCode
351 = pGb18030Data[pRange->m_nNonRangeDataIndex
352 + (nChar - nFirstNonRange)];
353 if (pDestBufEnd - pDestBufPtr
354 >= (nCode <= 0xFFFF ? 2 : 4))
356 if (nCode > 0xFFFF)
358 *pDestBufPtr++ = (sal_Char) (nCode >> 24);
359 *pDestBufPtr++ = (sal_Char) (nCode >> 16 & 0xFF);
361 *pDestBufPtr++ = (sal_Char) (nCode >> 8 & 0xFF);
362 *pDestBufPtr++ = (sal_Char) (nCode & 0xFF);
364 else
365 goto no_output;
366 break;
368 else if (nChar <= pRange->m_nLastUnicode)
370 if (pDestBufEnd - pDestBufPtr >= 4)
372 sal_uInt32 nCode
373 = pRange->m_nFirstLinear
374 + (nChar - pRange->m_nFirstUnicode);
375 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x81);
376 *pDestBufPtr++
377 = (sal_Char) (nCode / 1260 % 10 + 0x30);
378 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
379 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
381 else
382 goto no_output;
383 break;
385 nFirstNonRange
386 = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1);
389 else
390 if (pDestBufEnd - pDestBufPtr >= 4)
392 sal_uInt32 nCode = nChar - 0x10000;
393 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x90);
394 *pDestBufPtr++ = (sal_Char) (nCode / 1260 % 10 + 0x30);
395 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81);
396 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30);
398 else
399 goto no_output;
400 nHighSurrogate = 0;
401 continue;
403 bad_input:
404 switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined,
405 nChar,
406 nFlags,
407 &pDestBufPtr,
408 pDestBufEnd,
409 &nInfo,
410 NULL,
412 NULL))
414 case IMPL_BAD_INPUT_STOP:
415 nHighSurrogate = 0;
416 break;
418 case IMPL_BAD_INPUT_CONTINUE:
419 nHighSurrogate = 0;
420 continue;
422 case IMPL_BAD_INPUT_NO_OUTPUT:
423 goto no_output;
425 break;
427 no_output:
428 --pSrcBuf;
429 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
430 break;
433 if (nHighSurrogate != 0
434 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
435 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
436 == 0)
438 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
439 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
440 else
441 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False,
443 nFlags,
444 &pDestBufPtr,
445 pDestBufEnd,
446 &nInfo,
447 NULL,
449 NULL))
451 case IMPL_BAD_INPUT_STOP:
452 case IMPL_BAD_INPUT_CONTINUE:
453 nHighSurrogate = 0;
454 break;
456 case IMPL_BAD_INPUT_NO_OUTPUT:
457 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
458 break;
462 if (pContext)
463 ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
464 = nHighSurrogate;
465 if (pInfo)
466 *pInfo = nInfo;
467 if (pSrcCvtChars)
468 *pSrcCvtChars = nConverted;
470 return pDestBufPtr - pDestBuf;