Update ooo320-m1
[ooovba.git] / sal / textenc / tcvtutf8.c
blobc5587a15df9c104e3df87bc3b775c5f218541b0f
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: tcvtutf8.c,v $
10 * $Revision: 1.10 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 #include "sal/types.h"
32 #include "rtl/alloc.h"
33 #include "rtl/textcvt.h"
35 #include "converter.h"
36 #include "tenchelp.h"
37 #include "unichars.h"
39 struct ImplUtf8ToUnicodeContext
41 sal_uInt32 nUtf32;
42 int nShift;
43 sal_Bool bCheckBom;
46 struct ImplUnicodeToUtf8Context
48 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
51 void * ImplCreateUtf8ToUnicodeContext(void)
53 void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext));
54 ImplResetUtf8ToUnicodeContext(p);
55 return p;
58 void ImplResetUtf8ToUnicodeContext(void * pContext)
60 if (pContext != NULL)
62 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1;
63 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True;
67 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,
68 void * pContext, sal_Char const * pSrcBuf,
69 sal_Size nSrcBytes, sal_Unicode * pDestBuf,
70 sal_Size nDestChars, sal_uInt32 nFlags,
71 sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
74 This function is very liberal with the UTF-8 input. Accepted are:
75 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
76 - surrogates (e.g., ED A0 80 to represent U+D800)
77 - encodings with up to six bytes (everything outside the range
78 U+0000..10FFFF is considered "undefined")
79 The first two of these points allow this routine to translate from both
80 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
83 int bJavaUtf8 = pData != NULL;
84 sal_uInt32 nUtf32 = 0;
85 int nShift = -1;
86 sal_Bool bCheckBom = sal_True;
87 sal_uInt32 nInfo = 0;
88 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
89 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
90 sal_Unicode * pDestBufPtr = pDestBuf;
91 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
93 if (pContext != NULL)
95 nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32;
96 nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift;
97 bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom;
100 while (pSrcBufPtr < pSrcBufEnd)
102 sal_Bool bUndefined = sal_False;
103 int bConsume = sal_True;
104 sal_uInt32 nChar = *pSrcBufPtr++;
105 if (nShift < 0)
106 if (nChar <= 0x7F)
108 nUtf32 = nChar;
109 goto transform;
111 else if (nChar <= 0xBF)
112 goto bad_input;
113 else if (nChar <= 0xDF)
115 nUtf32 = (nChar & 0x1F) << 6;
116 nShift = 0;
118 else if (nChar <= 0xEF)
120 nUtf32 = (nChar & 0x0F) << 12;
121 nShift = 6;
123 else if (nChar <= 0xF7)
125 nUtf32 = (nChar & 0x07) << 18;
126 nShift = 12;
128 else if (nChar <= 0xFB)
130 nUtf32 = (nChar & 0x03) << 24;
131 nShift = 18;
133 else if (nChar <= 0xFD)
135 nUtf32 = (nChar & 0x01) << 30;
136 nShift = 24;
138 else
139 goto bad_input;
140 else if ((nChar & 0xC0) == 0x80)
142 nUtf32 |= (nChar & 0x3F) << nShift;
143 if (nShift == 0)
144 goto transform;
145 else
146 nShift -= 6;
148 else
151 This byte is preceeded by a broken UTF-8 sequence; if this byte
152 is neither in the range [0x80..0xBF] nor in the range
153 [0xFE..0xFF], assume that this byte does not belong to that
154 broken sequence, but instead starts a new, legal UTF-8 sequence:
156 bConsume = nChar >= 0xFE;
157 goto bad_input;
159 continue;
161 transform:
162 if (!bCheckBom || nUtf32 != 0xFEFF
163 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
164 || bJavaUtf8)
166 if (nUtf32 <= 0xFFFF)
167 if (pDestBufPtr != pDestBufEnd)
168 *pDestBufPtr++ = (sal_Unicode) nUtf32;
169 else
170 goto no_output;
171 else if (nUtf32 <= 0x10FFFF)
172 if (pDestBufEnd - pDestBufPtr >= 2)
174 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
175 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
177 else
178 goto no_output;
179 else
181 bUndefined = sal_True;
182 goto bad_input;
185 nShift = -1;
186 bCheckBom = sal_False;
187 continue;
189 bad_input:
190 switch (ImplHandleBadInputTextToUnicodeConversion(
191 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
192 &nInfo))
194 case IMPL_BAD_INPUT_STOP:
195 nShift = -1;
196 bCheckBom = sal_False;
197 if (!bConsume)
198 --pSrcBufPtr;
199 break;
201 case IMPL_BAD_INPUT_CONTINUE:
202 nShift = -1;
203 bCheckBom = sal_False;
204 if (!bConsume)
205 --pSrcBufPtr;
206 continue;
208 case IMPL_BAD_INPUT_NO_OUTPUT:
209 goto no_output;
211 break;
213 no_output:
214 --pSrcBufPtr;
215 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
216 break;
219 if (nShift >= 0
220 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
221 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
222 == 0)
224 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
225 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
226 else
227 switch (ImplHandleBadInputTextToUnicodeConversion(
228 sal_False, sal_True, 0, nFlags, &pDestBufPtr,
229 pDestBufEnd, &nInfo))
231 case IMPL_BAD_INPUT_STOP:
232 case IMPL_BAD_INPUT_CONTINUE:
233 nShift = -1;
234 bCheckBom = sal_False;
235 break;
237 case IMPL_BAD_INPUT_NO_OUTPUT:
238 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
239 break;
243 if (pContext != NULL)
245 ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32;
246 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift;
247 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom;
249 if (pInfo != NULL)
250 *pInfo = nInfo;
251 if (pSrcCvtBytes != NULL)
252 *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf;
253 return pDestBufPtr - pDestBuf;
256 void * ImplCreateUnicodeToUtf8Context(void)
258 void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context));
259 ImplResetUnicodeToUtf8Context(p);
260 return p;
263 void ImplResetUnicodeToUtf8Context(void * pContext)
265 if (pContext != NULL)
266 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF;
269 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,
270 void * pContext, sal_Unicode const * pSrcBuf,
271 sal_Size nSrcChars, sal_Char * pDestBuf,
272 sal_Size nDestBytes, sal_uInt32 nFlags,
273 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
275 int bJavaUtf8 = pData != NULL;
276 sal_Unicode nHighSurrogate = 0xFFFF;
277 sal_uInt32 nInfo = 0;
278 sal_Unicode const * pSrcBufPtr = pSrcBuf;
279 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
280 sal_Char * pDestBufPtr = pDestBuf;
281 sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes;
283 if (pContext != NULL)
284 nHighSurrogate
285 = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate;
287 if (nHighSurrogate == 0xFFFF)
289 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
290 && !bJavaUtf8)
292 if (pDestBufEnd - pDestBufPtr >= 3)
294 /* Write BOM (U+FEFF) as UTF-8: */
295 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF;
296 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB;
297 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF;
299 else
301 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
302 goto done;
305 nHighSurrogate = 0;
308 while (pSrcBufPtr < pSrcBufEnd)
310 sal_uInt32 nChar = *pSrcBufPtr++;
311 if (nHighSurrogate == 0)
313 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
315 nHighSurrogate = (sal_Unicode) nChar;
316 continue;
319 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
320 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
321 else
322 goto bad_input;
324 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
325 || ImplIsNoncharacter(nChar))
326 goto bad_input;
328 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
329 if (pDestBufPtr != pDestBufEnd)
330 *pDestBufPtr++ = (sal_Char) nChar;
331 else
332 goto no_output;
333 else if (nChar <= 0x7FF)
334 if (pDestBufEnd - pDestBufPtr >= 2)
336 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6));
337 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
339 else
340 goto no_output;
341 else if (nChar <= 0xFFFF)
342 if (pDestBufEnd - pDestBufPtr >= 3)
344 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12));
345 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
346 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
348 else
349 goto no_output;
350 else if (pDestBufEnd - pDestBufPtr >= 4)
352 *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18));
353 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F));
354 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
355 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
357 else
358 goto no_output;
359 nHighSurrogate = 0;
360 continue;
362 bad_input:
363 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags,
364 &pDestBufPtr,
365 pDestBufEnd, &nInfo,
366 NULL, 0, NULL))
368 case IMPL_BAD_INPUT_STOP:
369 nHighSurrogate = 0;
370 break;
372 case IMPL_BAD_INPUT_CONTINUE:
373 nHighSurrogate = 0;
374 continue;
376 case IMPL_BAD_INPUT_NO_OUTPUT:
377 goto no_output;
379 break;
381 no_output:
382 --pSrcBufPtr;
383 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
384 break;
387 if (nHighSurrogate != 0
388 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
389 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
390 == 0)
392 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
393 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
394 else
395 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0,
396 nFlags,
397 &pDestBufPtr,
398 pDestBufEnd,
399 &nInfo, NULL, 0,
400 NULL))
402 case IMPL_BAD_INPUT_STOP:
403 case IMPL_BAD_INPUT_CONTINUE:
404 nHighSurrogate = 0;
405 break;
407 case IMPL_BAD_INPUT_NO_OUTPUT:
408 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
409 break;
413 done:
414 if (pContext != NULL)
415 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate
416 = nHighSurrogate;
417 if (pInfo != NULL)
418 *pInfo = nInfo;
419 if (pSrcCvtChars != NULL)
420 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
421 return pDestBufPtr - pDestBuf;