merged tag ooo/DEV300_m102
[LibreOffice.git] / sal / textenc / tcvtutf8.c
blobcc5141f2c3e2c9ed8d3b16e5c07a904d9fe48586
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 #include "sal/types.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
32 #include "converter.h"
33 #include "tenchelp.h"
34 #include "unichars.h"
36 struct ImplUtf8ToUnicodeContext
38 sal_uInt32 nUtf32;
39 int nShift;
40 sal_Bool bCheckBom;
43 struct ImplUnicodeToUtf8Context
45 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
48 void * ImplCreateUtf8ToUnicodeContext(void)
50 void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext));
51 ImplResetUtf8ToUnicodeContext(p);
52 return p;
55 void ImplResetUtf8ToUnicodeContext(void * pContext)
57 if (pContext != NULL)
59 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1;
60 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True;
64 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,
65 void * pContext, sal_Char const * pSrcBuf,
66 sal_Size nSrcBytes, sal_Unicode * pDestBuf,
67 sal_Size nDestChars, sal_uInt32 nFlags,
68 sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
71 This function is very liberal with the UTF-8 input. Accepted are:
72 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
73 - surrogates (e.g., ED A0 80 to represent U+D800)
74 - encodings with up to six bytes (everything outside the range
75 U+0000..10FFFF is considered "undefined")
76 The first two of these points allow this routine to translate from both
77 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
80 int bJavaUtf8 = pData != NULL;
81 sal_uInt32 nUtf32 = 0;
82 int nShift = -1;
83 sal_Bool bCheckBom = sal_True;
84 sal_uInt32 nInfo = 0;
85 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
86 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
87 sal_Unicode * pDestBufPtr = pDestBuf;
88 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
90 if (pContext != NULL)
92 nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32;
93 nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift;
94 bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom;
97 while (pSrcBufPtr < pSrcBufEnd)
99 sal_Bool bUndefined = sal_False;
100 int bConsume = sal_True;
101 sal_uInt32 nChar = *pSrcBufPtr++;
102 if (nShift < 0)
103 if (nChar <= 0x7F)
105 nUtf32 = nChar;
106 goto transform;
108 else if (nChar <= 0xBF)
109 goto bad_input;
110 else if (nChar <= 0xDF)
112 nUtf32 = (nChar & 0x1F) << 6;
113 nShift = 0;
115 else if (nChar <= 0xEF)
117 nUtf32 = (nChar & 0x0F) << 12;
118 nShift = 6;
120 else if (nChar <= 0xF7)
122 nUtf32 = (nChar & 0x07) << 18;
123 nShift = 12;
125 else if (nChar <= 0xFB)
127 nUtf32 = (nChar & 0x03) << 24;
128 nShift = 18;
130 else if (nChar <= 0xFD)
132 nUtf32 = (nChar & 0x01) << 30;
133 nShift = 24;
135 else
136 goto bad_input;
137 else if ((nChar & 0xC0) == 0x80)
139 nUtf32 |= (nChar & 0x3F) << nShift;
140 if (nShift == 0)
141 goto transform;
142 else
143 nShift -= 6;
145 else
148 This byte is preceeded by a broken UTF-8 sequence; if this byte
149 is neither in the range [0x80..0xBF] nor in the range
150 [0xFE..0xFF], assume that this byte does not belong to that
151 broken sequence, but instead starts a new, legal UTF-8 sequence:
153 bConsume = nChar >= 0xFE;
154 goto bad_input;
156 continue;
158 transform:
159 if (!bCheckBom || nUtf32 != 0xFEFF
160 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
161 || bJavaUtf8)
163 if (nUtf32 <= 0xFFFF)
164 if (pDestBufPtr != pDestBufEnd)
165 *pDestBufPtr++ = (sal_Unicode) nUtf32;
166 else
167 goto no_output;
168 else if (nUtf32 <= 0x10FFFF)
169 if (pDestBufEnd - pDestBufPtr >= 2)
171 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
172 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
174 else
175 goto no_output;
176 else
178 bUndefined = sal_True;
179 goto bad_input;
182 nShift = -1;
183 bCheckBom = sal_False;
184 continue;
186 bad_input:
187 switch (ImplHandleBadInputTextToUnicodeConversion(
188 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
189 &nInfo))
191 case IMPL_BAD_INPUT_STOP:
192 nShift = -1;
193 bCheckBom = sal_False;
194 if (!bConsume)
195 --pSrcBufPtr;
196 break;
198 case IMPL_BAD_INPUT_CONTINUE:
199 nShift = -1;
200 bCheckBom = sal_False;
201 if (!bConsume)
202 --pSrcBufPtr;
203 continue;
205 case IMPL_BAD_INPUT_NO_OUTPUT:
206 goto no_output;
208 break;
210 no_output:
211 --pSrcBufPtr;
212 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
213 break;
216 if (nShift >= 0
217 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
218 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
219 == 0)
221 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
222 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
223 else
224 switch (ImplHandleBadInputTextToUnicodeConversion(
225 sal_False, sal_True, 0, nFlags, &pDestBufPtr,
226 pDestBufEnd, &nInfo))
228 case IMPL_BAD_INPUT_STOP:
229 case IMPL_BAD_INPUT_CONTINUE:
230 nShift = -1;
231 bCheckBom = sal_False;
232 break;
234 case IMPL_BAD_INPUT_NO_OUTPUT:
235 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
236 break;
240 if (pContext != NULL)
242 ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32;
243 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift;
244 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom;
246 if (pInfo != NULL)
247 *pInfo = nInfo;
248 if (pSrcCvtBytes != NULL)
249 *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf;
250 return pDestBufPtr - pDestBuf;
253 void * ImplCreateUnicodeToUtf8Context(void)
255 void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context));
256 ImplResetUnicodeToUtf8Context(p);
257 return p;
260 void ImplResetUnicodeToUtf8Context(void * pContext)
262 if (pContext != NULL)
263 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF;
266 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,
267 void * pContext, sal_Unicode const * pSrcBuf,
268 sal_Size nSrcChars, sal_Char * pDestBuf,
269 sal_Size nDestBytes, sal_uInt32 nFlags,
270 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
272 int bJavaUtf8 = pData != NULL;
273 sal_Unicode nHighSurrogate = 0xFFFF;
274 sal_uInt32 nInfo = 0;
275 sal_Unicode const * pSrcBufPtr = pSrcBuf;
276 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
277 sal_Char * pDestBufPtr = pDestBuf;
278 sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes;
280 if (pContext != NULL)
281 nHighSurrogate
282 = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate;
284 if (nHighSurrogate == 0xFFFF)
286 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
287 && !bJavaUtf8)
289 if (pDestBufEnd - pDestBufPtr >= 3)
291 /* Write BOM (U+FEFF) as UTF-8: */
292 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF;
293 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB;
294 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF;
296 else
298 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
299 goto done;
302 nHighSurrogate = 0;
305 while (pSrcBufPtr < pSrcBufEnd)
307 sal_uInt32 nChar = *pSrcBufPtr++;
308 if (nHighSurrogate == 0)
310 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
312 nHighSurrogate = (sal_Unicode) nChar;
313 continue;
316 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
317 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
318 else
319 goto bad_input;
321 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
322 || ImplIsNoncharacter(nChar))
323 goto bad_input;
325 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
326 if (pDestBufPtr != pDestBufEnd)
327 *pDestBufPtr++ = (sal_Char) nChar;
328 else
329 goto no_output;
330 else if (nChar <= 0x7FF)
331 if (pDestBufEnd - pDestBufPtr >= 2)
333 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6));
334 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
336 else
337 goto no_output;
338 else if (nChar <= 0xFFFF)
339 if (pDestBufEnd - pDestBufPtr >= 3)
341 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12));
342 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
343 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
345 else
346 goto no_output;
347 else if (pDestBufEnd - pDestBufPtr >= 4)
349 *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18));
350 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F));
351 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
352 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
354 else
355 goto no_output;
356 nHighSurrogate = 0;
357 continue;
359 bad_input:
360 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags,
361 &pDestBufPtr,
362 pDestBufEnd, &nInfo,
363 NULL, 0, NULL))
365 case IMPL_BAD_INPUT_STOP:
366 nHighSurrogate = 0;
367 break;
369 case IMPL_BAD_INPUT_CONTINUE:
370 nHighSurrogate = 0;
371 continue;
373 case IMPL_BAD_INPUT_NO_OUTPUT:
374 goto no_output;
376 break;
378 no_output:
379 --pSrcBufPtr;
380 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
381 break;
384 if (nHighSurrogate != 0
385 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
386 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
387 == 0)
389 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
390 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
391 else
392 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0,
393 nFlags,
394 &pDestBufPtr,
395 pDestBufEnd,
396 &nInfo, NULL, 0,
397 NULL))
399 case IMPL_BAD_INPUT_STOP:
400 case IMPL_BAD_INPUT_CONTINUE:
401 nHighSurrogate = 0;
402 break;
404 case IMPL_BAD_INPUT_NO_OUTPUT:
405 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
406 break;
410 done:
411 if (pContext != NULL)
412 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate
413 = nHighSurrogate;
414 if (pInfo != NULL)
415 *pInfo = nInfo;
416 if (pSrcCvtChars != NULL)
417 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
418 return pDestBufPtr - pDestBuf;