Bump for 3.6-28
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blobefa0e8dc0097d28212d813dc86449ad071446aaa
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*************************************************************************
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * Copyright 2000, 2010 Oracle and/or its affiliates.
8 * OpenOffice.org - a multi-platform office productivity suite
10 * This file is part of OpenOffice.org.
12 * OpenOffice.org is free software: you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 3
14 * only, as published by the Free Software Foundation.
16 * OpenOffice.org is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License version 3 for more details
20 * (a copy is included in the LICENSE file that accompanied this code).
22 * You should have received a copy of the GNU Lesser General Public License
23 * version 3 along with OpenOffice.org. If not, see
24 * <http://www.openoffice.org/license.html>
25 * for a copy of the LGPLv3 License.
27 ************************************************************************/
29 #include "sal/config.h"
31 #include "sal/types.h"
32 #include "rtl/textcvt.h"
34 #include "converter.hxx"
35 #include "tcvtutf8.hxx"
36 #include "tenchelp.hxx"
37 #include "unichars.hxx"
39 struct ImplUtf8ToUnicodeContext
41 sal_uInt32 nUtf32;
42 int nShift;
43 bool bCheckBom;
46 struct ImplUnicodeToUtf8Context
48 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
51 void * ImplCreateUtf8ToUnicodeContext()
53 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
54 ImplResetUtf8ToUnicodeContext(p);
55 return p;
58 void ImplResetUtf8ToUnicodeContext(void * pContext)
60 if (pContext != NULL)
62 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
63 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
67 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
69 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
72 sal_Size ImplConvertUtf8ToUnicode(
73 void const * pData, void * pContext, char const * pSrcBuf,
74 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
75 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
78 This function is very liberal with the UTF-8 input. Accepted are:
79 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
80 - surrogates (e.g., ED A0 80 to represent U+D800)
81 - encodings with up to six bytes (everything outside the range
82 U+0000..10FFFF is considered "undefined")
83 The first two of these points allow this routine to translate from both
84 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
87 int bJavaUtf8 = pData != NULL;
88 sal_uInt32 nUtf32 = 0;
89 int nShift = -1;
90 bool bCheckBom = true;
91 sal_uInt32 nInfo = 0;
92 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
93 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
94 sal_Unicode * pDestBufPtr = pDestBuf;
95 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
97 if (pContext != NULL)
99 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
100 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
101 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
104 while (pSrcBufPtr < pSrcBufEnd)
106 bool bUndefined = false;
107 int bConsume = true;
108 sal_uInt32 nChar = *pSrcBufPtr++;
109 if (nShift < 0)
110 if (nChar <= 0x7F)
112 nUtf32 = nChar;
113 goto transform;
115 else if (nChar <= 0xBF)
116 goto bad_input;
117 else if (nChar <= 0xDF)
119 nUtf32 = (nChar & 0x1F) << 6;
120 nShift = 0;
122 else if (nChar <= 0xEF)
124 nUtf32 = (nChar & 0x0F) << 12;
125 nShift = 6;
127 else if (nChar <= 0xF7)
129 nUtf32 = (nChar & 0x07) << 18;
130 nShift = 12;
132 else if (nChar <= 0xFB)
134 nUtf32 = (nChar & 0x03) << 24;
135 nShift = 18;
137 else if (nChar <= 0xFD)
139 nUtf32 = (nChar & 0x01) << 30;
140 nShift = 24;
142 else
143 goto bad_input;
144 else if ((nChar & 0xC0) == 0x80)
146 nUtf32 |= (nChar & 0x3F) << nShift;
147 if (nShift == 0)
148 goto transform;
149 else
150 nShift -= 6;
152 else
155 This byte is preceeded by a broken UTF-8 sequence; if this byte
156 is neither in the range [0x80..0xBF] nor in the range
157 [0xFE..0xFF], assume that this byte does not belong to that
158 broken sequence, but instead starts a new, legal UTF-8 sequence:
160 bConsume = nChar >= 0xFE;
161 goto bad_input;
163 continue;
165 transform:
166 if (!bCheckBom || nUtf32 != 0xFEFF
167 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
168 || bJavaUtf8)
170 if (nUtf32 <= 0xFFFF)
171 if (pDestBufPtr != pDestBufEnd)
172 *pDestBufPtr++ = (sal_Unicode) nUtf32;
173 else
174 goto no_output;
175 else if (nUtf32 <= 0x10FFFF)
176 if (pDestBufEnd - pDestBufPtr >= 2)
178 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
179 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
181 else
182 goto no_output;
183 else
185 bUndefined = true;
186 goto bad_input;
189 nShift = -1;
190 bCheckBom = false;
191 continue;
193 bad_input:
194 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
195 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
196 &nInfo))
198 case sal::detail::textenc::BAD_INPUT_STOP:
199 nShift = -1;
200 bCheckBom = false;
201 if (!bConsume)
202 --pSrcBufPtr;
203 break;
205 case sal::detail::textenc::BAD_INPUT_CONTINUE:
206 nShift = -1;
207 bCheckBom = false;
208 if (!bConsume)
209 --pSrcBufPtr;
210 continue;
212 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
213 goto no_output;
215 break;
217 no_output:
218 --pSrcBufPtr;
219 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
220 break;
223 if (nShift >= 0
224 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
225 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
226 == 0)
228 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
229 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
230 else
231 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
232 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
233 &nInfo))
235 case sal::detail::textenc::BAD_INPUT_STOP:
236 case sal::detail::textenc::BAD_INPUT_CONTINUE:
237 nShift = -1;
238 bCheckBom = false;
239 break;
241 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
242 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
243 break;
247 if (pContext != NULL)
249 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
250 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
251 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
253 if (pInfo != NULL)
254 *pInfo = nInfo;
255 if (pSrcCvtBytes != NULL)
256 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
257 return pDestBufPtr - pDestBuf;
260 void * ImplCreateUnicodeToUtf8Context()
262 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
263 ImplResetUnicodeToUtf8Context(p);
264 return p;
267 void ImplResetUnicodeToUtf8Context(void * pContext)
269 if (pContext != NULL)
270 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
273 void ImplDestroyUnicodeToUtf8Context(void * pContext)
275 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
278 sal_Size ImplConvertUnicodeToUtf8(
279 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
280 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
281 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
283 int bJavaUtf8 = pData != NULL;
284 sal_Unicode nHighSurrogate = 0xFFFF;
285 sal_uInt32 nInfo = 0;
286 sal_Unicode const * pSrcBufPtr = pSrcBuf;
287 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
288 char * pDestBufPtr = pDestBuf;
289 char * pDestBufEnd = pDestBufPtr + nDestBytes;
291 if (pContext != NULL)
292 nHighSurrogate
293 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
295 if (nHighSurrogate == 0xFFFF)
297 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
298 && !bJavaUtf8)
300 if (pDestBufEnd - pDestBufPtr >= 3)
302 /* Write BOM (U+FEFF) as UTF-8: */
303 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
304 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
305 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
307 else
309 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
310 goto done;
313 nHighSurrogate = 0;
316 while (pSrcBufPtr < pSrcBufEnd)
318 sal_uInt32 nChar = *pSrcBufPtr++;
319 if (nHighSurrogate == 0)
321 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
323 nHighSurrogate = (sal_Unicode) nChar;
324 continue;
327 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
328 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
329 else
330 goto bad_input;
332 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
333 || ImplIsNoncharacter(nChar))
334 goto bad_input;
336 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
337 if (pDestBufPtr != pDestBufEnd)
338 *pDestBufPtr++ = static_cast< char >(nChar);
339 else
340 goto no_output;
341 else if (nChar <= 0x7FF)
342 if (pDestBufEnd - pDestBufPtr >= 2)
344 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
345 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
347 else
348 goto no_output;
349 else if (nChar <= 0xFFFF)
350 if (pDestBufEnd - pDestBufPtr >= 3)
352 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
353 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
356 else
357 goto no_output;
358 else if (pDestBufEnd - pDestBufPtr >= 4)
360 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
361 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
362 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
363 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
365 else
366 goto no_output;
367 nHighSurrogate = 0;
368 continue;
370 bad_input:
371 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
372 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
373 0, NULL))
375 case sal::detail::textenc::BAD_INPUT_STOP:
376 nHighSurrogate = 0;
377 break;
379 case sal::detail::textenc::BAD_INPUT_CONTINUE:
380 nHighSurrogate = 0;
381 continue;
383 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
384 goto no_output;
386 break;
388 no_output:
389 --pSrcBufPtr;
390 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
391 break;
394 if (nHighSurrogate != 0
395 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
396 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
397 == 0)
399 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
400 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
401 else
402 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
403 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
404 NULL, 0, NULL))
406 case sal::detail::textenc::BAD_INPUT_STOP:
407 case sal::detail::textenc::BAD_INPUT_CONTINUE:
408 nHighSurrogate = 0;
409 break;
411 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
412 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
413 break;
417 done:
418 if (pContext != NULL)
419 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
420 = nHighSurrogate;
421 if (pInfo != NULL)
422 *pInfo = nInfo;
423 if (pSrcCvtChars != NULL)
424 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
425 return pDestBufPtr - pDestBuf;
428 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */