build fix: no comphelper/profilezone.hxx in this branch
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blobf9c9879c54a3726c98c6f7310f892a4e10978e37
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "sal/types.h"
23 #include "rtl/textcvt.h"
25 #include "converter.hxx"
26 #include "tcvtutf8.hxx"
27 #include "tenchelp.hxx"
28 #include "unichars.hxx"
30 struct ImplUtf8ToUnicodeContext
32 sal_uInt32 nUtf32;
33 int nShift;
34 bool bCheckBom;
37 struct ImplUnicodeToUtf8Context
39 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
42 void * ImplCreateUtf8ToUnicodeContext()
44 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
45 ImplResetUtf8ToUnicodeContext(p);
46 return p;
49 void ImplResetUtf8ToUnicodeContext(void * pContext)
51 if (pContext != nullptr)
53 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
54 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
58 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
60 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
63 sal_Size ImplConvertUtf8ToUnicode(
64 void const * pData, void * pContext, char const * pSrcBuf,
65 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
66 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
69 This function is very liberal with the UTF-8 input. Accepted are:
70 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 - surrogates (e.g., ED A0 80 to represent U+D800)
72 - encodings with up to six bytes (everything outside the range
73 U+0000..10FFFF is considered "undefined")
74 The first two of these points allow this routine to translate from both
75 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
78 bool bJavaUtf8 = pData != nullptr;
79 sal_uInt32 nUtf32 = 0;
80 int nShift = -1;
81 bool bCheckBom = true;
82 sal_uInt32 nInfo = 0;
83 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
84 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
85 sal_Unicode * pDestBufPtr = pDestBuf;
86 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
88 if (pContext != nullptr)
90 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
95 while (pSrcBufPtr < pSrcBufEnd)
97 bool bUndefined = false;
98 bool bConsume = true;
99 sal_uInt32 nChar = *pSrcBufPtr++;
100 if (nShift < 0)
101 if (nChar <= 0x7F)
103 nUtf32 = nChar;
104 goto transform;
106 else if (nChar <= 0xBF)
107 goto bad_input;
108 else if (nChar <= 0xDF)
110 nUtf32 = (nChar & 0x1F) << 6;
111 nShift = 0;
113 else if (nChar <= 0xEF)
115 nUtf32 = (nChar & 0x0F) << 12;
116 nShift = 6;
118 else if (nChar <= 0xF7)
120 nUtf32 = (nChar & 0x07) << 18;
121 nShift = 12;
123 else if (nChar <= 0xFB)
125 nUtf32 = (nChar & 0x03) << 24;
126 nShift = 18;
128 else if (nChar <= 0xFD)
130 nUtf32 = (nChar & 0x01) << 30;
131 nShift = 24;
133 else
134 goto bad_input;
135 else if ((nChar & 0xC0) == 0x80)
137 nUtf32 |= (nChar & 0x3F) << nShift;
138 if (nShift == 0)
139 goto transform;
140 else
141 nShift -= 6;
143 else
146 This byte is preceded by a broken UTF-8 sequence; if this byte
147 is neither in the range [0x80..0xBF] nor in the range
148 [0xFE..0xFF], assume that this byte does not belong to that
149 broken sequence, but instead starts a new, legal UTF-8 sequence:
151 bConsume = nChar >= 0xFE;
152 goto bad_input;
154 continue;
156 transform:
157 if (!bCheckBom || nUtf32 != 0xFEFF
158 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 || bJavaUtf8)
161 if (nUtf32 <= 0xFFFF)
162 if (pDestBufPtr != pDestBufEnd)
163 *pDestBufPtr++ = (sal_Unicode) nUtf32;
164 else
165 goto no_output;
166 else if (rtl::isUnicodeCodePoint(nUtf32))
167 if (pDestBufEnd - pDestBufPtr >= 2)
169 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
170 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
172 else
173 goto no_output;
174 else
176 bUndefined = true;
177 goto bad_input;
180 nShift = -1;
181 bCheckBom = false;
182 continue;
184 bad_input:
185 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
187 &nInfo))
189 case sal::detail::textenc::BAD_INPUT_STOP:
190 nShift = -1;
191 bCheckBom = false;
192 if (!bConsume)
193 --pSrcBufPtr;
194 break;
196 case sal::detail::textenc::BAD_INPUT_CONTINUE:
197 nShift = -1;
198 bCheckBom = false;
199 if (!bConsume)
200 --pSrcBufPtr;
201 continue;
203 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
204 goto no_output;
206 break;
208 no_output:
209 --pSrcBufPtr;
210 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
211 break;
214 if (nShift >= 0
215 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
216 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
217 == 0)
219 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
220 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
221 else
222 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
224 &nInfo))
226 case sal::detail::textenc::BAD_INPUT_STOP:
227 case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 nShift = -1;
229 bCheckBom = false;
230 break;
232 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
234 break;
238 if (pContext != nullptr)
240 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
241 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
242 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
244 if (pInfo != nullptr)
245 *pInfo = nInfo;
246 if (pSrcCvtBytes != nullptr)
247 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
248 return pDestBufPtr - pDestBuf;
251 void * ImplCreateUnicodeToUtf8Context()
253 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
254 ImplResetUnicodeToUtf8Context(p);
255 return p;
258 void ImplResetUnicodeToUtf8Context(void * pContext)
260 if (pContext != nullptr)
261 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
264 void ImplDestroyUnicodeToUtf8Context(void * pContext)
266 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
269 sal_Size ImplConvertUnicodeToUtf8(
270 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
271 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
272 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
274 bool bJavaUtf8 = pData != nullptr;
275 sal_Unicode nHighSurrogate = 0xFFFF;
276 sal_uInt32 nInfo = 0;
277 sal_Unicode const * pSrcBufPtr = pSrcBuf;
278 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
279 char * pDestBufPtr = pDestBuf;
280 char * pDestBufEnd = pDestBufPtr + nDestBytes;
282 if (pContext != nullptr)
283 nHighSurrogate
284 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
286 if (nHighSurrogate == 0xFFFF)
288 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
289 && !bJavaUtf8)
291 if (pDestBufEnd - pDestBufPtr >= 3)
293 /* Write BOM (U+FEFF) as UTF-8: */
294 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
298 else
300 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 goto done;
304 nHighSurrogate = 0;
307 while (pSrcBufPtr < pSrcBufEnd)
309 sal_uInt32 nChar = *pSrcBufPtr++;
310 if (nHighSurrogate == 0)
312 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
314 nHighSurrogate = (sal_Unicode) nChar;
315 continue;
318 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
319 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
320 else
321 goto bad_input;
323 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
324 || ImplIsNoncharacter(nChar))
325 goto bad_input;
327 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
328 if (pDestBufPtr != pDestBufEnd)
329 *pDestBufPtr++ = static_cast< char >(nChar);
330 else
331 goto no_output;
332 else if (nChar <= 0x7FF)
333 if (pDestBufEnd - pDestBufPtr >= 2)
335 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
336 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
338 else
339 goto no_output;
340 else if (nChar <= 0xFFFF)
341 if (pDestBufEnd - pDestBufPtr >= 3)
343 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
344 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
345 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
347 else
348 goto no_output;
349 else if (pDestBufEnd - pDestBufPtr >= 4)
351 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
352 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
353 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
356 else
357 goto no_output;
358 nHighSurrogate = 0;
359 continue;
361 bad_input:
362 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
364 0, nullptr))
366 case sal::detail::textenc::BAD_INPUT_STOP:
367 nHighSurrogate = 0;
368 break;
370 case sal::detail::textenc::BAD_INPUT_CONTINUE:
371 nHighSurrogate = 0;
372 continue;
374 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
375 goto no_output;
377 break;
379 no_output:
380 --pSrcBufPtr;
381 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
382 break;
385 if (nHighSurrogate != 0
386 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
387 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
388 == 0)
390 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
391 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
392 else
393 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
395 nullptr, 0, nullptr))
397 case sal::detail::textenc::BAD_INPUT_STOP:
398 case sal::detail::textenc::BAD_INPUT_CONTINUE:
399 nHighSurrogate = 0;
400 break;
402 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
403 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 break;
408 done:
409 if (pContext != nullptr)
410 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
411 = nHighSurrogate;
412 if (pInfo != nullptr)
413 *pInfo = nInfo;
414 if (pSrcCvtChars != nullptr)
415 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
416 return pDestBufPtr - pDestBuf;
419 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */