android: Update app-specific/MIME type icons
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blobf210b654d57feec22720166a44b90adfaaedbe08
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <sal/types.h>
25 #include <rtl/character.hxx>
26 #include <rtl/textcvt.h>
28 #include "converter.hxx"
29 #include "tcvtutf8.hxx"
30 #include "tenchelp.hxx"
31 #include "unichars.hxx"
33 namespace {
35 struct ImplUtf8ToUnicodeContext
37 sal_uInt32 nUtf32;
38 int nBytes;
39 int nShift;
40 bool bCheckBom;
43 struct ImplUnicodeToUtf8Context
45 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
50 void * ImplCreateUtf8ToUnicodeContext()
52 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
53 ImplResetUtf8ToUnicodeContext(p);
54 return p;
57 void ImplResetUtf8ToUnicodeContext(void * pContext)
59 if (pContext != nullptr)
61 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
62 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
66 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
68 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
71 sal_Size ImplConvertUtf8ToUnicode(
72 void const * pData, void * pContext, char const * pSrcBuf,
73 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
74 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
76 bool bJavaUtf8 = pData != nullptr;
77 sal_uInt32 nUtf32 = 0;
78 int nBytes = int();
79 int nShift = -1;
80 bool bCheckBom = true;
81 sal_uInt32 nInfo = 0;
82 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
83 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
84 sal_Unicode * pDestBufPtr = pDestBuf;
85 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
86 unsigned char const * startOfCurrentChar = pSrcBufPtr;
88 if (pContext != nullptr)
90 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
92 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
93 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
96 while (pSrcBufPtr < pSrcBufEnd)
98 bool bConsume = true;
99 sal_uInt32 nChar = *pSrcBufPtr++;
100 if (nShift < 0)
101 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
102 // single individual bad character:
103 if (nChar <= 0x7F)
105 nUtf32 = nChar;
106 nBytes = 1;
107 goto transform;
109 else if (nChar <= 0xBF)
110 goto bad_input;
111 else if (nChar <= 0xDF)
113 nUtf32 = (nChar & 0x1F) << 6;
114 nBytes = 2;
115 nShift = 0;
117 else if (nChar <= 0xEF)
119 nUtf32 = (nChar & 0x0F) << 12;
120 nBytes = 3;
121 nShift = 6;
123 else if (nChar <= 0xF7)
125 nUtf32 = (nChar & 0x07) << 18;
126 nBytes = 4;
127 nShift = 12;
129 else if (nChar <= 0xFB)
131 nUtf32 = (nChar & 0x03) << 24;
132 nBytes = 5;
133 nShift = 18;
135 else if (nChar <= 0xFD)
137 nUtf32 = (nChar & 0x01) << 30;
138 nBytes = 6;
139 nShift = 24;
141 else
142 goto bad_input;
143 else if ((nChar & 0xC0) == 0x80)
145 nUtf32 |= (nChar & 0x3F) << nShift;
146 if (nShift == 0)
147 goto transform;
148 else
149 nShift -= 6;
151 else
154 This byte is preceded by a broken UTF-8 sequence; if this byte
155 is neither in the range [0x80..0xBF] nor in the range
156 [0xFE..0xFF], assume that this byte does not belong to that
157 broken sequence, but instead starts a new, legal UTF-8 sequence:
159 bConsume = nChar >= 0xFE;
160 goto bad_input;
162 continue;
164 transform:
165 if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
166 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
167 || bJavaUtf8)
169 switch (nBytes) {
170 case 1:
171 if (bJavaUtf8 && nUtf32 == 0) {
172 goto bad_input;
174 break;
175 case 2:
176 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
177 goto bad_input;
179 break;
180 case 3:
181 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
183 goto bad_input;
185 break;
186 case 4:
187 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
188 || bJavaUtf8)
190 goto bad_input;
192 break;
193 default:
194 goto bad_input;
196 if (nUtf32 <= 0xFFFF)
197 if (pDestBufPtr != pDestBufEnd)
198 *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
199 else
200 goto no_output;
201 else if (pDestBufEnd - pDestBufPtr >= 2)
202 pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
203 else
204 goto no_output;
206 nShift = -1;
207 bCheckBom = false;
208 startOfCurrentChar = pSrcBufPtr;
209 continue;
211 bad_input:
212 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
213 false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
214 &nInfo))
216 case sal::detail::textenc::BAD_INPUT_STOP:
217 nShift = -1;
218 bCheckBom = false;
219 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
220 if (!bConsume)
221 --pSrcBufPtr;
222 } else {
223 pSrcBufPtr = startOfCurrentChar;
225 break;
227 case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 nShift = -1;
229 bCheckBom = false;
230 if (!bConsume)
231 --pSrcBufPtr;
232 startOfCurrentChar = pSrcBufPtr;
233 continue;
235 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
236 goto no_output;
238 break;
240 no_output:
241 --pSrcBufPtr;
242 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
243 break;
246 if (nShift >= 0
247 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
248 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
249 == 0)
251 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
252 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
253 else
254 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
255 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
256 &nInfo))
258 case sal::detail::textenc::BAD_INPUT_STOP:
259 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
260 pSrcBufPtr = startOfCurrentChar;
262 [[fallthrough]];
263 case sal::detail::textenc::BAD_INPUT_CONTINUE:
264 nShift = -1;
265 bCheckBom = false;
266 break;
268 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
269 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
270 break;
274 if (pContext != nullptr)
276 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
277 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
278 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
279 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
281 if (pInfo != nullptr)
282 *pInfo = nInfo;
283 if (pSrcCvtBytes != nullptr)
284 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
285 return pDestBufPtr - pDestBuf;
288 void * ImplCreateUnicodeToUtf8Context()
290 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
291 ImplResetUnicodeToUtf8Context(p);
292 return p;
295 void ImplResetUnicodeToUtf8Context(void * pContext)
297 if (pContext != nullptr)
298 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
301 void ImplDestroyUnicodeToUtf8Context(void * pContext)
303 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
306 sal_Size ImplConvertUnicodeToUtf8(
307 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
308 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
309 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
311 bool bJavaUtf8 = pData != nullptr;
312 sal_Unicode nHighSurrogate = 0xFFFF;
313 sal_uInt32 nInfo = 0;
314 sal_Unicode const * pSrcBufPtr = pSrcBuf;
315 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
316 char * pDestBufPtr = pDestBuf;
317 char * pDestBufEnd = pDestBufPtr + nDestBytes;
319 if (pContext != nullptr)
320 nHighSurrogate
321 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
323 if (nHighSurrogate == 0xFFFF)
325 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
326 && !bJavaUtf8)
328 if (pDestBufEnd - pDestBufPtr >= 3)
330 /* Write BOM (U+FEFF) as UTF-8: */
331 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
332 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
333 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
335 else
337 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
338 goto done;
341 nHighSurrogate = 0;
344 while (pSrcBufPtr < pSrcBufEnd)
346 sal_uInt32 nChar = *pSrcBufPtr++;
347 if (nHighSurrogate == 0)
349 if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
351 nHighSurrogate = static_cast<sal_Unicode>(nChar);
352 continue;
354 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
356 goto bad_input;
359 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
360 nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
361 else
362 goto bad_input;
364 assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
366 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
367 if (pDestBufPtr != pDestBufEnd)
368 *pDestBufPtr++ = static_cast< char >(nChar);
369 else
370 goto no_output;
371 else if (nChar <= 0x7FF)
372 if (pDestBufEnd - pDestBufPtr >= 2)
374 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
375 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
377 else
378 goto no_output;
379 else if (nChar <= 0xFFFF)
380 if (pDestBufEnd - pDestBufPtr >= 3)
382 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
383 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
384 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
386 else
387 goto no_output;
388 else if (pDestBufEnd - pDestBufPtr >= 4)
390 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
391 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
392 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
393 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
395 else
396 goto no_output;
397 nHighSurrogate = 0;
398 continue;
400 bad_input:
401 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
402 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
403 0, nullptr))
405 case sal::detail::textenc::BAD_INPUT_STOP:
406 nHighSurrogate = 0;
407 break;
409 case sal::detail::textenc::BAD_INPUT_CONTINUE:
410 nHighSurrogate = 0;
411 continue;
413 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
414 goto no_output;
416 break;
418 no_output:
419 --pSrcBufPtr;
420 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
421 break;
424 if (nHighSurrogate != 0
425 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
426 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
427 == 0)
429 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
430 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
431 else
432 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
433 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
434 nullptr, 0, nullptr))
436 case sal::detail::textenc::BAD_INPUT_STOP:
437 case sal::detail::textenc::BAD_INPUT_CONTINUE:
438 nHighSurrogate = 0;
439 break;
441 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
442 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
443 break;
447 done:
448 if (pContext != nullptr)
449 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
450 = nHighSurrogate;
451 if (pInfo != nullptr)
452 *pInfo = nInfo;
453 if (pSrcCvtChars != nullptr)
454 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
455 return pDestBufPtr - pDestBuf;
458 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */