Version 6.4.0.3, tag libreoffice-6.4.0.3
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blob950d810e8b85be054bb0f061d04953392ee3493c
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <sal/types.h>
25 #include <rtl/character.hxx>
26 #include <rtl/textcvt.h>
28 #include "converter.hxx"
29 #include "tcvtutf8.hxx"
30 #include "tenchelp.hxx"
31 #include "unichars.hxx"
33 struct ImplUtf8ToUnicodeContext
35 sal_uInt32 nUtf32;
36 int nBytes;
37 int nShift;
38 bool bCheckBom;
41 struct ImplUnicodeToUtf8Context
43 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
46 void * ImplCreateUtf8ToUnicodeContext()
48 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
49 ImplResetUtf8ToUnicodeContext(p);
50 return p;
53 void ImplResetUtf8ToUnicodeContext(void * pContext)
55 if (pContext != nullptr)
57 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
58 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
62 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
64 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
67 sal_Size ImplConvertUtf8ToUnicode(
68 void const * pData, void * pContext, char const * pSrcBuf,
69 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
70 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
72 bool bJavaUtf8 = pData != nullptr;
73 sal_uInt32 nUtf32 = 0;
74 int nBytes = int();
75 int nShift = -1;
76 bool bCheckBom = true;
77 sal_uInt32 nInfo = 0;
78 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
79 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
80 sal_Unicode * pDestBufPtr = pDestBuf;
81 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
82 unsigned char const * startOfCurrentChar = pSrcBufPtr;
84 if (pContext != nullptr)
86 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
87 nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
88 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
89 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
92 while (pSrcBufPtr < pSrcBufEnd)
94 bool bConsume = true;
95 sal_uInt32 nChar = *pSrcBufPtr++;
96 if (nShift < 0)
97 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
98 // single individual bad character:
99 if (nChar <= 0x7F)
101 nUtf32 = nChar;
102 nBytes = 1;
103 goto transform;
105 else if (nChar <= 0xBF)
106 goto bad_input;
107 else if (nChar <= 0xDF)
109 nUtf32 = (nChar & 0x1F) << 6;
110 nBytes = 2;
111 nShift = 0;
113 else if (nChar <= 0xEF)
115 nUtf32 = (nChar & 0x0F) << 12;
116 nBytes = 3;
117 nShift = 6;
119 else if (nChar <= 0xF7)
121 nUtf32 = (nChar & 0x07) << 18;
122 nBytes = 4;
123 nShift = 12;
125 else if (nChar <= 0xFB)
127 nUtf32 = (nChar & 0x03) << 24;
128 nBytes = 5;
129 nShift = 18;
131 else if (nChar <= 0xFD)
133 nUtf32 = (nChar & 0x01) << 30;
134 nBytes = 6;
135 nShift = 24;
137 else
138 goto bad_input;
139 else if ((nChar & 0xC0) == 0x80)
141 nUtf32 |= (nChar & 0x3F) << nShift;
142 if (nShift == 0)
143 goto transform;
144 else
145 nShift -= 6;
147 else
150 This byte is preceded by a broken UTF-8 sequence; if this byte
151 is neither in the range [0x80..0xBF] nor in the range
152 [0xFE..0xFF], assume that this byte does not belong to that
153 broken sequence, but instead starts a new, legal UTF-8 sequence:
155 bConsume = nChar >= 0xFE;
156 goto bad_input;
158 continue;
160 transform:
161 if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
162 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
163 || bJavaUtf8)
165 switch (nBytes) {
166 case 1:
167 if (bJavaUtf8 && nUtf32 == 0) {
168 goto bad_input;
170 break;
171 case 2:
172 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
173 goto bad_input;
175 break;
176 case 3:
177 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
179 goto bad_input;
181 break;
182 case 4:
183 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
184 || bJavaUtf8)
186 goto bad_input;
188 break;
189 default:
190 goto bad_input;
192 if (nUtf32 <= 0xFFFF)
193 if (pDestBufPtr != pDestBufEnd)
194 *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
195 else
196 goto no_output;
197 else if (pDestBufEnd - pDestBufPtr >= 2)
199 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetHighSurrogate(nUtf32));
200 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetLowSurrogate(nUtf32));
202 else
203 goto no_output;
205 nShift = -1;
206 bCheckBom = false;
207 startOfCurrentChar = pSrcBufPtr;
208 continue;
210 bad_input:
211 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
212 false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
213 &nInfo))
215 case sal::detail::textenc::BAD_INPUT_STOP:
216 nShift = -1;
217 bCheckBom = false;
218 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
219 if (!bConsume)
220 --pSrcBufPtr;
221 } else {
222 pSrcBufPtr = startOfCurrentChar;
224 break;
226 case sal::detail::textenc::BAD_INPUT_CONTINUE:
227 nShift = -1;
228 bCheckBom = false;
229 if (!bConsume)
230 --pSrcBufPtr;
231 startOfCurrentChar = pSrcBufPtr;
232 continue;
234 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
235 goto no_output;
237 break;
239 no_output:
240 --pSrcBufPtr;
241 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
242 break;
245 if (nShift >= 0
246 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
247 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
248 == 0)
250 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
251 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
252 else
253 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
254 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
255 &nInfo))
257 case sal::detail::textenc::BAD_INPUT_STOP:
258 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
259 pSrcBufPtr = startOfCurrentChar;
261 [[fallthrough]];
262 case sal::detail::textenc::BAD_INPUT_CONTINUE:
263 nShift = -1;
264 bCheckBom = false;
265 break;
267 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
268 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
269 break;
273 if (pContext != nullptr)
275 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
276 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
277 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
278 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
280 if (pInfo != nullptr)
281 *pInfo = nInfo;
282 if (pSrcCvtBytes != nullptr)
283 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
284 return pDestBufPtr - pDestBuf;
287 void * ImplCreateUnicodeToUtf8Context()
289 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
290 ImplResetUnicodeToUtf8Context(p);
291 return p;
294 void ImplResetUnicodeToUtf8Context(void * pContext)
296 if (pContext != nullptr)
297 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
300 void ImplDestroyUnicodeToUtf8Context(void * pContext)
302 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
305 sal_Size ImplConvertUnicodeToUtf8(
306 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
307 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
308 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
310 bool bJavaUtf8 = pData != nullptr;
311 sal_Unicode nHighSurrogate = 0xFFFF;
312 sal_uInt32 nInfo = 0;
313 sal_Unicode const * pSrcBufPtr = pSrcBuf;
314 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
315 char * pDestBufPtr = pDestBuf;
316 char * pDestBufEnd = pDestBufPtr + nDestBytes;
318 if (pContext != nullptr)
319 nHighSurrogate
320 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
322 if (nHighSurrogate == 0xFFFF)
324 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
325 && !bJavaUtf8)
327 if (pDestBufEnd - pDestBufPtr >= 3)
329 /* Write BOM (U+FEFF) as UTF-8: */
330 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
331 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
332 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
334 else
336 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
337 goto done;
340 nHighSurrogate = 0;
343 while (pSrcBufPtr < pSrcBufEnd)
345 sal_uInt32 nChar = *pSrcBufPtr++;
346 if (nHighSurrogate == 0)
348 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
350 nHighSurrogate = static_cast<sal_Unicode>(nChar);
351 continue;
353 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
355 goto bad_input;
358 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
359 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
360 else
361 goto bad_input;
363 assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
365 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
366 if (pDestBufPtr != pDestBufEnd)
367 *pDestBufPtr++ = static_cast< char >(nChar);
368 else
369 goto no_output;
370 else if (nChar <= 0x7FF)
371 if (pDestBufEnd - pDestBufPtr >= 2)
373 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
374 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
376 else
377 goto no_output;
378 else if (nChar <= 0xFFFF)
379 if (pDestBufEnd - pDestBufPtr >= 3)
381 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
382 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
383 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
385 else
386 goto no_output;
387 else if (pDestBufEnd - pDestBufPtr >= 4)
389 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
390 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
391 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
392 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
394 else
395 goto no_output;
396 nHighSurrogate = 0;
397 continue;
399 bad_input:
400 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
401 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
402 0, nullptr))
404 case sal::detail::textenc::BAD_INPUT_STOP:
405 nHighSurrogate = 0;
406 break;
408 case sal::detail::textenc::BAD_INPUT_CONTINUE:
409 nHighSurrogate = 0;
410 continue;
412 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
413 goto no_output;
415 break;
417 no_output:
418 --pSrcBufPtr;
419 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
420 break;
423 if (nHighSurrogate != 0
424 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
425 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
426 == 0)
428 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
429 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
430 else
431 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
433 nullptr, 0, nullptr))
435 case sal::detail::textenc::BAD_INPUT_STOP:
436 case sal::detail::textenc::BAD_INPUT_CONTINUE:
437 nHighSurrogate = 0;
438 break;
440 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
441 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
442 break;
446 done:
447 if (pContext != nullptr)
448 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
449 = nHighSurrogate;
450 if (pInfo != nullptr)
451 *pInfo = nInfo;
452 if (pSrcCvtChars != nullptr)
453 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
454 return pDestBufPtr - pDestBuf;
457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */