lok: vcl: fix multiple floatwin removal case more robustly.
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blob72b336b9ded4ac6ab68d1b05f494a62a5b0edb62
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <sal/types.h>
23 #include <rtl/textcvt.h>
25 #include "converter.hxx"
26 #include "tcvtutf8.hxx"
27 #include "tenchelp.hxx"
28 #include "unichars.hxx"
30 struct ImplUtf8ToUnicodeContext
32 sal_uInt32 nUtf32;
33 int nBytes;
34 int nShift;
35 bool bCheckBom;
38 struct ImplUnicodeToUtf8Context
40 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
43 void * ImplCreateUtf8ToUnicodeContext()
45 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
46 ImplResetUtf8ToUnicodeContext(p);
47 return p;
50 void ImplResetUtf8ToUnicodeContext(void * pContext)
52 if (pContext != nullptr)
54 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
55 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
59 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
61 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
64 sal_Size ImplConvertUtf8ToUnicode(
65 void const * pData, void * pContext, char const * pSrcBuf,
66 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
67 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
69 bool bJavaUtf8 = pData != nullptr;
70 sal_uInt32 nUtf32 = 0;
71 int nBytes = int();
72 int nShift = -1;
73 bool bCheckBom = true;
74 sal_uInt32 nInfo = 0;
75 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
76 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
77 sal_Unicode * pDestBufPtr = pDestBuf;
78 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
80 if (pContext != nullptr)
82 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
83 nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
84 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
85 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
88 while (pSrcBufPtr < pSrcBufEnd)
90 bool bConsume = true;
91 sal_uInt32 nChar = *pSrcBufPtr++;
92 if (nShift < 0)
93 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
94 // single individual bad character:
95 if (nChar <= 0x7F)
97 nUtf32 = nChar;
98 nBytes = 1;
99 goto transform;
101 else if (nChar <= 0xBF)
102 goto bad_input;
103 else if (nChar <= 0xDF)
105 nUtf32 = (nChar & 0x1F) << 6;
106 nBytes = 2;
107 nShift = 0;
109 else if (nChar <= 0xEF)
111 nUtf32 = (nChar & 0x0F) << 12;
112 nBytes = 3;
113 nShift = 6;
115 else if (nChar <= 0xF7)
117 nUtf32 = (nChar & 0x07) << 18;
118 nBytes = 4;
119 nShift = 12;
121 else if (nChar <= 0xFB)
123 nUtf32 = (nChar & 0x03) << 24;
124 nBytes = 5;
125 nShift = 18;
127 else if (nChar <= 0xFD)
129 nUtf32 = (nChar & 0x01) << 30;
130 nBytes = 6;
131 nShift = 24;
133 else
134 goto bad_input;
135 else if ((nChar & 0xC0) == 0x80)
137 nUtf32 |= (nChar & 0x3F) << nShift;
138 if (nShift == 0)
139 goto transform;
140 else
141 nShift -= 6;
143 else
146 This byte is preceded by a broken UTF-8 sequence; if this byte
147 is neither in the range [0x80..0xBF] nor in the range
148 [0xFE..0xFF], assume that this byte does not belong to that
149 broken sequence, but instead starts a new, legal UTF-8 sequence:
151 bConsume = nChar >= 0xFE;
152 goto bad_input;
154 continue;
156 transform:
157 if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
158 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 || bJavaUtf8)
161 switch (nBytes) {
162 case 1:
163 if (bJavaUtf8 && nUtf32 == 0) {
164 goto bad_input;
166 break;
167 case 2:
168 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
169 goto bad_input;
171 break;
172 case 3:
173 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
175 goto bad_input;
177 break;
178 case 4:
179 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
180 || bJavaUtf8)
182 goto bad_input;
184 break;
185 default:
186 goto bad_input;
188 if (nUtf32 <= 0xFFFF)
189 if (pDestBufPtr != pDestBufEnd)
190 *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
191 else
192 goto no_output;
193 else if (pDestBufEnd - pDestBufPtr >= 2)
195 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetHighSurrogate(nUtf32));
196 *pDestBufPtr++ = static_cast<sal_Unicode>(ImplGetLowSurrogate(nUtf32));
198 else
199 goto no_output;
201 nShift = -1;
202 bCheckBom = false;
203 continue;
205 bad_input:
206 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
207 false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
208 &nInfo))
210 case sal::detail::textenc::BAD_INPUT_STOP:
211 nShift = -1;
212 bCheckBom = false;
213 if (!bConsume)
214 --pSrcBufPtr;
215 break;
217 case sal::detail::textenc::BAD_INPUT_CONTINUE:
218 nShift = -1;
219 bCheckBom = false;
220 if (!bConsume)
221 --pSrcBufPtr;
222 continue;
224 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
225 goto no_output;
227 break;
229 no_output:
230 --pSrcBufPtr;
231 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
232 break;
235 if (nShift >= 0
236 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
237 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
238 == 0)
240 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
241 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
242 else
243 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
244 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
245 &nInfo))
247 case sal::detail::textenc::BAD_INPUT_STOP:
248 case sal::detail::textenc::BAD_INPUT_CONTINUE:
249 nShift = -1;
250 bCheckBom = false;
251 break;
253 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
254 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
255 break;
259 if (pContext != nullptr)
261 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
262 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
263 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
264 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
266 if (pInfo != nullptr)
267 *pInfo = nInfo;
268 if (pSrcCvtBytes != nullptr)
269 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
270 return pDestBufPtr - pDestBuf;
273 void * ImplCreateUnicodeToUtf8Context()
275 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
276 ImplResetUnicodeToUtf8Context(p);
277 return p;
280 void ImplResetUnicodeToUtf8Context(void * pContext)
282 if (pContext != nullptr)
283 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
286 void ImplDestroyUnicodeToUtf8Context(void * pContext)
288 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
291 sal_Size ImplConvertUnicodeToUtf8(
292 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
293 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
294 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
296 bool bJavaUtf8 = pData != nullptr;
297 sal_Unicode nHighSurrogate = 0xFFFF;
298 sal_uInt32 nInfo = 0;
299 sal_Unicode const * pSrcBufPtr = pSrcBuf;
300 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
301 char * pDestBufPtr = pDestBuf;
302 char * pDestBufEnd = pDestBufPtr + nDestBytes;
304 if (pContext != nullptr)
305 nHighSurrogate
306 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
308 if (nHighSurrogate == 0xFFFF)
310 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
311 && !bJavaUtf8)
313 if (pDestBufEnd - pDestBufPtr >= 3)
315 /* Write BOM (U+FEFF) as UTF-8: */
316 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
317 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
318 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
320 else
322 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
323 goto done;
326 nHighSurrogate = 0;
329 while (pSrcBufPtr < pSrcBufEnd)
331 sal_uInt32 nChar = *pSrcBufPtr++;
332 if (nHighSurrogate == 0)
334 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
336 nHighSurrogate = static_cast<sal_Unicode>(nChar);
337 continue;
340 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
341 nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
342 else
343 goto bad_input;
345 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
346 || ImplIsNoncharacter(nChar))
347 goto bad_input;
349 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
350 if (pDestBufPtr != pDestBufEnd)
351 *pDestBufPtr++ = static_cast< char >(nChar);
352 else
353 goto no_output;
354 else if (nChar <= 0x7FF)
355 if (pDestBufEnd - pDestBufPtr >= 2)
357 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
358 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
360 else
361 goto no_output;
362 else if (nChar <= 0xFFFF)
363 if (pDestBufEnd - pDestBufPtr >= 3)
365 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
366 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
367 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
369 else
370 goto no_output;
371 else if (pDestBufEnd - pDestBufPtr >= 4)
373 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
374 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
375 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
376 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
378 else
379 goto no_output;
380 nHighSurrogate = 0;
381 continue;
383 bad_input:
384 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
385 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
386 0, nullptr))
388 case sal::detail::textenc::BAD_INPUT_STOP:
389 nHighSurrogate = 0;
390 break;
392 case sal::detail::textenc::BAD_INPUT_CONTINUE:
393 nHighSurrogate = 0;
394 continue;
396 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
397 goto no_output;
399 break;
401 no_output:
402 --pSrcBufPtr;
403 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 break;
407 if (nHighSurrogate != 0
408 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
409 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
410 == 0)
412 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
413 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
414 else
415 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
416 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
417 nullptr, 0, nullptr))
419 case sal::detail::textenc::BAD_INPUT_STOP:
420 case sal::detail::textenc::BAD_INPUT_CONTINUE:
421 nHighSurrogate = 0;
422 break;
424 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
425 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
426 break;
430 done:
431 if (pContext != nullptr)
432 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
433 = nHighSurrogate;
434 if (pInfo != nullptr)
435 *pInfo = nInfo;
436 if (pSrcCvtChars != nullptr)
437 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
438 return pDestBufPtr - pDestBuf;
441 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */