Fix GNU C++ version check
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blob2d73a96a3c50c17247da9b77165598e933d23a21
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <sal/types.h>
25 #include <rtl/character.hxx>
26 #include <rtl/textcvt.h>
28 #include "converter.hxx"
29 #include "tcvtutf8.hxx"
31 namespace {
33 struct ImplUtf8ToUnicodeContext
35 sal_uInt32 nUtf32;
36 int nBytes;
37 int nShift;
38 bool bCheckBom;
41 struct ImplUnicodeToUtf8Context
43 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
48 void * ImplCreateUtf8ToUnicodeContext()
50 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
51 ImplResetUtf8ToUnicodeContext(p);
52 return p;
55 void ImplResetUtf8ToUnicodeContext(void * pContext)
57 if (pContext != nullptr)
59 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = 1;
60 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
61 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
65 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
67 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
70 sal_Size ImplConvertUtf8ToUnicode(
71 void const * pData, void * pContext, char const * pSrcBuf,
72 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
73 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
75 bool bJavaUtf8 = pData != nullptr;
76 sal_uInt32 nUtf32 = 0;
77 int nBytes = 1;
78 int nShift = -1;
79 bool bCheckBom = true;
80 sal_uInt32 nInfo = 0;
81 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
82 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
83 sal_Unicode * pDestBufPtr = pDestBuf;
84 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
85 unsigned char const * startOfCurrentChar = pSrcBufPtr;
87 if (pContext != nullptr)
89 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
90 nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
91 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
95 while (pSrcBufPtr < pSrcBufEnd)
97 bool bConsume = true;
98 sal_uInt32 nChar = *pSrcBufPtr++;
99 if (nShift < 0)
100 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
101 // single individual bad character:
102 if (nChar <= 0x7F)
104 nUtf32 = nChar;
105 nBytes = 1;
106 goto transform;
108 else if (nChar <= 0xBF)
109 goto bad_input;
110 else if (nChar <= 0xDF)
112 nUtf32 = (nChar & 0x1F) << 6;
113 nBytes = 2;
114 nShift = 0;
116 else if (nChar <= 0xEF)
118 nUtf32 = (nChar & 0x0F) << 12;
119 nBytes = 3;
120 nShift = 6;
122 else if (nChar <= 0xF7)
124 nUtf32 = (nChar & 0x07) << 18;
125 nBytes = 4;
126 nShift = 12;
128 else if (nChar <= 0xFB)
130 nUtf32 = (nChar & 0x03) << 24;
131 nBytes = 5;
132 nShift = 18;
134 else if (nChar <= 0xFD)
136 nUtf32 = (nChar & 0x01) << 30;
137 nBytes = 6;
138 nShift = 24;
140 else
141 goto bad_input;
142 else if ((nChar & 0xC0) == 0x80)
144 nUtf32 |= (nChar & 0x3F) << nShift;
145 if (nShift == 0)
146 goto transform;
147 else
148 nShift -= 6;
150 else
153 This byte is preceded by a broken UTF-8 sequence; if this byte
154 is neither in the range [0x80..0xBF] nor in the range
155 [0xFE..0xFF], assume that this byte does not belong to that
156 broken sequence, but instead starts a new, legal UTF-8 sequence:
158 bConsume = nChar >= 0xFE;
159 goto bad_input;
161 continue;
163 transform:
164 if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
165 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
166 || bJavaUtf8)
168 switch (nBytes) {
169 case 1:
170 if (bJavaUtf8 && nUtf32 == 0) {
171 goto bad_input;
173 break;
174 case 2:
175 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
176 goto bad_input;
178 break;
179 case 3:
180 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
182 goto bad_input;
184 break;
185 case 4:
186 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
187 || bJavaUtf8)
189 goto bad_input;
191 break;
192 default:
193 goto bad_input;
195 if (nUtf32 <= 0xFFFF)
196 if (pDestBufPtr != pDestBufEnd)
197 *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
198 else
199 goto no_output;
200 else if (pDestBufEnd - pDestBufPtr >= 2)
201 pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
202 else
203 goto no_output;
205 nShift = -1;
206 bCheckBom = false;
207 startOfCurrentChar = pSrcBufPtr;
208 continue;
210 bad_input:
211 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
212 false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
213 &nInfo))
215 case sal::detail::textenc::BAD_INPUT_STOP:
216 nShift = -1;
217 bCheckBom = false;
218 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
219 if (!bConsume)
220 --pSrcBufPtr;
221 } else {
222 pSrcBufPtr = startOfCurrentChar;
224 break;
226 case sal::detail::textenc::BAD_INPUT_CONTINUE:
227 nShift = -1;
228 bCheckBom = false;
229 if (!bConsume)
230 --pSrcBufPtr;
231 startOfCurrentChar = pSrcBufPtr;
232 continue;
234 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
235 goto no_output;
237 break;
239 no_output:
240 --pSrcBufPtr;
241 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
242 break;
245 if (nShift >= 0
246 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
247 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
248 == 0)
250 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
251 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
252 else
253 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
254 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
255 &nInfo))
257 case sal::detail::textenc::BAD_INPUT_STOP:
258 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
259 pSrcBufPtr = startOfCurrentChar;
261 [[fallthrough]];
262 case sal::detail::textenc::BAD_INPUT_CONTINUE:
263 nShift = -1;
264 bCheckBom = false;
265 break;
267 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
268 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
269 break;
273 if (pContext != nullptr)
275 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
276 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
277 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
278 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
280 if (pInfo != nullptr)
281 *pInfo = nInfo;
282 if (pSrcCvtBytes != nullptr)
283 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
284 return pDestBufPtr - pDestBuf;
287 void * ImplCreateUnicodeToUtf8Context()
289 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
290 ImplResetUnicodeToUtf8Context(p);
291 return p;
294 void ImplResetUnicodeToUtf8Context(void * pContext)
296 if (pContext != nullptr)
297 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
300 void ImplDestroyUnicodeToUtf8Context(void * pContext)
302 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
305 sal_Size ImplConvertUnicodeToUtf8(
306 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
307 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
308 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
310 bool bJavaUtf8 = pData != nullptr;
311 sal_Unicode nHighSurrogate = 0xFFFF;
312 sal_uInt32 nInfo = 0;
313 sal_Unicode const * pSrcBufPtr = pSrcBuf;
314 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
315 char * pDestBufPtr = pDestBuf;
316 char * pDestBufEnd = pDestBufPtr + nDestBytes;
318 if (pContext != nullptr)
319 nHighSurrogate
320 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
322 if (nHighSurrogate == 0xFFFF)
324 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
325 && !bJavaUtf8)
327 if (pDestBufEnd - pDestBufPtr >= 3)
329 /* Write BOM (U+FEFF) as UTF-8: */
330 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
331 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
332 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
334 else
336 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
337 goto done;
340 nHighSurrogate = 0;
343 while (pSrcBufPtr < pSrcBufEnd)
345 sal_uInt32 nChar = *pSrcBufPtr++;
346 if (nHighSurrogate == 0)
348 if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
350 nHighSurrogate = static_cast<sal_Unicode>(nChar);
351 continue;
353 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
355 goto bad_input;
358 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
359 nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
360 else
361 goto bad_input;
363 assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
365 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
366 if (pDestBufPtr != pDestBufEnd)
367 *pDestBufPtr++ = static_cast< char >(nChar);
368 else
369 goto no_output;
370 else if (nChar <= 0x7FF)
371 if (pDestBufEnd - pDestBufPtr >= 2)
373 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
374 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
376 else
377 goto no_output;
378 else if (nChar <= 0xFFFF)
379 if (pDestBufEnd - pDestBufPtr >= 3)
381 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
382 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
383 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
385 else
386 goto no_output;
387 else if (pDestBufEnd - pDestBufPtr >= 4)
389 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
390 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
391 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
392 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
394 else
395 goto no_output;
396 nHighSurrogate = 0;
397 continue;
399 bad_input:
400 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
401 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
402 0, nullptr))
404 case sal::detail::textenc::BAD_INPUT_STOP:
405 nHighSurrogate = 0;
406 break;
408 case sal::detail::textenc::BAD_INPUT_CONTINUE:
409 nHighSurrogate = 0;
410 continue;
412 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
413 goto no_output;
415 break;
417 no_output:
418 --pSrcBufPtr;
419 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
420 break;
423 if (nHighSurrogate != 0
424 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
425 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
426 == 0)
428 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
429 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
430 else
431 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
433 nullptr, 0, nullptr))
435 case sal::detail::textenc::BAD_INPUT_STOP:
436 case sal::detail::textenc::BAD_INPUT_CONTINUE:
437 nHighSurrogate = 0;
438 break;
440 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
441 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
442 break;
446 done:
447 if (pContext != nullptr)
448 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
449 = nHighSurrogate;
450 if (pInfo != nullptr)
451 *pInfo = nInfo;
452 if (pSrcCvtChars != nullptr)
453 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
454 return pDestBufPtr - pDestBuf;
457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */