Avoid potential negative array index access to cached text.
[LibreOffice.git] / sal / textenc / tcvtutf8.cxx
blob456d77e2f51c8e188b06edbde8ba3c79da07466a
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
24 #include <sal/types.h>
25 #include <rtl/character.hxx>
26 #include <rtl/textcvt.h>
28 #include "converter.hxx"
29 #include "tcvtutf8.hxx"
31 namespace {
33 struct ImplUtf8ToUnicodeContext
35 sal_uInt32 nUtf32;
36 int nBytes;
37 int nShift;
38 bool bCheckBom;
41 struct ImplUnicodeToUtf8Context
43 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
48 void * ImplCreateUtf8ToUnicodeContext()
50 ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
51 ImplResetUtf8ToUnicodeContext(p);
52 return p;
55 void ImplResetUtf8ToUnicodeContext(void * pContext)
57 if (pContext != nullptr)
59 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
60 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
64 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
66 delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
69 sal_Size ImplConvertUtf8ToUnicode(
70 void const * pData, void * pContext, char const * pSrcBuf,
71 sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
72 sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
74 bool bJavaUtf8 = pData != nullptr;
75 sal_uInt32 nUtf32 = 0;
76 int nBytes = int();
77 int nShift = -1;
78 bool bCheckBom = true;
79 sal_uInt32 nInfo = 0;
80 unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
81 unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
82 sal_Unicode * pDestBufPtr = pDestBuf;
83 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
84 unsigned char const * startOfCurrentChar = pSrcBufPtr;
86 if (pContext != nullptr)
88 nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
89 nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
90 nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
91 bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
94 while (pSrcBufPtr < pSrcBufEnd)
96 bool bConsume = true;
97 sal_uInt32 nChar = *pSrcBufPtr++;
98 if (nShift < 0)
99 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
100 // single individual bad character:
101 if (nChar <= 0x7F)
103 nUtf32 = nChar;
104 nBytes = 1;
105 goto transform;
107 else if (nChar <= 0xBF)
108 goto bad_input;
109 else if (nChar <= 0xDF)
111 nUtf32 = (nChar & 0x1F) << 6;
112 nBytes = 2;
113 nShift = 0;
115 else if (nChar <= 0xEF)
117 nUtf32 = (nChar & 0x0F) << 12;
118 nBytes = 3;
119 nShift = 6;
121 else if (nChar <= 0xF7)
123 nUtf32 = (nChar & 0x07) << 18;
124 nBytes = 4;
125 nShift = 12;
127 else if (nChar <= 0xFB)
129 nUtf32 = (nChar & 0x03) << 24;
130 nBytes = 5;
131 nShift = 18;
133 else if (nChar <= 0xFD)
135 nUtf32 = (nChar & 0x01) << 30;
136 nBytes = 6;
137 nShift = 24;
139 else
140 goto bad_input;
141 else if ((nChar & 0xC0) == 0x80)
143 nUtf32 |= (nChar & 0x3F) << nShift;
144 if (nShift == 0)
145 goto transform;
146 else
147 nShift -= 6;
149 else
152 This byte is preceded by a broken UTF-8 sequence; if this byte
153 is neither in the range [0x80..0xBF] nor in the range
154 [0xFE..0xFF], assume that this byte does not belong to that
155 broken sequence, but instead starts a new, legal UTF-8 sequence:
157 bConsume = nChar >= 0xFE;
158 goto bad_input;
160 continue;
162 transform:
163 if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
164 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
165 || bJavaUtf8)
167 switch (nBytes) {
168 case 1:
169 if (bJavaUtf8 && nUtf32 == 0) {
170 goto bad_input;
172 break;
173 case 2:
174 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
175 goto bad_input;
177 break;
178 case 3:
179 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
181 goto bad_input;
183 break;
184 case 4:
185 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
186 || bJavaUtf8)
188 goto bad_input;
190 break;
191 default:
192 goto bad_input;
194 if (nUtf32 <= 0xFFFF)
195 if (pDestBufPtr != pDestBufEnd)
196 *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
197 else
198 goto no_output;
199 else if (pDestBufEnd - pDestBufPtr >= 2)
200 pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
201 else
202 goto no_output;
204 nShift = -1;
205 bCheckBom = false;
206 startOfCurrentChar = pSrcBufPtr;
207 continue;
209 bad_input:
210 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
211 false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
212 &nInfo))
214 case sal::detail::textenc::BAD_INPUT_STOP:
215 nShift = -1;
216 bCheckBom = false;
217 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
218 if (!bConsume)
219 --pSrcBufPtr;
220 } else {
221 pSrcBufPtr = startOfCurrentChar;
223 break;
225 case sal::detail::textenc::BAD_INPUT_CONTINUE:
226 nShift = -1;
227 bCheckBom = false;
228 if (!bConsume)
229 --pSrcBufPtr;
230 startOfCurrentChar = pSrcBufPtr;
231 continue;
233 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
234 goto no_output;
236 break;
238 no_output:
239 --pSrcBufPtr;
240 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
241 break;
244 if (nShift >= 0
245 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
246 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
247 == 0)
249 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
250 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
251 else
252 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
253 false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
254 &nInfo))
256 case sal::detail::textenc::BAD_INPUT_STOP:
257 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
258 pSrcBufPtr = startOfCurrentChar;
260 [[fallthrough]];
261 case sal::detail::textenc::BAD_INPUT_CONTINUE:
262 nShift = -1;
263 bCheckBom = false;
264 break;
266 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
267 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
268 break;
272 if (pContext != nullptr)
274 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
275 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
276 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
277 static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
279 if (pInfo != nullptr)
280 *pInfo = nInfo;
281 if (pSrcCvtBytes != nullptr)
282 *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
283 return pDestBufPtr - pDestBuf;
286 void * ImplCreateUnicodeToUtf8Context()
288 ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
289 ImplResetUnicodeToUtf8Context(p);
290 return p;
293 void ImplResetUnicodeToUtf8Context(void * pContext)
295 if (pContext != nullptr)
296 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
299 void ImplDestroyUnicodeToUtf8Context(void * pContext)
301 delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
304 sal_Size ImplConvertUnicodeToUtf8(
305 void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
306 sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
307 sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
309 bool bJavaUtf8 = pData != nullptr;
310 sal_Unicode nHighSurrogate = 0xFFFF;
311 sal_uInt32 nInfo = 0;
312 sal_Unicode const * pSrcBufPtr = pSrcBuf;
313 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
314 char * pDestBufPtr = pDestBuf;
315 char * pDestBufEnd = pDestBufPtr + nDestBytes;
317 if (pContext != nullptr)
318 nHighSurrogate
319 = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
321 if (nHighSurrogate == 0xFFFF)
323 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
324 && !bJavaUtf8)
326 if (pDestBufEnd - pDestBufPtr >= 3)
328 /* Write BOM (U+FEFF) as UTF-8: */
329 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
330 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
331 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
333 else
335 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
336 goto done;
339 nHighSurrogate = 0;
342 while (pSrcBufPtr < pSrcBufEnd)
344 sal_uInt32 nChar = *pSrcBufPtr++;
345 if (nHighSurrogate == 0)
347 if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
349 nHighSurrogate = static_cast<sal_Unicode>(nChar);
350 continue;
352 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
354 goto bad_input;
357 else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
358 nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
359 else
360 goto bad_input;
362 assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
364 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
365 if (pDestBufPtr != pDestBufEnd)
366 *pDestBufPtr++ = static_cast< char >(nChar);
367 else
368 goto no_output;
369 else if (nChar <= 0x7FF)
370 if (pDestBufEnd - pDestBufPtr >= 2)
372 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
373 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
375 else
376 goto no_output;
377 else if (nChar <= 0xFFFF)
378 if (pDestBufEnd - pDestBufPtr >= 3)
380 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
381 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
382 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
384 else
385 goto no_output;
386 else if (pDestBufEnd - pDestBufPtr >= 4)
388 *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
389 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
390 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
391 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
393 else
394 goto no_output;
395 nHighSurrogate = 0;
396 continue;
398 bad_input:
399 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
400 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
401 0, nullptr))
403 case sal::detail::textenc::BAD_INPUT_STOP:
404 nHighSurrogate = 0;
405 break;
407 case sal::detail::textenc::BAD_INPUT_CONTINUE:
408 nHighSurrogate = 0;
409 continue;
411 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
412 goto no_output;
414 break;
416 no_output:
417 --pSrcBufPtr;
418 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
419 break;
422 if (nHighSurrogate != 0
423 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
424 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
425 == 0)
427 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
428 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
429 else
430 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
431 false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
432 nullptr, 0, nullptr))
434 case sal::detail::textenc::BAD_INPUT_STOP:
435 case sal::detail::textenc::BAD_INPUT_CONTINUE:
436 nHighSurrogate = 0;
437 break;
439 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
440 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
441 break;
445 done:
446 if (pContext != nullptr)
447 static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
448 = nHighSurrogate;
449 if (pInfo != nullptr)
450 *pInfo = nInfo;
451 if (pSrcCvtChars != nullptr)
452 *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
453 return pDestBufPtr - pDestBuf;
456 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */