1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <sal/types.h>
25 #include <rtl/character.hxx>
26 #include <rtl/textcvt.h>
28 #include "converter.hxx"
29 #include "tcvtutf8.hxx"
33 struct ImplUtf8ToUnicodeContext
41 struct ImplUnicodeToUtf8Context
43 sal_Unicode nHighSurrogate
; /* 0xFFFF: write BOM */
48 void * ImplCreateUtf8ToUnicodeContext()
50 ImplUtf8ToUnicodeContext
* p
= new ImplUtf8ToUnicodeContext
;
51 ImplResetUtf8ToUnicodeContext(p
);
55 void ImplResetUtf8ToUnicodeContext(void * pContext
)
57 if (pContext
!= nullptr)
59 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nBytes
= 1;
60 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= -1;
61 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= true;
65 void ImplDestroyUtf8ToUnicodeContext(void * pContext
)
67 delete static_cast< ImplUtf8ToUnicodeContext
* >(pContext
);
70 sal_Size
ImplConvertUtf8ToUnicode(
71 void const * pData
, void * pContext
, char const * pSrcBuf
,
72 sal_Size nSrcBytes
, sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
73 sal_uInt32 nFlags
, sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
75 bool bJavaUtf8
= pData
!= nullptr;
76 sal_uInt32 nUtf32
= 0;
79 bool bCheckBom
= true;
81 unsigned char const * pSrcBufPtr
= reinterpret_cast<unsigned char const *>(pSrcBuf
);
82 unsigned char const * pSrcBufEnd
= pSrcBufPtr
+ nSrcBytes
;
83 sal_Unicode
* pDestBufPtr
= pDestBuf
;
84 sal_Unicode
* pDestBufEnd
= pDestBufPtr
+ nDestChars
;
85 unsigned char const * startOfCurrentChar
= pSrcBufPtr
;
87 if (pContext
!= nullptr)
89 nUtf32
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
;
90 nBytes
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nBytes
;
91 nShift
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
;
92 bCheckBom
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
;
95 while (pSrcBufPtr
< pSrcBufEnd
)
98 sal_uInt32 nChar
= *pSrcBufPtr
++;
100 // Allow (illegal) 5 and 6 byte sequences, so they are read as a
101 // single individual bad character:
108 else if (nChar
<= 0xBF)
110 else if (nChar
<= 0xDF)
112 nUtf32
= (nChar
& 0x1F) << 6;
116 else if (nChar
<= 0xEF)
118 nUtf32
= (nChar
& 0x0F) << 12;
122 else if (nChar
<= 0xF7)
124 nUtf32
= (nChar
& 0x07) << 18;
128 else if (nChar
<= 0xFB)
130 nUtf32
= (nChar
& 0x03) << 24;
134 else if (nChar
<= 0xFD)
136 nUtf32
= (nChar
& 0x01) << 30;
142 else if ((nChar
& 0xC0) == 0x80)
144 nUtf32
|= (nChar
& 0x3F) << nShift
;
153 This byte is preceded by a broken UTF-8 sequence; if this byte
154 is neither in the range [0x80..0xBF] nor in the range
155 [0xFE..0xFF], assume that this byte does not belong to that
156 broken sequence, but instead starts a new, legal UTF-8 sequence:
158 bConsume
= nChar
>= 0xFE;
164 if (!bCheckBom
|| nUtf32
!= 0xFEFF || nBytes
!= 3
165 || (nFlags
& RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
) == 0
170 if (bJavaUtf8
&& nUtf32
== 0) {
175 if (nUtf32
< 0x80 && !(bJavaUtf8
&& nUtf32
== 0)) {
180 if (nUtf32
< 0x800 || (!bJavaUtf8
&& rtl::isSurrogate(nUtf32
)))
186 if (nUtf32
< 0x10000 || !rtl::isUnicodeCodePoint(nUtf32
)
195 if (nUtf32
<= 0xFFFF)
196 if (pDestBufPtr
!= pDestBufEnd
)
197 *pDestBufPtr
++ = static_cast<sal_Unicode
>(nUtf32
);
200 else if (pDestBufEnd
- pDestBufPtr
>= 2)
201 pDestBufPtr
+= rtl::splitSurrogates(nUtf32
, pDestBufPtr
);
207 startOfCurrentChar
= pSrcBufPtr
;
211 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
212 false, nBytes
!= 1, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
215 case sal::detail::textenc::BAD_INPUT_STOP
:
218 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0) {
222 pSrcBufPtr
= startOfCurrentChar
;
226 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
231 startOfCurrentChar
= pSrcBufPtr
;
234 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
241 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
246 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
247 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
))
250 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
251 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
253 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
254 false, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
257 case sal::detail::textenc::BAD_INPUT_STOP
:
258 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) != 0) {
259 pSrcBufPtr
= startOfCurrentChar
;
262 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
267 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
268 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
273 if (pContext
!= nullptr)
275 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
= nUtf32
;
276 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nBytes
= nBytes
;
277 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= nShift
;
278 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= bCheckBom
;
280 if (pInfo
!= nullptr)
282 if (pSrcCvtBytes
!= nullptr)
283 *pSrcCvtBytes
= reinterpret_cast< char const * >(pSrcBufPtr
) - pSrcBuf
;
284 return pDestBufPtr
- pDestBuf
;
287 void * ImplCreateUnicodeToUtf8Context()
289 ImplUnicodeToUtf8Context
* p
= new ImplUnicodeToUtf8Context
;
290 ImplResetUnicodeToUtf8Context(p
);
294 void ImplResetUnicodeToUtf8Context(void * pContext
)
296 if (pContext
!= nullptr)
297 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
= 0xFFFF;
300 void ImplDestroyUnicodeToUtf8Context(void * pContext
)
302 delete static_cast< ImplUnicodeToUtf8Context
* >(pContext
);
305 sal_Size
ImplConvertUnicodeToUtf8(
306 void const * pData
, void * pContext
, sal_Unicode
const * pSrcBuf
,
307 sal_Size nSrcChars
, char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
308 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
310 bool bJavaUtf8
= pData
!= nullptr;
311 sal_Unicode nHighSurrogate
= 0xFFFF;
312 sal_uInt32 nInfo
= 0;
313 sal_Unicode
const * pSrcBufPtr
= pSrcBuf
;
314 sal_Unicode
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcChars
;
315 char * pDestBufPtr
= pDestBuf
;
316 char * pDestBufEnd
= pDestBufPtr
+ nDestBytes
;
318 if (pContext
!= nullptr)
320 = static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
;
322 if (nHighSurrogate
== 0xFFFF)
324 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE
) != 0
327 if (pDestBufEnd
- pDestBufPtr
>= 3)
329 /* Write BOM (U+FEFF) as UTF-8: */
330 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xEF));
331 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBB));
332 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBF));
336 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
343 while (pSrcBufPtr
< pSrcBufEnd
)
345 sal_uInt32 nChar
= *pSrcBufPtr
++;
346 if (nHighSurrogate
== 0)
348 if (rtl::isHighSurrogate(nChar
) && !bJavaUtf8
)
350 nHighSurrogate
= static_cast<sal_Unicode
>(nChar
);
353 else if (rtl::isLowSurrogate(nChar
) && !bJavaUtf8
)
358 else if (rtl::isLowSurrogate(nChar
) && !bJavaUtf8
)
359 nChar
= rtl::combineSurrogates(nHighSurrogate
, nChar
);
363 assert(bJavaUtf8
? nChar
<= 0xFFFF : rtl::isUnicodeScalarValue(nChar
));
365 if (nChar
<= 0x7F && (!bJavaUtf8
|| nChar
!= 0))
366 if (pDestBufPtr
!= pDestBufEnd
)
367 *pDestBufPtr
++ = static_cast< char >(nChar
);
370 else if (nChar
<= 0x7FF)
371 if (pDestBufEnd
- pDestBufPtr
>= 2)
373 *pDestBufPtr
++ = static_cast< char >(0xC0 | (nChar
>> 6));
374 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
378 else if (nChar
<= 0xFFFF)
379 if (pDestBufEnd
- pDestBufPtr
>= 3)
381 *pDestBufPtr
++ = static_cast< char >(0xE0 | (nChar
>> 12));
382 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
383 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
387 else if (pDestBufEnd
- pDestBufPtr
>= 4)
389 *pDestBufPtr
++ = static_cast< char >(0xF0 | (nChar
>> 18));
390 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 12) & 0x3F));
391 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
392 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
400 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
401 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
404 case sal::detail::textenc::BAD_INPUT_STOP
:
408 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
412 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
419 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
423 if (nHighSurrogate
!= 0
424 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
425 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
428 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
429 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
431 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
432 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
,
433 nullptr, 0, nullptr))
435 case sal::detail::textenc::BAD_INPUT_STOP
:
436 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
440 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
441 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
447 if (pContext
!= nullptr)
448 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
450 if (pInfo
!= nullptr)
452 if (pSrcCvtChars
!= nullptr)
453 *pSrcCvtChars
= pSrcBufPtr
- pSrcBuf
;
454 return pDestBufPtr
- pDestBuf
;
457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */