1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "sal/types.h"
23 #include "rtl/textcvt.h"
25 #include "converter.hxx"
26 #include "tcvtutf8.hxx"
27 #include "tenchelp.hxx"
28 #include "unichars.hxx"
30 struct ImplUtf8ToUnicodeContext
37 struct ImplUnicodeToUtf8Context
39 sal_Unicode nHighSurrogate
; /* 0xFFFF: write BOM */
42 void * ImplCreateUtf8ToUnicodeContext()
44 ImplUtf8ToUnicodeContext
* p
= new ImplUtf8ToUnicodeContext
;
45 ImplResetUtf8ToUnicodeContext(p
);
49 void ImplResetUtf8ToUnicodeContext(void * pContext
)
51 if (pContext
!= nullptr)
53 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= -1;
54 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= true;
58 void ImplDestroyUtf8ToUnicodeContext(void * pContext
)
60 delete static_cast< ImplUtf8ToUnicodeContext
* >(pContext
);
63 sal_Size
ImplConvertUtf8ToUnicode(
64 void const * pData
, void * pContext
, char const * pSrcBuf
,
65 sal_Size nSrcBytes
, sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
66 sal_uInt32 nFlags
, sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
69 This function is very liberal with the UTF-8 input. Accepted are:
70 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 - surrogates (e.g., ED A0 80 to represent U+D800)
72 - encodings with up to six bytes (everything outside the range
73 U+0000..10FFFF is considered "undefined")
74 The first two of these points allow this routine to translate from both
75 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
78 bool bJavaUtf8
= pData
!= nullptr;
79 sal_uInt32 nUtf32
= 0;
81 bool bCheckBom
= true;
83 unsigned char const * pSrcBufPtr
= reinterpret_cast<unsigned char const *>(pSrcBuf
);
84 unsigned char const * pSrcBufEnd
= pSrcBufPtr
+ nSrcBytes
;
85 sal_Unicode
* pDestBufPtr
= pDestBuf
;
86 sal_Unicode
* pDestBufEnd
= pDestBufPtr
+ nDestChars
;
88 if (pContext
!= nullptr)
90 nUtf32
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
;
91 nShift
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
;
92 bCheckBom
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
;
95 while (pSrcBufPtr
< pSrcBufEnd
)
97 bool bUndefined
= false;
99 sal_uInt32 nChar
= *pSrcBufPtr
++;
106 else if (nChar
<= 0xBF)
108 else if (nChar
<= 0xDF)
110 nUtf32
= (nChar
& 0x1F) << 6;
113 else if (nChar
<= 0xEF)
115 nUtf32
= (nChar
& 0x0F) << 12;
118 else if (nChar
<= 0xF7)
120 nUtf32
= (nChar
& 0x07) << 18;
123 else if (nChar
<= 0xFB)
125 nUtf32
= (nChar
& 0x03) << 24;
128 else if (nChar
<= 0xFD)
130 nUtf32
= (nChar
& 0x01) << 30;
135 else if ((nChar
& 0xC0) == 0x80)
137 nUtf32
|= (nChar
& 0x3F) << nShift
;
146 This byte is preceded by a broken UTF-8 sequence; if this byte
147 is neither in the range [0x80..0xBF] nor in the range
148 [0xFE..0xFF], assume that this byte does not belong to that
149 broken sequence, but instead starts a new, legal UTF-8 sequence:
151 bConsume
= nChar
>= 0xFE;
157 if (!bCheckBom
|| nUtf32
!= 0xFEFF
158 || (nFlags
& RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
) == 0
161 if (nUtf32
<= 0xFFFF)
162 if (pDestBufPtr
!= pDestBufEnd
)
163 *pDestBufPtr
++ = (sal_Unicode
) nUtf32
;
166 else if (rtl::isUnicodeCodePoint(nUtf32
))
167 if (pDestBufEnd
- pDestBufPtr
>= 2)
169 *pDestBufPtr
++ = (sal_Unicode
) ImplGetHighSurrogate(nUtf32
);
170 *pDestBufPtr
++ = (sal_Unicode
) ImplGetLowSurrogate(nUtf32
);
185 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
189 case sal::detail::textenc::BAD_INPUT_STOP
:
196 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
203 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
210 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
215 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
216 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
))
219 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
220 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
222 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 false, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
226 case sal::detail::textenc::BAD_INPUT_STOP
:
227 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
232 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
233 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
238 if (pContext
!= nullptr)
240 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
= nUtf32
;
241 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= nShift
;
242 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= bCheckBom
;
244 if (pInfo
!= nullptr)
246 if (pSrcCvtBytes
!= nullptr)
247 *pSrcCvtBytes
= reinterpret_cast< char const * >(pSrcBufPtr
) - pSrcBuf
;
248 return pDestBufPtr
- pDestBuf
;
251 void * ImplCreateUnicodeToUtf8Context()
253 ImplUnicodeToUtf8Context
* p
= new ImplUnicodeToUtf8Context
;
254 ImplResetUnicodeToUtf8Context(p
);
258 void ImplResetUnicodeToUtf8Context(void * pContext
)
260 if (pContext
!= nullptr)
261 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
= 0xFFFF;
264 void ImplDestroyUnicodeToUtf8Context(void * pContext
)
266 delete static_cast< ImplUnicodeToUtf8Context
* >(pContext
);
269 sal_Size
ImplConvertUnicodeToUtf8(
270 void const * pData
, void * pContext
, sal_Unicode
const * pSrcBuf
,
271 sal_Size nSrcChars
, char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
272 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
274 bool bJavaUtf8
= pData
!= nullptr;
275 sal_Unicode nHighSurrogate
= 0xFFFF;
276 sal_uInt32 nInfo
= 0;
277 sal_Unicode
const * pSrcBufPtr
= pSrcBuf
;
278 sal_Unicode
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcChars
;
279 char * pDestBufPtr
= pDestBuf
;
280 char * pDestBufEnd
= pDestBufPtr
+ nDestBytes
;
282 if (pContext
!= nullptr)
284 = static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
;
286 if (nHighSurrogate
== 0xFFFF)
288 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE
) != 0
291 if (pDestBufEnd
- pDestBufPtr
>= 3)
293 /* Write BOM (U+FEFF) as UTF-8: */
294 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBF));
300 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
307 while (pSrcBufPtr
< pSrcBufEnd
)
309 sal_uInt32 nChar
= *pSrcBufPtr
++;
310 if (nHighSurrogate
== 0)
312 if (ImplIsHighSurrogate(nChar
) && !bJavaUtf8
)
314 nHighSurrogate
= (sal_Unicode
) nChar
;
318 else if (ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
319 nChar
= ImplCombineSurrogates(nHighSurrogate
, nChar
);
323 if ((ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
324 || ImplIsNoncharacter(nChar
))
327 if (nChar
<= 0x7F && (!bJavaUtf8
|| nChar
!= 0))
328 if (pDestBufPtr
!= pDestBufEnd
)
329 *pDestBufPtr
++ = static_cast< char >(nChar
);
332 else if (nChar
<= 0x7FF)
333 if (pDestBufEnd
- pDestBufPtr
>= 2)
335 *pDestBufPtr
++ = static_cast< char >(0xC0 | (nChar
>> 6));
336 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
340 else if (nChar
<= 0xFFFF)
341 if (pDestBufEnd
- pDestBufPtr
>= 3)
343 *pDestBufPtr
++ = static_cast< char >(0xE0 | (nChar
>> 12));
344 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
345 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
349 else if (pDestBufEnd
- pDestBufPtr
>= 4)
351 *pDestBufPtr
++ = static_cast< char >(0xF0 | (nChar
>> 18));
352 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 12) & 0x3F));
353 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
354 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
362 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, nullptr,
366 case sal::detail::textenc::BAD_INPUT_STOP
:
370 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
374 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
381 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
385 if (nHighSurrogate
!= 0
386 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
387 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
390 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
391 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
393 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
,
395 nullptr, 0, nullptr))
397 case sal::detail::textenc::BAD_INPUT_STOP
:
398 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
402 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
403 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
409 if (pContext
!= nullptr)
410 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
412 if (pInfo
!= nullptr)
414 if (pSrcCvtChars
!= nullptr)
415 *pSrcCvtChars
= pSrcBufPtr
- pSrcBuf
;
416 return pDestBufPtr
- pDestBuf
;
419 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */