1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*************************************************************************
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * Copyright 2000, 2010 Oracle and/or its affiliates.
8 * OpenOffice.org - a multi-platform office productivity suite
10 * This file is part of OpenOffice.org.
12 * OpenOffice.org is free software: you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 3
14 * only, as published by the Free Software Foundation.
16 * OpenOffice.org is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License version 3 for more details
20 * (a copy is included in the LICENSE file that accompanied this code).
22 * You should have received a copy of the GNU Lesser General Public License
23 * version 3 along with OpenOffice.org. If not, see
24 * <http://www.openoffice.org/license.html>
25 * for a copy of the LGPLv3 License.
27 ************************************************************************/
29 #include "sal/config.h"
31 #include "sal/types.h"
32 #include "rtl/textcvt.h"
34 #include "converter.hxx"
35 #include "tcvtutf8.hxx"
36 #include "tenchelp.hxx"
37 #include "unichars.hxx"
39 struct ImplUtf8ToUnicodeContext
46 struct ImplUnicodeToUtf8Context
48 sal_Unicode nHighSurrogate
; /* 0xFFFF: write BOM */
51 void * ImplCreateUtf8ToUnicodeContext()
53 ImplUtf8ToUnicodeContext
* p
= new ImplUtf8ToUnicodeContext
;
54 ImplResetUtf8ToUnicodeContext(p
);
58 void ImplResetUtf8ToUnicodeContext(void * pContext
)
62 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= -1;
63 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= true;
67 void ImplDestroyUtf8ToUnicodeContext(void * pContext
)
69 delete static_cast< ImplUtf8ToUnicodeContext
* >(pContext
);
72 sal_Size
ImplConvertUtf8ToUnicode(
73 void const * pData
, void * pContext
, char const * pSrcBuf
,
74 sal_Size nSrcBytes
, sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
75 sal_uInt32 nFlags
, sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
78 This function is very liberal with the UTF-8 input. Accepted are:
79 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
80 - surrogates (e.g., ED A0 80 to represent U+D800)
81 - encodings with up to six bytes (everything outside the range
82 U+0000..10FFFF is considered "undefined")
83 The first two of these points allow this routine to translate from both
84 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
87 int bJavaUtf8
= pData
!= NULL
;
88 sal_uInt32 nUtf32
= 0;
90 bool bCheckBom
= true;
92 sal_uChar
const * pSrcBufPtr
= (sal_uChar
const *) pSrcBuf
;
93 sal_uChar
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcBytes
;
94 sal_Unicode
* pDestBufPtr
= pDestBuf
;
95 sal_Unicode
* pDestBufEnd
= pDestBufPtr
+ nDestChars
;
99 nUtf32
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
;
100 nShift
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
;
101 bCheckBom
= static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
;
104 while (pSrcBufPtr
< pSrcBufEnd
)
106 bool bUndefined
= false;
108 sal_uInt32 nChar
= *pSrcBufPtr
++;
115 else if (nChar
<= 0xBF)
117 else if (nChar
<= 0xDF)
119 nUtf32
= (nChar
& 0x1F) << 6;
122 else if (nChar
<= 0xEF)
124 nUtf32
= (nChar
& 0x0F) << 12;
127 else if (nChar
<= 0xF7)
129 nUtf32
= (nChar
& 0x07) << 18;
132 else if (nChar
<= 0xFB)
134 nUtf32
= (nChar
& 0x03) << 24;
137 else if (nChar
<= 0xFD)
139 nUtf32
= (nChar
& 0x01) << 30;
144 else if ((nChar
& 0xC0) == 0x80)
146 nUtf32
|= (nChar
& 0x3F) << nShift
;
155 This byte is preceeded by a broken UTF-8 sequence; if this byte
156 is neither in the range [0x80..0xBF] nor in the range
157 [0xFE..0xFF], assume that this byte does not belong to that
158 broken sequence, but instead starts a new, legal UTF-8 sequence:
160 bConsume
= nChar
>= 0xFE;
166 if (!bCheckBom
|| nUtf32
!= 0xFEFF
167 || (nFlags
& RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
) == 0
170 if (nUtf32
<= 0xFFFF)
171 if (pDestBufPtr
!= pDestBufEnd
)
172 *pDestBufPtr
++ = (sal_Unicode
) nUtf32
;
175 else if (nUtf32
<= 0x10FFFF)
176 if (pDestBufEnd
- pDestBufPtr
>= 2)
178 *pDestBufPtr
++ = (sal_Unicode
) ImplGetHighSurrogate(nUtf32
);
179 *pDestBufPtr
++ = (sal_Unicode
) ImplGetLowSurrogate(nUtf32
);
194 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
195 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
198 case sal::detail::textenc::BAD_INPUT_STOP
:
205 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
212 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
219 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
224 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
225 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
))
228 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
229 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
231 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
232 false, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
235 case sal::detail::textenc::BAD_INPUT_STOP
:
236 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
241 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
242 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
247 if (pContext
!= NULL
)
249 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nUtf32
= nUtf32
;
250 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->nShift
= nShift
;
251 static_cast< ImplUtf8ToUnicodeContext
* >(pContext
)->bCheckBom
= bCheckBom
;
255 if (pSrcCvtBytes
!= NULL
)
256 *pSrcCvtBytes
= reinterpret_cast< char const * >(pSrcBufPtr
) - pSrcBuf
;
257 return pDestBufPtr
- pDestBuf
;
260 void * ImplCreateUnicodeToUtf8Context()
262 ImplUnicodeToUtf8Context
* p
= new ImplUnicodeToUtf8Context
;
263 ImplResetUnicodeToUtf8Context(p
);
267 void ImplResetUnicodeToUtf8Context(void * pContext
)
269 if (pContext
!= NULL
)
270 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
= 0xFFFF;
273 void ImplDestroyUnicodeToUtf8Context(void * pContext
)
275 delete static_cast< ImplUnicodeToUtf8Context
* >(pContext
);
278 sal_Size
ImplConvertUnicodeToUtf8(
279 void const * pData
, void * pContext
, sal_Unicode
const * pSrcBuf
,
280 sal_Size nSrcChars
, char * pDestBuf
, sal_Size nDestBytes
, sal_uInt32 nFlags
,
281 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
283 int bJavaUtf8
= pData
!= NULL
;
284 sal_Unicode nHighSurrogate
= 0xFFFF;
285 sal_uInt32 nInfo
= 0;
286 sal_Unicode
const * pSrcBufPtr
= pSrcBuf
;
287 sal_Unicode
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcChars
;
288 char * pDestBufPtr
= pDestBuf
;
289 char * pDestBufEnd
= pDestBufPtr
+ nDestBytes
;
291 if (pContext
!= NULL
)
293 = static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
;
295 if (nHighSurrogate
== 0xFFFF)
297 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE
) != 0
300 if (pDestBufEnd
- pDestBufPtr
>= 3)
302 /* Write BOM (U+FEFF) as UTF-8: */
303 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xEF));
304 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBB));
305 *pDestBufPtr
++ = static_cast< char >(static_cast< unsigned char >(0xBF));
309 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
316 while (pSrcBufPtr
< pSrcBufEnd
)
318 sal_uInt32 nChar
= *pSrcBufPtr
++;
319 if (nHighSurrogate
== 0)
321 if (ImplIsHighSurrogate(nChar
) && !bJavaUtf8
)
323 nHighSurrogate
= (sal_Unicode
) nChar
;
327 else if (ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
328 nChar
= ImplCombineSurrogates(nHighSurrogate
, nChar
);
332 if ((ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
333 || ImplIsNoncharacter(nChar
))
336 if (nChar
<= 0x7F && (!bJavaUtf8
|| nChar
!= 0))
337 if (pDestBufPtr
!= pDestBufEnd
)
338 *pDestBufPtr
++ = static_cast< char >(nChar
);
341 else if (nChar
<= 0x7FF)
342 if (pDestBufEnd
- pDestBufPtr
>= 2)
344 *pDestBufPtr
++ = static_cast< char >(0xC0 | (nChar
>> 6));
345 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
349 else if (nChar
<= 0xFFFF)
350 if (pDestBufEnd
- pDestBufPtr
>= 3)
352 *pDestBufPtr
++ = static_cast< char >(0xE0 | (nChar
>> 12));
353 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
354 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
358 else if (pDestBufEnd
- pDestBufPtr
>= 4)
360 *pDestBufPtr
++ = static_cast< char >(0xF0 | (nChar
>> 18));
361 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 12) & 0x3F));
362 *pDestBufPtr
++ = static_cast< char >(0x80 | ((nChar
>> 6) & 0x3F));
363 *pDestBufPtr
++ = static_cast< char >(0x80 | (nChar
& 0x3F));
371 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
372 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
, NULL
,
375 case sal::detail::textenc::BAD_INPUT_STOP
:
379 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
383 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
390 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
394 if (nHighSurrogate
!= 0
395 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
396 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
399 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
400 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
402 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
403 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
,
406 case sal::detail::textenc::BAD_INPUT_STOP
:
407 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
411 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
412 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
418 if (pContext
!= NULL
)
419 static_cast< ImplUnicodeToUtf8Context
* >(pContext
)->nHighSurrogate
423 if (pSrcCvtChars
!= NULL
)
424 *pSrcCvtChars
= pSrcBufPtr
- pSrcBuf
;
425 return pDestBufPtr
- pDestBuf
;
428 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */