1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: tcvtutf8.c,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 #include "sal/types.h"
32 #include "rtl/alloc.h"
33 #include "rtl/textcvt.h"
35 #include "converter.h"
39 struct ImplUtf8ToUnicodeContext
46 struct ImplUnicodeToUtf8Context
48 sal_Unicode nHighSurrogate
; /* 0xFFFF: write BOM */
51 void * ImplCreateUtf8ToUnicodeContext(void)
53 void * p
= rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext
));
54 ImplResetUtf8ToUnicodeContext(p
);
58 void ImplResetUtf8ToUnicodeContext(void * pContext
)
62 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
= -1;
63 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
= sal_True
;
67 sal_Size
ImplConvertUtf8ToUnicode(ImplTextConverterData
const * pData
,
68 void * pContext
, sal_Char
const * pSrcBuf
,
69 sal_Size nSrcBytes
, sal_Unicode
* pDestBuf
,
70 sal_Size nDestChars
, sal_uInt32 nFlags
,
71 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
74 This function is very liberal with the UTF-8 input. Accepted are:
75 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
76 - surrogates (e.g., ED A0 80 to represent U+D800)
77 - encodings with up to six bytes (everything outside the range
78 U+0000..10FFFF is considered "undefined")
79 The first two of these points allow this routine to translate from both
80 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
83 int bJavaUtf8
= pData
!= NULL
;
84 sal_uInt32 nUtf32
= 0;
86 sal_Bool bCheckBom
= sal_True
;
88 sal_uChar
const * pSrcBufPtr
= (sal_uChar
const *) pSrcBuf
;
89 sal_uChar
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcBytes
;
90 sal_Unicode
* pDestBufPtr
= pDestBuf
;
91 sal_Unicode
* pDestBufEnd
= pDestBufPtr
+ nDestChars
;
95 nUtf32
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nUtf32
;
96 nShift
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
;
97 bCheckBom
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
;
100 while (pSrcBufPtr
< pSrcBufEnd
)
102 sal_Bool bUndefined
= sal_False
;
103 int bConsume
= sal_True
;
104 sal_uInt32 nChar
= *pSrcBufPtr
++;
111 else if (nChar
<= 0xBF)
113 else if (nChar
<= 0xDF)
115 nUtf32
= (nChar
& 0x1F) << 6;
118 else if (nChar
<= 0xEF)
120 nUtf32
= (nChar
& 0x0F) << 12;
123 else if (nChar
<= 0xF7)
125 nUtf32
= (nChar
& 0x07) << 18;
128 else if (nChar
<= 0xFB)
130 nUtf32
= (nChar
& 0x03) << 24;
133 else if (nChar
<= 0xFD)
135 nUtf32
= (nChar
& 0x01) << 30;
140 else if ((nChar
& 0xC0) == 0x80)
142 nUtf32
|= (nChar
& 0x3F) << nShift
;
151 This byte is preceeded by a broken UTF-8 sequence; if this byte
152 is neither in the range [0x80..0xBF] nor in the range
153 [0xFE..0xFF], assume that this byte does not belong to that
154 broken sequence, but instead starts a new, legal UTF-8 sequence:
156 bConsume
= nChar
>= 0xFE;
162 if (!bCheckBom
|| nUtf32
!= 0xFEFF
163 || (nFlags
& RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
) == 0
166 if (nUtf32
<= 0xFFFF)
167 if (pDestBufPtr
!= pDestBufEnd
)
168 *pDestBufPtr
++ = (sal_Unicode
) nUtf32
;
171 else if (nUtf32
<= 0x10FFFF)
172 if (pDestBufEnd
- pDestBufPtr
>= 2)
174 *pDestBufPtr
++ = (sal_Unicode
) ImplGetHighSurrogate(nUtf32
);
175 *pDestBufPtr
++ = (sal_Unicode
) ImplGetLowSurrogate(nUtf32
);
181 bUndefined
= sal_True
;
186 bCheckBom
= sal_False
;
190 switch (ImplHandleBadInputTextToUnicodeConversion(
191 bUndefined
, sal_True
, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
194 case IMPL_BAD_INPUT_STOP
:
196 bCheckBom
= sal_False
;
201 case IMPL_BAD_INPUT_CONTINUE
:
203 bCheckBom
= sal_False
;
208 case IMPL_BAD_INPUT_NO_OUTPUT
:
215 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
220 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
221 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
))
224 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
225 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
227 switch (ImplHandleBadInputTextToUnicodeConversion(
228 sal_False
, sal_True
, 0, nFlags
, &pDestBufPtr
,
229 pDestBufEnd
, &nInfo
))
231 case IMPL_BAD_INPUT_STOP
:
232 case IMPL_BAD_INPUT_CONTINUE
:
234 bCheckBom
= sal_False
;
237 case IMPL_BAD_INPUT_NO_OUTPUT
:
238 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
243 if (pContext
!= NULL
)
245 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nUtf32
= nUtf32
;
246 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
= nShift
;
247 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
= bCheckBom
;
251 if (pSrcCvtBytes
!= NULL
)
252 *pSrcCvtBytes
= (sal_Char
const *) pSrcBufPtr
- pSrcBuf
;
253 return pDestBufPtr
- pDestBuf
;
256 void * ImplCreateUnicodeToUtf8Context(void)
258 void * p
= rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context
));
259 ImplResetUnicodeToUtf8Context(p
);
263 void ImplResetUnicodeToUtf8Context(void * pContext
)
265 if (pContext
!= NULL
)
266 ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
= 0xFFFF;
269 sal_Size
ImplConvertUnicodeToUtf8(ImplTextConverterData
const * pData
,
270 void * pContext
, sal_Unicode
const * pSrcBuf
,
271 sal_Size nSrcChars
, sal_Char
* pDestBuf
,
272 sal_Size nDestBytes
, sal_uInt32 nFlags
,
273 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
275 int bJavaUtf8
= pData
!= NULL
;
276 sal_Unicode nHighSurrogate
= 0xFFFF;
277 sal_uInt32 nInfo
= 0;
278 sal_Unicode
const * pSrcBufPtr
= pSrcBuf
;
279 sal_Unicode
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcChars
;
280 sal_Char
* pDestBufPtr
= pDestBuf
;
281 sal_Char
* pDestBufEnd
= pDestBufPtr
+ nDestBytes
;
283 if (pContext
!= NULL
)
285 = ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
;
287 if (nHighSurrogate
== 0xFFFF)
289 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE
) != 0
292 if (pDestBufEnd
- pDestBufPtr
>= 3)
294 /* Write BOM (U+FEFF) as UTF-8: */
295 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xEF;
296 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xBB;
297 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xBF;
301 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
308 while (pSrcBufPtr
< pSrcBufEnd
)
310 sal_uInt32 nChar
= *pSrcBufPtr
++;
311 if (nHighSurrogate
== 0)
313 if (ImplIsHighSurrogate(nChar
) && !bJavaUtf8
)
315 nHighSurrogate
= (sal_Unicode
) nChar
;
319 else if (ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
320 nChar
= ImplCombineSurrogates(nHighSurrogate
, nChar
);
324 if ((ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
325 || ImplIsNoncharacter(nChar
))
328 if (nChar
<= 0x7F && (!bJavaUtf8
|| nChar
!= 0))
329 if (pDestBufPtr
!= pDestBufEnd
)
330 *pDestBufPtr
++ = (sal_Char
) nChar
;
333 else if (nChar
<= 0x7FF)
334 if (pDestBufEnd
- pDestBufPtr
>= 2)
336 *pDestBufPtr
++ = (sal_Char
) (0xC0 | (nChar
>> 6));
337 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
341 else if (nChar
<= 0xFFFF)
342 if (pDestBufEnd
- pDestBufPtr
>= 3)
344 *pDestBufPtr
++ = (sal_Char
) (0xE0 | (nChar
>> 12));
345 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 6) & 0x3F));
346 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
350 else if (pDestBufEnd
- pDestBufPtr
>= 4)
352 *pDestBufPtr
++ = (sal_Char
) (0xF0 | (nChar
>> 18));
353 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 12) & 0x3F));
354 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 6) & 0x3F));
355 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
363 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False
, 0, nFlags
,
368 case IMPL_BAD_INPUT_STOP
:
372 case IMPL_BAD_INPUT_CONTINUE
:
376 case IMPL_BAD_INPUT_NO_OUTPUT
:
383 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
387 if (nHighSurrogate
!= 0
388 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
389 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
392 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
393 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
395 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False
, 0,
402 case IMPL_BAD_INPUT_STOP
:
403 case IMPL_BAD_INPUT_CONTINUE
:
407 case IMPL_BAD_INPUT_NO_OUTPUT
:
408 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
414 if (pContext
!= NULL
)
415 ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
419 if (pSrcCvtChars
!= NULL
)
420 *pSrcCvtChars
= pSrcBufPtr
- pSrcBuf
;
421 return pDestBufPtr
- pDestBuf
;