1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 #include "sal/types.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
32 #include "converter.h"
36 struct ImplUtf8ToUnicodeContext
43 struct ImplUnicodeToUtf8Context
45 sal_Unicode nHighSurrogate
; /* 0xFFFF: write BOM */
48 void * ImplCreateUtf8ToUnicodeContext(void)
50 void * p
= rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext
));
51 ImplResetUtf8ToUnicodeContext(p
);
55 void ImplResetUtf8ToUnicodeContext(void * pContext
)
59 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
= -1;
60 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
= sal_True
;
64 sal_Size
ImplConvertUtf8ToUnicode(ImplTextConverterData
const * pData
,
65 void * pContext
, sal_Char
const * pSrcBuf
,
66 sal_Size nSrcBytes
, sal_Unicode
* pDestBuf
,
67 sal_Size nDestChars
, sal_uInt32 nFlags
,
68 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtBytes
)
71 This function is very liberal with the UTF-8 input. Accepted are:
72 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
73 - surrogates (e.g., ED A0 80 to represent U+D800)
74 - encodings with up to six bytes (everything outside the range
75 U+0000..10FFFF is considered "undefined")
76 The first two of these points allow this routine to translate from both
77 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
80 int bJavaUtf8
= pData
!= NULL
;
81 sal_uInt32 nUtf32
= 0;
83 sal_Bool bCheckBom
= sal_True
;
85 sal_uChar
const * pSrcBufPtr
= (sal_uChar
const *) pSrcBuf
;
86 sal_uChar
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcBytes
;
87 sal_Unicode
* pDestBufPtr
= pDestBuf
;
88 sal_Unicode
* pDestBufEnd
= pDestBufPtr
+ nDestChars
;
92 nUtf32
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nUtf32
;
93 nShift
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
;
94 bCheckBom
= ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
;
97 while (pSrcBufPtr
< pSrcBufEnd
)
99 sal_Bool bUndefined
= sal_False
;
100 int bConsume
= sal_True
;
101 sal_uInt32 nChar
= *pSrcBufPtr
++;
108 else if (nChar
<= 0xBF)
110 else if (nChar
<= 0xDF)
112 nUtf32
= (nChar
& 0x1F) << 6;
115 else if (nChar
<= 0xEF)
117 nUtf32
= (nChar
& 0x0F) << 12;
120 else if (nChar
<= 0xF7)
122 nUtf32
= (nChar
& 0x07) << 18;
125 else if (nChar
<= 0xFB)
127 nUtf32
= (nChar
& 0x03) << 24;
130 else if (nChar
<= 0xFD)
132 nUtf32
= (nChar
& 0x01) << 30;
137 else if ((nChar
& 0xC0) == 0x80)
139 nUtf32
|= (nChar
& 0x3F) << nShift
;
148 This byte is preceeded by a broken UTF-8 sequence; if this byte
149 is neither in the range [0x80..0xBF] nor in the range
150 [0xFE..0xFF], assume that this byte does not belong to that
151 broken sequence, but instead starts a new, legal UTF-8 sequence:
153 bConsume
= nChar
>= 0xFE;
159 if (!bCheckBom
|| nUtf32
!= 0xFEFF
160 || (nFlags
& RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
) == 0
163 if (nUtf32
<= 0xFFFF)
164 if (pDestBufPtr
!= pDestBufEnd
)
165 *pDestBufPtr
++ = (sal_Unicode
) nUtf32
;
168 else if (nUtf32
<= 0x10FFFF)
169 if (pDestBufEnd
- pDestBufPtr
>= 2)
171 *pDestBufPtr
++ = (sal_Unicode
) ImplGetHighSurrogate(nUtf32
);
172 *pDestBufPtr
++ = (sal_Unicode
) ImplGetLowSurrogate(nUtf32
);
178 bUndefined
= sal_True
;
183 bCheckBom
= sal_False
;
187 switch (ImplHandleBadInputTextToUnicodeConversion(
188 bUndefined
, sal_True
, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
191 case IMPL_BAD_INPUT_STOP
:
193 bCheckBom
= sal_False
;
198 case IMPL_BAD_INPUT_CONTINUE
:
200 bCheckBom
= sal_False
;
205 case IMPL_BAD_INPUT_NO_OUTPUT
:
212 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
217 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
218 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
))
221 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
222 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
224 switch (ImplHandleBadInputTextToUnicodeConversion(
225 sal_False
, sal_True
, 0, nFlags
, &pDestBufPtr
,
226 pDestBufEnd
, &nInfo
))
228 case IMPL_BAD_INPUT_STOP
:
229 case IMPL_BAD_INPUT_CONTINUE
:
231 bCheckBom
= sal_False
;
234 case IMPL_BAD_INPUT_NO_OUTPUT
:
235 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
240 if (pContext
!= NULL
)
242 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nUtf32
= nUtf32
;
243 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->nShift
= nShift
;
244 ((struct ImplUtf8ToUnicodeContext
*) pContext
)->bCheckBom
= bCheckBom
;
248 if (pSrcCvtBytes
!= NULL
)
249 *pSrcCvtBytes
= (sal_Char
const *) pSrcBufPtr
- pSrcBuf
;
250 return pDestBufPtr
- pDestBuf
;
253 void * ImplCreateUnicodeToUtf8Context(void)
255 void * p
= rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context
));
256 ImplResetUnicodeToUtf8Context(p
);
260 void ImplResetUnicodeToUtf8Context(void * pContext
)
262 if (pContext
!= NULL
)
263 ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
= 0xFFFF;
266 sal_Size
ImplConvertUnicodeToUtf8(ImplTextConverterData
const * pData
,
267 void * pContext
, sal_Unicode
const * pSrcBuf
,
268 sal_Size nSrcChars
, sal_Char
* pDestBuf
,
269 sal_Size nDestBytes
, sal_uInt32 nFlags
,
270 sal_uInt32
* pInfo
, sal_Size
* pSrcCvtChars
)
272 int bJavaUtf8
= pData
!= NULL
;
273 sal_Unicode nHighSurrogate
= 0xFFFF;
274 sal_uInt32 nInfo
= 0;
275 sal_Unicode
const * pSrcBufPtr
= pSrcBuf
;
276 sal_Unicode
const * pSrcBufEnd
= pSrcBufPtr
+ nSrcChars
;
277 sal_Char
* pDestBufPtr
= pDestBuf
;
278 sal_Char
* pDestBufEnd
= pDestBufPtr
+ nDestBytes
;
280 if (pContext
!= NULL
)
282 = ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
;
284 if (nHighSurrogate
== 0xFFFF)
286 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE
) != 0
289 if (pDestBufEnd
- pDestBufPtr
>= 3)
291 /* Write BOM (U+FEFF) as UTF-8: */
292 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xEF;
293 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xBB;
294 *pDestBufPtr
++ = (sal_Char
) (unsigned char) 0xBF;
298 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
305 while (pSrcBufPtr
< pSrcBufEnd
)
307 sal_uInt32 nChar
= *pSrcBufPtr
++;
308 if (nHighSurrogate
== 0)
310 if (ImplIsHighSurrogate(nChar
) && !bJavaUtf8
)
312 nHighSurrogate
= (sal_Unicode
) nChar
;
316 else if (ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
317 nChar
= ImplCombineSurrogates(nHighSurrogate
, nChar
);
321 if ((ImplIsLowSurrogate(nChar
) && !bJavaUtf8
)
322 || ImplIsNoncharacter(nChar
))
325 if (nChar
<= 0x7F && (!bJavaUtf8
|| nChar
!= 0))
326 if (pDestBufPtr
!= pDestBufEnd
)
327 *pDestBufPtr
++ = (sal_Char
) nChar
;
330 else if (nChar
<= 0x7FF)
331 if (pDestBufEnd
- pDestBufPtr
>= 2)
333 *pDestBufPtr
++ = (sal_Char
) (0xC0 | (nChar
>> 6));
334 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
338 else if (nChar
<= 0xFFFF)
339 if (pDestBufEnd
- pDestBufPtr
>= 3)
341 *pDestBufPtr
++ = (sal_Char
) (0xE0 | (nChar
>> 12));
342 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 6) & 0x3F));
343 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
347 else if (pDestBufEnd
- pDestBufPtr
>= 4)
349 *pDestBufPtr
++ = (sal_Char
) (0xF0 | (nChar
>> 18));
350 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 12) & 0x3F));
351 *pDestBufPtr
++ = (sal_Char
) (0x80 | ((nChar
>> 6) & 0x3F));
352 *pDestBufPtr
++ = (sal_Char
) (0x80 | (nChar
& 0x3F));
360 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False
, 0, nFlags
,
365 case IMPL_BAD_INPUT_STOP
:
369 case IMPL_BAD_INPUT_CONTINUE
:
373 case IMPL_BAD_INPUT_NO_OUTPUT
:
380 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
384 if (nHighSurrogate
!= 0
385 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
386 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
389 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
390 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
392 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False
, 0,
399 case IMPL_BAD_INPUT_STOP
:
400 case IMPL_BAD_INPUT_CONTINUE
:
404 case IMPL_BAD_INPUT_NO_OUTPUT
:
405 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
411 if (pContext
!= NULL
)
412 ((struct ImplUnicodeToUtf8Context
*) pContext
)->nHighSurrogate
416 if (pSrcCvtChars
!= NULL
)
417 *pSrcCvtChars
= pSrcBufPtr
- pSrcBuf
;
418 return pDestBufPtr
- pDestBuf
;