1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*************************************************************************
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * Copyright 2000, 2010 Oracle and/or its affiliates.
8 * OpenOffice.org - a multi-platform office productivity suite
10 * This file is part of OpenOffice.org.
12 * OpenOffice.org is free software: you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 3
14 * only, as published by the Free Software Foundation.
16 * OpenOffice.org is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License version 3 for more details
20 * (a copy is included in the LICENSE file that accompanied this code).
22 * You should have received a copy of the GNU Lesser General Public License
23 * version 3 along with OpenOffice.org. If not, see
24 * <http://www.openoffice.org/license.html>
25 * for a copy of the LGPLv3 License.
27 ************************************************************************/
29 #include "sal/config.h"
31 #include "rtl/textcvt.h"
33 #include "handleundefinedunicodetotextchar.hxx"
34 #include "tenchelp.hxx"
35 #include "unichars.hxx"
37 /* ======================================================================= */
39 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
40 /* where we determine the trail table or for single byte chars the unicode */
41 /* value. We have for all lead byte a separate table, because we can */
42 /* then share many tables for diffrent charset encodings. */
44 /* ======================================================================= */
46 sal_Size
ImplDBCSToUnicode( const void* pData
, SAL_UNUSED_PARAMETER
void*,
47 const char* pSrcBuf
, sal_Size nSrcBytes
,
48 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
49 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
50 sal_Size
* pSrcCvtBytes
)
55 const ImplDBCSToUniLeadTab
* pLeadEntry
;
56 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
57 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
58 sal_Unicode
* pEndDestBuf
;
59 const char* pEndSrcBuf
;
62 pEndDestBuf
= pDestBuf
+nDestChars
;
63 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
64 while ( pSrcBuf
< pEndSrcBuf
)
66 cLead
= (sal_uChar
)*pSrcBuf
;
68 /* get entry for the lead byte */
69 pLeadEntry
= pLeadTab
+cLead
;
71 /* SingleByte char? */
72 if (pLeadEntry
->mpToUniTrailTab
== NULL
73 || cLead
< pConvertData
->mnLeadStart
74 || cLead
> pConvertData
->mnLeadEnd
)
76 cConv
= pLeadEntry
->mnUniChar
;
77 if ( !cConv
&& (cLead
!= 0) )
79 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
80 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
82 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
85 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
91 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
96 /* Source buffer to small */
97 if ( pSrcBuf
+1 == pEndSrcBuf
)
99 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
104 cTrail
= (sal_uChar
)*pSrcBuf
;
105 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
106 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
114 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
115 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
117 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
118 (cLead
<= pEUDCTab
->mnLeadEnd
) )
120 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
121 (cTrail
<= pEUDCTab
->mnTrail1End
) )
123 cConv
= pEUDCTab
->mnUniStart
+
124 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
125 (cTrail
-pEUDCTab
->mnTrail1Start
);
130 sal_uInt16 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
131 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
132 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
133 (cTrail
<= pEUDCTab
->mnTrail2End
) )
135 cConv
= pEUDCTab
->mnUniStart
+
136 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
138 (cTrail
-pEUDCTab
->mnTrail2Start
);
143 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
144 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
145 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
146 (cTrail
<= pEUDCTab
->mnTrail3End
) )
148 cConv
= pEUDCTab
->mnUniStart
+
149 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
151 (cTrail
-pEUDCTab
->mnTrail3Start
);
163 /* We compare the full range of the trail we defined, */
164 /* which can often be greater than the limit. We do this */
165 /* so that extensions that don't consider encodings */
166 /* correctly treat double-byte characters as a single */
167 /* character as much as possible. */
169 if (cLead
< pConvertData
->mnLeadStart
170 || cLead
> pConvertData
->mnLeadEnd
171 || cTrail
< pConvertData
->mnTrailStart
172 || cTrail
> pConvertData
->mnTrailEnd
)
174 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
175 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
177 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
180 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
186 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
190 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
191 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
193 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
196 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
202 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
208 if ( pDestBuf
== pEndDestBuf
)
210 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
219 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
220 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
223 /* ----------------------------------------------------------------------- */
225 sal_Size
ImplUnicodeToDBCS( const void* pData
, SAL_UNUSED_PARAMETER
void*,
226 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
227 char* pDestBuf
, sal_Size nDestBytes
,
228 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
229 sal_Size
* pSrcCvtChars
)
235 const ImplUniToDBCSHighTab
* pHighEntry
;
236 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
237 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
239 const sal_Unicode
* pEndSrcBuf
;
242 pConvertData
->mnLeadStart
!= 0 || pConvertData
->mnLeadEnd
!= 0xFF;
243 /* this statement has the effect that this extra check is only done for
244 EUC-KR, which uses the MS-949 tables, but does not support the full
248 pEndDestBuf
= pDestBuf
+nDestBytes
;
249 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
250 while ( pSrcBuf
< pEndSrcBuf
)
253 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
254 nLowChar
= (sal_uChar
)(c
& 0xFF);
256 /* get entry for the high byte */
257 pHighEntry
= pHighTab
+nHighChar
;
259 /* is low byte in the table range */
260 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
262 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
263 if (bCheckRange
&& cConv
> 0x7F
264 && ((cConv
>> 8) < pConvertData
->mnLeadStart
265 || (cConv
>> 8) > pConvertData
->mnLeadEnd
266 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
267 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
273 if (cConv
== 0 && c
!= 0)
275 /* Map to EUDC ranges: */
276 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
278 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
280 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
282 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
284 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
286 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
289 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
291 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
292 if (nTrailOff
< nSize
)
294 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
299 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
300 if (nTrailOff
< nSize
)
302 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
306 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
313 * SB: Not sure why this is in here. Plus, it does not work as
314 * intended when (c & 0xFF) == 0, because the next !cConv check
315 * will then think c has not yet been converted...
317 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
318 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
320 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
321 cConv
= static_cast< char >(static_cast< unsigned char >(c
& 0xFF));
327 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
332 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
337 /* Handle undefined and surrogates characters */
338 /* (all surrogates characters are undefined) */
339 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
340 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
, nFlags
,
348 if ( !(cConv
& 0xFF00) )
350 if ( pDestBuf
== pEndDestBuf
)
352 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
356 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
361 if ( pDestBuf
+1 >= pEndDestBuf
)
363 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
367 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
369 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
376 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
377 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
380 /* ======================================================================= */
382 #define JIS_EUC_LEAD_OFF 0x80
383 #define JIS_EUC_TRAIL_OFF 0x80
385 /* ----------------------------------------------------------------------- */
387 sal_Size
ImplEUCJPToUnicode( const void* pData
,
388 SAL_UNUSED_PARAMETER
void*,
389 const char* pSrcBuf
, sal_Size nSrcBytes
,
390 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
391 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
392 sal_Size
* pSrcCvtBytes
)
395 sal_uChar cLead
= '\0';
396 sal_uChar cTrail
= '\0';
398 const ImplDBCSToUniLeadTab
* pLeadEntry
;
399 const ImplDBCSToUniLeadTab
* pLeadTab
;
400 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
401 sal_Unicode
* pEndDestBuf
;
402 const char* pEndSrcBuf
;
405 pEndDestBuf
= pDestBuf
+nDestChars
;
406 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
407 while ( pSrcBuf
< pEndSrcBuf
)
409 c
= (sal_uChar
)*pSrcBuf
;
416 /* SS2 - Half-width katakana */
420 /* Source buffer to small */
421 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
423 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
428 c
= (sal_uChar
)*pSrcBuf
;
429 if ( (c
>= 0xA1) && (c
<= 0xDF) )
430 cConv
= 0xFF61+(c
-0xA1);
440 /* SS3 - JIS 0212-1990 */
441 /* 8F + A1-FE + A1-FE */
444 /* Source buffer to small */
445 if (pEndSrcBuf
- pSrcBuf
< 3)
447 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
452 cLead
= (sal_uChar
)*pSrcBuf
;
454 cTrail
= (sal_uChar
)*pSrcBuf
;
455 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
457 /* CodeSet 2 JIS 0208-1997 */
461 /* Source buffer to small */
462 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
464 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
470 cTrail
= (sal_uChar
)*pSrcBuf
;
471 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
474 /* Undefined Range */
475 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
479 cLead
-= JIS_EUC_LEAD_OFF
;
480 cTrail
-= JIS_EUC_TRAIL_OFF
;
481 pLeadEntry
= pLeadTab
+cLead
;
482 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
483 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
491 /* We compare the full range of the trail we defined, */
492 /* which can often be greater than the limit. We do this */
493 /* so that extensions that don't consider encodings */
494 /* correctly treat double-byte characters as a single */
495 /* character as much as possible. */
497 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
499 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
500 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
502 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
505 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
511 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
515 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
516 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
518 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
521 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
527 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
532 if ( pDestBuf
== pEndDestBuf
)
534 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
543 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
544 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
547 /* ----------------------------------------------------------------------- */
549 sal_Size
ImplUnicodeToEUCJP( const void* pData
,
550 SAL_UNUSED_PARAMETER
void*,
551 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
552 char* pDestBuf
, sal_Size nDestBytes
,
553 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
554 sal_Size
* pSrcCvtChars
)
560 const ImplUniToDBCSHighTab
* pHighEntry
;
561 const ImplUniToDBCSHighTab
* pHighTab
;
562 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
564 const sal_Unicode
* pEndSrcBuf
;
567 pEndDestBuf
= pDestBuf
+nDestBytes
;
568 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
569 while ( pSrcBuf
< pEndSrcBuf
)
576 /* Half-width katakana */
577 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
578 cConv
= 0x8E00+0xA1+(c
-0xFF61);
581 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
582 nLowChar
= (sal_uChar
)(c
& 0xFF);
585 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
586 pHighEntry
= pHighTab
+nHighChar
;
587 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
589 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
599 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
600 pHighEntry
= pHighTab
+nHighChar
;
601 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
603 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
610 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
615 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
620 /* Handle undefined and surrogates characters */
621 /* (all surrogates characters are undefined) */
622 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
623 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
,
633 if ( !(cConv
& 0xFFFF00) )
635 if ( pDestBuf
== pEndDestBuf
)
637 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
641 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
645 else if ( !(cConv
& 0xFF0000) )
647 if ( pDestBuf
+1 >= pEndDestBuf
)
649 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
653 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
655 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
660 if ( pDestBuf
+2 >= pEndDestBuf
)
662 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
666 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 16) & 0xFF));
668 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
670 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
677 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
678 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
681 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */