1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: tcvtmb.c,v $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
33 #include "rtl/textcvt.h"
35 /* ======================================================================= */
37 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
38 /* where we determine the trail table or for single byte chars the unicode */
39 /* value. We have for all lead byte a separate table, because we can */
40 /* then share many tables for diffrent charset encodings. */
42 /* ======================================================================= */
44 sal_Size
ImplDBCSToUnicode( const ImplTextConverterData
* pData
, void* pContext
,
45 const sal_Char
* pSrcBuf
, sal_Size nSrcBytes
,
46 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
47 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
48 sal_Size
* pSrcCvtBytes
)
53 const ImplDBCSToUniLeadTab
* pLeadEntry
;
54 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
55 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
56 sal_Unicode
* pEndDestBuf
;
57 const sal_Char
* pEndSrcBuf
;
59 (void) pContext
; /* unused */
62 pEndDestBuf
= pDestBuf
+nDestChars
;
63 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
64 while ( pSrcBuf
< pEndSrcBuf
)
66 cLead
= (sal_uChar
)*pSrcBuf
;
68 /* get entry for the lead byte */
69 pLeadEntry
= pLeadTab
+cLead
;
71 /* SingleByte char? */
72 if (pLeadEntry
->mpToUniTrailTab
== NULL
73 || cLead
< pConvertData
->mnLeadStart
74 || cLead
> pConvertData
->mnLeadEnd
)
76 cConv
= pLeadEntry
->mnUniChar
;
77 if ( !cConv
&& (cLead
!= 0) )
79 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
80 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
82 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
85 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
91 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
96 /* Source buffer to small */
97 if ( pSrcBuf
+1 == pEndSrcBuf
)
99 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
104 cTrail
= (sal_uChar
)*pSrcBuf
;
105 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
106 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
114 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
115 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
117 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
118 (cLead
<= pEUDCTab
->mnLeadEnd
) )
120 sal_uInt16 nTrailCount
= 0;
121 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
122 (cTrail
<= pEUDCTab
->mnTrail1End
) )
124 cConv
= pEUDCTab
->mnUniStart
+
125 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
126 (cTrail
-pEUDCTab
->mnTrail1Start
);
131 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
132 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
133 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
134 (cTrail
<= pEUDCTab
->mnTrail2End
) )
136 cConv
= pEUDCTab
->mnUniStart
+
137 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
139 (cTrail
-pEUDCTab
->mnTrail2Start
);
144 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
145 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
146 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
147 (cTrail
<= pEUDCTab
->mnTrail3End
) )
149 cConv
= pEUDCTab
->mnUniStart
+
150 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
152 (cTrail
-pEUDCTab
->mnTrail3Start
);
164 /* Wir vergleichen den kompletten Trailbereich den wir */
165 /* definieren, der normalerweise groesser sein kann als */
166 /* der definierte. Dies machen wir, damit Erweiterungen von */
167 /* uns nicht beruecksichtigten Encodings so weit wie */
168 /* moeglich auch richtig zu behandeln, das double byte */
169 /* characters auch als ein einzelner Character behandelt */
171 if (cLead
< pConvertData
->mnLeadStart
172 || cLead
> pConvertData
->mnLeadEnd
173 || cTrail
< pConvertData
->mnTrailStart
174 || cTrail
> pConvertData
->mnTrailEnd
)
176 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
177 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
179 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
182 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
188 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
192 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
193 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
195 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
198 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
204 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
210 if ( pDestBuf
== pEndDestBuf
)
212 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
221 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
222 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
225 /* ----------------------------------------------------------------------- */
227 sal_Size
ImplUnicodeToDBCS( const ImplTextConverterData
* pData
, void* pContext
,
228 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
229 sal_Char
* pDestBuf
, sal_Size nDestBytes
,
230 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
231 sal_Size
* pSrcCvtChars
)
237 const ImplUniToDBCSHighTab
* pHighEntry
;
238 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
239 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
240 sal_Char
* pEndDestBuf
;
241 const sal_Unicode
* pEndSrcBuf
;
243 sal_Bool bCheckRange
= (pConvertData
->mnLeadStart
!= 0
244 || pConvertData
->mnLeadEnd
!= 0xFF);
245 /* this statement has the effect that this extra check is only done for
246 EUC-KR, which uses the MS-949 tables, but does not support the full
249 (void) pContext
; /* unused */
252 pEndDestBuf
= pDestBuf
+nDestBytes
;
253 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
254 while ( pSrcBuf
< pEndSrcBuf
)
257 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
258 nLowChar
= (sal_uChar
)(c
& 0xFF);
260 /* get entry for the high byte */
261 pHighEntry
= pHighTab
+nHighChar
;
263 /* is low byte in the table range */
264 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
266 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
267 if (bCheckRange
&& cConv
> 0x7F
268 && ((cConv
>> 8) < pConvertData
->mnLeadStart
269 || (cConv
>> 8) > pConvertData
->mnLeadEnd
270 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
271 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
277 if (cConv
== 0 && c
!= 0)
279 /* Map to EUDC ranges: */
280 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
282 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
284 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
286 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
288 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
290 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
293 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
295 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
296 if (nTrailOff
< nSize
)
298 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
303 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
304 if (nTrailOff
< nSize
)
306 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
310 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
317 * SB: Not sure why this is in here. Plus, it does not work as
318 * intended when (c & 0xFF) == 0, because the next !cConv check
319 * will then think c has not yet been converted...
321 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
322 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
324 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
325 cConv
= (sal_Char
)(sal_uChar
)(c
& 0xFF);
331 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
336 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
341 /* Handle undefined and surrogates characters */
342 /* (all surrogates characters are undefined) */
343 if (ImplHandleUndefinedUnicodeToTextChar(pData
,
356 if ( !(cConv
& 0xFF00) )
358 if ( pDestBuf
== pEndDestBuf
)
360 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
364 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
369 if ( pDestBuf
+1 >= pEndDestBuf
)
371 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
375 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
377 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
384 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
385 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
388 /* ======================================================================= */
390 #define JIS_EUC_LEAD_OFF 0x80
391 #define JIS_EUC_TRAIL_OFF 0x80
393 /* ----------------------------------------------------------------------- */
395 sal_Size
ImplEUCJPToUnicode( const ImplTextConverterData
* pData
,
397 const sal_Char
* pSrcBuf
, sal_Size nSrcBytes
,
398 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
399 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
400 sal_Size
* pSrcCvtBytes
)
403 sal_uChar cLead
= '\0';
404 sal_uChar cTrail
= '\0';
406 const ImplDBCSToUniLeadTab
* pLeadEntry
;
407 const ImplDBCSToUniLeadTab
* pLeadTab
;
408 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
409 sal_Unicode
* pEndDestBuf
;
410 const sal_Char
* pEndSrcBuf
;
412 (void) pContext
; /* unused */
415 pEndDestBuf
= pDestBuf
+nDestChars
;
416 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
417 while ( pSrcBuf
< pEndSrcBuf
)
419 c
= (sal_uChar
)*pSrcBuf
;
426 /* SS2 - Half-width katakana */
430 /* Source buffer to small */
431 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
433 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
438 c
= (sal_uChar
)*pSrcBuf
;
439 if ( (c
>= 0xA1) && (c
<= 0xDF) )
440 cConv
= 0xFF61+(c
-0xA1);
450 /* SS3 - JIS 0212-1990 */
451 /* 8F + A1-FE + A1-FE */
454 /* Source buffer to small */
455 if (pEndSrcBuf
- pSrcBuf
< 3)
457 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
462 cLead
= (sal_uChar
)*pSrcBuf
;
464 cTrail
= (sal_uChar
)*pSrcBuf
;
465 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
467 /* CodeSet 2 JIS 0208-1997 */
471 /* Source buffer to small */
472 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
474 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
480 cTrail
= (sal_uChar
)*pSrcBuf
;
481 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
484 /* Undefined Range */
485 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
489 cLead
-= JIS_EUC_LEAD_OFF
;
490 cTrail
-= JIS_EUC_TRAIL_OFF
;
491 pLeadEntry
= pLeadTab
+cLead
;
492 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
493 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
501 /* Wir vergleichen den kompletten Trailbereich den wir */
502 /* definieren, der normalerweise groesser sein kann als */
503 /* der definierte. Dies machen wir, damit Erweiterungen von */
504 /* uns nicht beruecksichtigten Encodings so weit wie */
505 /* moeglich auch richtig zu behandeln, das double byte */
506 /* characters auch als ein einzelner Character behandelt */
508 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
510 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
511 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
513 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
516 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
522 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
526 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
527 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
529 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
532 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
538 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
543 if ( pDestBuf
== pEndDestBuf
)
545 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
554 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
555 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
558 /* ----------------------------------------------------------------------- */
560 sal_Size
ImplUnicodeToEUCJP( const ImplTextConverterData
* pData
,
562 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
563 sal_Char
* pDestBuf
, sal_Size nDestBytes
,
564 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
565 sal_Size
* pSrcCvtChars
)
571 const ImplUniToDBCSHighTab
* pHighEntry
;
572 const ImplUniToDBCSHighTab
* pHighTab
;
573 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
574 sal_Char
* pEndDestBuf
;
575 const sal_Unicode
* pEndSrcBuf
;
577 (void) pContext
; /* unused */
580 pEndDestBuf
= pDestBuf
+nDestBytes
;
581 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
582 while ( pSrcBuf
< pEndSrcBuf
)
589 /* Half-width katakana */
590 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
591 cConv
= 0x8E00+0xA1+(c
-0xFF61);
594 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
595 nLowChar
= (sal_uChar
)(c
& 0xFF);
598 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
599 pHighEntry
= pHighTab
+nHighChar
;
600 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
602 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
612 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
613 pHighEntry
= pHighTab
+nHighChar
;
614 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
616 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
623 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
628 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
633 /* Handle undefined and surrogates characters */
634 /* (all surrogates characters are undefined) */
635 if (ImplHandleUndefinedUnicodeToTextChar(pData
,
650 if ( !(cConv
& 0xFFFF00) )
652 if ( pDestBuf
== pEndDestBuf
)
654 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
658 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
662 else if ( !(cConv
& 0xFF0000) )
664 if ( pDestBuf
+1 >= pEndDestBuf
)
666 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
670 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
672 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
677 if ( pDestBuf
+2 >= pEndDestBuf
)
679 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
683 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 16) & 0xFF);
685 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
687 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
694 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
695 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));