1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
30 #include "rtl/textcvt.h"
32 /* ======================================================================= */
34 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
35 /* where we determine the trail table or for single byte chars the unicode */
36 /* value. We have for all lead byte a separate table, because we can */
37 /* then share many tables for diffrent charset encodings. */
39 /* ======================================================================= */
41 sal_Size
ImplDBCSToUnicode( const ImplTextConverterData
* pData
, void* pContext
,
42 const sal_Char
* pSrcBuf
, sal_Size nSrcBytes
,
43 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
44 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
45 sal_Size
* pSrcCvtBytes
)
50 const ImplDBCSToUniLeadTab
* pLeadEntry
;
51 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
52 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
53 sal_Unicode
* pEndDestBuf
;
54 const sal_Char
* pEndSrcBuf
;
56 (void) pContext
; /* unused */
59 pEndDestBuf
= pDestBuf
+nDestChars
;
60 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
61 while ( pSrcBuf
< pEndSrcBuf
)
63 cLead
= (sal_uChar
)*pSrcBuf
;
65 /* get entry for the lead byte */
66 pLeadEntry
= pLeadTab
+cLead
;
68 /* SingleByte char? */
69 if (pLeadEntry
->mpToUniTrailTab
== NULL
70 || cLead
< pConvertData
->mnLeadStart
71 || cLead
> pConvertData
->mnLeadEnd
)
73 cConv
= pLeadEntry
->mnUniChar
;
74 if ( !cConv
&& (cLead
!= 0) )
76 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
77 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
79 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
82 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
88 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
93 /* Source buffer to small */
94 if ( pSrcBuf
+1 == pEndSrcBuf
)
96 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
101 cTrail
= (sal_uChar
)*pSrcBuf
;
102 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
103 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
111 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
112 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
114 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
115 (cLead
<= pEUDCTab
->mnLeadEnd
) )
117 sal_uInt16 nTrailCount
= 0;
118 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
119 (cTrail
<= pEUDCTab
->mnTrail1End
) )
121 cConv
= pEUDCTab
->mnUniStart
+
122 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
123 (cTrail
-pEUDCTab
->mnTrail1Start
);
128 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
129 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
130 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
131 (cTrail
<= pEUDCTab
->mnTrail2End
) )
133 cConv
= pEUDCTab
->mnUniStart
+
134 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
136 (cTrail
-pEUDCTab
->mnTrail2Start
);
141 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
142 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
143 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
144 (cTrail
<= pEUDCTab
->mnTrail3End
) )
146 cConv
= pEUDCTab
->mnUniStart
+
147 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
149 (cTrail
-pEUDCTab
->mnTrail3Start
);
161 /* Wir vergleichen den kompletten Trailbereich den wir */
162 /* definieren, der normalerweise groesser sein kann als */
163 /* der definierte. Dies machen wir, damit Erweiterungen von */
164 /* uns nicht beruecksichtigten Encodings so weit wie */
165 /* moeglich auch richtig zu behandeln, das double byte */
166 /* characters auch als ein einzelner Character behandelt */
168 if (cLead
< pConvertData
->mnLeadStart
169 || cLead
> pConvertData
->mnLeadEnd
170 || cTrail
< pConvertData
->mnTrailStart
171 || cTrail
> pConvertData
->mnTrailEnd
)
173 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
174 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
176 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
179 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
185 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
189 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
190 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
192 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
195 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
201 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
207 if ( pDestBuf
== pEndDestBuf
)
209 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
218 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
219 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
222 /* ----------------------------------------------------------------------- */
224 sal_Size
ImplUnicodeToDBCS( const ImplTextConverterData
* pData
, void* pContext
,
225 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
226 sal_Char
* pDestBuf
, sal_Size nDestBytes
,
227 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
228 sal_Size
* pSrcCvtChars
)
234 const ImplUniToDBCSHighTab
* pHighEntry
;
235 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
236 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
237 sal_Char
* pEndDestBuf
;
238 const sal_Unicode
* pEndSrcBuf
;
240 sal_Bool bCheckRange
= (pConvertData
->mnLeadStart
!= 0
241 || pConvertData
->mnLeadEnd
!= 0xFF);
242 /* this statement has the effect that this extra check is only done for
243 EUC-KR, which uses the MS-949 tables, but does not support the full
246 (void) pContext
; /* unused */
249 pEndDestBuf
= pDestBuf
+nDestBytes
;
250 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
251 while ( pSrcBuf
< pEndSrcBuf
)
254 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
255 nLowChar
= (sal_uChar
)(c
& 0xFF);
257 /* get entry for the high byte */
258 pHighEntry
= pHighTab
+nHighChar
;
260 /* is low byte in the table range */
261 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
263 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
264 if (bCheckRange
&& cConv
> 0x7F
265 && ((cConv
>> 8) < pConvertData
->mnLeadStart
266 || (cConv
>> 8) > pConvertData
->mnLeadEnd
267 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
268 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
274 if (cConv
== 0 && c
!= 0)
276 /* Map to EUDC ranges: */
277 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
279 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
281 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
283 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
285 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
287 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
290 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
292 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
293 if (nTrailOff
< nSize
)
295 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
300 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
301 if (nTrailOff
< nSize
)
303 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
307 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
314 * SB: Not sure why this is in here. Plus, it does not work as
315 * intended when (c & 0xFF) == 0, because the next !cConv check
316 * will then think c has not yet been converted...
318 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
319 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
321 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
322 cConv
= (sal_Char
)(sal_uChar
)(c
& 0xFF);
328 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
333 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
338 /* Handle undefined and surrogates characters */
339 /* (all surrogates characters are undefined) */
340 if (ImplHandleUndefinedUnicodeToTextChar(pData
,
353 if ( !(cConv
& 0xFF00) )
355 if ( pDestBuf
== pEndDestBuf
)
357 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
361 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
366 if ( pDestBuf
+1 >= pEndDestBuf
)
368 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
372 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
374 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
381 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
382 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
385 /* ======================================================================= */
387 #define JIS_EUC_LEAD_OFF 0x80
388 #define JIS_EUC_TRAIL_OFF 0x80
390 /* ----------------------------------------------------------------------- */
392 sal_Size
ImplEUCJPToUnicode( const ImplTextConverterData
* pData
,
394 const sal_Char
* pSrcBuf
, sal_Size nSrcBytes
,
395 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
396 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
397 sal_Size
* pSrcCvtBytes
)
400 sal_uChar cLead
= '\0';
401 sal_uChar cTrail
= '\0';
403 const ImplDBCSToUniLeadTab
* pLeadEntry
;
404 const ImplDBCSToUniLeadTab
* pLeadTab
;
405 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
406 sal_Unicode
* pEndDestBuf
;
407 const sal_Char
* pEndSrcBuf
;
409 (void) pContext
; /* unused */
412 pEndDestBuf
= pDestBuf
+nDestChars
;
413 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
414 while ( pSrcBuf
< pEndSrcBuf
)
416 c
= (sal_uChar
)*pSrcBuf
;
423 /* SS2 - Half-width katakana */
427 /* Source buffer to small */
428 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
430 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
435 c
= (sal_uChar
)*pSrcBuf
;
436 if ( (c
>= 0xA1) && (c
<= 0xDF) )
437 cConv
= 0xFF61+(c
-0xA1);
447 /* SS3 - JIS 0212-1990 */
448 /* 8F + A1-FE + A1-FE */
451 /* Source buffer to small */
452 if (pEndSrcBuf
- pSrcBuf
< 3)
454 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
459 cLead
= (sal_uChar
)*pSrcBuf
;
461 cTrail
= (sal_uChar
)*pSrcBuf
;
462 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
464 /* CodeSet 2 JIS 0208-1997 */
468 /* Source buffer to small */
469 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
471 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
477 cTrail
= (sal_uChar
)*pSrcBuf
;
478 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
481 /* Undefined Range */
482 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
486 cLead
-= JIS_EUC_LEAD_OFF
;
487 cTrail
-= JIS_EUC_TRAIL_OFF
;
488 pLeadEntry
= pLeadTab
+cLead
;
489 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
490 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
498 /* Wir vergleichen den kompletten Trailbereich den wir */
499 /* definieren, der normalerweise groesser sein kann als */
500 /* der definierte. Dies machen wir, damit Erweiterungen von */
501 /* uns nicht beruecksichtigten Encodings so weit wie */
502 /* moeglich auch richtig zu behandeln, das double byte */
503 /* characters auch als ein einzelner Character behandelt */
505 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
507 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
508 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
510 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
513 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
519 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
523 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
524 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
526 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
529 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
535 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
540 if ( pDestBuf
== pEndDestBuf
)
542 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
551 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
552 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
555 /* ----------------------------------------------------------------------- */
557 sal_Size
ImplUnicodeToEUCJP( const ImplTextConverterData
* pData
,
559 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
560 sal_Char
* pDestBuf
, sal_Size nDestBytes
,
561 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
562 sal_Size
* pSrcCvtChars
)
568 const ImplUniToDBCSHighTab
* pHighEntry
;
569 const ImplUniToDBCSHighTab
* pHighTab
;
570 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
571 sal_Char
* pEndDestBuf
;
572 const sal_Unicode
* pEndSrcBuf
;
574 (void) pContext
; /* unused */
577 pEndDestBuf
= pDestBuf
+nDestBytes
;
578 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
579 while ( pSrcBuf
< pEndSrcBuf
)
586 /* Half-width katakana */
587 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
588 cConv
= 0x8E00+0xA1+(c
-0xFF61);
591 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
592 nLowChar
= (sal_uChar
)(c
& 0xFF);
595 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
596 pHighEntry
= pHighTab
+nHighChar
;
597 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
599 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
609 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
610 pHighEntry
= pHighTab
+nHighChar
;
611 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
613 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
620 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
625 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
630 /* Handle undefined and surrogates characters */
631 /* (all surrogates characters are undefined) */
632 if (ImplHandleUndefinedUnicodeToTextChar(pData
,
647 if ( !(cConv
& 0xFFFF00) )
649 if ( pDestBuf
== pEndDestBuf
)
651 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
655 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
659 else if ( !(cConv
& 0xFF0000) )
661 if ( pDestBuf
+1 >= pEndDestBuf
)
663 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
667 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
669 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
674 if ( pDestBuf
+2 >= pEndDestBuf
)
676 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
680 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 16) & 0xFF);
682 *pDestBuf
= (sal_Char
)(sal_uChar
)((cConv
>> 8) & 0xFF);
684 *pDestBuf
= (sal_Char
)(sal_uChar
)(cConv
& 0xFF);
691 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
692 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));