1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "rtl/textcvt.h"
24 #include "handleundefinedunicodetotextchar.hxx"
25 #include "tenchelp.hxx"
26 #include "unichars.hxx"
28 /* ======================================================================= */
30 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
31 /* where we determine the trail table or for single byte chars the unicode */
32 /* value. We have for all lead byte a separate table, because we can */
33 /* then share many tables for different charset encodings. */
35 /* ======================================================================= */
37 sal_Size
ImplDBCSToUnicode( const void* pData
, SAL_UNUSED_PARAMETER
void*,
38 const char* pSrcBuf
, sal_Size nSrcBytes
,
39 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
40 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
41 sal_Size
* pSrcCvtBytes
)
46 const ImplDBCSToUniLeadTab
* pLeadEntry
;
47 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
48 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
49 sal_Unicode
* pEndDestBuf
;
50 const char* pEndSrcBuf
;
53 pEndDestBuf
= pDestBuf
+nDestChars
;
54 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
55 while ( pSrcBuf
< pEndSrcBuf
)
57 cLead
= (sal_uChar
)*pSrcBuf
;
59 /* get entry for the lead byte */
60 pLeadEntry
= pLeadTab
+cLead
;
62 /* SingleByte char? */
63 if (pLeadEntry
->mpToUniTrailTab
== NULL
64 || cLead
< pConvertData
->mnLeadStart
65 || cLead
> pConvertData
->mnLeadEnd
)
67 cConv
= pLeadEntry
->mnUniChar
;
68 if ( !cConv
&& (cLead
!= 0) )
70 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
71 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
73 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
76 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
82 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
87 /* Source buffer to small */
88 if ( pSrcBuf
+1 == pEndSrcBuf
)
90 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
95 cTrail
= (sal_uChar
)*pSrcBuf
;
96 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
97 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
105 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
106 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
108 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
109 (cLead
<= pEUDCTab
->mnLeadEnd
) )
111 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
112 (cTrail
<= pEUDCTab
->mnTrail1End
) )
114 cConv
= pEUDCTab
->mnUniStart
+
115 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
116 (cTrail
-pEUDCTab
->mnTrail1Start
);
121 sal_uInt16 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
122 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
123 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
124 (cTrail
<= pEUDCTab
->mnTrail2End
) )
126 cConv
= pEUDCTab
->mnUniStart
+
127 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
129 (cTrail
-pEUDCTab
->mnTrail2Start
);
134 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
135 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
136 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
137 (cTrail
<= pEUDCTab
->mnTrail3End
) )
139 cConv
= pEUDCTab
->mnUniStart
+
140 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
142 (cTrail
-pEUDCTab
->mnTrail3Start
);
154 /* We compare the full range of the trail we defined, */
155 /* which can often be greater than the limit. We do this */
156 /* so that extensions that don't consider encodings */
157 /* correctly treat double-byte characters as a single */
158 /* character as much as possible. */
160 if (cLead
< pConvertData
->mnLeadStart
161 || cLead
> pConvertData
->mnLeadEnd
162 || cTrail
< pConvertData
->mnTrailStart
163 || cTrail
> pConvertData
->mnTrailEnd
)
165 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
166 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
168 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
171 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
177 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
181 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
182 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
184 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
187 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
193 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
199 if ( pDestBuf
== pEndDestBuf
)
201 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
210 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
211 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
214 /* ----------------------------------------------------------------------- */
216 sal_Size
ImplUnicodeToDBCS( const void* pData
, SAL_UNUSED_PARAMETER
void*,
217 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
218 char* pDestBuf
, sal_Size nDestBytes
,
219 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
220 sal_Size
* pSrcCvtChars
)
226 const ImplUniToDBCSHighTab
* pHighEntry
;
227 const ImplDBCSConvertData
* pConvertData
= (const ImplDBCSConvertData
*)pData
;
228 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
230 const sal_Unicode
* pEndSrcBuf
;
233 pConvertData
->mnLeadStart
!= 0 || pConvertData
->mnLeadEnd
!= 0xFF;
234 /* this statement has the effect that this extra check is only done for
235 EUC-KR, which uses the MS-949 tables, but does not support the full
239 pEndDestBuf
= pDestBuf
+nDestBytes
;
240 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
241 while ( pSrcBuf
< pEndSrcBuf
)
244 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
245 nLowChar
= (sal_uChar
)(c
& 0xFF);
247 /* get entry for the high byte */
248 pHighEntry
= pHighTab
+nHighChar
;
250 /* is low byte in the table range */
251 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
253 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
254 if (bCheckRange
&& cConv
> 0x7F
255 && ((cConv
>> 8) < pConvertData
->mnLeadStart
256 || (cConv
>> 8) > pConvertData
->mnLeadEnd
257 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
258 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
264 if (cConv
== 0 && c
!= 0)
266 /* Map to EUDC ranges: */
267 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
269 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
271 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
273 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
275 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
277 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
280 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
282 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
283 if (nTrailOff
< nSize
)
285 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
290 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
291 if (nTrailOff
< nSize
)
293 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
297 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
304 * SB: Not sure why this is in here. Plus, it does not work as
305 * intended when (c & 0xFF) == 0, because the next !cConv check
306 * will then think c has not yet been converted...
308 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
309 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
311 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
312 cConv
= static_cast< char >(static_cast< unsigned char >(c
& 0xFF));
318 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
323 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
328 /* Handle undefined and surrogates characters */
329 /* (all surrogates characters are undefined) */
330 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
331 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
, nFlags
,
339 if ( !(cConv
& 0xFF00) )
341 if ( pDestBuf
== pEndDestBuf
)
343 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
347 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
352 if ( pDestBuf
+1 >= pEndDestBuf
)
354 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
358 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
360 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
367 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
368 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
371 /* ======================================================================= */
373 #define JIS_EUC_LEAD_OFF 0x80
374 #define JIS_EUC_TRAIL_OFF 0x80
376 /* ----------------------------------------------------------------------- */
378 sal_Size
ImplEUCJPToUnicode( const void* pData
,
379 SAL_UNUSED_PARAMETER
void*,
380 const char* pSrcBuf
, sal_Size nSrcBytes
,
381 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
382 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
383 sal_Size
* pSrcCvtBytes
)
386 sal_uChar cLead
= '\0';
387 sal_uChar cTrail
= '\0';
389 const ImplDBCSToUniLeadTab
* pLeadEntry
;
390 const ImplDBCSToUniLeadTab
* pLeadTab
;
391 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
392 sal_Unicode
* pEndDestBuf
;
393 const char* pEndSrcBuf
;
396 pEndDestBuf
= pDestBuf
+nDestChars
;
397 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
398 while ( pSrcBuf
< pEndSrcBuf
)
400 c
= (sal_uChar
)*pSrcBuf
;
407 /* SS2 - Half-width katakana */
411 /* Source buffer to small */
412 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
414 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
419 c
= (sal_uChar
)*pSrcBuf
;
420 if ( (c
>= 0xA1) && (c
<= 0xDF) )
421 cConv
= 0xFF61+(c
-0xA1);
431 /* SS3 - JIS 0212-1990 */
432 /* 8F + A1-FE + A1-FE */
435 /* Source buffer to small */
436 if (pEndSrcBuf
- pSrcBuf
< 3)
438 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
443 cLead
= (sal_uChar
)*pSrcBuf
;
445 cTrail
= (sal_uChar
)*pSrcBuf
;
446 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
448 /* CodeSet 2 JIS 0208-1997 */
452 /* Source buffer to small */
453 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
455 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
461 cTrail
= (sal_uChar
)*pSrcBuf
;
462 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
465 /* Undefined Range */
466 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
470 cLead
-= JIS_EUC_LEAD_OFF
;
471 cTrail
-= JIS_EUC_TRAIL_OFF
;
472 pLeadEntry
= pLeadTab
+cLead
;
473 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
474 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
482 /* We compare the full range of the trail we defined, */
483 /* which can often be greater than the limit. We do this */
484 /* so that extensions that don't consider encodings */
485 /* correctly treat double-byte characters as a single */
486 /* character as much as possible. */
488 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
490 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
491 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
493 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
496 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
502 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
506 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
507 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
509 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
512 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
518 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
523 if ( pDestBuf
== pEndDestBuf
)
525 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
534 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
535 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
538 /* ----------------------------------------------------------------------- */
540 sal_Size
ImplUnicodeToEUCJP( const void* pData
,
541 SAL_UNUSED_PARAMETER
void*,
542 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
543 char* pDestBuf
, sal_Size nDestBytes
,
544 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
545 sal_Size
* pSrcCvtChars
)
551 const ImplUniToDBCSHighTab
* pHighEntry
;
552 const ImplUniToDBCSHighTab
* pHighTab
;
553 const ImplEUCJPConvertData
* pConvertData
= (const ImplEUCJPConvertData
*)pData
;
555 const sal_Unicode
* pEndSrcBuf
;
558 pEndDestBuf
= pDestBuf
+nDestBytes
;
559 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
560 while ( pSrcBuf
< pEndSrcBuf
)
567 /* Half-width katakana */
568 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
569 cConv
= 0x8E00+0xA1+(c
-0xFF61);
572 nHighChar
= (sal_uChar
)((c
>> 8) & 0xFF);
573 nLowChar
= (sal_uChar
)(c
& 0xFF);
576 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
577 pHighEntry
= pHighTab
+nHighChar
;
578 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
580 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
590 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
591 pHighEntry
= pHighTab
+nHighChar
;
592 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
594 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
601 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
606 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
611 /* Handle undefined and surrogates characters */
612 /* (all surrogates characters are undefined) */
613 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
614 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
,
624 if ( !(cConv
& 0xFFFF00) )
626 if ( pDestBuf
== pEndDestBuf
)
628 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
632 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
636 else if ( !(cConv
& 0xFF0000) )
638 if ( pDestBuf
+1 >= pEndDestBuf
)
640 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
644 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
646 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
651 if ( pDestBuf
+2 >= pEndDestBuf
)
653 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
657 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 16) & 0xFF));
659 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
661 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
668 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
669 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
672 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */