1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <rtl/textcvt.h>
24 #include "handleundefinedunicodetotextchar.hxx"
25 #include "tenchelp.hxx"
26 #include "unichars.hxx"
28 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
29 /* where we determine the trail table or for single byte chars the unicode */
30 /* value. We have for all lead byte a separate table, because we can */
31 /* then share many tables for different charset encodings. */
33 sal_Size
ImplDBCSToUnicode( const void* pData
, SAL_UNUSED_PARAMETER
void*,
34 const char* pSrcBuf
, sal_Size nSrcBytes
,
35 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
36 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
37 sal_Size
* pSrcCvtBytes
)
41 const ImplDBCSToUniLeadTab
* pLeadEntry
;
42 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
43 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
44 sal_Unicode
* pEndDestBuf
;
45 const char* pEndSrcBuf
;
48 pEndDestBuf
= pDestBuf
+nDestChars
;
49 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
50 while ( pSrcBuf
< pEndSrcBuf
)
52 unsigned char cLead
= static_cast<unsigned char>(*pSrcBuf
);
54 /* get entry for the lead byte */
55 pLeadEntry
= pLeadTab
+cLead
;
57 /* SingleByte char? */
58 if (pLeadEntry
->mpToUniTrailTab
== nullptr
59 || cLead
< pConvertData
->mnLeadStart
60 || cLead
> pConvertData
->mnLeadEnd
)
62 cConv
= pLeadEntry
->mnUniChar
;
63 if ( !cConv
&& (cLead
!= 0) )
65 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
66 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
68 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
71 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
76 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
81 /* Source buffer too small */
82 if ( pSrcBuf
+1 == pEndSrcBuf
)
84 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0 )
86 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
94 cTrail
= static_cast<unsigned char>(*pSrcBuf
);
95 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
96 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
104 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
105 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
107 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
108 (cLead
<= pEUDCTab
->mnLeadEnd
) )
110 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
111 (cTrail
<= pEUDCTab
->mnTrail1End
) )
113 cConv
= pEUDCTab
->mnUniStart
+
114 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
115 (cTrail
-pEUDCTab
->mnTrail1Start
);
118 sal_uInt16 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
119 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
120 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
121 (cTrail
<= pEUDCTab
->mnTrail2End
) )
123 cConv
= pEUDCTab
->mnUniStart
+
124 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
126 (cTrail
-pEUDCTab
->mnTrail2Start
);
129 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
130 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
131 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
132 (cTrail
<= pEUDCTab
->mnTrail3End
) )
134 cConv
= pEUDCTab
->mnUniStart
+
135 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
137 (cTrail
-pEUDCTab
->mnTrail3Start
);
147 /* We compare the full range of the trail we defined, */
148 /* which can often be greater than the limit. We do this */
149 /* so that extensions that don't consider encodings */
150 /* correctly treat double-byte characters as a single */
151 /* character as much as possible. */
153 if (cLead
< pConvertData
->mnLeadStart
154 || cLead
> pConvertData
->mnLeadEnd
155 || cTrail
< pConvertData
->mnTrailStart
156 || cTrail
> pConvertData
->mnTrailEnd
)
158 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
159 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
161 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
164 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
169 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
176 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
177 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
179 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
182 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
187 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
191 if ( pDestBuf
== pEndDestBuf
)
193 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
202 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
203 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
206 sal_Size
ImplUnicodeToDBCS( const void* pData
, SAL_UNUSED_PARAMETER
void*,
207 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
208 char* pDestBuf
, sal_Size nDestBytes
,
209 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
210 sal_Size
* pSrcCvtChars
)
214 const ImplUniToDBCSHighTab
* pHighEntry
;
215 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
216 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
218 const sal_Unicode
* pEndSrcBuf
;
221 pConvertData
->mnLeadStart
!= 0 || pConvertData
->mnLeadEnd
!= 0xFF;
222 /* this statement has the effect that this extra check is only done for
223 EUC-KR, which uses the MS-949 tables, but does not support the full
227 pEndDestBuf
= pDestBuf
+nDestBytes
;
228 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
229 while ( pSrcBuf
< pEndSrcBuf
)
232 unsigned char nHighChar
= static_cast<unsigned char>((c
>> 8) & 0xFF);
233 unsigned char nLowChar
= static_cast<unsigned char>(c
& 0xFF);
235 /* get entry for the high byte */
236 pHighEntry
= pHighTab
+nHighChar
;
238 /* is low byte in the table range */
239 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
241 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
242 if (bCheckRange
&& cConv
> 0x7F
243 && ((cConv
>> 8) < pConvertData
->mnLeadStart
244 || (cConv
>> 8) > pConvertData
->mnLeadEnd
245 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
246 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
252 if (cConv
== 0 && c
!= 0)
254 /* Map to EUDC ranges: */
255 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
257 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
259 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
261 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
263 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
265 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
267 cConv
= static_cast<sal_uInt16
>((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
269 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
270 if (nTrailOff
< nSize
)
272 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
277 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
278 if (nTrailOff
< nSize
)
280 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
284 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
291 * SB: Not sure why this is in here. Plus, it does not work as
292 * intended when (c & 0xFF) == 0, because the next !cConv check
293 * will then think c has not yet been converted...
295 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
296 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
298 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
299 cConv
= static_cast< char >(static_cast< unsigned char >(c
& 0xFF));
305 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
310 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
315 /* Handle undefined and surrogates characters */
316 /* (all surrogates characters are undefined) */
317 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
318 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
, nFlags
,
325 if ( !(cConv
& 0xFF00) )
327 if ( pDestBuf
== pEndDestBuf
)
329 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
333 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
338 if ( pDestBuf
+1 >= pEndDestBuf
)
340 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
344 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
346 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
353 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
354 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
357 #define JIS_EUC_LEAD_OFF 0x80
358 #define JIS_EUC_TRAIL_OFF 0x80
360 sal_Size
ImplEUCJPToUnicode( const void* pData
,
361 SAL_UNUSED_PARAMETER
void*,
362 const char* pSrcBuf
, sal_Size nSrcBytes
,
363 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
364 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
365 sal_Size
* pSrcCvtBytes
)
367 unsigned char cLead
= '\0';
368 unsigned char cTrail
= '\0';
370 const ImplDBCSToUniLeadTab
* pLeadEntry
;
371 const ImplDBCSToUniLeadTab
* pLeadTab
;
372 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
373 sal_Unicode
* pEndDestBuf
;
374 const char* pEndSrcBuf
;
377 pEndDestBuf
= pDestBuf
+nDestChars
;
378 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
379 while ( pSrcBuf
< pEndSrcBuf
)
381 unsigned char c
= static_cast<unsigned char>(*pSrcBuf
);
388 /* SS2 - Half-width katakana */
392 /* Source buffer to small */
393 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
395 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
400 c
= static_cast<unsigned char>(*pSrcBuf
);
401 if ( (c
>= 0xA1) && (c
<= 0xDF) )
402 cConv
= 0xFF61+(c
-0xA1);
412 /* SS3 - JIS 0212-1990 */
413 /* 8F + A1-FE + A1-FE */
416 /* Source buffer to small */
417 if (pEndSrcBuf
- pSrcBuf
< 3)
419 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
424 cLead
= static_cast<unsigned char>(*pSrcBuf
);
426 cTrail
= static_cast<unsigned char>(*pSrcBuf
);
427 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
429 /* CodeSet 2 JIS 0208-1997 */
433 /* Source buffer to small */
434 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
436 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
442 cTrail
= static_cast<unsigned char>(*pSrcBuf
);
443 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
446 /* Undefined Range */
447 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
451 cLead
-= JIS_EUC_LEAD_OFF
;
452 cTrail
-= JIS_EUC_TRAIL_OFF
;
453 pLeadEntry
= pLeadTab
+cLead
;
454 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
455 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
463 /* We compare the full range of the trail we defined, */
464 /* which can often be greater than the limit. We do this */
465 /* so that extensions that don't consider encodings */
466 /* correctly treat double-byte characters as a single */
467 /* character as much as possible. */
469 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
471 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
472 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
474 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
477 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
482 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
486 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
487 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
489 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
492 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
497 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
502 if ( pDestBuf
== pEndDestBuf
)
504 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
513 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
514 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
517 sal_Size
ImplUnicodeToEUCJP( const void* pData
,
518 SAL_UNUSED_PARAMETER
void*,
519 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
520 char* pDestBuf
, sal_Size nDestBytes
,
521 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
522 sal_Size
* pSrcCvtChars
)
526 unsigned char nHighChar
;
527 unsigned char nLowChar
;
528 const ImplUniToDBCSHighTab
* pHighEntry
;
529 const ImplUniToDBCSHighTab
* pHighTab
;
530 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
532 const sal_Unicode
* pEndSrcBuf
;
535 pEndDestBuf
= pDestBuf
+nDestBytes
;
536 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
537 while ( pSrcBuf
< pEndSrcBuf
)
544 /* Half-width katakana */
545 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
546 cConv
= 0x8E00+0xA1+(c
-0xFF61);
549 nHighChar
= static_cast<unsigned char>((c
>> 8) & 0xFF);
550 nLowChar
= static_cast<unsigned char>(c
& 0xFF);
553 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
554 pHighEntry
= pHighTab
+nHighChar
;
555 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
557 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
567 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
568 pHighEntry
= pHighTab
+nHighChar
;
569 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
571 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
578 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
583 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
588 /* Handle undefined and surrogates characters */
589 /* (all surrogates characters are undefined) */
590 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
591 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
,
600 if ( !(cConv
& 0xFFFF00) )
602 if ( pDestBuf
== pEndDestBuf
)
604 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
608 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
612 else if ( !(cConv
& 0xFF0000) )
614 if ( pDestBuf
+1 >= pEndDestBuf
)
616 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
620 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
622 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
627 if ( pDestBuf
+2 >= pEndDestBuf
)
629 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
633 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 16) & 0xFF));
635 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
637 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
644 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
645 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
648 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */