1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "rtl/textcvt.h"
24 #include "handleundefinedunicodetotextchar.hxx"
25 #include "tenchelp.hxx"
26 #include "unichars.hxx"
28 /* ======================================================================= */
30 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
31 /* where we determine the trail table or for single byte chars the unicode */
32 /* value. We have for all lead byte a separate table, because we can */
33 /* then share many tables for different charset encodings. */
35 /* ======================================================================= */
37 sal_Size
ImplDBCSToUnicode( const void* pData
, SAL_UNUSED_PARAMETER
void*,
38 const char* pSrcBuf
, sal_Size nSrcBytes
,
39 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
40 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
41 sal_Size
* pSrcCvtBytes
)
45 const ImplDBCSToUniLeadTab
* pLeadEntry
;
46 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
47 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
48 sal_Unicode
* pEndDestBuf
;
49 const char* pEndSrcBuf
;
52 pEndDestBuf
= pDestBuf
+nDestChars
;
53 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
54 while ( pSrcBuf
< pEndSrcBuf
)
56 unsigned char cLead
= (unsigned char)*pSrcBuf
;
58 /* get entry for the lead byte */
59 pLeadEntry
= pLeadTab
+cLead
;
61 /* SingleByte char? */
62 if (pLeadEntry
->mpToUniTrailTab
== nullptr
63 || cLead
< pConvertData
->mnLeadStart
64 || cLead
> pConvertData
->mnLeadEnd
)
66 cConv
= pLeadEntry
->mnUniChar
;
67 if ( !cConv
&& (cLead
!= 0) )
69 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
70 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
72 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
75 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
81 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
86 /* Source buffer to small */
87 if ( pSrcBuf
+1 == pEndSrcBuf
)
89 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0 )
91 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
99 cTrail
= (unsigned char)*pSrcBuf
;
100 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
101 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
109 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
110 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
112 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
113 (cLead
<= pEUDCTab
->mnLeadEnd
) )
115 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
116 (cTrail
<= pEUDCTab
->mnTrail1End
) )
118 cConv
= pEUDCTab
->mnUniStart
+
119 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
120 (cTrail
-pEUDCTab
->mnTrail1Start
);
125 sal_uInt16 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
126 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
127 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
128 (cTrail
<= pEUDCTab
->mnTrail2End
) )
130 cConv
= pEUDCTab
->mnUniStart
+
131 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
133 (cTrail
-pEUDCTab
->mnTrail2Start
);
138 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
139 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
140 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
141 (cTrail
<= pEUDCTab
->mnTrail3End
) )
143 cConv
= pEUDCTab
->mnUniStart
+
144 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
146 (cTrail
-pEUDCTab
->mnTrail3Start
);
158 /* We compare the full range of the trail we defined, */
159 /* which can often be greater than the limit. We do this */
160 /* so that extensions that don't consider encodings */
161 /* correctly treat double-byte characters as a single */
162 /* character as much as possible. */
164 if (cLead
< pConvertData
->mnLeadStart
165 || cLead
> pConvertData
->mnLeadEnd
166 || cTrail
< pConvertData
->mnTrailStart
167 || cTrail
> pConvertData
->mnTrailEnd
)
169 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
170 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
172 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
175 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
181 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
188 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
189 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
191 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
194 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
200 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
204 if ( pDestBuf
== pEndDestBuf
)
206 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
215 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
216 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
219 /* ----------------------------------------------------------------------- */
221 sal_Size
ImplUnicodeToDBCS( const void* pData
, SAL_UNUSED_PARAMETER
void*,
222 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
223 char* pDestBuf
, sal_Size nDestBytes
,
224 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
225 sal_Size
* pSrcCvtChars
)
229 const ImplUniToDBCSHighTab
* pHighEntry
;
230 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
231 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
233 const sal_Unicode
* pEndSrcBuf
;
236 pConvertData
->mnLeadStart
!= 0 || pConvertData
->mnLeadEnd
!= 0xFF;
237 /* this statement has the effect that this extra check is only done for
238 EUC-KR, which uses the MS-949 tables, but does not support the full
242 pEndDestBuf
= pDestBuf
+nDestBytes
;
243 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
244 while ( pSrcBuf
< pEndSrcBuf
)
247 unsigned char nHighChar
= (unsigned char)((c
>> 8) & 0xFF);
248 unsigned char nLowChar
= (unsigned char)(c
& 0xFF);
250 /* get entry for the high byte */
251 pHighEntry
= pHighTab
+nHighChar
;
253 /* is low byte in the table range */
254 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
256 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
257 if (bCheckRange
&& cConv
> 0x7F
258 && ((cConv
>> 8) < pConvertData
->mnLeadStart
259 || (cConv
>> 8) > pConvertData
->mnLeadEnd
260 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
261 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
267 if (cConv
== 0 && c
!= 0)
269 /* Map to EUDC ranges: */
270 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
272 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
274 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
276 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
278 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
280 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
283 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
285 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
286 if (nTrailOff
< nSize
)
288 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
293 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
294 if (nTrailOff
< nSize
)
296 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
300 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
307 * SB: Not sure why this is in here. Plus, it does not work as
308 * intended when (c & 0xFF) == 0, because the next !cConv check
309 * will then think c has not yet been converted...
311 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
312 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
314 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
315 cConv
= static_cast< char >(static_cast< unsigned char >(c
& 0xFF));
321 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
326 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
331 /* Handle undefined and surrogates characters */
332 /* (all surrogates characters are undefined) */
333 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
334 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
, nFlags
,
342 if ( !(cConv
& 0xFF00) )
344 if ( pDestBuf
== pEndDestBuf
)
346 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
350 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
355 if ( pDestBuf
+1 >= pEndDestBuf
)
357 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
361 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
363 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
370 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
371 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
374 /* ======================================================================= */
376 #define JIS_EUC_LEAD_OFF 0x80
377 #define JIS_EUC_TRAIL_OFF 0x80
379 /* ----------------------------------------------------------------------- */
381 sal_Size
ImplEUCJPToUnicode( const void* pData
,
382 SAL_UNUSED_PARAMETER
void*,
383 const char* pSrcBuf
, sal_Size nSrcBytes
,
384 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
385 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
386 sal_Size
* pSrcCvtBytes
)
388 unsigned char cLead
= '\0';
389 unsigned char cTrail
= '\0';
391 const ImplDBCSToUniLeadTab
* pLeadEntry
;
392 const ImplDBCSToUniLeadTab
* pLeadTab
;
393 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
394 sal_Unicode
* pEndDestBuf
;
395 const char* pEndSrcBuf
;
398 pEndDestBuf
= pDestBuf
+nDestChars
;
399 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
400 while ( pSrcBuf
< pEndSrcBuf
)
402 unsigned char c
= (unsigned char)*pSrcBuf
;
409 /* SS2 - Half-width katakana */
413 /* Source buffer to small */
414 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
416 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
421 c
= (unsigned char)*pSrcBuf
;
422 if ( (c
>= 0xA1) && (c
<= 0xDF) )
423 cConv
= 0xFF61+(c
-0xA1);
433 /* SS3 - JIS 0212-1990 */
434 /* 8F + A1-FE + A1-FE */
437 /* Source buffer to small */
438 if (pEndSrcBuf
- pSrcBuf
< 3)
440 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
445 cLead
= (unsigned char)*pSrcBuf
;
447 cTrail
= (unsigned char)*pSrcBuf
;
448 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
450 /* CodeSet 2 JIS 0208-1997 */
454 /* Source buffer to small */
455 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
457 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
463 cTrail
= (unsigned char)*pSrcBuf
;
464 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
467 /* Undefined Range */
468 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
472 cLead
-= JIS_EUC_LEAD_OFF
;
473 cTrail
-= JIS_EUC_TRAIL_OFF
;
474 pLeadEntry
= pLeadTab
+cLead
;
475 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
476 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
484 /* We compare the full range of the trail we defined, */
485 /* which can often be greater than the limit. We do this */
486 /* so that extensions that don't consider encodings */
487 /* correctly treat double-byte characters as a single */
488 /* character as much as possible. */
490 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
492 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
493 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
495 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
498 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
504 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
508 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
509 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
511 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
514 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
520 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
525 if ( pDestBuf
== pEndDestBuf
)
527 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
536 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
537 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
540 /* ----------------------------------------------------------------------- */
542 sal_Size
ImplUnicodeToEUCJP( const void* pData
,
543 SAL_UNUSED_PARAMETER
void*,
544 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
545 char* pDestBuf
, sal_Size nDestBytes
,
546 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
547 sal_Size
* pSrcCvtChars
)
551 unsigned char nHighChar
;
552 unsigned char nLowChar
;
553 const ImplUniToDBCSHighTab
* pHighEntry
;
554 const ImplUniToDBCSHighTab
* pHighTab
;
555 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
557 const sal_Unicode
* pEndSrcBuf
;
560 pEndDestBuf
= pDestBuf
+nDestBytes
;
561 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
562 while ( pSrcBuf
< pEndSrcBuf
)
569 /* Half-width katakana */
570 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
571 cConv
= 0x8E00+0xA1+(c
-0xFF61);
574 nHighChar
= (unsigned char)((c
>> 8) & 0xFF);
575 nLowChar
= (unsigned char)(c
& 0xFF);
578 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
579 pHighEntry
= pHighTab
+nHighChar
;
580 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
582 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
592 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
593 pHighEntry
= pHighTab
+nHighChar
;
594 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
596 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
603 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
608 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
613 /* Handle undefined and surrogates characters */
614 /* (all surrogates characters are undefined) */
615 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
616 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
,
626 if ( !(cConv
& 0xFFFF00) )
628 if ( pDestBuf
== pEndDestBuf
)
630 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
634 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
638 else if ( !(cConv
& 0xFF0000) )
640 if ( pDestBuf
+1 >= pEndDestBuf
)
642 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
646 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
648 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
653 if ( pDestBuf
+2 >= pEndDestBuf
)
655 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
659 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 16) & 0xFF));
661 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
663 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
670 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
671 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
674 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */