1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "sal/config.h"
22 #include "rtl/textcvt.h"
24 #include "handleundefinedunicodetotextchar.hxx"
25 #include "tenchelp.hxx"
26 #include "unichars.hxx"
28 /* ======================================================================= */
30 /* DBCS to Unicode conversion routine use a lead table for the first byte, */
31 /* where we determine the trail table or for single byte chars the unicode */
32 /* value. We have for all lead byte a separate table, because we can */
33 /* then share many tables for different charset encodings. */
35 /* ======================================================================= */
37 sal_Size
ImplDBCSToUnicode( const void* pData
, SAL_UNUSED_PARAMETER
void*,
38 const char* pSrcBuf
, sal_Size nSrcBytes
,
39 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
40 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
41 sal_Size
* pSrcCvtBytes
)
46 const ImplDBCSToUniLeadTab
* pLeadEntry
;
47 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
48 const ImplDBCSToUniLeadTab
* pLeadTab
= pConvertData
->mpToUniLeadTab
;
49 sal_Unicode
* pEndDestBuf
;
50 const char* pEndSrcBuf
;
53 pEndDestBuf
= pDestBuf
+nDestChars
;
54 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
55 while ( pSrcBuf
< pEndSrcBuf
)
57 cLead
= (unsigned char)*pSrcBuf
;
59 /* get entry for the lead byte */
60 pLeadEntry
= pLeadTab
+cLead
;
62 /* SingleByte char? */
63 if (pLeadEntry
->mpToUniTrailTab
== NULL
64 || cLead
< pConvertData
->mnLeadStart
65 || cLead
> pConvertData
->mnLeadEnd
)
67 cConv
= pLeadEntry
->mnUniChar
;
68 if ( !cConv
&& (cLead
!= 0) )
70 *pInfo
|= RTL_TEXTTOUNICODE_INFO_UNDEFINED
;
71 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
)
73 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
76 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_IGNORE
)
82 cConv
= ImplGetUndefinedUnicodeChar(cLead
, nFlags
);
87 /* Source buffer to small */
88 if ( pSrcBuf
+1 == pEndSrcBuf
)
90 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0 )
92 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
100 cTrail
= (unsigned char)*pSrcBuf
;
101 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
102 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
110 const ImplDBCSEUDCData
* pEUDCTab
= pConvertData
->mpEUDCTab
;
111 for ( i
= 0; i
< pConvertData
->mnEUDCCount
; i
++ )
113 if ( (cLead
>= pEUDCTab
->mnLeadStart
) &&
114 (cLead
<= pEUDCTab
->mnLeadEnd
) )
116 if ( (cTrail
>= pEUDCTab
->mnTrail1Start
) &&
117 (cTrail
<= pEUDCTab
->mnTrail1End
) )
119 cConv
= pEUDCTab
->mnUniStart
+
120 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
121 (cTrail
-pEUDCTab
->mnTrail1Start
);
126 sal_uInt16 nTrailCount
= pEUDCTab
->mnTrail1End
-pEUDCTab
->mnTrail1Start
+1;
127 if ( (pEUDCTab
->mnTrailCount
>= 2) &&
128 (cTrail
>= pEUDCTab
->mnTrail2Start
) &&
129 (cTrail
<= pEUDCTab
->mnTrail2End
) )
131 cConv
= pEUDCTab
->mnUniStart
+
132 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
134 (cTrail
-pEUDCTab
->mnTrail2Start
);
139 nTrailCount
= pEUDCTab
->mnTrail2End
-pEUDCTab
->mnTrail2Start
+1;
140 if ( (pEUDCTab
->mnTrailCount
>= 3) &&
141 (cTrail
>= pEUDCTab
->mnTrail3Start
) &&
142 (cTrail
<= pEUDCTab
->mnTrail3End
) )
144 cConv
= pEUDCTab
->mnUniStart
+
145 ((cLead
-pEUDCTab
->mnLeadStart
)*pEUDCTab
->mnTrailRangeCount
)+
147 (cTrail
-pEUDCTab
->mnTrail3Start
);
159 /* We compare the full range of the trail we defined, */
160 /* which can often be greater than the limit. We do this */
161 /* so that extensions that don't consider encodings */
162 /* correctly treat double-byte characters as a single */
163 /* character as much as possible. */
165 if (cLead
< pConvertData
->mnLeadStart
166 || cLead
> pConvertData
->mnLeadEnd
167 || cTrail
< pConvertData
->mnTrailStart
168 || cTrail
> pConvertData
->mnTrailEnd
)
170 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
171 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
173 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
176 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
182 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
189 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
190 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
192 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
195 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
201 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
205 if ( pDestBuf
== pEndDestBuf
)
207 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
216 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
217 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
220 /* ----------------------------------------------------------------------- */
222 sal_Size
ImplUnicodeToDBCS( const void* pData
, SAL_UNUSED_PARAMETER
void*,
223 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
224 char* pDestBuf
, sal_Size nDestBytes
,
225 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
226 sal_Size
* pSrcCvtChars
)
230 const ImplUniToDBCSHighTab
* pHighEntry
;
231 const ImplDBCSConvertData
* pConvertData
= static_cast<const ImplDBCSConvertData
*>(pData
);
232 const ImplUniToDBCSHighTab
* pHighTab
= pConvertData
->mpToDBCSHighTab
;
234 const sal_Unicode
* pEndSrcBuf
;
237 pConvertData
->mnLeadStart
!= 0 || pConvertData
->mnLeadEnd
!= 0xFF;
238 /* this statement has the effect that this extra check is only done for
239 EUC-KR, which uses the MS-949 tables, but does not support the full
243 pEndDestBuf
= pDestBuf
+nDestBytes
;
244 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
245 while ( pSrcBuf
< pEndSrcBuf
)
248 unsigned char nHighChar
= (unsigned char)((c
>> 8) & 0xFF);
249 unsigned char nLowChar
= (unsigned char)(c
& 0xFF);
251 /* get entry for the high byte */
252 pHighEntry
= pHighTab
+nHighChar
;
254 /* is low byte in the table range */
255 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
257 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
258 if (bCheckRange
&& cConv
> 0x7F
259 && ((cConv
>> 8) < pConvertData
->mnLeadStart
260 || (cConv
>> 8) > pConvertData
->mnLeadEnd
261 || (cConv
& 0xFF) < pConvertData
->mnTrailStart
262 || (cConv
& 0xFF) > pConvertData
->mnTrailEnd
))
268 if (cConv
== 0 && c
!= 0)
270 /* Map to EUDC ranges: */
271 ImplDBCSEUDCData
const * pEUDCTab
= pConvertData
->mpEUDCTab
;
273 for (i
= 0; i
< pConvertData
->mnEUDCCount
; ++i
)
275 if (c
>= pEUDCTab
->mnUniStart
&& c
<= pEUDCTab
->mnUniEnd
)
277 sal_uInt32 nIndex
= c
- pEUDCTab
->mnUniStart
;
279 = nIndex
/ pEUDCTab
->mnTrailRangeCount
;
281 = nIndex
% pEUDCTab
->mnTrailRangeCount
;
284 ((pEUDCTab
->mnLeadStart
+ nLeadOff
) << 8);
286 = pEUDCTab
->mnTrail1End
- pEUDCTab
->mnTrail1Start
+ 1;
287 if (nTrailOff
< nSize
)
289 cConv
|= pEUDCTab
->mnTrail1Start
+ nTrailOff
;
294 = pEUDCTab
->mnTrail2End
- pEUDCTab
->mnTrail2Start
+ 1;
295 if (nTrailOff
< nSize
)
297 cConv
|= pEUDCTab
->mnTrail2Start
+ nTrailOff
;
301 cConv
|= pEUDCTab
->mnTrail3Start
+ nTrailOff
;
308 * SB: Not sure why this is in here. Plus, it does not work as
309 * intended when (c & 0xFF) == 0, because the next !cConv check
310 * will then think c has not yet been converted...
312 if (c
>= RTL_TEXTCVT_BYTE_PRIVATE_START
313 && c
<= RTL_TEXTCVT_BYTE_PRIVATE_END
)
315 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_PRIVATE_MAPTO0
)
316 cConv
= static_cast< char >(static_cast< unsigned char >(c
& 0xFF));
322 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
327 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
332 /* Handle undefined and surrogates characters */
333 /* (all surrogates characters are undefined) */
334 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
335 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
, nFlags
,
343 if ( !(cConv
& 0xFF00) )
345 if ( pDestBuf
== pEndDestBuf
)
347 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
351 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
356 if ( pDestBuf
+1 >= pEndDestBuf
)
358 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
362 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
364 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
371 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
372 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
375 /* ======================================================================= */
377 #define JIS_EUC_LEAD_OFF 0x80
378 #define JIS_EUC_TRAIL_OFF 0x80
380 /* ----------------------------------------------------------------------- */
382 sal_Size
ImplEUCJPToUnicode( const void* pData
,
383 SAL_UNUSED_PARAMETER
void*,
384 const char* pSrcBuf
, sal_Size nSrcBytes
,
385 sal_Unicode
* pDestBuf
, sal_Size nDestChars
,
386 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
387 sal_Size
* pSrcCvtBytes
)
389 unsigned char cLead
= '\0';
390 unsigned char cTrail
= '\0';
392 const ImplDBCSToUniLeadTab
* pLeadEntry
;
393 const ImplDBCSToUniLeadTab
* pLeadTab
;
394 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
395 sal_Unicode
* pEndDestBuf
;
396 const char* pEndSrcBuf
;
399 pEndDestBuf
= pDestBuf
+nDestChars
;
400 pEndSrcBuf
= pSrcBuf
+nSrcBytes
;
401 while ( pSrcBuf
< pEndSrcBuf
)
403 unsigned char c
= (unsigned char)*pSrcBuf
;
410 /* SS2 - Half-width katakana */
414 /* Source buffer to small */
415 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
417 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
422 c
= (unsigned char)*pSrcBuf
;
423 if ( (c
>= 0xA1) && (c
<= 0xDF) )
424 cConv
= 0xFF61+(c
-0xA1);
434 /* SS3 - JIS 0212-1990 */
435 /* 8F + A1-FE + A1-FE */
438 /* Source buffer to small */
439 if (pEndSrcBuf
- pSrcBuf
< 3)
441 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
446 cLead
= (unsigned char)*pSrcBuf
;
448 cTrail
= (unsigned char)*pSrcBuf
;
449 pLeadTab
= pConvertData
->mpJIS0212ToUniLeadTab
;
451 /* CodeSet 2 JIS 0208-1997 */
455 /* Source buffer to small */
456 if ( pSrcBuf
+ 1 == pEndSrcBuf
)
458 *pInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
;
464 cTrail
= (unsigned char)*pSrcBuf
;
465 pLeadTab
= pConvertData
->mpJIS0208ToUniLeadTab
;
468 /* Undefined Range */
469 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
473 cLead
-= JIS_EUC_LEAD_OFF
;
474 cTrail
-= JIS_EUC_TRAIL_OFF
;
475 pLeadEntry
= pLeadTab
+cLead
;
476 if ( (cTrail
>= pLeadEntry
->mnTrailStart
) && (cTrail
<= pLeadEntry
->mnTrailEnd
) )
477 cConv
= pLeadEntry
->mpToUniTrailTab
[cTrail
-pLeadEntry
->mnTrailStart
];
485 /* We compare the full range of the trail we defined, */
486 /* which can often be greater than the limit. We do this */
487 /* so that extensions that don't consider encodings */
488 /* correctly treat double-byte characters as a single */
489 /* character as much as possible. */
491 if ( (cLead
< JIS_EUC_LEAD_OFF
) || (cTrail
< JIS_EUC_TRAIL_OFF
) )
493 *pInfo
|= RTL_TEXTTOUNICODE_INFO_INVALID
;
494 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR
)
496 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
499 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK
) == RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE
)
505 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
509 *pInfo
|= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED
;
510 if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
)
512 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
;
515 else if ( (nFlags
& RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK
) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE
)
521 cConv
= RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER
;
526 if ( pDestBuf
== pEndDestBuf
)
528 *pInfo
|= RTL_TEXTTOUNICODE_INFO_ERROR
| RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL
;
537 *pSrcCvtBytes
= nSrcBytes
- (pEndSrcBuf
-pSrcBuf
);
538 return (nDestChars
- (pEndDestBuf
-pDestBuf
));
541 /* ----------------------------------------------------------------------- */
543 sal_Size
ImplUnicodeToEUCJP( const void* pData
,
544 SAL_UNUSED_PARAMETER
void*,
545 const sal_Unicode
* pSrcBuf
, sal_Size nSrcChars
,
546 char* pDestBuf
, sal_Size nDestBytes
,
547 sal_uInt32 nFlags
, sal_uInt32
* pInfo
,
548 sal_Size
* pSrcCvtChars
)
552 unsigned char nHighChar
;
553 unsigned char nLowChar
;
554 const ImplUniToDBCSHighTab
* pHighEntry
;
555 const ImplUniToDBCSHighTab
* pHighTab
;
556 const ImplEUCJPConvertData
* pConvertData
= static_cast<const ImplEUCJPConvertData
*>(pData
);
558 const sal_Unicode
* pEndSrcBuf
;
561 pEndDestBuf
= pDestBuf
+nDestBytes
;
562 pEndSrcBuf
= pSrcBuf
+nSrcChars
;
563 while ( pSrcBuf
< pEndSrcBuf
)
570 /* Half-width katakana */
571 else if ( (c
>= 0xFF61) && (c
<= 0xFF9F) )
572 cConv
= 0x8E00+0xA1+(c
-0xFF61);
575 nHighChar
= (unsigned char)((c
>> 8) & 0xFF);
576 nLowChar
= (unsigned char)(c
& 0xFF);
579 pHighTab
= pConvertData
->mpUniToJIS0208HighTab
;
580 pHighEntry
= pHighTab
+nHighChar
;
581 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
583 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
593 pHighTab
= pConvertData
->mpUniToJIS0212HighTab
;
594 pHighEntry
= pHighTab
+nHighChar
;
595 if ( (nLowChar
>= pHighEntry
->mnLowStart
) && (nLowChar
<= pHighEntry
->mnLowEnd
) )
597 cConv
= pHighEntry
->mpToUniTrailTab
[nLowChar
-pHighEntry
->mnLowStart
];
604 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE
)
609 if ( nFlags
& RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACESTR
)
614 /* Handle undefined and surrogates characters */
615 /* (all surrogates characters are undefined) */
616 if (sal::detail::textenc::handleUndefinedUnicodeToTextChar(
617 &pSrcBuf
, pEndSrcBuf
, &pDestBuf
, pEndDestBuf
,
627 if ( !(cConv
& 0xFFFF00) )
629 if ( pDestBuf
== pEndDestBuf
)
631 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
635 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
639 else if ( !(cConv
& 0xFF0000) )
641 if ( pDestBuf
+1 >= pEndDestBuf
)
643 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
647 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
649 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
654 if ( pDestBuf
+2 >= pEndDestBuf
)
656 *pInfo
|= RTL_UNICODETOTEXT_INFO_ERROR
| RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
660 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 16) & 0xFF));
662 *pDestBuf
= static_cast< char >(static_cast< unsigned char >((cConv
>> 8) & 0xFF));
664 *pDestBuf
= static_cast< char >(static_cast< unsigned char >(cConv
& 0xFF));
671 *pSrcCvtChars
= nSrcChars
- (pEndSrcBuf
-pSrcBuf
);
672 return (nDestBytes
- (pEndDestBuf
-pDestBuf
));
675 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */