1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <rtl/character.hxx>
25 #include <rtl/textcvt.h>
26 #include <sal/types.h>
28 #include "context.hxx"
29 #include "converter.hxx"
30 #include "convertgb18030.hxx"
34 enum ImplGb18030ToUnicodeState
36 IMPL_GB_18030_TO_UNICODE_STATE_0
,
37 IMPL_GB_18030_TO_UNICODE_STATE_1
,
38 IMPL_GB_18030_TO_UNICODE_STATE_2
,
39 IMPL_GB_18030_TO_UNICODE_STATE_3
42 struct ImplGb18030ToUnicodeContext
44 ImplGb18030ToUnicodeState m_eState
;
50 void * ImplCreateGb18030ToUnicodeContext()
52 ImplGb18030ToUnicodeContext
* pContext
= new ImplGb18030ToUnicodeContext
;
53 pContext
->m_eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
57 void ImplResetGb18030ToUnicodeContext(void * pContext
)
60 static_cast< ImplGb18030ToUnicodeContext
* >(pContext
)->m_eState
61 = IMPL_GB_18030_TO_UNICODE_STATE_0
;
64 void ImplDestroyGb18030ToUnicodeContext(void * pContext
)
66 delete static_cast< ImplGb18030ToUnicodeContext
* >(pContext
);
69 sal_Size
ImplConvertGb18030ToUnicode(void const * pData
,
73 sal_Unicode
* pDestBuf
,
77 sal_Size
* pSrcCvtBytes
)
79 sal_Unicode
const * pGb18030Data
80 = static_cast< ImplGb18030ConverterData
const * >(pData
)->m_pGb18030ToUnicodeData
;
81 ImplGb180302000ToUnicodeRange
const * pGb18030Ranges
82 = static_cast< ImplGb18030ConverterData
const * >(pData
)->
83 m_pGb18030ToUnicodeRanges
;
84 ImplGb18030ToUnicodeState eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
87 sal_Size nConverted
= 0;
88 sal_Unicode
* pDestBufPtr
= pDestBuf
;
89 sal_Unicode
* pDestBufEnd
= pDestBuf
+ nDestChars
;
90 sal_Size startOfCurrentChar
= 0;
94 eState
= static_cast< ImplGb18030ToUnicodeContext
* >(pContext
)->m_eState
;
95 nCode
= static_cast< ImplGb18030ToUnicodeContext
* >(pContext
)->m_nCode
;
98 for (; nConverted
< nSrcBytes
; ++nConverted
)
100 bool bUndefined
= true;
101 sal_uInt32 nChar
= *reinterpret_cast<unsigned char const *>(pSrcBuf
++);
104 case IMPL_GB_18030_TO_UNICODE_STATE_0
:
106 if (pDestBufPtr
!= pDestBufEnd
) {
107 *pDestBufPtr
++ = static_cast<sal_Unicode
>(nChar
);
108 startOfCurrentChar
= nConverted
+ 1;
111 else if (nChar
== 0x80)
113 else if (nChar
<= 0xFE)
115 nCode
= nChar
- 0x81;
116 eState
= IMPL_GB_18030_TO_UNICODE_STATE_1
;
125 case IMPL_GB_18030_TO_UNICODE_STATE_1
:
126 if (nChar
>= 0x30 && nChar
<= 0x39)
128 nCode
= nCode
* 10 + (nChar
- 0x30);
129 eState
= IMPL_GB_18030_TO_UNICODE_STATE_2
;
131 else if ((nChar
>= 0x40 && nChar
<= 0x7E)
132 || (nChar
>= 0x80 && nChar
<= 0xFE))
134 nCode
= nCode
* 190 + (nChar
<= 0x7E ? nChar
- 0x40 :
136 if (pDestBufPtr
!= pDestBufEnd
) {
137 *pDestBufPtr
++ = pGb18030Data
[nCode
];
138 startOfCurrentChar
= nConverted
+ 1;
141 eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
150 case IMPL_GB_18030_TO_UNICODE_STATE_2
:
151 if (nChar
>= 0x81 && nChar
<= 0xFE)
153 nCode
= nCode
* 126 + (nChar
- 0x81);
154 eState
= IMPL_GB_18030_TO_UNICODE_STATE_3
;
163 case IMPL_GB_18030_TO_UNICODE_STATE_3
:
164 if (nChar
>= 0x30 && nChar
<= 0x39)
166 nCode
= nCode
* 10 + (nChar
- 0x30);
168 // 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF:
169 if (nCode
>= 189000 && nCode
<= 1237575)
170 if (pDestBufEnd
- pDestBufPtr
>= 2)
172 nCode
-= 189000 - 0x10000;
173 pDestBufPtr
+= rtl::splitSurrogates(nCode
, pDestBufPtr
);
174 startOfCurrentChar
= nConverted
+ 1;
180 ImplGb180302000ToUnicodeRange
const * pRange
182 sal_uInt32 nFirstNonRange
= 0;
185 if (pRange
->m_nNonRangeDataIndex
== -1)
187 else if (nCode
< pRange
->m_nFirstLinear
)
189 if (pDestBufPtr
!= pDestBufEnd
) {
192 pRange
->m_nNonRangeDataIndex
193 + (nCode
- nFirstNonRange
)];
194 startOfCurrentChar
= nConverted
+ 1;
199 else if (nCode
< pRange
->m_nPastLinear
)
201 if (pDestBufPtr
!= pDestBufEnd
) {
203 = static_cast<sal_Unicode
>(pRange
->m_nFirstUnicode
207 startOfCurrentChar
= nConverted
+ 1;
212 nFirstNonRange
= (pRange
++)->m_nPastLinear
;
215 eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
227 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
228 bUndefined
, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
231 case sal::detail::textenc::BAD_INPUT_STOP
:
232 eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
233 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0) {
236 nConverted
= startOfCurrentChar
;
240 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
241 eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
242 startOfCurrentChar
= nConverted
+ 1;
245 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
252 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
256 if (eState
!= IMPL_GB_18030_TO_UNICODE_STATE_0
257 && (nInfo
& (RTL_TEXTTOUNICODE_INFO_ERROR
258 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
))
261 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) == 0)
262 nInfo
|= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL
;
264 switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
265 false, true, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
,
268 case sal::detail::textenc::BAD_INPUT_STOP
:
269 if ((nFlags
& RTL_TEXTTOUNICODE_FLAGS_FLUSH
) != 0) {
270 nConverted
= startOfCurrentChar
;
273 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
274 eState
= IMPL_GB_18030_TO_UNICODE_STATE_0
;
277 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
278 nInfo
|= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL
;
285 static_cast< ImplGb18030ToUnicodeContext
* >(pContext
)->m_eState
= eState
;
286 static_cast< ImplGb18030ToUnicodeContext
* >(pContext
)->m_nCode
= nCode
;
291 *pSrcCvtBytes
= nConverted
;
293 return pDestBufPtr
- pDestBuf
;
296 sal_Size
ImplConvertUnicodeToGb18030(void const * pData
,
298 sal_Unicode
const * pSrcBuf
,
304 sal_Size
* pSrcCvtChars
)
306 sal_uInt32
const * pGb18030Data
307 = static_cast< ImplGb18030ConverterData
const * >(pData
)->
308 m_pUnicodeToGb18030Data
;
309 ImplUnicodeToGb180302000Range
const * pGb18030Ranges
310 = static_cast< ImplGb18030ConverterData
const * >(pData
)->
311 m_pUnicodeToGb18030Ranges
;
312 sal_Unicode nHighSurrogate
= 0;
313 sal_uInt32 nInfo
= 0;
314 sal_Size nConverted
= 0;
315 char * pDestBufPtr
= pDestBuf
;
316 char * pDestBufEnd
= pDestBuf
+ nDestBytes
;
320 = static_cast<ImplUnicodeToTextContext
*>(pContext
)->m_nHighSurrogate
;
322 for (; nConverted
< nSrcChars
; ++nConverted
)
324 bool bUndefined
= true;
325 sal_uInt32 nChar
= *pSrcBuf
++;
326 if (nHighSurrogate
== 0)
328 if (rtl::isHighSurrogate(nChar
))
330 nHighSurrogate
= static_cast<sal_Unicode
>(nChar
);
333 else if (rtl::isLowSurrogate(nChar
))
339 else if (rtl::isLowSurrogate(nChar
))
340 nChar
= rtl::combineSurrogates(nHighSurrogate
, nChar
);
347 assert(rtl::isUnicodeScalarValue(nChar
));
350 if (pDestBufPtr
!= pDestBufEnd
)
351 *pDestBufPtr
++ = static_cast< char >(nChar
);
354 else if (nChar
< 0x10000)
356 ImplUnicodeToGb180302000Range
const * pRange
= pGb18030Ranges
;
357 sal_Unicode nFirstNonRange
= 0x80;
360 if (nChar
< pRange
->m_nFirstUnicode
)
363 = pGb18030Data
[pRange
->m_nNonRangeDataIndex
364 + (nChar
- nFirstNonRange
)];
365 if (pDestBufEnd
- pDestBufPtr
366 >= (nCode
<= 0xFFFF ? 2 : 4))
370 *pDestBufPtr
++ = static_cast< char >(nCode
>> 24);
371 *pDestBufPtr
++ = static_cast< char >(nCode
>> 16 & 0xFF);
373 *pDestBufPtr
++ = static_cast< char >(nCode
>> 8 & 0xFF);
374 *pDestBufPtr
++ = static_cast< char >(nCode
& 0xFF);
380 if (nChar
<= pRange
->m_nLastUnicode
)
382 if (pDestBufEnd
- pDestBufPtr
>= 4)
385 = pRange
->m_nFirstLinear
386 + (nChar
- pRange
->m_nFirstUnicode
);
387 *pDestBufPtr
++ = static_cast< char >(nCode
/ 12600 + 0x81);
389 = static_cast< char >(nCode
/ 1260 % 10 + 0x30);
390 *pDestBufPtr
++ = static_cast< char >(nCode
/ 10 % 126 + 0x81);
391 *pDestBufPtr
++ = static_cast< char >(nCode
% 10 + 0x30);
398 = static_cast<sal_Unicode
>((pRange
++)->m_nLastUnicode
+ 1);
402 if (pDestBufEnd
- pDestBufPtr
>= 4)
404 sal_uInt32 nCode
= nChar
- 0x10000;
405 *pDestBufPtr
++ = static_cast< char >(nCode
/ 12600 + 0x90);
406 *pDestBufPtr
++ = static_cast< char >(nCode
/ 1260 % 10 + 0x30);
407 *pDestBufPtr
++ = static_cast< char >(nCode
/ 10 % 126 + 0x81);
408 *pDestBufPtr
++ = static_cast< char >(nCode
% 10 + 0x30);
416 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
417 bUndefined
, nChar
, nFlags
, &pDestBufPtr
, pDestBufEnd
,
418 &nInfo
, nullptr, 0, nullptr))
420 case sal::detail::textenc::BAD_INPUT_STOP
:
424 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
428 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
435 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
439 if (nHighSurrogate
!= 0
440 && (nInfo
& (RTL_UNICODETOTEXT_INFO_ERROR
441 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
))
444 if ((nFlags
& RTL_UNICODETOTEXT_FLAGS_FLUSH
) != 0)
445 nInfo
|= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL
;
447 switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
448 false, 0, nFlags
, &pDestBufPtr
, pDestBufEnd
, &nInfo
,
449 nullptr, 0, nullptr))
451 case sal::detail::textenc::BAD_INPUT_STOP
:
452 case sal::detail::textenc::BAD_INPUT_CONTINUE
:
456 case sal::detail::textenc::BAD_INPUT_NO_OUTPUT
:
457 nInfo
|= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL
;
463 static_cast<ImplUnicodeToTextContext
*>(pContext
)->m_nHighSurrogate
468 *pSrcCvtChars
= nConverted
;
470 return pDestBufPtr
- pDestBuf
;
473 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */