1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 * A character set converter from GBK to Unicode.
41 * @created 07/Sept/1999
42 * @author Yueheng Xu, Yueheng.Xu@intel.com
45 #include "nsGBKToUnicode.h"
46 #include "nsUCvCnDll.h"
50 //------------------------------------------------------------
51 // nsGBKUnique2BytesToUnicode
52 //------------------------------------------------------------
53 class nsGBKUnique2BytesToUnicode
: public nsTableDecoderSupport
56 nsGBKUnique2BytesToUnicode();
57 virtual ~nsGBKUnique2BytesToUnicode()
62 static const PRUint16 g_utGBKUnique2Bytes
[] = {
63 #include "gbkuniq2b.ut"
65 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode()
66 : nsTableDecoderSupport(u2BytesCharset
, nsnull
,
67 (uMappingTable
*) &g_utGBKUnique2Bytes
, 1)
71 //------------------------------------------------------------
72 // nsGB18030Unique2BytesToUnicode
73 //------------------------------------------------------------
74 class nsGB18030Unique2BytesToUnicode
: public nsTableDecoderSupport
77 nsGB18030Unique2BytesToUnicode();
78 virtual ~nsGB18030Unique2BytesToUnicode()
83 static const PRUint16 g_utGB18030Unique2Bytes
[] = {
84 #include "gb18030uniq2b.ut"
86 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode()
87 : nsTableDecoderSupport(u2BytesCharset
, nsnull
,
88 (uMappingTable
*) &g_utGB18030Unique2Bytes
, 1)
92 //------------------------------------------------------------
93 // nsGB18030Unique4BytesToUnicode
94 //------------------------------------------------------------
95 class nsGB18030Unique4BytesToUnicode
: public nsTableDecoderSupport
98 nsGB18030Unique4BytesToUnicode();
99 virtual ~nsGB18030Unique4BytesToUnicode()
104 static const PRUint16 g_utGB18030Unique4Bytes
[] = {
105 #include "gb180304bytes.ut"
107 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode()
108 : nsTableDecoderSupport(u4BytesGB18030Charset
, nsnull
,
109 (uMappingTable
*) &g_utGB18030Unique4Bytes
, 1)
114 //----------------------------------------------------------------------
115 // Class nsGBKToUnicode [implementation]
117 //----------------------------------------------------------------------
118 // Subclassing of nsTablesDecoderSupport class [implementation]
120 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
121 (UINT8_IN_RANGE(0x81, (c), 0xFE))
122 #define FIRST_BYTE_IS_SURROGATE(c) \
123 (UINT8_IN_RANGE(0x90, (c), 0xFE))
124 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
125 (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
126 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
127 (UINT8_IN_RANGE(0x30, (c), 0x39))
128 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
129 (UINT8_IN_RANGE(0x81, (c), 0xFE))
130 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
131 (UINT8_IN_RANGE(0x30, (c), 0x39))
133 NS_IMETHODIMP
nsGBKToUnicode::ConvertNoBuff(const char* aSrc
,
134 PRInt32
* aSrcLength
,
136 PRInt32
* aDestLength
)
139 PRInt32 iSrcLength
= (*aSrcLength
);
140 PRInt32 iDestlen
= 0;
144 for (i
=0;i
<iSrcLength
;i
++)
146 if ( iDestlen
>= (*aDestLength
) )
148 rv
= NS_OK_UDEC_MOREOUTPUT
;
151 // The valid range for the 1st byte is [0x81,0xFE]
152 if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc
))
154 if(i
+1 >= iSrcLength
)
156 rv
= NS_OK_UDEC_MOREINPUT
;
159 // To make sure, the second byte has to be checked as well.
160 // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
161 if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc
[1]))
164 *aDest
= mUtil
.GBKCharToUnicode(aSrc
[0], aSrc
[1]);
165 if(UCS2_NO_MAPPING
== *aDest
)
167 // We cannot map in the common mapping, let's call the
168 // delegate 2 byte decoder to decode the gbk or gb18030 unique
170 if(! TryExtensionDecoder(aSrc
, aDest
))
172 *aDest
= UCS2_NO_MAPPING
;
178 else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc
[1]))
180 // from the first 2 bytes, it looks like a 4 byte GB18030
181 if(i
+3 >= iSrcLength
) // make sure we got 4 bytes
183 rv
= NS_OK_UDEC_MOREINPUT
;
187 // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
190 if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc
[2]) &&
191 LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc
[3]))
193 if ( ! FIRST_BYTE_IS_SURROGATE(aSrc
[0]))
195 // let's call the delegated 4 byte gb18030 converter to convert it
196 if(! Try4BytesDecoder(aSrc
, aDest
))
197 *aDest
= UCS2_NO_MAPPING
;
199 // let's try supplement mapping
200 NS_ASSERTION(( (iDestlen
+1) <= (*aDestLength
) ), "no enouth output memory");
201 if ( (iDestlen
+1) <= (*aDestLength
) )
203 if(DecodeToSurrogate(aSrc
, aDest
))
205 // surrogte two PRUnichar
209 *aDest
= UCS2_NO_MAPPING
;
212 *aDest
= UCS2_NO_MAPPING
;
216 *aDest
= UCS2_NO_MAPPING
;
221 else if ((PRUint8
) aSrc
[0] == (PRUint8
)0xA0 )
223 // stand-alone (not followed by a valid second byte) 0xA0 !
224 // treat it as valid a la Netscape 4.x
225 *aDest
= CAST_CHAR_TO_UNICHAR(*aSrc
);
228 // Invalid GBK code point (second byte should be 0x40 or higher)
229 *aDest
= UCS2_NO_MAPPING
;
235 // The source is an ASCII
236 *aDest
= CAST_CHAR_TO_UNICHAR(*aSrc
);
239 if(IS_GBK_EURO(*aSrc
)) {
242 *aDest
= UCS2_NO_MAPPING
;
251 *aDestLength
= iDestlen
;
256 void nsGBKToUnicode::CreateExtensionDecoder()
258 mExtensionDecoder
= new nsGBKUnique2BytesToUnicode();
260 void nsGBKToUnicode::Create4BytesDecoder()
262 m4BytesDecoder
= nsnull
;
264 void nsGB18030ToUnicode::CreateExtensionDecoder()
266 mExtensionDecoder
= new nsGB18030Unique2BytesToUnicode();
268 void nsGB18030ToUnicode::Create4BytesDecoder()
270 m4BytesDecoder
= new nsGB18030Unique4BytesToUnicode();
272 PRBool
nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc
, PRUnichar
* aOut
)
274 NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc
[0]), "illegal first byte");
275 NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc
[1]), "illegal second byte");
276 NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc
[2]), "illegal third byte");
277 NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc
[3]), "illegal forth byte");
278 if(! FIRST_BYTE_IS_SURROGATE(aSrc
[0]))
280 if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc
[1]))
282 if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc
[2]))
284 if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc
[3]))
287 PRUint8 a1
= (PRUint8
) aSrc
[0];
288 PRUint8 a2
= (PRUint8
) aSrc
[1];
289 PRUint8 a3
= (PRUint8
) aSrc
[2];
290 PRUint8 a4
= (PRUint8
) aSrc
[3];
295 PRUint32 idx
= (((a1
* 10 + a2
) * 126 + a3
) * 10) + a4
;
297 *aOut
++ = 0xD800 | (0x000003FF & (idx
>> 10));
298 *aOut
= 0xDC00 | (0x000003FF & idx
);
302 PRBool
nsGBKToUnicode::TryExtensionDecoder(const char* aSrc
, PRUnichar
* aOut
)
304 if(!mExtensionDecoder
)
305 CreateExtensionDecoder();
306 NS_ASSERTION(mExtensionDecoder
, "cannot creqte 2 bytes unique converter");
307 if(mExtensionDecoder
)
309 nsresult res
= mExtensionDecoder
->Reset();
310 NS_ASSERTION(NS_SUCCEEDED(res
), "2 bytes unique conversoin reset failed");
313 res
= mExtensionDecoder
->Convert(aSrc
,&len
, aOut
, &dstlen
);
314 NS_ASSERTION(NS_FAILED(res
) || ((len
==2) && (dstlen
== 1)),
315 "some strange conversion result");
316 // if we failed, we then just use the 0xfffd
317 // therefore, we ignore the res here.
318 if(NS_SUCCEEDED(res
))
323 PRBool
nsGBKToUnicode::DecodeToSurrogate(const char* aSrc
, PRUnichar
* aOut
)
327 PRBool
nsGBKToUnicode::Try4BytesDecoder(const char* aSrc
, PRUnichar
* aOut
)
330 Create4BytesDecoder();
333 nsresult res
= m4BytesDecoder
->Reset();
334 NS_ASSERTION(NS_SUCCEEDED(res
), "4 bytes unique conversoin reset failed");
337 res
= m4BytesDecoder
->Convert(aSrc
,&len
, aOut
, &dstlen
);
338 NS_ASSERTION(NS_FAILED(res
) || ((len
==4) && (dstlen
== 1)),
339 "some strange conversion result");
340 // if we failed, we then just use the 0xfffd
341 // therefore, we ignore the res here.
342 if(NS_SUCCEEDED(res
))