Bug 468575 - Scrape some gunk off the config/ grout, r=ted
[wine-gecko.git] / intl / uconv / src / nsUTF8ToUnicode.cpp
blob6563dbdc0e2aa0e5bd661dcb0188b0971381776a
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is Mozilla Communicator client code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
22 * Contributor(s):
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 #include "nsUCSupport.h"
39 #include "nsUTF8ToUnicode.h"
41 #define UNICODE_BYTE_ORDER_MARK 0xFEFF
43 NS_IMETHODIMP NS_NewUTF8ToUnicode(nsISupports* aOuter,
44 const nsIID& aIID,
45 void** aResult)
47 if (!aResult) {
48 return NS_ERROR_NULL_POINTER;
50 if (aOuter) {
51 *aResult = nsnull;
52 return NS_ERROR_NO_AGGREGATION;
54 nsUTF8ToUnicode * inst = new nsUTF8ToUnicode();
55 if (!inst) {
56 *aResult = nsnull;
57 return NS_ERROR_OUT_OF_MEMORY;
59 nsresult res = inst->QueryInterface(aIID, aResult);
60 if (NS_FAILED(res)) {
61 *aResult = nsnull;
62 delete inst;
64 return res;
67 //----------------------------------------------------------------------
68 // Class nsUTF8ToUnicode [implementation]
70 nsUTF8ToUnicode::nsUTF8ToUnicode()
71 : nsBasicDecoderSupport()
73 Reset();
76 //----------------------------------------------------------------------
77 // Subclassing of nsTableDecoderSupport class [implementation]
79 /**
80 * Normally the maximum length of the output of the UTF8 decoder in UTF16
81 * code units is the same as the length of the input in UTF8 code units,
82 * since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single
83 * UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair.
85 * However, there is an edge case where the output can be longer than the
86 * input: if the previous buffer ended with an incomplete multi-byte
87 * sequence and this buffer does not begin with a valid continuation
88 * byte, we will return NS_ERROR_UNEXPECTED and the caller may insert a
89 * replacement character in the output buffer which corresponds to no
90 * character in the input buffer. So in the worst case the destination
91 * will need to be one code unit longer than the source.
92 * See bug 301797.
94 NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
95 PRInt32 aSrcLength,
96 PRInt32 * aDestLength)
98 *aDestLength = aSrcLength + 1;
99 return NS_OK;
103 //----------------------------------------------------------------------
104 // Subclassing of nsBasicDecoderSupport class [implementation]
106 NS_IMETHODIMP nsUTF8ToUnicode::Reset()
109 mUcs4 = 0; // cached Unicode character
110 mState = 0; // cached expected number of octets after the current octet
111 // until the beginning of the next UTF8 character sequence
112 mBytes = 1; // cached expected number of octets in the current sequence
113 mFirst = PR_TRUE;
115 return NS_OK;
119 //----------------------------------------------------------------------
120 // Subclassing of nsBasicDecoderSupport class [implementation]
123 NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
124 PRInt32 * aSrcLength,
125 PRUnichar * aDest,
126 PRInt32 * aDestLength)
128 PRUint32 aSrcLen = (PRUint32) (*aSrcLength);
129 PRUint32 aDestLen = (PRUint32) (*aDestLength);
131 const char *in, *inend;
132 inend = aSrc + aSrcLen;
134 PRUnichar *out, *outend;
135 outend = aDest + aDestLen;
137 nsresult res = NS_OK; // conversion result
139 // Set mFirst to PR_FALSE now so we don't have to every time through the ASCII
140 // branch within the loop.
141 if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
142 mFirst = PR_FALSE;
144 for (in = aSrc, out = aDest; ((in < inend) && (out < outend)); ++in) {
145 if (0 == mState) {
146 // When mState is zero we expect either a US-ASCII character or a
147 // multi-octet sequence.
148 if (0 == (0x80 & (*in))) {
149 // US-ASCII, pass straight through.
150 *out++ = (PRUnichar)*in;
151 mBytes = 1;
152 } else if (0xC0 == (0xE0 & (*in))) {
153 // First octet of 2 octet sequence
154 mUcs4 = (PRUint32)(*in);
155 mUcs4 = (mUcs4 & 0x1F) << 6;
156 mState = 1;
157 mBytes = 2;
158 } else if (0xE0 == (0xF0 & (*in))) {
159 // First octet of 3 octet sequence
160 mUcs4 = (PRUint32)(*in);
161 mUcs4 = (mUcs4 & 0x0F) << 12;
162 mState = 2;
163 mBytes = 3;
164 } else if (0xF0 == (0xF8 & (*in))) {
165 // First octet of 4 octet sequence
166 mUcs4 = (PRUint32)(*in);
167 mUcs4 = (mUcs4 & 0x07) << 18;
168 mState = 3;
169 mBytes = 4;
170 } else if (0xF8 == (0xFC & (*in))) {
171 /* First octet of 5 octet sequence.
173 * This is illegal because the encoded codepoint must be either
174 * (a) not the shortest form or
175 * (b) outside the Unicode range of 0-0x10FFFF.
176 * Rather than trying to resynchronize, we will carry on until the end
177 * of the sequence and let the later error handling code catch it.
179 mUcs4 = (PRUint32)(*in);
180 mUcs4 = (mUcs4 & 0x03) << 24;
181 mState = 4;
182 mBytes = 5;
183 } else if (0xFC == (0xFE & (*in))) {
184 // First octet of 6 octet sequence, see comments for 5 octet sequence.
185 mUcs4 = (PRUint32)(*in);
186 mUcs4 = (mUcs4 & 1) << 30;
187 mState = 5;
188 mBytes = 6;
189 } else {
190 /* Current octet is neither in the US-ASCII range nor a legal first
191 * octet of a multi-octet sequence.
193 * Return an error condition. Caller is responsible for flushing and
194 * refilling the buffer and resetting state.
196 res = NS_ERROR_UNEXPECTED;
197 break;
199 } else {
200 // When mState is non-zero, we expect a continuation of the multi-octet
201 // sequence
202 if (0x80 == (0xC0 & (*in))) {
203 // Legal continuation.
204 PRUint32 shift = (mState - 1) * 6;
205 PRUint32 tmp = *in;
206 tmp = (tmp & 0x0000003FL) << shift;
207 mUcs4 |= tmp;
209 if (0 == --mState) {
210 /* End of the multi-octet sequence. mUcs4 now contains the final
211 * Unicode codepoint to be output
213 * Check for illegal sequences and codepoints.
216 // From Unicode 3.1, non-shortest form is illegal
217 if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
218 ((3 == mBytes) && (mUcs4 < 0x0800)) ||
219 ((4 == mBytes) && (mUcs4 < 0x10000)) ||
220 (4 < mBytes) ||
221 // From Unicode 3.2, surrogate characters are illegal
222 ((mUcs4 & 0xFFFFF800) == 0xD800) ||
223 // Codepoints outside the Unicode range are illegal
224 (mUcs4 > 0x10FFFF)) {
225 res = NS_ERROR_UNEXPECTED;
226 break;
228 if (mUcs4 > 0xFFFF) {
229 // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
230 mUcs4 -= 0x00010000;
231 *out++ = 0xD800 | (0x000003FF & (mUcs4 >> 10));
232 *out++ = 0xDC00 | (0x000003FF & mUcs4);
233 } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) {
234 // Don't output the BOM only if it is the first character
235 *out++ = mUcs4;
237 //initialize UTF8 cache
238 mUcs4 = 0;
239 mState = 0;
240 mBytes = 1;
241 mFirst = PR_FALSE;
243 } else {
244 /* ((0xC0 & (*in) != 0x80) && (mState != 0))
246 * Incomplete multi-octet sequence. Unconsume this
247 * octet and return an error condition. Caller is responsible
248 * for flushing and refilling the buffer and resetting state.
250 in--;
251 res = NS_ERROR_UNEXPECTED;
252 break;
257 // output not finished, output buffer too short
258 if ((NS_OK == res) && (in < inend) && (out >= outend))
259 res = NS_OK_UDEC_MOREOUTPUT;
261 // last UCS4 is incomplete, make sure the caller
262 // returns with properly aligned continuation of the buffer
263 if ((NS_OK == res) && (mState != 0))
264 res = NS_OK_UDEC_MOREINPUT;
266 *aSrcLength = in - aSrc;
267 *aDestLength = out - aDest;
269 return(res);