Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / extensions / universalchardet / src / base / nsUniversalDetector.cpp
blob0a3a79fabd0a27de859f9bea744f47295b9e9610
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is Mozilla Universal charset detector code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 2001
20 * the Initial Developer. All Rights Reserved.
22 * Contributor(s):
23 * Shy Shalom <shooshX@gmail.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #include "nscore.h"
41 #include "nsUniversalDetector.h"
43 #include "nsMBCSGroupProber.h"
44 #include "nsSBCSGroupProber.h"
45 #include "nsEscCharsetProber.h"
46 #include "nsLatin1Prober.h"
48 nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
50 mDone = PR_FALSE;
51 mBestGuess = -1; //illegal value as signal
52 mInTag = PR_FALSE;
53 mEscCharSetProber = nsnull;
55 mStart = PR_TRUE;
56 mDetectedCharset = nsnull;
57 mGotData = PR_FALSE;
58 mInputState = ePureAscii;
59 mLastChar = '\0';
60 mLanguageFilter = aLanguageFilter;
62 PRUint32 i;
63 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
64 mCharSetProbers[i] = nsnull;
67 nsUniversalDetector::~nsUniversalDetector()
69 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
70 if (mCharSetProbers[i])
71 delete mCharSetProbers[i];
72 if (mEscCharSetProber)
73 delete mEscCharSetProber;
76 void
77 nsUniversalDetector::Reset()
79 mDone = PR_FALSE;
80 mBestGuess = -1; //illegal value as signal
81 mInTag = PR_FALSE;
83 mStart = PR_TRUE;
84 mDetectedCharset = nsnull;
85 mGotData = PR_FALSE;
86 mInputState = ePureAscii;
87 mLastChar = '\0';
89 if (mEscCharSetProber)
90 mEscCharSetProber->Reset();
92 PRUint32 i;
93 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
94 if (mCharSetProbers[i])
95 mCharSetProbers[i]->Reset();
98 //---------------------------------------------------------------------
99 #define SHORTCUT_THRESHOLD (float)0.95
100 #define MINIMUM_THRESHOLD (float)0.20
102 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
104 if(mDone)
105 return NS_OK;
107 if (aLen > 0)
108 mGotData = PR_TRUE;
110 //If the data starts with BOM, we know it is UTF
111 if (mStart)
113 mStart = PR_FALSE;
114 if (aLen > 3)
115 switch (aBuf[0])
117 case '\xEF':
118 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
119 // EF BB BF UTF-8 encoded BOM
120 mDetectedCharset = "UTF-8";
121 break;
122 case '\xFE':
123 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
124 // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
125 mDetectedCharset = "X-ISO-10646-UCS-4-3412";
126 else if ('\xFF' == aBuf[1])
127 // FE FF UTF-16, big endian BOM
128 mDetectedCharset = "UTF-16";
129 break;
130 case '\x00':
131 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
132 // 00 00 FE FF UTF-32, big-endian BOM
133 mDetectedCharset = "UTF-32";
134 else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
135 // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
136 mDetectedCharset = "X-ISO-10646-UCS-4-2143";
137 break;
138 case '\xFF':
139 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
140 // FF FE 00 00 UTF-32, little-endian BOM
141 mDetectedCharset = "UTF-32";
142 else if ('\xFE' == aBuf[1])
143 // FF FE UTF-16, little endian BOM
144 mDetectedCharset = "UTF-16";
145 break;
146 } // switch
148 if (mDetectedCharset)
150 mDone = PR_TRUE;
151 return NS_OK;
155 PRUint32 i;
156 for (i = 0; i < aLen; i++)
158 //other than 0xa0, if every othe character is ascii, the page is ascii
159 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
161 //we got a non-ascii byte (high-byte)
162 if (mInputState != eHighbyte)
164 //adjust state
165 mInputState = eHighbyte;
167 //kill mEscCharSetProber if it is active
168 if (mEscCharSetProber) {
169 delete mEscCharSetProber;
170 mEscCharSetProber = nsnull;
173 //start multibyte and singlebyte charset prober
174 if (nsnull == mCharSetProbers[0])
176 mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
177 if (nsnull == mCharSetProbers[0])
178 return NS_ERROR_OUT_OF_MEMORY;
180 if (nsnull == mCharSetProbers[1] &&
181 (mLanguageFilter & NS_FILTER_NON_CJK))
183 mCharSetProbers[1] = new nsSBCSGroupProber;
184 if (nsnull == mCharSetProbers[1])
185 return NS_ERROR_OUT_OF_MEMORY;
187 mCharSetProbers[2] = new nsLatin1Prober;
188 if (nsnull == mCharSetProbers[2])
189 return NS_ERROR_OUT_OF_MEMORY;
192 else
194 //ok, just pure ascii so far
195 if ( ePureAscii == mInputState &&
196 (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
198 //found escape character or HZ "~{"
199 mInputState = eEscAscii;
201 mLastChar = aBuf[i];
205 nsProbingState st;
206 switch (mInputState)
208 case eEscAscii:
209 if (nsnull == mEscCharSetProber) {
210 mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
211 if (nsnull == mEscCharSetProber)
212 return NS_ERROR_OUT_OF_MEMORY;
214 st = mEscCharSetProber->HandleData(aBuf, aLen);
215 if (st == eFoundIt)
217 mDone = PR_TRUE;
218 mDetectedCharset = mEscCharSetProber->GetCharSetName();
220 break;
221 case eHighbyte:
222 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
224 if (mCharSetProbers[i])
226 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
227 if (st == eFoundIt)
229 mDone = PR_TRUE;
230 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
231 return NS_OK;
235 break;
237 default: //pure ascii
238 ;//do nothing here
240 return NS_OK;
244 //---------------------------------------------------------------------
245 void nsUniversalDetector::DataEnd()
247 if (!mGotData)
249 // we haven't got any data yet, return immediately
250 // caller program sometimes call DataEnd before anything has been sent to detector
251 return;
254 if (mDetectedCharset)
256 mDone = PR_TRUE;
257 Report(mDetectedCharset);
258 return;
261 switch (mInputState)
263 case eHighbyte:
265 float proberConfidence;
266 float maxProberConfidence = (float)0.0;
267 PRInt32 maxProber = 0;
269 for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
271 if (mCharSetProbers[i])
273 proberConfidence = mCharSetProbers[i]->GetConfidence();
274 if (proberConfidence > maxProberConfidence)
276 maxProberConfidence = proberConfidence;
277 maxProber = i;
281 //do not report anything because we are not confident of it, that's in fact a negative answer
282 if (maxProberConfidence > MINIMUM_THRESHOLD)
283 Report(mCharSetProbers[maxProber]->GetCharSetName());
285 break;
286 case eEscAscii:
287 break;
288 default:
291 return;