Bug 468575 - Scrape some gunk off the config/ grout, r=ted
[wine-gecko.git] / intl / chardet / tests / DetectCharset.cpp
blob4668dd0cffdc52d15ac3596c2d330d3f7aac95e0
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
22 * Contributor(s):
23 * Pierre Phaneuf <pp@ludusdesign.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either of the GNU General Public License Version 2 or later (the "GPL"),
27 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
38 #include "nsISupports.h"
39 #include "nsIComponentManager.h"
40 #include "nsICharsetDetector.h"
41 #include "nsICharsetDetectionObserver.h"
42 #include "nsComponentManagerUtils.h"
43 #include "nsServiceManagerUtils.h"
45 #include <stdio.h>
46 #include <stdlib.h>
47 #if defined(XP_WIN) || defined(XP_OS2)
48 #include <io.h>
49 #endif
50 #ifdef XP_UNIX
51 #include <unistd.h>
52 #endif
55 class nsStatis {
56 public:
57 nsStatis() { };
58 virtual ~nsStatis() { };
59 virtual PRBool HandleData(const char* aBuf, PRUint32 aLen) = 0;
60 virtual void DataEnd() = 0;
61 virtual void Report()=0;
64 class nsBaseStatis : public nsStatis {
65 public:
66 nsBaseStatis(unsigned char aL, unsigned char aH, float aR) ;
67 virtual ~nsBaseStatis() {};
68 virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
69 virtual void DataEnd() ;
70 virtual void Report();
71 protected:
72 unsigned char mLWordHi;
73 unsigned char mLWordLo;
74 private:
75 PRUint32 mNumOf2Bytes;
76 PRUint32 mNumOfLChar;
77 PRUint32 mNumOfLWord;
78 PRUint32 mLWordLength;
79 PRUint32 mLWordLen[10];
80 float mR;
81 PRBool mTailByte;
82 PRBool mLastLChar;
84 nsBaseStatis::nsBaseStatis(unsigned char aL, unsigned char aH, float aR)
86 mNumOf2Bytes = mNumOfLWord = mLWordLength = mNumOfLChar= 0;
87 mTailByte = mLastLChar = PR_FALSE;
88 for(PRUint32 i =0;i < 20; i++)
89 mLWordLen[i] = 0;
90 mLWordHi = aH;
91 mLWordLo = aL;
92 mR = aR;
94 PRBool nsBaseStatis::HandleData(const char* aBuf, PRUint32 aLen)
96 for(PRUint32 i=0; i < aLen; i++)
98 if(mTailByte)
99 mTailByte = PR_FALSE;
100 else
102 mTailByte = (0x80 == ( aBuf[i] & 0x80));
103 if(mTailByte)
105 mNumOf2Bytes++;
106 unsigned char a = (unsigned char) aBuf[i];
107 PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
108 if(thisLChar)
110 mNumOfLChar++;
111 mLWordLength++;
112 } else {
113 if(mLastLChar) {
114 mNumOfLWord++;
115 mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
116 mLWordLength =0 ;
119 mLastLChar = thisLChar;
120 } else {
121 if(mLastLChar) {
122 mNumOfLWord++;
123 mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
124 mLWordLength =0 ;
125 mLastLChar = PR_FALSE;
130 return PR_TRUE;
132 void nsBaseStatis::DataEnd()
134 if(mLastLChar) {
135 mNumOfLWord++;
136 mLWordLen[ (mLWordLength > 10) ? 9 : (mLWordLength-1)]++;
139 void nsBaseStatis::Report()
141 if(mNumOf2Bytes > 0)
144 printf("LChar Ratio = %d : %d ( %5.3f)\n",
145 mNumOfLChar,
146 mNumOf2Bytes,
147 ((float)mNumOfLChar / (float)mNumOf2Bytes) * 100);
149 float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
150 float delta = (rate - mR) / mR;
151 delta *= delta * 1000;
152 #ifdef EXPERIMENT
153 printf("Exp = %f \n",delta);
154 #endif
159 if(mNumOfLChar > 0)
160 printf("LWord Word = %d : %d (%5.3f)\n",
161 mNumOfLWord,
162 mNumOfLChar,
163 ((float)mNumOfLWord / (float)mNumOfLChar) * 100);
164 if(mNumOfLWord > 0)
166 PRUint32 ac =0;
167 for(PRUint32 i=0;i<10;i++)
169 ac += mLWordLen[i];
170 printf("LWord Word Length[%d]= %d -> %5.3f%% %5.3f%%\n", i+1,
171 mLWordLen[i],
172 (((float)mLWordLen[i] / (float)mNumOfLWord) * 100),
173 (((float)ac / (float)mNumOfLWord) * 100));
180 class nsSimpleStatis : public nsStatis {
181 public:
182 nsSimpleStatis(unsigned char aL, unsigned char aH, float aR,const char* aCharset) ;
183 virtual ~nsSimpleStatis() {};
184 virtual PRBool HandleData(const char* aBuf, PRUint32 aLen);
185 virtual void DataEnd() ;
186 virtual void Report();
187 protected:
188 unsigned char mLWordHi;
189 unsigned char mLWordLo;
190 private:
191 PRUint32 mNumOf2Bytes;
192 PRUint32 mNumOfLChar;
193 float mR;
194 const char* mCharset;
195 PRBool mTailByte;
197 nsSimpleStatis::nsSimpleStatis(unsigned char aL, unsigned char aH, float aR, const char* aCharset)
199 mNumOf2Bytes = mNumOfLChar= 0;
200 mTailByte = PR_FALSE;
201 mLWordHi = aH;
202 mLWordLo = aL;
203 mR = aR;
204 mCharset = aCharset;
206 PRBool nsSimpleStatis::HandleData(const char* aBuf, PRUint32 aLen)
208 for(PRUint32 i=0; i < aLen; i++)
210 if(mTailByte)
211 mTailByte = PR_FALSE;
212 else
214 mTailByte = (0x80 == ( aBuf[i] & 0x80));
215 if(mTailByte)
217 mNumOf2Bytes++;
218 unsigned char a = (unsigned char) aBuf[i];
219 PRBool thisLChar = (( mLWordLo <= a) && (a <= mLWordHi));
220 if(thisLChar)
221 mNumOfLChar++;
225 return PR_TRUE;
227 void nsSimpleStatis::DataEnd()
230 void nsSimpleStatis::Report()
232 if(mNumOf2Bytes > 0)
234 float rate = (float) mNumOfLChar / (float) mNumOf2Bytes;
235 float delta = (rate - mR) / mR;
236 delta = delta * delta * (float)100;
237 #ifdef EXPERIMENT
238 printf("Exp = %f \n",delta);
239 if(delta < 1.0)
240 printf("This is %s\n" ,mCharset);
241 #endif
245 //==========================================================
248 #define MAXBSIZE (1L << 13)
250 void usage() {
251 printf("Usage: DetectFile detector blocksize\n"
252 " detector: "
253 "ja_parallel_state_machine,"
254 "ko_parallel_state_machine,"
255 "zhcn_parallel_state_machine,"
256 "zhtw_parallel_state_machine,"
257 "zh_parallel_state_machine,"
258 "cjk_parallel_state_machine,"
259 "ruprob,"
260 "ukprob,"
261 "\n blocksize: 1 ~ %ld\n"
262 " Data are passed in from STDIN\n"
263 , MAXBSIZE);
266 class nsReporter : public nsICharsetDetectionObserver
268 NS_DECL_ISUPPORTS
269 public:
270 nsReporter() { };
271 virtual ~nsReporter() { };
273 NS_IMETHOD Notify(const char* aCharset, nsDetectionConfident aConf)
275 printf("RESULT CHARSET : %s\n", aCharset);
276 printf("RESULT Confident : %d\n", aConf);
277 return NS_OK;
282 NS_IMPL_ISUPPORTS1(nsReporter, nsICharsetDetectionObserver)
284 nsresult GetDetector(const char* key, nsICharsetDetector** det)
286 char buf[128];
287 strcpy(buf, NS_CHARSET_DETECTOR_CONTRACTID_BASE);
288 strcat(buf, key);
289 return CallCreateInstance(buf, det);
293 nsresult GetObserver(nsICharsetDetectionObserver** aRes)
295 *aRes = nsnull;
296 nsReporter* rep = new nsReporter();
297 if(rep) {
298 return rep->QueryInterface(NS_GET_IID(nsICharsetDetectionObserver) ,
299 (void**)aRes);
301 return NS_ERROR_OUT_OF_MEMORY;
304 int main(int argc, char** argv) {
305 char buf[MAXBSIZE];
306 PRUint32 bs;
307 if( 3 != argc )
309 usage();
310 printf("Need 2 arguments\n");
311 return(-1);
313 bs = atoi(argv[2]);
314 if((bs <1)||(bs>MAXBSIZE))
316 usage();
317 printf("blocksize out of range - %s\n", argv[2]);
318 return(-1);
320 nsresult rev = NS_OK;
321 nsICharsetDetector *det = nsnull;
322 rev = GetDetector(argv[1], &det);
323 if(NS_FAILED(rev) || (nsnull == det) ){
324 usage();
325 printf("Invalid Detector - %s\n", argv[1]);
326 printf("XPCOM ERROR CODE = %x\n", rev);
327 return(-1);
329 nsICharsetDetectionObserver *obs = nsnull;
330 rev = GetObserver(&obs);
331 if(NS_SUCCEEDED(rev)) {
332 rev = det->Init(obs);
333 NS_IF_RELEASE(obs);
334 if(NS_FAILED(rev))
336 printf("XPCOM ERROR CODE = %x\n", rev);
337 return(-1);
339 } else {
340 printf("XPCOM ERROR CODE = %x\n", rev);
341 return(-1);
344 size_t sz;
345 PRBool done = PR_FALSE;
346 nsSimpleStatis ks(0xb0,0xc8, (float)0.95952, "EUC-KR");
347 nsSimpleStatis js(0xa4,0xa5, (float)0.45006, "EUC-JP");
348 nsStatis* stat[2] = {&ks, &js};
349 PRUint32 i;
352 sz = read(0, buf, bs);
353 if(sz > 0) {
354 if(! done) {
355 #ifdef DEBUG_DetectCharset
356 printf("call DoIt %d\n",sz);
357 #endif
358 rev = det->DoIt( buf, sz, &done);
359 #ifdef DEBUG_DetectCharset
360 printf("DoIt return Done = %d\n",done);
361 #endif
362 if(NS_FAILED(rev))
364 printf("XPCOM ERROR CODE = %x\n", rev);
365 return(-1);
368 for(i=0;i<2;i++)
369 stat[i]->HandleData(buf, sz);
371 // } while((sz > 0) && (!done) );
372 } while(sz > 0);
373 if(!done)
375 #ifdef DEBUG_DetectCharset
376 printf("Done = %d\n",done);
377 printf("call Done %d\n",sz);
378 #endif
379 rev = det->Done();
380 if(NS_FAILED(rev))
382 printf("XPCOM ERROR CODE = %x\n", rev);
383 return(-1);
386 for(i=0;i<2;i++) {
387 stat[i]->DataEnd();
388 stat[i]->Report();
390 #ifdef DEBUG_DetectCharset
391 printf( "Done\n");
392 #endif
394 NS_IF_RELEASE(det);
395 #ifdef DEBUG_DetectCharset
396 printf( "Done 2\n");
397 #endif
398 return (0);