1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Communicator client code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 1998
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
12 # This library is free software; you can redistribute it and/or
13 # modify it under the terms of the GNU Lesser General Public
14 # License as published by the Free Software Foundation; either
15 # version 2.1 of the License, or (at your option) any later version.
17 # This library is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 # Lesser General Public License for more details.
22 # You should have received a copy of the GNU Lesser General Public
23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 ######################### END LICENSE BLOCK #########################
29 from euctwfreq
import EUCTWCharToFreqOrder
, EUCTW_TABLE_SIZE
, EUCTW_TYPICAL_DISTRIBUTION_RATIO
30 from euckrfreq
import EUCKRCharToFreqOrder
, EUCKR_TABLE_SIZE
, EUCKR_TYPICAL_DISTRIBUTION_RATIO
31 from gb2312freq
import GB2312CharToFreqOrder
, GB2312_TABLE_SIZE
, GB2312_TYPICAL_DISTRIBUTION_RATIO
32 from big5freq
import Big5CharToFreqOrder
, BIG5_TABLE_SIZE
, BIG5_TYPICAL_DISTRIBUTION_RATIO
33 from jisfreq
import JISCharToFreqOrder
, JIS_TABLE_SIZE
, JIS_TYPICAL_DISTRIBUTION_RATIO
35 ENOUGH_DATA_THRESHOLD
= 1024
39 class CharDistributionAnalysis
:
41 self
._mCharToFreqOrder
= None # Mapping table to get frequency order from char order (get from GetOrder())
42 self
._mTableSize
= None # Size of above table
43 self
._mTypicalDistributionRatio
= None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
47 """reset analyser, clear any state"""
48 self
._mDone
= constants
.False # If this flag is set to constants.True, detection is done and conclusion has been made
49 self
._mTotalChars
= 0 # Total characters encountered
50 self
._mFreqChars
= 0 # The number of characters whose frequency order is less than 512
52 def feed(self
, aStr
, aCharLen
):
53 """feed a character with known length"""
55 # we only care about 2-bytes character in our distribution analysis
56 order
= self
.get_order(aStr
)
60 self
._mTotalChars
+= 1
62 if order
< self
._mTableSize
:
63 if 512 > self
._mCharToFreqOrder
[order
]:
66 def get_confidence(self
):
67 """return confidence based on existing data"""
68 # if we didn't receive any character in our consideration range, return negative answer
69 if self
._mTotalChars
<= 0:
72 if self
._mTotalChars
!= self
._mFreqChars
:
73 r
= self
._mFreqChars
/ ((self
._mTotalChars
- self
._mFreqChars
) * self
._mTypicalDistributionRatio
)
77 # normalize confidence (we don't want to be 100% sure)
80 def got_enough_data(self
):
81 # It is not necessary to receive all data to draw conclusion. For charset detection,
82 # certain amount of data is enough
83 return self
._mTotalChars
> ENOUGH_DATA_THRESHOLD
85 def get_order(self
, aStr
):
86 # We do not handle characters based on the original encoding string, but
87 # convert this encoding string to a number, here called order.
88 # This allows multiple encodings of a language to share one frequency table.
91 class EUCTWDistributionAnalysis(CharDistributionAnalysis
):
93 CharDistributionAnalysis
.__init
__(self
)
94 self
._mCharToFreqOrder
= EUCTWCharToFreqOrder
95 self
._mTableSize
= EUCTW_TABLE_SIZE
96 self
._mTypicalDistributionRatio
= EUCTW_TYPICAL_DISTRIBUTION_RATIO
98 def get_order(self
, aStr
):
99 # for euc-TW encoding, we are interested
100 # first byte range: 0xc4 -- 0xfe
101 # second byte range: 0xa1 -- 0xfe
102 # no validation needed here. State machine has done that
103 if aStr
[0] >= '\xC4':
104 return 94 * (ord(aStr
[0]) - 0xC4) + ord(aStr
[1]) - 0xA1
108 class EUCKRDistributionAnalysis(CharDistributionAnalysis
):
110 CharDistributionAnalysis
.__init
__(self
)
111 self
._mCharToFreqOrder
= EUCKRCharToFreqOrder
112 self
._mTableSize
= EUCKR_TABLE_SIZE
113 self
._mTypicalDistributionRatio
= EUCKR_TYPICAL_DISTRIBUTION_RATIO
115 def get_order(self
, aStr
):
116 # for euc-KR encoding, we are interested
117 # first byte range: 0xb0 -- 0xfe
118 # second byte range: 0xa1 -- 0xfe
119 # no validation needed here. State machine has done that
120 if aStr
[0] >= '\xB0':
121 return 94 * (ord(aStr
[0]) - 0xB0) + ord(aStr
[1]) - 0xA1
125 class GB2312DistributionAnalysis(CharDistributionAnalysis
):
127 CharDistributionAnalysis
.__init
__(self
)
128 self
._mCharToFreqOrder
= GB2312CharToFreqOrder
129 self
._mTableSize
= GB2312_TABLE_SIZE
130 self
._mTypicalDistributionRatio
= GB2312_TYPICAL_DISTRIBUTION_RATIO
132 def get_order(self
, aStr
):
133 # for GB2312 encoding, we are interested
134 # first byte range: 0xb0 -- 0xfe
135 # second byte range: 0xa1 -- 0xfe
136 # no validation needed here. State machine has done that
137 if (aStr
[0] >= '\xB0') and (aStr
[1] >= '\xA1'):
138 return 94 * (ord(aStr
[0]) - 0xB0) + ord(aStr
[1]) - 0xA1
142 class Big5DistributionAnalysis(CharDistributionAnalysis
):
144 CharDistributionAnalysis
.__init
__(self
)
145 self
._mCharToFreqOrder
= Big5CharToFreqOrder
146 self
._mTableSize
= BIG5_TABLE_SIZE
147 self
._mTypicalDistributionRatio
= BIG5_TYPICAL_DISTRIBUTION_RATIO
149 def get_order(self
, aStr
):
150 # for big5 encoding, we are interested
151 # first byte range: 0xa4 -- 0xfe
152 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
153 # no validation needed here. State machine has done that
154 if aStr
[0] >= '\xA4':
155 if aStr
[1] >= '\xA1':
156 return 157 * (ord(aStr
[0]) - 0xA4) + ord(aStr
[1]) - 0xA1 + 63
158 return 157 * (ord(aStr
[0]) - 0xA4) + ord(aStr
[1]) - 0x40
162 class SJISDistributionAnalysis(CharDistributionAnalysis
):
164 CharDistributionAnalysis
.__init
__(self
)
165 self
._mCharToFreqOrder
= JISCharToFreqOrder
166 self
._mTableSize
= JIS_TABLE_SIZE
167 self
._mTypicalDistributionRatio
= JIS_TYPICAL_DISTRIBUTION_RATIO
169 def get_order(self
, aStr
):
170 # for sjis encoding, we are interested
171 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
173 # no validation needed here. State machine has done that
174 if (aStr
[0] >= '\x81') and (aStr
[0] <= '\x9F'):
175 order
= 188 * (ord(aStr
[0]) - 0x81)
176 elif (aStr
[0] >= '\xE0') and (aStr
[0] <= '\xEF'):
177 order
= 188 * (ord(aStr
[0]) - 0xE0 + 31)
180 order
= order
+ ord(aStr
[1]) - 0x40
185 class EUCJPDistributionAnalysis(CharDistributionAnalysis
):
187 CharDistributionAnalysis
.__init
__(self
)
188 self
._mCharToFreqOrder
= JISCharToFreqOrder
189 self
._mTableSize
= JIS_TABLE_SIZE
190 self
._mTypicalDistributionRatio
= JIS_TYPICAL_DISTRIBUTION_RATIO
192 def get_order(self
, aStr
):
193 # for euc-JP encoding, we are interested
194 # first byte range: 0xa0 -- 0xfe
195 # second byte range: 0xa1 -- 0xfe
196 # no validation needed here. State machine has done that
197 if aStr
[0] >= '\xA0':
198 return 94 * (ord(aStr
[0]) - 0xA1) + ord(aStr
[1]) - 0xa1