Catch the exception if decoding failed.
[pymailheaders.git] / chardet / chardistribution.py
blobb893341845ba53a403abda8160950430cc173aad
1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Communicator client code.
3 #
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 1998
7 # the Initial Developer. All Rights Reserved.
8 #
9 # Contributor(s):
10 # Mark Pilgrim - port to Python
12 # This library is free software; you can redistribute it and/or
13 # modify it under the terms of the GNU Lesser General Public
14 # License as published by the Free Software Foundation; either
15 # version 2.1 of the License, or (at your option) any later version.
17 # This library is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 # Lesser General Public License for more details.
22 # You should have received a copy of the GNU Lesser General Public
23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 # 02110-1301 USA
26 ######################### END LICENSE BLOCK #########################
28 import constants
29 from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
30 from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
31 from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
32 from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
33 from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
35 ENOUGH_DATA_THRESHOLD = 1024
36 SURE_YES = 0.99
37 SURE_NO = 0.01
39 class CharDistributionAnalysis:
40 def __init__(self):
41 self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
42 self._mTableSize = None # Size of above table
43 self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
44 self.reset()
46 def reset(self):
47 """reset analyser, clear any state"""
48 self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
49 self._mTotalChars = 0 # Total characters encountered
50 self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
52 def feed(self, aStr, aCharLen):
53 """feed a character with known length"""
54 if aCharLen == 2:
55 # we only care about 2-bytes character in our distribution analysis
56 order = self.get_order(aStr)
57 else:
58 order = -1
59 if order >= 0:
60 self._mTotalChars += 1
61 # order is valid
62 if order < self._mTableSize:
63 if 512 > self._mCharToFreqOrder[order]:
64 self._mFreqChars += 1
66 def get_confidence(self):
67 """return confidence based on existing data"""
68 # if we didn't receive any character in our consideration range, return negative answer
69 if self._mTotalChars <= 0:
70 return SURE_NO
72 if self._mTotalChars != self._mFreqChars:
73 r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
74 if r < SURE_YES:
75 return r
77 # normalize confidence (we don't want to be 100% sure)
78 return SURE_YES
80 def got_enough_data(self):
81 # It is not necessary to receive all data to draw conclusion. For charset detection,
82 # certain amount of data is enough
83 return self._mTotalChars > ENOUGH_DATA_THRESHOLD
85 def get_order(self, aStr):
86 # We do not handle characters based on the original encoding string, but
87 # convert this encoding string to a number, here called order.
88 # This allows multiple encodings of a language to share one frequency table.
89 return -1
91 class EUCTWDistributionAnalysis(CharDistributionAnalysis):
92 def __init__(self):
93 CharDistributionAnalysis.__init__(self)
94 self._mCharToFreqOrder = EUCTWCharToFreqOrder
95 self._mTableSize = EUCTW_TABLE_SIZE
96 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
98 def get_order(self, aStr):
99 # for euc-TW encoding, we are interested
100 # first byte range: 0xc4 -- 0xfe
101 # second byte range: 0xa1 -- 0xfe
102 # no validation needed here. State machine has done that
103 if aStr[0] >= '\xC4':
104 return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
105 else:
106 return -1
108 class EUCKRDistributionAnalysis(CharDistributionAnalysis):
109 def __init__(self):
110 CharDistributionAnalysis.__init__(self)
111 self._mCharToFreqOrder = EUCKRCharToFreqOrder
112 self._mTableSize = EUCKR_TABLE_SIZE
113 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
115 def get_order(self, aStr):
116 # for euc-KR encoding, we are interested
117 # first byte range: 0xb0 -- 0xfe
118 # second byte range: 0xa1 -- 0xfe
119 # no validation needed here. State machine has done that
120 if aStr[0] >= '\xB0':
121 return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
122 else:
123 return -1;
125 class GB2312DistributionAnalysis(CharDistributionAnalysis):
126 def __init__(self):
127 CharDistributionAnalysis.__init__(self)
128 self._mCharToFreqOrder = GB2312CharToFreqOrder
129 self._mTableSize = GB2312_TABLE_SIZE
130 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
132 def get_order(self, aStr):
133 # for GB2312 encoding, we are interested
134 # first byte range: 0xb0 -- 0xfe
135 # second byte range: 0xa1 -- 0xfe
136 # no validation needed here. State machine has done that
137 if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
138 return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
139 else:
140 return -1;
142 class Big5DistributionAnalysis(CharDistributionAnalysis):
143 def __init__(self):
144 CharDistributionAnalysis.__init__(self)
145 self._mCharToFreqOrder = Big5CharToFreqOrder
146 self._mTableSize = BIG5_TABLE_SIZE
147 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
149 def get_order(self, aStr):
150 # for big5 encoding, we are interested
151 # first byte range: 0xa4 -- 0xfe
152 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
153 # no validation needed here. State machine has done that
154 if aStr[0] >= '\xA4':
155 if aStr[1] >= '\xA1':
156 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
157 else:
158 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
159 else:
160 return -1
162 class SJISDistributionAnalysis(CharDistributionAnalysis):
163 def __init__(self):
164 CharDistributionAnalysis.__init__(self)
165 self._mCharToFreqOrder = JISCharToFreqOrder
166 self._mTableSize = JIS_TABLE_SIZE
167 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
169 def get_order(self, aStr):
170 # for sjis encoding, we are interested
171 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
173 # no validation needed here. State machine has done that
174 if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
175 order = 188 * (ord(aStr[0]) - 0x81)
176 elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
177 order = 188 * (ord(aStr[0]) - 0xE0 + 31)
178 else:
179 return -1;
180 order = order + ord(aStr[1]) - 0x40
181 if aStr[1] > '\x7F':
182 order =- 1
183 return order
185 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
186 def __init__(self):
187 CharDistributionAnalysis.__init__(self)
188 self._mCharToFreqOrder = JISCharToFreqOrder
189 self._mTableSize = JIS_TABLE_SIZE
190 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
192 def get_order(self, aStr):
193 # for euc-JP encoding, we are interested
194 # first byte range: 0xa0 -- 0xfe
195 # second byte range: 0xa1 -- 0xfe
196 # no validation needed here. State machine has done that
197 if aStr[0] >= '\xA0':
198 return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
199 else:
200 return -1