Catch the exception if decoding failed.
[pymailheaders.git] / chardet / sbcharsetprober.py
blobda071163216fd3b74ed14146d1c35b467c476aae
1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
9 # Contributor(s):
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 # 02110-1301 USA
27 ######################### END LICENSE BLOCK #########################
29 import constants, sys
30 from charsetprober import CharSetProber
32 SAMPLE_SIZE = 64
33 SB_ENOUGH_REL_THRESHOLD = 1024
34 POSITIVE_SHORTCUT_THRESHOLD = 0.95
35 NEGATIVE_SHORTCUT_THRESHOLD = 0.05
36 SYMBOL_CAT_ORDER = 250
37 NUMBER_OF_SEQ_CAT = 4
38 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
39 #NEGATIVE_CAT = 0
41 class SingleByteCharSetProber(CharSetProber):
42 def __init__(self, model, reversed=constants.False, nameProber=None):
43 CharSetProber.__init__(self)
44 self._mModel = model
45 self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
46 self._mNameProber = nameProber # Optional auxiliary prober for name decision
47 self.reset()
49 def reset(self):
50 CharSetProber.reset(self)
51 self._mLastOrder = 255 # char order of last character
52 self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
53 self._mTotalSeqs = 0
54 self._mTotalChar = 0
55 self._mFreqChar = 0 # characters that fall in our sampling range
57 def get_charset_name(self):
58 if self._mNameProber:
59 return self._mNameProber.get_charset_name()
60 else:
61 return self._mModel['charsetName']
63 def feed(self, aBuf):
64 if not self._mModel['keepEnglishLetter']:
65 aBuf = self.filter_without_english_letters(aBuf)
66 aLen = len(aBuf)
67 if not aLen:
68 return self.get_state()
69 for c in aBuf:
70 order = self._mModel['charToOrderMap'][ord(c)]
71 if order < SYMBOL_CAT_ORDER:
72 self._mTotalChar += 1
73 if order < SAMPLE_SIZE:
74 self._mFreqChar += 1
75 if self._mLastOrder < SAMPLE_SIZE:
76 self._mTotalSeqs += 1
77 if not self._mReversed:
78 self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
79 else: # reverse the order of the letters in the lookup
80 self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
81 self._mLastOrder = order
83 if self.get_state() == constants.eDetecting:
84 if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
85 cf = self.get_confidence()
86 if cf > POSITIVE_SHORTCUT_THRESHOLD:
87 if constants._debug:
88 sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
89 self._mState = constants.eFoundIt
90 elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
91 if constants._debug:
92 sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
93 self._mState = constants.eNotMe
95 return self.get_state()
97 def get_confidence(self):
98 r = 0.01
99 if self._mTotalSeqs > 0:
100 # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
101 r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
102 # print r, self._mFreqChar, self._mTotalChar
103 r = r * self._mFreqChar / self._mTotalChar
104 if r >= 1.0:
105 r = 0.99
106 return r