1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 ######################### END LICENSE BLOCK #########################
30 from charsetprober
import CharSetProber
33 SB_ENOUGH_REL_THRESHOLD
= 1024
34 POSITIVE_SHORTCUT_THRESHOLD
= 0.95
35 NEGATIVE_SHORTCUT_THRESHOLD
= 0.05
36 SYMBOL_CAT_ORDER
= 250
38 POSITIVE_CAT
= NUMBER_OF_SEQ_CAT
- 1
41 class SingleByteCharSetProber(CharSetProber
):
42 def __init__(self
, model
, reversed=constants
.False, nameProber
=None):
43 CharSetProber
.__init
__(self
)
45 self
._mReversed
= reversed # TRUE if we need to reverse every pair in the model lookup
46 self
._mNameProber
= nameProber
# Optional auxiliary prober for name decision
50 CharSetProber
.reset(self
)
51 self
._mLastOrder
= 255 # char order of last character
52 self
._mSeqCounters
= [0] * NUMBER_OF_SEQ_CAT
55 self
._mFreqChar
= 0 # characters that fall in our sampling range
57 def get_charset_name(self
):
59 return self
._mNameProber
.get_charset_name()
61 return self
._mModel
['charsetName']
64 if not self
._mModel
['keepEnglishLetter']:
65 aBuf
= self
.filter_without_english_letters(aBuf
)
68 return self
.get_state()
70 order
= self
._mModel
['charToOrderMap'][ord(c
)]
71 if order
< SYMBOL_CAT_ORDER
:
73 if order
< SAMPLE_SIZE
:
75 if self
._mLastOrder
< SAMPLE_SIZE
:
77 if not self
._mReversed
:
78 self
._mSeqCounters
[self
._mModel
['precedenceMatrix'][(self
._mLastOrder
* SAMPLE_SIZE
) + order
]] += 1
79 else: # reverse the order of the letters in the lookup
80 self
._mSeqCounters
[self
._mModel
['precedenceMatrix'][(order
* SAMPLE_SIZE
) + self
._mLastOrder
]] += 1
81 self
._mLastOrder
= order
83 if self
.get_state() == constants
.eDetecting
:
84 if self
._mTotalSeqs
> SB_ENOUGH_REL_THRESHOLD
:
85 cf
= self
.get_confidence()
86 if cf
> POSITIVE_SHORTCUT_THRESHOLD
:
88 sys
.stderr
.write('%s confidence = %s, we have a winner\n' % (self
._mModel
['charsetName'], cf
))
89 self
._mState
= constants
.eFoundIt
90 elif cf
< NEGATIVE_SHORTCUT_THRESHOLD
:
92 sys
.stderr
.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self
._mModel
['charsetName'], cf
, NEGATIVE_SHORTCUT_THRESHOLD
))
93 self
._mState
= constants
.eNotMe
95 return self
.get_state()
97 def get_confidence(self
):
99 if self
._mTotalSeqs
> 0:
100 # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
101 r
= (1.0 * self
._mSeqCounters
[POSITIVE_CAT
]) / self
._mTotalSeqs
/ self
._mModel
['mTypicalPositiveRatio']
102 # print r, self._mFreqChar, self._mTotalChar
103 r
= r
* self
._mFreqChar
/ self
._mTotalChar