1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is Mozilla Universal charset detector code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 2001
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
11 # Shy Shalom - original C code
13 # This library is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU Lesser General Public
15 # License as published by the Free Software Foundation; either
16 # version 2.1 of the License, or (at your option) any later version.
18 # This library is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # Lesser General Public License for more details.
23 # You should have received a copy of the GNU Lesser General Public
24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 ######################### END LICENSE BLOCK #########################
29 from charsetprober
import CharSetProber
37 ASC
= 2 # ascii capital letter
38 ASS
= 3 # ascii small letter
39 ACV
= 4 # accent capital vowel
40 ACO
= 5 # accent capital other
41 ASV
= 6 # accent small vowel
42 ASO
= 7 # accent small other
43 CLASS_NUM
= 8 # total classes
45 Latin1_CharToClass
= ( \
46 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 00 - 07
47 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 08 - 0F
48 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 10 - 17
49 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 18 - 1F
50 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 20 - 27
51 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 28 - 2F
52 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 30 - 37
53 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 38 - 3F
54 OTH
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 40 - 47
55 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 48 - 4F
56 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 50 - 57
57 ASC
, ASC
, ASC
, OTH
, OTH
, OTH
, OTH
, OTH
, # 58 - 5F
58 OTH
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 60 - 67
59 ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 68 - 6F
60 ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 70 - 77
61 ASS
, ASS
, ASS
, OTH
, OTH
, OTH
, OTH
, OTH
, # 78 - 7F
62 OTH
, UDF
, OTH
, ASO
, OTH
, OTH
, OTH
, OTH
, # 80 - 87
63 OTH
, OTH
, ACO
, OTH
, ACO
, UDF
, ACO
, UDF
, # 88 - 8F
64 UDF
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 90 - 97
65 OTH
, OTH
, ASO
, OTH
, ASO
, UDF
, ASO
, ACO
, # 98 - 9F
66 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # A0 - A7
67 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # A8 - AF
68 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # B0 - B7
69 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # B8 - BF
70 ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, ACO
, ACO
, # C0 - C7
71 ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, # C8 - CF
72 ACO
, ACO
, ACV
, ACV
, ACV
, ACV
, ACV
, OTH
, # D0 - D7
73 ACV
, ACV
, ACV
, ACV
, ACV
, ACO
, ACO
, ACO
, # D8 - DF
74 ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASO
, ASO
, # E0 - E7
75 ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, # E8 - EF
76 ASO
, ASO
, ASV
, ASV
, ASV
, ASV
, ASV
, OTH
, # F0 - F7
77 ASV
, ASV
, ASV
, ASV
, ASV
, ASO
, ASO
, ASO
, # F8 - FF
84 Latin1ClassModel
= ( \
85 # UDF OTH ASC ASS ACV ACO ASV ASO
86 0, 0, 0, 0, 0, 0, 0, 0, # UDF
87 0, 3, 3, 3, 3, 3, 3, 3, # OTH
88 0, 3, 3, 3, 3, 3, 3, 3, # ASC
89 0, 3, 3, 3, 1, 1, 3, 3, # ASS
90 0, 3, 3, 3, 1, 2, 1, 2, # ACV
91 0, 3, 3, 3, 3, 3, 3, 3, # ACO
92 0, 3, 1, 3, 1, 1, 1, 3, # ASV
93 0, 3, 1, 3, 1, 1, 3, 3, # ASO
96 class Latin1Prober(CharSetProber
):
98 CharSetProber
.__init
__(self
)
102 self
._mLastCharClass
= OTH
103 self
._mFreqCounter
= [0] * FREQ_CAT_NUM
104 CharSetProber
.reset(self
)
106 def get_charset_name(self
):
107 return "windows-1252"
109 def feed(self
, aBuf
):
110 aBuf
= self
.filter_with_english_letters(aBuf
)
112 charClass
= Latin1_CharToClass
[ord(c
)]
113 freq
= Latin1ClassModel
[(self
._mLastCharClass
* CLASS_NUM
) + charClass
]
115 self
._mState
= constants
.eNotMe
117 self
._mFreqCounter
[freq
] += 1
118 self
._mLastCharClass
= charClass
120 return self
.get_state()
122 def get_confidence(self
):
123 if self
.get_state() == constants
.eNotMe
:
126 total
= reduce(operator
.add
, self
._mFreqCounter
)
130 confidence
= (self
._mFreqCounter
[3] / total
) - (self
._mFreqCounter
[1] * 20.0 / total
)
133 # lower the confidence of latin1 so that other more accurate detector
135 confidence
= confidence
* 0.5