4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using Lucene
.Net
.Analysis
;
31 namespace Beagle
.Daemon
{
33 class NoiseFilter
: TokenFilter
{
35 static int total_count
= 0;
36 static int noise_count
= 0;
38 TokenStream token_stream
;
40 public NoiseFilter (TokenStream input
) : base (input
)
45 // FIXME: we should add some heuristics that are stricter
46 // but explicitly try to avoid filtering out dates,
47 // phone numbers, etc.
48 private static bool IsNoise (string text
)
50 // Anything really long is almost certainly noise.
54 // Look at how often we switch between numbers and letters.
60 const int transitions_cutoff
= 4;
61 int last_type
= -1, last_non_punct_type
= -1, first_type
= -1;
62 bool has_letter
= false, has_digit
= false, has_punctuation
= false;
64 for (int i
= 0; i
< text
.Length
&& transitions
< transitions_cutoff
; ++i
) {
67 if (Char
.IsLetter (c
)) {
70 } else if (Char
.IsDigit (c
)) {
73 } else if (Char
.IsPunctuation (c
)) {
75 has_punctuation
= true;
80 if (type
!= last_type
) {
82 if (type
!= last_non_punct_type
)
94 last_non_punct_type
= type
;
98 // If we make too many transitions, it must be noise.
99 if (transitions
>= transitions_cutoff
)
102 // If we consist of nothing but digits and punctuation, treat it
103 // as noise if it is too long.
104 if (transitions
== 1 && first_type
!= 1 && text
.Length
> 10)
107 // We are very suspicious of long things that make lots of
109 if (transitions
> 3 && text
.Length
> 10)
112 // Beware of anything long that contains a little of everything.
113 if (has_letter
&& has_digit
&& has_punctuation
&& text
.Length
> 10)
116 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
121 public override Lucene
.Net
.Analysis
.Token
Next ()
123 Lucene
.Net
.Analysis
.Token token
;
124 while ( (token
= token_stream
.Next ()) != null) {
126 if (total_count
> 0 && total_count
% 5000 == 0)
127 Logger
.Log
.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
128 noise_count
, total_count
, 100.0 * noise_count
/ total_count
);
131 if (IsNoise (token
.TermText ())) {