4 // Copyright (C) 2004-2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using Lucene
.Net
.Analysis
;
30 using LNSA
= Lucene
.Net
.Analysis
.Standard
;
32 namespace Beagle
.Daemon
{
34 class NoiseFilter
: TokenFilter
{
36 static int total_count
= 0;
37 static int noise_count
= 0;
39 TokenStream token_stream
;
41 public NoiseFilter (TokenStream input
) : base (input
)
46 // FIXME: we should add some heuristics that are stricter
47 // but explicitly try to avoid filtering out dates,
48 // phone numbers, etc.
49 private static bool IsNoise (string text
)
51 // Anything really long is almost certainly noise.
55 // Look at how often we switch between numbers and letters.
61 const int transitions_cutoff
= 4;
62 int last_type
= -1, last_non_punct_type
= -1, first_type
= -1;
63 bool has_letter
= false, has_digit
= false, has_punctuation
= false;
65 for (int i
= 0; i
< text
.Length
&& transitions
< transitions_cutoff
; ++i
) {
68 if (Char
.IsLetter (c
)) {
71 } else if (Char
.IsDigit (c
)) {
74 } else if (Char
.IsPunctuation (c
)) {
76 has_punctuation
= true;
81 if (type
!= last_type
) {
83 if (type
!= last_non_punct_type
)
95 last_non_punct_type
= type
;
99 // If we make too many transitions, it must be noise.
100 if (transitions
>= transitions_cutoff
)
103 // If we consist of nothing but digits and punctuation, treat it
104 // as noise if it is too long.
105 if (transitions
== 1 && first_type
!= 1 && text
.Length
> 10)
108 // We are very suspicious of long things that make lots of
110 if (transitions
> 3 && text
.Length
> 10)
113 // Beware of anything long that contains a little of everything.
114 if (has_letter
&& has_digit
&& has_punctuation
&& text
.Length
> 10)
117 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
122 // Dont scan these tokens for additional noise
123 // Someone might like to search for emails, hostnames and
124 // phone numbers (which fall under type NUM)
125 private static readonly string tokentype_email
126 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.EMAIL
];
127 private static readonly string tokentype_host
128 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.HOST
];
129 private static readonly string tokentype_number
130 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.NUM
];
132 private bool IgnoreNoise (Lucene
.Net
.Analysis
.Token token
)
134 string type
= token
.Type ();
136 if (type
== tokentype_email
||
137 type
== tokentype_host
)
140 if (type
== tokentype_number
)
141 // nobody will remember more than 10 digits
142 return (token
.TermText ().Length
<= 10);
147 public override Lucene
.Net
.Analysis
.Token
Next ()
149 Lucene
.Net
.Analysis
.Token token
;
150 while ( (token
= token_stream
.Next ()) != null) {
152 if (total_count
> 0 && total_count
% 5000 == 0)
153 Logger
.Log
.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
154 noise_count
, total_count
, 100.0 * noise_count
/ total_count
);
157 if (IgnoreNoise (token
))
159 if (IsNoise (token
.TermText ())) {