2006-09-10 Francisco Javier F. Serrador <serrador@openshine.com>
[beagle.git] / beagled / NoiseFilter.cs
blob8639ff05c70e46f34bb0849cc973134063d2ed65
1 //
2 // NoiseFilter.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
29 using Lucene.Net.Analysis;
30 using LNSA = Lucene.Net.Analysis.Standard;
32 namespace Beagle.Daemon {
34 class NoiseFilter : TokenFilter {
36 static int total_count = 0;
37 static int noise_count = 0;
39 TokenStream token_stream;
41 public NoiseFilter (TokenStream input) : base (input)
43 token_stream = input;
46 // FIXME: we should add some heuristics that are stricter
47 // but explicitly try to avoid filtering out dates,
48 // phone numbers, etc.
49 private static bool IsNoise (string text)
51 // Anything really long is almost certainly noise.
52 if (text.Length > 30)
53 return true;
55 // Look at how often we switch between numbers and letters.
56 // Scoring:
57 // <letter> <digit> 1
58 // <digit> <letter> 1
59 // <x> <punct>+ <x> 1
60 // <x> <punct>+ <y> 2
61 const int transitions_cutoff = 4;
62 int last_type = -1, last_non_punct_type = -1, first_type = -1;
63 bool has_letter = false, has_digit = false, has_punctuation = false;
64 int transitions = 0;
65 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
66 char c = text [i];
67 int type = -1;
68 if (Char.IsLetter (c)) {
69 type = 1;
70 has_letter = true;
71 } else if (Char.IsDigit (c)) {
72 type = 2;
73 has_digit = true;
74 } else if (Char.IsPunctuation (c)) {
75 type = 3;
76 has_punctuation = true;
79 if (type != -1) {
81 if (type != last_type) {
82 if (last_type == 3) {
83 if (type != last_non_punct_type)
84 ++transitions;
85 } else {
86 ++transitions;
90 if (first_type == -1)
91 first_type = type;
93 last_type = type;
94 if (type != 3)
95 last_non_punct_type = type;
99 // If we make too many transitions, it must be noise.
100 if (transitions >= transitions_cutoff)
101 return true;
103 // If we consist of nothing but digits and punctuation, treat it
104 // as noise if it is too long.
105 if (transitions == 1 && first_type != 1 && text.Length > 10)
106 return true;
108 // We are very suspicious of long things that make lots of
109 // transitions
110 if (transitions > 3 && text.Length > 10)
111 return true;
113 // Beware of anything long that contains a little of everything.
114 if (has_letter && has_digit && has_punctuation && text.Length > 10)
115 return true;
117 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
118 return false;
122 // Dont scan these tokens for additional noise
123 // Someone might like to search for emails, hostnames and
124 // phone numbers (which fall under type NUM)
125 private static readonly string tokentype_email
126 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
127 private static readonly string tokentype_host
128 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
129 private static readonly string tokentype_number
130 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
132 private bool IgnoreNoise (Lucene.Net.Analysis.Token token)
134 string type = token.Type ();
136 if (type == tokentype_email ||
137 type == tokentype_host)
138 return true;
140 if (type == tokentype_number)
141 // nobody will remember more than 10 digits
142 return (token.TermText ().Length <= 10);
144 return false;
147 public override Lucene.Net.Analysis.Token Next ()
149 Lucene.Net.Analysis.Token token;
150 while ( (token = token_stream.Next ()) != null) {
151 #if false
152 if (total_count > 0 && total_count % 5000 == 0)
153 Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
154 noise_count, total_count, 100.0 * noise_count / total_count);
155 #endif
156 ++total_count;
157 if (IgnoreNoise (token))
158 return token;
159 if (IsNoise (token.TermText ())) {
160 ++noise_count;
161 continue;
163 return token;
165 return null;