Remove debug spew
[beagle.git] / beagled / NoiseFilter.cs
blob45afb95fd85b6a6f7d87b48efb67cdac9368d3fd
1 //
2 // NoiseFilter.cs
3 //
4 // Copyright (C) 2006 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004-2005 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.IO;
30 using System.Collections;
32 using Lucene.Net.Analysis;
33 using LNSA = Lucene.Net.Analysis.Standard;
35 namespace Beagle.Daemon {
37 // TokenFilter which does several fancy things
38 // 1. Removes words which are potential noise like dhyhy8ju7q9
39 // 2. Splits email addresses into meaningful tokens
40 // 3. Splits hostnames into subparts
41 class NoiseEmailHostFilter : TokenFilter {
43 private bool tokenize_email_hostname;
45 TokenStream token_stream;
47 public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
48 : base (input)
50 this.token_stream = input;
51 this.tokenize_email_hostname = tokenize_email_hostname;
54 // FIXME: we should add some heuristics that are stricter
55 // but explicitly try to avoid filtering out dates,
56 // phone numbers, etc.
57 private static bool IsNoise (string text)
59 // Anything really long is almost certainly noise.
60 if (text.Length > 30)
61 return true;
63 // Look at how often we switch between numbers and letters.
64 // Scoring:
65 // <letter> <digit> 1
66 // <digit> <letter> 1
67 // <x> <punct>+ <x> 1
68 // <x> <punct>+ <y> 2
69 const int transitions_cutoff = 4;
70 int last_type = -1, last_non_punct_type = -1, first_type = -1;
71 bool has_letter = false, has_digit = false, has_punctuation = false;
72 int transitions = 0;
73 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
74 char c = text [i];
75 int type = -1;
76 if (Char.IsLetter (c)) {
77 type = 1;
78 has_letter = true;
79 } else if (Char.IsDigit (c)) {
80 type = 2;
81 has_digit = true;
82 } else if (Char.IsPunctuation (c)) {
83 type = 3;
84 has_punctuation = true;
87 if (type != -1) {
89 if (type != last_type) {
90 if (last_type == 3) {
91 if (type != last_non_punct_type)
92 ++transitions;
93 } else {
94 ++transitions;
98 if (first_type == -1)
99 first_type = type;
101 last_type = type;
102 if (type != 3)
103 last_non_punct_type = type;
107 // If we make too many transitions, it must be noise.
108 if (transitions >= transitions_cutoff)
109 return true;
111 // If we consist of nothing but digits and punctuation, treat it
112 // as noise if it is too long.
113 if (transitions == 1 && first_type != 1 && text.Length > 10)
114 return true;
116 // We are very suspicious of long things that make lots of
117 // transitions
118 if (transitions > 3 && text.Length > 10)
119 return true;
121 // Beware of anything long that contains a little of everything.
122 if (has_letter && has_digit && has_punctuation && text.Length > 10)
123 return true;
125 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
126 return false;
130 // Dont scan these tokens for additional noise
131 // Someone might like to search for emails, hostnames and
132 // phone numbers (which fall under type NUM)
133 private static readonly string tokentype_email
134 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
135 private static readonly string tokentype_host
136 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
137 private static readonly string tokentype_number
138 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
139 private static readonly string tokentype_alphanum
140 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.ALPHANUM];
142 private bool ProcessToken (ref Lucene.Net.Analysis.Token token)
144 string type = token.Type ();
146 if (type == tokentype_email) {
147 if (tokenize_email_hostname)
148 ProcessEmailToken (token);
149 return true;
150 } else if (type == tokentype_host) {
151 if (tokenize_email_hostname)
152 ProcessURLToken (token);
153 return true;
154 } else if (type == tokentype_number) {
155 // nobody will remember more than 20 digits
156 return (token.TermText ().Length <= 20);
157 } else if (type == tokentype_alphanum) {
158 string text = token.TermText ();
159 int begin = 0;
160 bool found = false;
161 // Check if number, in that case strip 0's from beginning
162 foreach (char c in text) {
163 if (! Char.IsDigit (c)) {
164 begin = 0;
165 break;
166 } else if (! found) {
167 if (c == '0')
168 begin ++;
169 else
170 found = true;
174 if (begin == 0)
175 return ! IsNoise (text);
176 token = new Lucene.Net.Analysis.Token (
177 token.TermText ().Remove (0, begin),
178 token.StartOffset (),
179 token.EndOffset (),
180 token.Type ());
181 return true;
182 } else
183 // FIXME: Noise should be only tested on token type alphanum
184 return ! IsNoise (token.TermText ());
187 private Queue parts = new Queue ();
188 private Lucene.Net.Analysis.Token token;
190 public override Lucene.Net.Analysis.Token Next ()
192 if (parts.Count != 0) {
193 string part = (string) parts.Dequeue ();
194 Lucene.Net.Analysis.Token part_token;
195 // FIXME: Searching for google.com will not match www.google.com.
196 // If we decide to allow google-style "abcd.1234" which means
197 // "abcd 1234" as a consequtive phrase, then adjusting
198 // the startOffset and endOffset would enable matching
199 // google.com to www.google.com
200 part_token = new Lucene.Net.Analysis.Token (part,
201 token.StartOffset (),
202 token.EndOffset (),
203 token.Type ());
204 part_token.SetPositionIncrement (0);
205 return part_token;
208 while ( (token = token_stream.Next ()) != null) {
209 //Console.WriteLine ("Found token: [{0}]", token.TermText ());
210 if (ProcessToken (ref token))
211 return token;
213 return null;
216 char[] replace_array = { '@', '.', '-', '_', '+' };
217 private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
219 string email = token.TermText ();
220 string[] tmp = email.Split (replace_array);
221 int l = tmp.Length;
223 // store username part as a large token
224 int index_at = email.IndexOf ('@');
225 tmp [l-1] = email.Substring (0, index_at);
227 foreach (string s in tmp)
228 parts.Enqueue (s);
232 private void ProcessURLToken (Lucene.Net.Analysis.Token token)
234 string hostname = token.TermText ();
235 string[] host_parts = hostname.Split ('.');
237 // remove initial www
238 int begin_index = (host_parts [0] == "www" ? 1 : 0);
239 // FIXME: Remove final tld
240 // Any string of form "<alnum> '.')+<alnum>" has type HOST
241 // Removing last token might remove important words from non-host
242 // string of that form. To fix that, we need to match against the
243 // huge list of TLDs.
244 for (int i = begin_index; i < host_parts.Length; ++i)
245 parts.Enqueue (host_parts [i]);
250 #if false
251 public class AnalyzerTest {
252 public static void Analyze (TextReader reader)
254 Lucene.Net.Analysis.Token lastToken = null;
255 Analyzer indexing_analyzer = new LuceneCommon.BeagleAnalyzer (true);
256 TokenStream stream = indexing_analyzer.TokenStream ("Text", reader);
258 int position = 1;
259 for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next())
261 position += (t.GetPositionIncrement() - 1);
262 Console.WriteLine (t);
266 #endif