Indexable is not marked _done_ until all the child indexables (including child of...
[beagle.git] / beagled / NoiseFilter.cs
blobd80466e5b3633e093a6661ba10939bef7f0f2337
1 //
2 // NoiseFilter.cs
3 //
4 // Copyright (C) 2006 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004-2005 Novell, Inc.
6 //
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
31 using Lucene.Net.Analysis;
32 using LNSA = Lucene.Net.Analysis.Standard;
34 namespace Beagle.Daemon {
36 // TokenFilter which does several fancy things
37 // 1. Removes words which are potential noise like dhyhy8ju7q9
38 // 2. Splits email addresses into meaningful tokens
39 // 3. Splits hostnames into subparts
40 class NoiseEmailHostFilter : TokenFilter {
42 static int total_count = 0;
43 static int noise_count = 0;
44 private bool tokenize_email_hostname;
46 TokenStream token_stream;
48 public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
49 : base (input)
51 this.token_stream = input;
52 this.tokenize_email_hostname = tokenize_email_hostname;
55 // FIXME: we should add some heuristics that are stricter
56 // but explicitly try to avoid filtering out dates,
57 // phone numbers, etc.
58 private static bool IsNoise (string text)
60 // Anything really long is almost certainly noise.
61 if (text.Length > 30)
62 return true;
64 // Look at how often we switch between numbers and letters.
65 // Scoring:
66 // <letter> <digit> 1
67 // <digit> <letter> 1
68 // <x> <punct>+ <x> 1
69 // <x> <punct>+ <y> 2
70 const int transitions_cutoff = 4;
71 int last_type = -1, last_non_punct_type = -1, first_type = -1;
72 bool has_letter = false, has_digit = false, has_punctuation = false;
73 int transitions = 0;
74 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
75 char c = text [i];
76 int type = -1;
77 if (Char.IsLetter (c)) {
78 type = 1;
79 has_letter = true;
80 } else if (Char.IsDigit (c)) {
81 type = 2;
82 has_digit = true;
83 } else if (Char.IsPunctuation (c)) {
84 type = 3;
85 has_punctuation = true;
88 if (type != -1) {
90 if (type != last_type) {
91 if (last_type == 3) {
92 if (type != last_non_punct_type)
93 ++transitions;
94 } else {
95 ++transitions;
99 if (first_type == -1)
100 first_type = type;
102 last_type = type;
103 if (type != 3)
104 last_non_punct_type = type;
108 // If we make too many transitions, it must be noise.
109 if (transitions >= transitions_cutoff)
110 return true;
112 // If we consist of nothing but digits and punctuation, treat it
113 // as noise if it is too long.
114 if (transitions == 1 && first_type != 1 && text.Length > 10)
115 return true;
117 // We are very suspicious of long things that make lots of
118 // transitions
119 if (transitions > 3 && text.Length > 10)
120 return true;
122 // Beware of anything long that contains a little of everything.
123 if (has_letter && has_digit && has_punctuation && text.Length > 10)
124 return true;
126 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
127 return false;
131 // Dont scan these tokens for additional noise
132 // Someone might like to search for emails, hostnames and
133 // phone numbers (which fall under type NUM)
134 private static readonly string tokentype_email
135 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
136 private static readonly string tokentype_host
137 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
138 private static readonly string tokentype_number
139 = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
141 private bool ProcessToken (Lucene.Net.Analysis.Token token)
143 string type = token.Type ();
145 if (type == tokentype_email) {
146 if (tokenize_email_hostname)
147 ProcessEmailToken (token);
148 return true;
149 } else if (type == tokentype_host) {
150 if (tokenize_email_hostname)
151 ProcessURLToken (token);
152 return true;
153 } else if (type == tokentype_number)
154 // nobody will remember more than 10 digits
155 return (token.TermText ().Length <= 10);
156 else
157 return false;
160 private Queue parts = new Queue ();
161 private Lucene.Net.Analysis.Token token;
163 public override Lucene.Net.Analysis.Token Next ()
165 if (parts.Count != 0) {
166 string part = (string) parts.Dequeue ();
167 Lucene.Net.Analysis.Token part_token;
168 // FIXME: Searching for google.com will not match www.google.com.
169 // If we decide to allow google-style "abcd.1234" which means
170 // "abcd 1234" as a consequtive phrase, then adjusting
171 // the startOffset and endOffset would enable matching
172 // google.com to www.google.com
173 part_token = new Lucene.Net.Analysis.Token (part,
174 token.StartOffset (),
175 token.EndOffset (),
176 token.Type ());
177 part_token.SetPositionIncrement (0);
178 return part_token;
181 while ( (token = token_stream.Next ()) != null) {
182 //Console.WriteLine ("Found token: [{0}]", token.TermText ());
183 #if false
184 if (total_count > 0 && total_count % 5000 == 0)
185 Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
186 noise_count, total_count, 100.0 * noise_count / total_count);
187 #endif
188 ++total_count;
189 if (ProcessToken (token))
190 return token;
191 if (IsNoise (token.TermText ())) {
192 ++noise_count;
193 continue;
195 return token;
197 return null;
200 char[] replace_array = { '@', '.', '-', '_', '+' };
201 private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
203 string email = token.TermText ();
204 string[] tmp = email.Split (replace_array);
205 int l = tmp.Length;
207 // store username part as a large token
208 int index_at = email.IndexOf ('@');
209 tmp [l-1] = email.Substring (0, index_at);
211 foreach (string s in tmp)
212 parts.Enqueue (s);
216 private void ProcessURLToken (Lucene.Net.Analysis.Token token)
218 string hostname = token.TermText ();
219 string[] host_parts = hostname.Split ('.');
221 // remove initial www
222 int begin_index = (host_parts [0] == "www" ? 1 : 0);
223 // FIXME: Remove final tld
224 // Any string of form "<alnum> '.')+<alnum>" has type HOST
225 // Removing last token might remove important words from non-host
226 // string of that form. To fix that, we need to match against the
227 // huge list of TLDs.
228 for (int i = begin_index; i < host_parts.Length; ++i)
229 parts.Enqueue (host_parts [i]);