beagled/NoiseFilter.cs

   1 //
   2 // NoiseFilter.cs
   3 //
   4 // Copyright (C) 2006 Debajyoti Bera <dbera.web@gmail.com>
   5 // Copyright (C) 2004-2005 Novell, Inc.
   6 //
   7
   8 //
   9 // Permission is hereby granted, free of charge, to any person obtaining a
  10 // copy of this software and associated documentation files (the "Software"),
  11 // to deal in the Software without restriction, including without limitation
  12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  13 // and/or sell copies of the Software, and to permit persons to whom the
  14 // Software is furnished to do so, subject to the following conditions:
  15 //
  16 // The above copyright notice and this permission notice shall be included in
  17 // all copies or substantial portions of the Software.
  18 //
  19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  25 // DEALINGS IN THE SOFTWARE.
  26 //
  27
  28 using System;
  29 using System.IO;
  30 using System.Collections;
  31
  32 using Lucene.Net.Analysis;
  33 using LNSA = Lucene.Net.Analysis.Standard;
  34
  35 namespace Beagle.Daemon {
  36
  37         // TokenFilter which does several fancy things
  38         // 1. Removes words which are potential noise like dhyhy8ju7q9
  39         // 2. Splits email addresses into meaningful tokens
  40         // 3. Splits hostnames into subparts
  41         class NoiseEmailHostFilter : TokenFilter {
  42
  43                 private bool tokenize_email_hostname;
  44
  45                 TokenStream token_stream;
  46
  47                 public NoiseEmailHostFilter (TokenStream input, bool tokenize_email_hostname)
  48                         : base (input)
  49                 {
  50                         this.token_stream = input;
  51                         this.tokenize_email_hostname = tokenize_email_hostname;
  52                 }
  53
  54                 // FIXME: we should add some heuristics that are stricter
  55                 // but explicitly try to avoid filtering out dates,
  56                 // phone numbers, etc.
  57                 private static bool IsNoise (string text)
  58                 {
  59                         // Anything really long is almost certainly noise.
  60                         if (text.Length > 30)
  61                                 return true;
  62
  63                         // Look at how often we switch between numbers and letters.
  64                         // Scoring:
  65                         // <letter> <digit>   1
  66                         // <digit> <letter>   1
  67                         // <x> <punct>+ <x>   1
  68                         // <x> <punct>+ <y>   2
  69                         const int transitions_cutoff = 4;
  70                         int last_type = -1, last_non_punct_type = -1, first_type = -1;
  71                         bool has_letter = false, has_digit = false, has_punctuation = false;
  72                         int transitions = 0;
  73                         for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
  74                                 char c = text [i];
  75                                 int type = -1;
  76                                 if (Char.IsLetter (c)) {
  77                                         type = 1;
  78                                         has_letter = true;
  79                                 } else if (Char.IsDigit (c)) {
  80                                         type = 2;
  81                                         has_digit = true;
  82                                 } else if (Char.IsPunctuation (c)) {
  83                                         type = 3;
  84                                         has_punctuation = true;
  85                                 }
  86
  87                                 if (type != -1) {
  88
  89                                         if (type != last_type) {
  90                                                 if (last_type == 3) {
  91                                                         if (type != last_non_punct_type)
  92                                                                 ++transitions;
  93                                                 } else {
  94                                                         ++transitions;
  95                                                 }
  96                                         }
  97
  98                                         if (first_type == -1)
  99                                                 first_type = type;
 100
 101                                         last_type = type;
 102                                         if (type != 3)
 103                                                 last_non_punct_type = type;
 104                                 }
 105                         }
 106
 107                         // If we make too many transitions, it must be noise.
 108                         if (transitions >= transitions_cutoff)
 109                                 return true;
 110
 111                         // If we consist of nothing but digits and punctuation, treat it
 112                         // as noise if it is too long.
 113                         if (transitions == 1 && first_type != 1 && text.Length > 10)
 114                                 return true;
 115
 116                         // We are very suspicious of long things that make lots of
 117                         // transitions
 118                         if (transitions > 3 && text.Length > 10)
 119                                 return true;
 120
 121                         // Beware of anything long that contains a little of everything.
 122                         if (has_letter && has_digit && has_punctuation && text.Length > 10)
 123                                 return true;
 124
 125                         //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
 126                         return false;
 127
 128                 }
 129
 130                 // Dont scan these tokens for additional noise
 131                 // Someone might like to search for emails, hostnames and
 132                 // phone numbers (which fall under type NUM)
 133                 private static readonly string tokentype_email
 134                         = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.EMAIL];
 135                 private static readonly string tokentype_host
 136                         = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.HOST];
 137                 private static readonly string tokentype_number
 138                         = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.NUM];
 139                 private static readonly string tokentype_alphanum
 140                         = LNSA.StandardTokenizerConstants.tokenImage [LNSA.StandardTokenizerConstants.ALPHANUM];
 141
 142                 private bool ProcessToken (ref Lucene.Net.Analysis.Token token)
 143                 {
 144                         string type = token.Type ();
 145
 146                         if (type == tokentype_email) {
 147                                 if (tokenize_email_hostname)
 148                                         ProcessEmailToken (token);
 149                                 return true;
 150                         } else if (type == tokentype_host) {
 151                                 if (tokenize_email_hostname)
 152                                         ProcessURLToken (token);
 153                                 return true;
 154                         } else if (type == tokentype_number) {
 155                                 // nobody will remember more than 10 digits
 156                                 return (token.TermText ().Length <= 10);
 157                         } else if (type == tokentype_alphanum) {
 158                                 string text = token.TermText ();
 159                                 int begin = 0;
 160                                 // Check if number, in that case strip 0's from beginning
 161                                 foreach (char c in text) {
 162                                         if (! Char.IsDigit (c)) {
 163                                                 begin = 0;
 164                                                 break;
 165                                         } else if (c == '0')
 166                                                 begin ++;
 167                                 }
 168
 169                                 if (begin == 0)
 170                                         return ! IsNoise (text);
 171                                 token = new Lucene.Net.Analysis.Token (
 172                                         token.TermText ().Remove (0, begin),
 173                                         token.StartOffset (),
 174                                         token.EndOffset (),
 175                                         token.Type ());
 176                                 return true;
 177                         } else
 178                                 // FIXME: Noise should be only tested on token type alphanum
 179                                 return ! IsNoise (token.TermText ());
 180                 }
 181
 182                 private Queue parts = new Queue ();
 183                 private Lucene.Net.Analysis.Token token;
 184
 185                 public override Lucene.Net.Analysis.Token Next ()
 186                 {
 187                         if (parts.Count != 0) {
 188                                 string part = (string) parts.Dequeue ();
 189                                 Lucene.Net.Analysis.Token part_token;
 190                                 // FIXME: Searching for google.com will not match www.google.com.
 191                                 // If we decide to allow google-style "abcd.1234" which means
 192                                 // "abcd 1234" as a consequtive phrase, then adjusting
 193                                 // the startOffset and endOffset would enable matching
 194                                 // google.com to www.google.com
 195                                 part_token = new Lucene.Net.Analysis.Token (part,
 196                                                                        token.StartOffset (),
 197                                                                        token.EndOffset (),
 198                                                                        token.Type ());
 199                                 part_token.SetPositionIncrement (0);
 200                                 return part_token;
 201                         }
 202
 203                         while ( (token = token_stream.Next ()) != null) {
 204                                 //Console.WriteLine ("Found token: [{0}]", token.TermText ());
 205                                 if (ProcessToken (ref token))
 206                                         return token;
 207                         }
 208                         return null;
 209                 }
 210
 211                 char[] replace_array = { '@', '.', '-', '_', '+' };
 212                 private void ProcessEmailToken (Lucene.Net.Analysis.Token token)
 213                 {
 214                         string email = token.TermText ();
 215                         string[] tmp = email.Split (replace_array);
 216                         int l = tmp.Length;
 217
 218                         // store username part as a large token
 219                         int index_at = email.IndexOf ('@');
 220                         tmp [l-1] = email.Substring (0, index_at);
 221
 222                         foreach (string s in tmp)
 223                                 parts.Enqueue (s);
 224
 225                 }
 226
 227                 private void ProcessURLToken (Lucene.Net.Analysis.Token token)
 228                 {
 229                         string hostname = token.TermText ();
 230                         string[] host_parts = hostname.Split ('.');
 231
 232                         // remove initial www
 233                         int begin_index = (host_parts [0] == "www" ? 1 : 0);
 234                         // FIXME: Remove final tld
 235                         // Any string of form "<alnum> '.')+<alnum>" has type HOST
 236                         // Removing last token might remove important words from non-host
 237                         // string of that form. To fix that, we need to match against the
 238                         // huge list of TLDs.
 239                         for (int i = begin_index; i < host_parts.Length; ++i)
 240                                 parts.Enqueue (host_parts [i]);
 241
 242                 }
 243         }
 244
 245 #if false
 246         public class AnalyzerTest {
 247                 public static void Analyze (TextReader reader)
 248                 {
 249                         Lucene.Net.Analysis.Token lastToken = null;
 250                         Analyzer indexing_analyzer = new LuceneCommon.BeagleAnalyzer (true);
 251                         TokenStream stream = indexing_analyzer.TokenStream ("Text", reader);
 252
 253                         int position = 1;
 254                         for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next())
 255                         {
 256                                 position += (t.GetPositionIncrement() - 1);
 257                                 Console.WriteLine (t);
 258                         }
 259                 }
 260         }
 261 #endif
 262 }