Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / NoiseFilter.cs
blob4862d26ce3cd8b6516b604c8b0ae356f09581d38
1 //
2 // NoiseFilter.cs
3 //
4 // Copyright (C) 2004-2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
29 using Lucene.Net.Analysis;
31 namespace Beagle.Daemon {
33 class NoiseFilter : TokenFilter {
35 static int total_count = 0;
36 static int noise_count = 0;
38 TokenStream token_stream;
40 public NoiseFilter (TokenStream input) : base (input)
42 token_stream = input;
45 // FIXME: we should add some heuristics that are stricter
46 // but explicitly try to avoid filtering out dates,
47 // phone numbers, etc.
48 private static bool IsNoise (string text)
50 // Anything really long is almost certainly noise.
51 if (text.Length > 30)
52 return true;
54 // Look at how often we switch between numbers and letters.
55 // Scoring:
56 // <letter> <digit> 1
57 // <digit> <letter> 1
58 // <x> <punct>+ <x> 1
59 // <x> <punct>+ <y> 2
60 const int transitions_cutoff = 4;
61 int last_type = -1, last_non_punct_type = -1, first_type = -1;
62 bool has_letter = false, has_digit = false, has_punctuation = false;
63 int transitions = 0;
64 for (int i = 0; i < text.Length && transitions < transitions_cutoff; ++i) {
65 char c = text [i];
66 int type = -1;
67 if (Char.IsLetter (c)) {
68 type = 1;
69 has_letter = true;
70 } else if (Char.IsDigit (c)) {
71 type = 2;
72 has_digit = true;
73 } else if (Char.IsPunctuation (c)) {
74 type = 3;
75 has_punctuation = true;
78 if (type != -1) {
80 if (type != last_type) {
81 if (last_type == 3) {
82 if (type != last_non_punct_type)
83 ++transitions;
84 } else {
85 ++transitions;
89 if (first_type == -1)
90 first_type = type;
92 last_type = type;
93 if (type != 3)
94 last_non_punct_type = type;
98 // If we make too many transitions, it must be noise.
99 if (transitions >= transitions_cutoff)
100 return true;
102 // If we consist of nothing but digits and punctuation, treat it
103 // as noise if it is too long.
104 if (transitions == 1 && first_type != 1 && text.Length > 10)
105 return true;
107 // We are very suspicious of long things that make lots of
108 // transitions
109 if (transitions > 3 && text.Length > 10)
110 return true;
112 // Beware of anything long that contains a little of everything.
113 if (has_letter && has_digit && has_punctuation && text.Length > 10)
114 return true;
116 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
117 return false;
121 public override Lucene.Net.Analysis.Token Next ()
123 Lucene.Net.Analysis.Token token;
124 while ( (token = token_stream.Next ()) != null) {
125 #if false
126 if (total_count > 0 && total_count % 5000 == 0)
127 Logger.Log.Debug ("BeagleNoiseFilter filtered {0} of {1} ({2:0.0}%)",
128 noise_count, total_count, 100.0 * noise_count / total_count);
129 #endif
130 ++total_count;
131 if (IsNoise (token.TermText ())) {
132 ++noise_count;
133 continue;
135 return token;
137 return null;