4 // Copyright (C) 2006 Debajyoti Bera <dbera.web@gmail.com>
5 // Copyright (C) 2004-2005 Novell, Inc.
9 // Permission is hereby granted, free of charge, to any person obtaining a
10 // copy of this software and associated documentation files (the "Software"),
11 // to deal in the Software without restriction, including without limitation
12 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 // and/or sell copies of the Software, and to permit persons to whom the
14 // Software is furnished to do so, subject to the following conditions:
16 // The above copyright notice and this permission notice shall be included in
17 // all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 // DEALINGS IN THE SOFTWARE.
30 using System
.Collections
;
32 using Lucene
.Net
.Analysis
;
33 using LNSA
= Lucene
.Net
.Analysis
.Standard
;
35 namespace Beagle
.Daemon
{
37 // TokenFilter which does several fancy things
38 // 1. Removes words which are potential noise like dhyhy8ju7q9
39 // 2. Splits email addresses into meaningful tokens
40 // 3. Splits hostnames into subparts
41 class NoiseEmailHostFilter
: TokenFilter
{
43 private bool tokenize_email_hostname
;
45 TokenStream token_stream
;
47 public NoiseEmailHostFilter (TokenStream input
, bool tokenize_email_hostname
)
50 this.token_stream
= input
;
51 this.tokenize_email_hostname
= tokenize_email_hostname
;
54 // FIXME: we should add some heuristics that are stricter
55 // but explicitly try to avoid filtering out dates,
56 // phone numbers, etc.
57 private static bool IsNoise (string text
)
59 // Anything really long is almost certainly noise.
63 // Look at how often we switch between numbers and letters.
69 const int transitions_cutoff
= 4;
70 int last_type
= -1, last_non_punct_type
= -1, first_type
= -1;
71 bool has_letter
= false, has_digit
= false, has_punctuation
= false;
73 for (int i
= 0; i
< text
.Length
&& transitions
< transitions_cutoff
; ++i
) {
76 if (Char
.IsLetter (c
)) {
79 } else if (Char
.IsDigit (c
)) {
82 } else if (Char
.IsPunctuation (c
)) {
84 has_punctuation
= true;
89 if (type
!= last_type
) {
91 if (type
!= last_non_punct_type
)
103 last_non_punct_type
= type
;
107 // If we make too many transitions, it must be noise.
108 if (transitions
>= transitions_cutoff
)
111 // If we consist of nothing but digits and punctuation, treat it
112 // as noise if it is too long.
113 if (transitions
== 1 && first_type
!= 1 && text
.Length
> 10)
116 // We are very suspicious of long things that make lots of
118 if (transitions
> 3 && text
.Length
> 10)
121 // Beware of anything long that contains a little of everything.
122 if (has_letter
&& has_digit
&& has_punctuation
&& text
.Length
> 10)
125 //Logger.Log.Debug ("BeagleNoiseFilter accepted '{0}'", text);
130 // Dont scan these tokens for additional noise
131 // Someone might like to search for emails, hostnames and
132 // phone numbers (which fall under type NUM)
133 private static readonly string tokentype_email
134 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.EMAIL
];
135 private static readonly string tokentype_host
136 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.HOST
];
137 private static readonly string tokentype_number
138 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.NUM
];
139 private static readonly string tokentype_alphanum
140 = LNSA
.StandardTokenizerConstants
.tokenImage
[LNSA
.StandardTokenizerConstants
.ALPHANUM
];
142 private bool ProcessToken (ref Lucene
.Net
.Analysis
.Token token
)
144 string type
= token
.Type ();
146 if (type
== tokentype_email
) {
147 if (tokenize_email_hostname
)
148 ProcessEmailToken (token
);
150 } else if (type
== tokentype_host
) {
151 if (tokenize_email_hostname
)
152 ProcessURLToken (token
);
154 } else if (type
== tokentype_number
) {
155 // nobody will remember more than 20 digits
156 return (token
.TermText ().Length
<= 20);
157 } else if (type
== tokentype_alphanum
) {
158 string text
= token
.TermText ();
161 // Check if number, in that case strip 0's from beginning
162 foreach (char c
in text
) {
163 if (! Char
.IsDigit (c
)) {
166 } else if (! found
) {
175 return ! IsNoise (text
);
176 token
= new Lucene
.Net
.Analysis
.Token (
177 token
.TermText ().Remove (0, begin
),
178 token
.StartOffset (),
183 // FIXME: Noise should be only tested on token type alphanum
184 return ! IsNoise (token
.TermText ());
187 private Queue parts
= new Queue ();
188 private Lucene
.Net
.Analysis
.Token token
;
190 public override Lucene
.Net
.Analysis
.Token
Next ()
192 if (parts
.Count
!= 0) {
193 string part
= (string) parts
.Dequeue ();
194 Lucene
.Net
.Analysis
.Token part_token
;
195 // FIXME: Searching for google.com will not match www.google.com.
196 // If we decide to allow google-style "abcd.1234" which means
197 // "abcd 1234" as a consequtive phrase, then adjusting
198 // the startOffset and endOffset would enable matching
199 // google.com to www.google.com
200 part_token
= new Lucene
.Net
.Analysis
.Token (part
,
201 token
.StartOffset (),
204 part_token
.SetPositionIncrement (0);
208 while ( (token
= token_stream
.Next ()) != null) {
209 //Console.WriteLine ("Found token: [{0}]", token.TermText ());
210 if (ProcessToken (ref token
))
216 char[] replace_array
= { '@', '.', '-', '_', '+' }
;
217 private void ProcessEmailToken (Lucene
.Net
.Analysis
.Token token
)
219 string email
= token
.TermText ();
220 string[] tmp
= email
.Split (replace_array
);
223 // store username part as a large token
224 int index_at
= email
.IndexOf ('@');
225 tmp
[l
-1] = email
.Substring (0, index_at
);
227 foreach (string s
in tmp
)
232 private void ProcessURLToken (Lucene
.Net
.Analysis
.Token token
)
234 string hostname
= token
.TermText ();
235 string[] host_parts
= hostname
.Split ('.');
237 // remove initial www
238 int begin_index
= (host_parts
[0] == "www" ? 1 : 0);
239 // FIXME: Remove final tld
240 // Any string of form "<alnum> '.')+<alnum>" has type HOST
241 // Removing last token might remove important words from non-host
242 // string of that form. To fix that, we need to match against the
243 // huge list of TLDs.
244 for (int i
= begin_index
; i
< host_parts
.Length
; ++i
)
245 parts
.Enqueue (host_parts
[i
]);
251 public class AnalyzerTest
{
252 public static void Analyze (TextReader reader
)
254 Lucene
.Net
.Analysis
.Token lastToken
= null;
255 Analyzer indexing_analyzer
= new LuceneCommon
.BeagleAnalyzer (true);
256 TokenStream stream
= indexing_analyzer
.TokenStream ("Text", reader
);
259 for (Lucene
.Net
.Analysis
.Token t
= stream
.Next(); t
!= null; t
= stream
.Next())
261 position
+= (t
.GetPositionIncrement() - 1);
262 Console
.WriteLine (t
);