Tokenize 001234 as 1234. Include a testing function in NoiseFilter to figure out...
[beagle.git] / beagled / SnippetFu.cs
blob7c6051cb648fb3a84d0515ab63120f1b7114794f
1 //
2 // SnippetFu.cs
3 //
4 // Copyright (C) 2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
27 using System;
28 using System.Collections;
29 using System.IO;
31 using Beagle.Util;
33 // FIXME: Hack. Use Lucence.Net highlighting.
35 namespace Beagle.Daemon {
37 public class SnippetFu {
39 public delegate string StringSource ();
41 static private bool IsTokenSeparator (char c)
43 return Char.IsWhiteSpace (c)
44 || Char.IsSeparator (c)
45 || Char.IsPunctuation (c);
48 const int max_prior_words = 6;
49 const int max_following_words = 6;
51 static private int HighlightTerms (ArrayList stemmed_terms, string text, ref ArrayList matches)
53 int pos = 0, prev_stop_pos = 0, prev_end_pos = 0;
54 string prev_match = "";
55 int length = 0;
57 while (pos < text.Length) {
59 // Find the beginning of the next token.
60 if (IsTokenSeparator (text [pos])) {
61 ++pos;
62 continue;
65 // Find the end of the next token
66 int next_pos = pos+1;
67 while (next_pos < text.Length && !IsTokenSeparator (text [next_pos]))
68 ++next_pos;
70 string stemmed_token = null;
71 int hl_offset = 0;
73 // Iterate through the stemmed terms and match the token
74 for (int i = 0; i < stemmed_terms.Count; i++) {
76 // If this term is longer than the token in question, give up.
77 if (next_pos - pos < ((string)stemmed_terms [i]).Length)
78 continue;
80 // We cache the token, so as to avoid stemming it more than once
81 // when considering multiple terms.
82 if (stemmed_token == null) {
83 string token = text.Substring (pos, next_pos - pos);
84 stemmed_token = LuceneCommon.Stem (token);
87 if (String.Compare ((string) stemmed_terms [i], stemmed_token, true) != 0)
88 continue;
90 // We have a match!
92 int start_pos = pos;
93 int stop_pos = next_pos;
95 // FIXME: This is a hack, I should be shot.
96 for (int count = 0; count <= max_prior_words && start_pos > 0; start_pos--) {
97 if ((text[start_pos] == ' '))
98 count++;
101 if (start_pos != 0)
102 start_pos += 2;
104 for (int count = 0; count <= max_following_words && stop_pos < text.Length; stop_pos++) {
105 if (text[stop_pos] == ' ')
106 count++;
109 if (stop_pos != text.Length)
110 stop_pos--;
112 bool append_to_prev_match = false;
114 if (prev_stop_pos > start_pos) {
115 start_pos = prev_end_pos;
116 prev_match = prev_match.Substring (0, prev_match.Length - (prev_stop_pos - prev_end_pos));
117 append_to_prev_match = true;
120 string new_match = String.Concat (text.Substring (start_pos, pos - start_pos),
121 "<font color=\"",
122 colors [(i - hl_offset) % colors.Length],
123 "\"><b>",
124 text.Substring (pos, next_pos-pos),
125 "</b></font>",
126 text.Substring (next_pos, stop_pos-next_pos));
128 if (append_to_prev_match) {
129 prev_match += new_match;
130 } else {
131 if (prev_match != "") {
132 matches.Add (prev_match);
133 length += prev_match.Length;
135 prev_match = new_match;
138 prev_stop_pos = stop_pos;
139 prev_end_pos = next_pos;
141 break;
144 pos = next_pos;
147 // Add trailing match
148 if (prev_match != "") {
149 matches.Add (prev_match);
150 length += prev_match.Length;
153 return length;
156 static string[] colors = new string [] {"red", "blue", "green", "orange", "purple", "brown"};
158 const int soft_snippet_limit = 400;
160 static public string GetSnippet (string[] query_terms, StringSource string_source)
162 // FIXME: If the query doesn't have search text (or is null), we should
163 // generate a 'summary snippet'.
165 if (string_source == null)
166 return null;
168 ArrayList matches = new ArrayList ();
169 int found_snippet_length = 0;
171 // remove stop words from query_terms
172 ArrayList query_terms_list = new ArrayList (query_terms.Length);
173 foreach (string term in query_terms) {
174 if (LuceneCommon.IsStopWord (term))
175 continue;
176 query_terms_list.Add (term);
179 string str;
180 while ( (str = string_source ()) != null) {
181 found_snippet_length += HighlightTerms (query_terms_list, str, ref matches);
182 if (found_snippet_length >= soft_snippet_limit)
183 break;
186 string snippet = "";
188 for (int i = 0; i < matches.Count && snippet.Length < soft_snippet_limit; i++)
189 snippet += String.Concat((string)matches[i], " ... ");
190 return snippet;
194 static public string GetSnippet (string[] query_terms, TextReader reader)
196 return GetSnippet (query_terms, new StringSource (reader.ReadLine));
199 static public string GetSnippetFromFile (string[] query_terms, string filename)
201 FileStream stream = new FileStream (filename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
203 return GetSnippet (query_terms, new StreamReader (stream));
206 static public string GetSnippetFromTextCache (string[] query_terms, string filename)
208 TextReader reader = TextCache.UserCache.GetReader (filename);
209 return GetSnippet (query_terms, reader);