4 // Copyright (C) 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System
.Collections
;
33 // FIXME: Hack. Use Lucence.Net highlighting.
35 namespace Beagle
.Daemon
{
37 public class SnippetFu
{
39 public delegate string StringSource ();
41 static private bool IsTokenSeparator (char c
)
43 return Char
.IsWhiteSpace (c
)
44 || Char
.IsSeparator (c
)
45 || Char
.IsPunctuation (c
);
48 const int max_prior_words
= 6;
49 const int max_following_words
= 6;
51 static private int HighlightTerms (ArrayList stemmed_terms
, string text
, ref ArrayList matches
)
53 int pos
= 0, prev_stop_pos
= 0, prev_end_pos
= 0;
54 string prev_match
= "";
57 while (pos
< text
.Length
) {
59 // Find the beginning of the next token.
60 if (IsTokenSeparator (text
[pos
])) {
65 // Find the end of the next token
67 while (next_pos
< text
.Length
&& !IsTokenSeparator (text
[next_pos
]))
70 string stemmed_token
= null;
73 // Iterate through the stemmed terms and match the token
74 for (int i
= 0; i
< stemmed_terms
.Count
; i
++) {
76 // If this term is longer than the token in question, give up.
77 if (next_pos
- pos
< ((string)stemmed_terms
[i
]).Length
)
80 // We cache the token, so as to avoid stemming it more than once
81 // when considering multiple terms.
82 if (stemmed_token
== null) {
83 string token
= text
.Substring (pos
, next_pos
- pos
);
84 stemmed_token
= LuceneCommon
.Stem (token
);
87 if (String
.Compare ((string) stemmed_terms
[i
], stemmed_token
, true) != 0)
93 int stop_pos
= next_pos
;
95 // FIXME: This is a hack, I should be shot.
96 for (int count
= 0; count
<= max_prior_words
&& start_pos
> 0; start_pos
--) {
97 if ((text
[start_pos
] == ' '))
104 for (int count
= 0; count
<= max_following_words
&& stop_pos
< text
.Length
; stop_pos
++) {
105 if (text
[stop_pos
] == ' ')
109 if (stop_pos
!= text
.Length
)
112 bool append_to_prev_match
= false;
114 if (prev_stop_pos
> start_pos
) {
115 start_pos
= prev_end_pos
;
116 prev_match
= prev_match
.Substring (0, prev_match
.Length
- (prev_stop_pos
- prev_end_pos
));
117 append_to_prev_match
= true;
120 string new_match
= String
.Concat (text
.Substring (start_pos
, pos
- start_pos
),
122 colors
[(i
- hl_offset
) % colors
.Length
],
124 text
.Substring (pos
, next_pos
-pos
),
126 text
.Substring (next_pos
, stop_pos
-next_pos
));
128 if (append_to_prev_match
) {
129 prev_match
+= new_match
;
131 if (prev_match
!= "") {
132 matches
.Add (prev_match
);
133 length
+= prev_match
.Length
;
135 prev_match
= new_match
;
138 prev_stop_pos
= stop_pos
;
139 prev_end_pos
= next_pos
;
147 // Add trailing match
148 if (prev_match
!= "") {
149 matches
.Add (prev_match
);
150 length
+= prev_match
.Length
;
156 static string[] colors
= new string [] {"red", "blue", "green", "orange", "purple", "brown"}
;
158 const int soft_snippet_limit
= 400;
160 static public string GetSnippet (string[] query_terms
, StringSource string_source
)
162 // FIXME: If the query doesn't have search text (or is null), we should
163 // generate a 'summary snippet'.
165 if (string_source
== null)
168 ArrayList matches
= new ArrayList ();
169 int found_snippet_length
= 0;
171 // remove stop words from query_terms
172 ArrayList query_terms_list
= new ArrayList (query_terms
.Length
);
173 foreach (string term
in query_terms
) {
174 if (LuceneCommon
.IsStopWord (term
))
176 query_terms_list
.Add (term
);
180 while ( (str
= string_source ()) != null) {
181 found_snippet_length
+= HighlightTerms (query_terms_list
, str
, ref matches
);
182 if (found_snippet_length
>= soft_snippet_limit
)
188 for (int i
= 0; i
< matches
.Count
&& snippet
.Length
< soft_snippet_limit
; i
++)
189 snippet
+= String
.Concat((string)matches
[i
], " ... ");
194 static public string GetSnippet (string[] query_terms
, TextReader reader
)
196 return GetSnippet (query_terms
, new StringSource (reader
.ReadLine
));
199 static public string GetSnippetFromFile (string[] query_terms
, string filename
)
201 FileStream stream
= new FileStream (filename
, FileMode
.Open
, FileAccess
.Read
, FileShare
.ReadWrite
);
203 return GetSnippet (query_terms
, new StreamReader (stream
));
206 static public string GetSnippetFromTextCache (string[] query_terms
, string filename
)
208 TextReader reader
= TextCache
.UserCache
.GetReader (filename
);
209 return GetSnippet (query_terms
, reader
);