(Back)port some changes from beagle-lucene-1-9-lockfile-branch: allow ext: queries...
[beagle.git] / Filters / FilterSource.cs
blob14af8558a661bcc344d93c4d20f3c7cbd966729a
1 //
2 // FilterSource.cs
3 //
4 // Copyright (C) 2004, 2005 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 using System;
29 using System.Collections;
30 using System.IO;
31 using System.Text;
32 namespace Beagle.Filters {
34 public abstract class FilterSource : Beagle.Daemon.Filter {
36 protected enum LangType {
37 None,
38 C_Style,
39 C_Sharp_Style,
40 Python_Style,
41 Fortran_Style,
42 Pascal_Style,
43 Lisp_Style,
44 Matlab_Style,
45 Shell_Style
48 protected LangType SrcLangType;
49 protected Hashtable KeyWordsHash;
51 private enum LineType {
52 None,
53 SingleLineComment,
54 BlockComment,
55 StringConstant
58 LineType SrcLineType;
59 string StrConstIdentifier;
61 StringBuilder token;
62 public FilterSource ()
64 // Initialize the linetype member.
65 SrcLineType = LineType.None;
66 SrcLangType = LangType.None;
67 StrConstIdentifier = " ";
69 KeyWordsHash = new Hashtable ();
71 SnippetMode = true;
72 OriginalIsText = true;
73 token = new StringBuilder ();
76 // Validate the character and append it to the token,
77 // that will be added to the text-pool of the filter.
78 // Returns: False, if token is not complete enough to be added to the pool.
79 // True, if it is a valid word that can be added to the pool.
80 private bool AppendToToken (char ch, int index, int length)
82 if (ch == ' ')
83 return true;
85 if (Char.IsLetter (ch) || Char.IsDigit (ch) || ch == '_') {
86 token.Append (ch);
87 if ((index + 1) < length)
88 return false;
89 else
90 return true;
93 return true;
96 // Tokenize the passed string and add the relevant
97 // tokens for indexing.
99 // FIXME: Perl has embedded "POD" (documentation) style, which needs a little processing.
101 protected void ExtractTokens (string str)
103 int index;
104 token.Length = 0;
105 string splCharSeq = "";
107 for (index = 0; index < str.Length; index++) {
108 if (((str[index] == '{'
109 || str[index] == '}'
110 || str[index] == '('
111 || str[index] == ')'
112 || str[index] == '*'
113 || str[index] == '/')
114 && SrcLangType == LangType.Pascal_Style)
115 || ((str[index] == '/'
116 || str[index] == '*')
117 && (SrcLangType == LangType.C_Style
118 || SrcLangType == LangType.C_Sharp_Style))) {
120 splCharSeq += str[index];
122 switch (splCharSeq) {
124 case "(*":
125 if (SrcLineType == LineType.None) {
126 SrcLineType = LineType.BlockComment;
127 token.Length = 0;
128 } else
129 token.Append (splCharSeq);
130 splCharSeq = "";
131 break;
133 case "*)":
134 if (SrcLineType == LineType.BlockComment) {
135 SrcLineType = LineType.None;
136 token.Append (" ");
137 AppendText (token.ToString());
138 token.Length = 0;
139 } else if (SrcLineType != LineType.None)
140 token.Append (splCharSeq);
141 splCharSeq = "";
142 break;
144 case "{":
145 if (SrcLineType == LineType.None) {
146 SrcLineType = LineType.BlockComment;
147 token.Length = 0;
148 } else
149 token.Append (splCharSeq);
150 splCharSeq = "";
151 break;
153 case "}":
154 if (SrcLineType == LineType.BlockComment) {
155 SrcLineType = LineType.None;
156 token.Append (" ");
157 AppendText (token.ToString());
158 token.Length = 0;
159 } else if (SrcLineType != LineType.None)
160 token.Append (splCharSeq);
161 splCharSeq = "";
162 break;
163 case "//":
164 if (SrcLineType == LineType.None) {
165 SrcLineType = LineType.SingleLineComment;
166 token.Length = 0;
167 } else
168 token.Append (splCharSeq);
169 splCharSeq = "";
170 break;
172 case "/*":
173 if (SrcLineType == LineType.None) {
174 SrcLineType = LineType.BlockComment;
175 token.Length = 0;
176 } else
177 token.Append (splCharSeq);
178 splCharSeq = "";
179 break;
181 case "*/":
182 if (SrcLineType == LineType.BlockComment) {
183 SrcLineType = LineType.None;
184 token.Append (" ");
185 AppendText (token.ToString());
186 token.Length = 0;
187 } else if (SrcLineType != LineType.None)
188 token.Append (splCharSeq);
189 splCharSeq = "";
190 break;
192 } else if ((str[index] == '#' && (SrcLangType == LangType.Python_Style ||
193 SrcLangType == LangType.Shell_Style)) ||
194 (str[index] == '!' && SrcLangType == LangType.Fortran_Style) ||
195 (str[index] == ';' && SrcLangType == LangType.Lisp_Style) ||
196 (str[index] == '%' && SrcLangType == LangType.Matlab_Style)) {
197 if (SrcLineType == LineType.None) {
198 SrcLineType = LineType.SingleLineComment;
199 token.Length = 0;
200 } else
201 token.Append (str[index]);
203 // FIXME: we evaluate *ALL* escape
204 // sequences on strings. Do we really need to
205 // do this for comments??? And also "\n", "\t" etc????
206 else if (SrcLineType == LineType.StringConstant &&
207 str[index] == '\\') {
208 if ((index + 1) <= (str.Length-1))
209 token.Append (str[index + 1]);
210 index ++;
212 // Well the typical python ''' or """ stuff
213 else if ((SrcLangType == LangType.Python_Style) &&
214 ((index + 2) <= (str.Length-1)) &&
215 (str[index] == '\"' || str[index] == '\'') &&
216 (str[index] == str[index + 1] && str[index] == str[index + 2]) &&
217 StrConstIdentifier[0] == str[index]) {
219 if (SrcLineType == LineType.StringConstant) {
220 SrcLineType = LineType.None;
221 token.Append (" ");
222 AppendText (token.ToString());
223 token.Length = 0;
224 } else {
225 StrConstIdentifier = str.Substring (index, 3);
226 SrcLineType = LineType.StringConstant;
227 token.Length = 0;
228 index += 2;
231 splCharSeq = "";
233 // Lisp: ignore the single quote character; do another iteration
234 else if (SrcLangType == LangType.Lisp_Style && str[index] == '\'') {
235 continue;
237 else if (str[index] == '\"' || str[index] == '\'' ||
238 (str[index] == '`' && SrcLangType == LangType.Shell_Style)) {
240 if (SrcLineType == LineType.StringConstant &&
241 StrConstIdentifier.Length == 1 &&
242 StrConstIdentifier[0] == str[index]) {
243 SrcLineType = LineType.None;
244 token.Append (" ");
245 AppendText (token.ToString());
246 token.Length = 0;
248 } else if (SrcLineType == LineType.None) {
249 StrConstIdentifier = str.Substring (index, 1);
250 SrcLineType = LineType.StringConstant;
251 token.Length = 0;
252 } else
253 token.Append (str[index]);
254 splCharSeq = "";
256 } else if (SrcLineType != LineType.None) {
257 token.Append (splCharSeq);
258 token.Append (str[index]);
259 splCharSeq = "";
261 } else if (SrcLineType == LineType.None) {
262 if (AppendToToken (str[index], index, str.Length)) {
263 if (SrcLangType == LangType.Lisp_Style) {
265 // Lisp identifiers: letters, digits, and:
266 // ! $ % & * + - . / : < = > ? @ ^ _ ~
267 switch (str[index]) {
268 case '!': case '$': case '%': case '&':
269 case '*': case '+': case '-': case '.':
270 case '/': case ':': case '<': case '=':
271 case '>': case '?': case '@': case '^':
272 case '_': case '~':
273 token.Append (str[index]);
274 continue;
277 //token = token.Replace(" ", "");
278 if (token.Length > 0) {
279 string tok;
280 if (SrcLangType == LangType.Fortran_Style)
281 tok = token.ToString().ToLower();
282 else
283 tok = token.ToString ();
284 if (!KeyWordsHash.Contains (tok)) {
285 if (!Char.IsDigit (token[0])) {
286 AppendText (tok);
287 AppendWhiteSpace ();
291 // reset the token
292 token.Length = 0;
294 splCharSeq = "";
297 if (SrcLineType != LineType.None) {
298 bool trailing_backslash = false;
300 token.Append (splCharSeq);
302 if (token.Length > 0 && token [token.Length - 1] == '\\') {
303 token = token.Remove (token.Length-1, 1);
304 trailing_backslash = true;
307 token.Append (" ");
308 AppendText (token.ToString());
310 // if a single-line-comment ends with a "\",
311 // the lines that follows it are also considered as a comment,
312 // till a line with out a "\" is found
313 // C# and Lisp don't follow this syntax.
314 if (SrcLineType == LineType.SingleLineComment)
315 if (!trailing_backslash
316 || SrcLangType == LangType.C_Sharp_Style
317 || SrcLangType == LangType.Lisp_Style)
318 SrcLineType = LineType.None;
319 } else if (token.Length > 0
320 && !Char.IsDigit (token[0])) {
321 /* we don't want any numeric const */
322 token.Append (" ");
323 AppendText (token.ToString());