4 // Copyright (C) 2004, 2005 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
29 using System
.Collections
;
32 namespace Beagle
.Filters
{
34 public abstract class FilterSource
: Beagle
.Daemon
.Filter
{
36 protected enum LangType
{
48 protected LangType SrcLangType
;
49 protected Hashtable KeyWordsHash
;
51 private enum LineType
{
59 string StrConstIdentifier
;
62 public FilterSource ()
64 // Initialize the linetype member.
65 SrcLineType
= LineType
.None
;
66 SrcLangType
= LangType
.None
;
67 StrConstIdentifier
= " ";
69 KeyWordsHash
= new Hashtable ();
72 OriginalIsText
= true;
73 token
= new StringBuilder ();
76 // Validate the character and append it to the token,
77 // that will be added to the text-pool of the filter.
78 // Returns: False, if token is not complete enough to be added to the pool.
79 // True, if it is a valid word that can be added to the pool.
80 private bool AppendToToken (char ch
, int index
, int length
)
85 if (Char
.IsLetter (ch
) || Char
.IsDigit (ch
) || ch
== '_') {
87 if ((index
+ 1) < length
)
96 // Tokenize the passed string and add the relevant
97 // tokens for indexing.
99 // FIXME: Perl has embedded "POD" (documentation) style, which needs a little processing.
101 protected void ExtractTokens (string str
)
105 string splCharSeq
= "";
107 for (index
= 0; index
< str
.Length
; index
++) {
108 if (((str
[index
] == '{'
113 || str
[index
] == '/')
114 && SrcLangType
== LangType
.Pascal_Style
)
115 || ((str
[index
] == '/'
116 || str
[index
] == '*')
117 && (SrcLangType
== LangType
.C_Style
118 || SrcLangType
== LangType
.C_Sharp_Style
))) {
120 splCharSeq
+= str
[index
];
122 switch (splCharSeq
) {
125 if (SrcLineType
== LineType
.None
) {
126 SrcLineType
= LineType
.BlockComment
;
129 token
.Append (splCharSeq
);
134 if (SrcLineType
== LineType
.BlockComment
) {
135 SrcLineType
= LineType
.None
;
137 AppendText (token
.ToString());
139 } else if (SrcLineType
!= LineType
.None
)
140 token
.Append (splCharSeq
);
145 if (SrcLineType
== LineType
.None
) {
146 SrcLineType
= LineType
.BlockComment
;
149 token
.Append (splCharSeq
);
154 if (SrcLineType
== LineType
.BlockComment
) {
155 SrcLineType
= LineType
.None
;
157 AppendText (token
.ToString());
159 } else if (SrcLineType
!= LineType
.None
)
160 token
.Append (splCharSeq
);
164 if (SrcLineType
== LineType
.None
) {
165 SrcLineType
= LineType
.SingleLineComment
;
168 token
.Append (splCharSeq
);
173 if (SrcLineType
== LineType
.None
) {
174 SrcLineType
= LineType
.BlockComment
;
177 token
.Append (splCharSeq
);
182 if (SrcLineType
== LineType
.BlockComment
) {
183 SrcLineType
= LineType
.None
;
185 AppendText (token
.ToString());
187 } else if (SrcLineType
!= LineType
.None
)
188 token
.Append (splCharSeq
);
192 } else if ((str
[index
] == '#' && (SrcLangType
== LangType
.Python_Style
||
193 SrcLangType
== LangType
.Shell_Style
)) ||
194 (str
[index
] == '!' && SrcLangType
== LangType
.Fortran_Style
) ||
195 (str
[index
] == ';' && SrcLangType
== LangType
.Lisp_Style
) ||
196 (str
[index
] == '%' && SrcLangType
== LangType
.Matlab_Style
)) {
197 if (SrcLineType
== LineType
.None
) {
198 SrcLineType
= LineType
.SingleLineComment
;
201 token
.Append (str
[index
]);
203 // FIXME: we evaluate *ALL* escape
204 // sequences on strings. Do we really need to
205 // do this for comments??? And also "\n", "\t" etc????
206 else if (SrcLineType
== LineType
.StringConstant
&&
207 str
[index
] == '\\') {
208 if ((index
+ 1) <= (str
.Length
-1))
209 token
.Append (str
[index
+ 1]);
212 // Well the typical python ''' or """ stuff
213 else if ((SrcLangType
== LangType
.Python_Style
) &&
214 ((index
+ 2) <= (str
.Length
-1)) &&
215 (str
[index
] == '\"' || str
[index
] == '\'') &&
216 (str
[index
] == str
[index
+ 1] && str
[index
] == str
[index
+ 2]) &&
217 StrConstIdentifier
[0] == str
[index
]) {
219 if (SrcLineType
== LineType
.StringConstant
) {
220 SrcLineType
= LineType
.None
;
222 AppendText (token
.ToString());
225 StrConstIdentifier
= str
.Substring (index
, 3);
226 SrcLineType
= LineType
.StringConstant
;
233 // Lisp: ignore the single quote character; do another iteration
234 else if (SrcLangType
== LangType
.Lisp_Style
&& str
[index
] == '\'') {
237 else if (str
[index
] == '\"' || str
[index
] == '\'' ||
238 (str
[index
] == '`' && SrcLangType
== LangType
.Shell_Style
)) {
240 if (SrcLineType
== LineType
.StringConstant
&&
241 StrConstIdentifier
.Length
== 1 &&
242 StrConstIdentifier
[0] == str
[index
]) {
243 SrcLineType
= LineType
.None
;
245 AppendText (token
.ToString());
248 } else if (SrcLineType
== LineType
.None
) {
249 StrConstIdentifier
= str
.Substring (index
, 1);
250 SrcLineType
= LineType
.StringConstant
;
253 token
.Append (str
[index
]);
256 } else if (SrcLineType
!= LineType
.None
) {
257 token
.Append (splCharSeq
);
258 token
.Append (str
[index
]);
261 } else if (SrcLineType
== LineType
.None
) {
262 if (AppendToToken (str
[index
], index
, str
.Length
)) {
263 if (SrcLangType
== LangType
.Lisp_Style
) {
265 // Lisp identifiers: letters, digits, and:
266 // ! $ % & * + - . / : < = > ? @ ^ _ ~
267 switch (str
[index
]) {
268 case '!': case '$': case '%': case '&':
269 case '*': case '+': case '-': case '.':
270 case '/': case ':': case '<': case '=':
271 case '>': case '?': case '@': case '^':
273 token
.Append (str
[index
]);
277 //token = token.Replace(" ", "");
278 if (token
.Length
> 0) {
280 if (SrcLangType
== LangType
.Fortran_Style
)
281 tok
= token
.ToString().ToLower();
283 tok
= token
.ToString ();
284 if (!KeyWordsHash
.Contains (tok
)) {
285 if (!Char
.IsDigit (token
[0])) {
297 if (SrcLineType
!= LineType
.None
) {
298 bool trailing_backslash
= false;
300 token
.Append (splCharSeq
);
302 if (token
.Length
> 0 && token
[token
.Length
- 1] == '\\') {
303 token
= token
.Remove (token
.Length
-1, 1);
304 trailing_backslash
= true;
308 AppendText (token
.ToString());
310 // if a single-line-comment ends with a "\",
311 // the lines that follows it are also considered as a comment,
312 // till a line with out a "\" is found
313 // C# and Lisp don't follow this syntax.
314 if (SrcLineType
== LineType
.SingleLineComment
)
315 if (!trailing_backslash
316 || SrcLangType
== LangType
.C_Sharp_Style
317 || SrcLangType
== LangType
.Lisp_Style
)
318 SrcLineType
= LineType
.None
;
319 } else if (token
.Length
> 0
320 && !Char
.IsDigit (token
[0])) {
321 /* we don't want any numeric const */
323 AppendText (token
.ToString());