2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using Analyzer
= Lucene
.Net
.Analysis
.Analyzer
;
18 using LowerCaseFilter
= Lucene
.Net
.Analysis
.LowerCaseFilter
;
19 using StopFilter
= Lucene
.Net
.Analysis
.StopFilter
;
20 using TokenStream
= Lucene
.Net
.Analysis
.TokenStream
;
21 using StandardFilter
= Lucene
.Net
.Analysis
.Standard
.StandardFilter
;
22 using StandardTokenizer
= Lucene
.Net
.Analysis
.Standard
.StandardTokenizer
;
23 namespace Lucene
.Net
.Analysis
.DE
26 /// <summary> Analyzer for German language. Supports an external list of stopwords (words that
27 /// will not be indexed at all) and an external list of exclusions (word that will
28 /// not be stemmed, but indexed).
29 /// A default set of stopwords is used unless an alternative list is specified, the
30 /// exclusion list is empty by default.
33 /// <author> Gerhard Schwarz
35 /// <version> $Id: GermanAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
37 public class GermanAnalyzer
: Analyzer
39 /// <summary> List of typical german stopwords.</summary>
40 private System
.String
[] GERMAN_STOP_WORDS
= new System
.String
[]
42 "einer", "eine", "eines", "einem", "einen", "der", "die",
43 "das", "dass", "daß", "du", "er", "sie", "es", "was", "wer",
44 "wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in",
45 "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre",
46 "ihres", "als", "für", "von", "mit", "dich", "dir", "mich",
47 "mir", "mein", "sein", "kein", "durch", "wegen", "wird"
50 /// <summary> Contains the stopwords used with the StopFilter.</summary>
51 private System
.Collections
.Hashtable stopSet
= new System
.Collections
.Hashtable();
53 /// <summary> Contains words that should be indexed but not stemmed.</summary>
54 private System
.Collections
.Hashtable exclusionSet
= new System
.Collections
.Hashtable();
56 /// <summary> Builds an analyzer.</summary>
57 public GermanAnalyzer()
59 stopSet
= StopFilter
.MakeStopSet(GERMAN_STOP_WORDS
);
62 /// <summary> Builds an analyzer with the given stop words.</summary>
63 public GermanAnalyzer(System
.String
[] stopwords
)
65 stopSet
= StopFilter
.MakeStopSet(stopwords
);
68 /// <summary> Builds an analyzer with the given stop words.</summary>
69 public GermanAnalyzer(System
.Collections
.Hashtable stopwords
)
71 stopSet
= new System
.Collections
.Hashtable(new System
.Collections
.Hashtable(stopwords
));
74 /// <summary> Builds an analyzer with the given stop words.</summary>
75 public GermanAnalyzer(System
.IO
.FileInfo stopwords
)
77 stopSet
= WordlistLoader
.GetWordSet(stopwords
);
80 /// <summary> Builds an exclusionlist from an array of Strings.</summary>
81 public virtual void SetStemExclusionTable(System
.String
[] exclusionlist
)
83 exclusionSet
= StopFilter
.MakeStopSet(exclusionlist
);
86 /// <summary> Builds an exclusionlist from a Hashtable.</summary>
87 public virtual void SetStemExclusionTable(System
.Collections
.Hashtable exclusionlist
)
89 exclusionSet
= new System
.Collections
.Hashtable(new System
.Collections
.Hashtable(exclusionlist
));
92 /// <summary> Builds an exclusionlist from the words contained in the given file.</summary>
93 public virtual void SetStemExclusionTable(System
.IO
.FileInfo exclusionlist
)
95 exclusionSet
= WordlistLoader
.GetWordSet(exclusionlist
);
98 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
101 /// <returns> A TokenStream build from a StandardTokenizer filtered with
102 /// StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
104 public override TokenStream
TokenStream(System
.String fieldName
, System
.IO
.TextReader reader
)
106 TokenStream result
= new StandardTokenizer(reader
);
107 result
= new StandardFilter(result
);
108 result
= new LowerCaseFilter(result
);
109 result
= new StopFilter(result
, stopSet
);
110 result
= new GermanStemFilter(result
, exclusionSet
);