Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Analysis / DE / GermanAnalyzer.cs
blobf5cadde5c1f4fd4b9d58b151c4f6907da353a4d0
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using Analyzer = Lucene.Net.Analysis.Analyzer;
18 using LowerCaseFilter = Lucene.Net.Analysis.LowerCaseFilter;
19 using StopFilter = Lucene.Net.Analysis.StopFilter;
20 using TokenStream = Lucene.Net.Analysis.TokenStream;
21 using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter;
22 using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer;
23 namespace Lucene.Net.Analysis.DE
26 /// <summary> Analyzer for German language. Supports an external list of stopwords (words that
27 /// will not be indexed at all) and an external list of exclusions (word that will
28 /// not be stemmed, but indexed).
29 /// A default set of stopwords is used unless an alternative list is specified, the
30 /// exclusion list is empty by default.
31 ///
32 /// </summary>
33 /// <author> Gerhard Schwarz
34 /// </author>
35 /// <version> $Id: GermanAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
36 /// </version>
37 public class GermanAnalyzer : Analyzer
39 /// <summary> List of typical german stopwords.</summary>
40 private System.String[] GERMAN_STOP_WORDS = new System.String[]
42 "einer", "eine", "eines", "einem", "einen", "der", "die",
43 "das", "dass", "daß", "du", "er", "sie", "es", "was", "wer",
44 "wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in",
45 "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre",
46 "ihres", "als", "für", "von", "mit", "dich", "dir", "mich",
47 "mir", "mein", "sein", "kein", "durch", "wegen", "wird"
50 /// <summary> Contains the stopwords used with the StopFilter.</summary>
51 private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
53 /// <summary> Contains words that should be indexed but not stemmed.</summary>
54 private System.Collections.Hashtable exclusionSet = new System.Collections.Hashtable();
56 /// <summary> Builds an analyzer.</summary>
57 public GermanAnalyzer()
59 stopSet = StopFilter.MakeStopSet(GERMAN_STOP_WORDS);
62 /// <summary> Builds an analyzer with the given stop words.</summary>
63 public GermanAnalyzer(System.String[] stopwords)
65 stopSet = StopFilter.MakeStopSet(stopwords);
68 /// <summary> Builds an analyzer with the given stop words.</summary>
69 public GermanAnalyzer(System.Collections.Hashtable stopwords)
71 stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
74 /// <summary> Builds an analyzer with the given stop words.</summary>
75 public GermanAnalyzer(System.IO.FileInfo stopwords)
77 stopSet = WordlistLoader.GetWordSet(stopwords);
80 /// <summary> Builds an exclusionlist from an array of Strings.</summary>
81 public virtual void SetStemExclusionTable(System.String[] exclusionlist)
83 exclusionSet = StopFilter.MakeStopSet(exclusionlist);
86 /// <summary> Builds an exclusionlist from a Hashtable.</summary>
87 public virtual void SetStemExclusionTable(System.Collections.Hashtable exclusionlist)
89 exclusionSet = new System.Collections.Hashtable(new System.Collections.Hashtable(exclusionlist));
92 /// <summary> Builds an exclusionlist from the words contained in the given file.</summary>
93 public virtual void SetStemExclusionTable(System.IO.FileInfo exclusionlist)
95 exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
98 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
99 ///
100 /// </summary>
101 /// <returns> A TokenStream build from a StandardTokenizer filtered with
102 /// StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
103 /// </returns>
104 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
106 TokenStream result = new StandardTokenizer(reader);
107 result = new StandardFilter(result);
108 result = new LowerCaseFilter(result);
109 result = new StopFilter(result, stopSet);
110 result = new GermanStemFilter(result, exclusionSet);
111 return result;