beagled/Lucene.Net/Analysis/DE/GermanAnalyzer.cs

   1 /*
   2  * Copyright 2004 The Apache Software Foundation
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  * http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 using System;
  17 using Analyzer = Lucene.Net.Analysis.Analyzer;
  18 using LowerCaseFilter = Lucene.Net.Analysis.LowerCaseFilter;
  19 using StopFilter = Lucene.Net.Analysis.StopFilter;
  20 using TokenStream = Lucene.Net.Analysis.TokenStream;
  21 using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter;
  22 using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer;
  23 namespace Lucene.Net.Analysis.DE
  24 {
  25
  26         /// <summary> Analyzer for German language. Supports an external list of stopwords (words that
  27         /// will not be indexed at all) and an external list of exclusions (word that will
  28         /// not be stemmed, but indexed).
  29         /// A default set of stopwords is used unless an alternative list is specified, the
  30         /// exclusion list is empty by default.
  31         ///
  32         /// </summary>
  33         /// <author>  Gerhard Schwarz
  34         /// </author>
  35         /// <version>  $Id: GermanAnalyzer.cs,v 1.2 2005/01/17 19:54:27 joeshaw Exp $
  36         /// </version>
  37         public class GermanAnalyzer : Analyzer
  38         {
  39                 /// <summary> List of typical german stopwords.</summary>
  40                 private System.String[] GERMAN_STOP_WORDS = new System.String[]
  41             {
  42                 "einer", "eine", "eines", "einem", "einen", "der", "die",
  43                 "das", "dass", "daß", "du", "er", "sie", "es", "was", "wer",
  44                 "wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in",
  45                 "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre",
  46                 "ihres", "als", "für", "von", "mit", "dich", "dir", "mich",
  47                 "mir", "mein", "sein", "kein", "durch", "wegen", "wird"
  48             };
  49
  50                 /// <summary> Contains the stopwords used with the StopFilter.</summary>
  51                 private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
  52
  53                 /// <summary> Contains words that should be indexed but not stemmed.</summary>
  54                 private System.Collections.Hashtable exclusionSet = new System.Collections.Hashtable();
  55
  56                 /// <summary> Builds an analyzer.</summary>
  57                 public GermanAnalyzer()
  58                 {
  59                         stopSet = StopFilter.MakeStopSet(GERMAN_STOP_WORDS);
  60                 }
  61
  62                 /// <summary> Builds an analyzer with the given stop words.</summary>
  63                 public GermanAnalyzer(System.String[] stopwords)
  64                 {
  65                         stopSet = StopFilter.MakeStopSet(stopwords);
  66                 }
  67
  68                 /// <summary> Builds an analyzer with the given stop words.</summary>
  69                 public GermanAnalyzer(System.Collections.Hashtable stopwords)
  70                 {
  71                         stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
  72                 }
  73
  74                 /// <summary> Builds an analyzer with the given stop words.</summary>
  75                 public GermanAnalyzer(System.IO.FileInfo stopwords)
  76                 {
  77                         stopSet = WordlistLoader.GetWordSet(stopwords);
  78                 }
  79
  80                 /// <summary> Builds an exclusionlist from an array of Strings.</summary>
  81                 public virtual void  SetStemExclusionTable(System.String[] exclusionlist)
  82                 {
  83                         exclusionSet = StopFilter.MakeStopSet(exclusionlist);
  84                 }
  85
  86                 /// <summary> Builds an exclusionlist from a Hashtable.</summary>
  87                 public virtual void  SetStemExclusionTable(System.Collections.Hashtable exclusionlist)
  88                 {
  89                         exclusionSet = new System.Collections.Hashtable(new System.Collections.Hashtable(exclusionlist));
  90                 }
  91
  92                 /// <summary> Builds an exclusionlist from the words contained in the given file.</summary>
  93                 public virtual void  SetStemExclusionTable(System.IO.FileInfo exclusionlist)
  94                 {
  95                         exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
  96                 }
  97
  98                 /// <summary> Creates a TokenStream which tokenizes all the text in the provided Reader.
  99                 ///
 100                 /// </summary>
 101                 /// <returns> A TokenStream build from a StandardTokenizer filtered with
 102                 /// StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
 103                 /// </returns>
 104                 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 105                 {
 106                         TokenStream result = new StandardTokenizer(reader);
 107                         result = new StandardFilter(result);
 108                         result = new LowerCaseFilter(result);
 109                         result = new StopFilter(result, stopSet);
 110                         result = new GermanStemFilter(result, exclusionSet);
 111                         return result;
 112                 }
 113         }
 114 }