Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / beagled / Lucene.Net / Search / FuzzyTermEnum.cs
blob84c291edeca7d6a566697390644e71cda1f852f2
1 /*
2 * Copyright 2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 using System;
17 using IndexReader = Lucene.Net.Index.IndexReader;
18 using Term = Lucene.Net.Index.Term;
19 namespace Lucene.Net.Search
22 /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
23 /// <p>Term enumerations are always ordered by Term.compareTo(). Each term in
24 /// the enumeration is greater than all that precede it.
25 /// </summary>
26 public sealed class FuzzyTermEnum:FilteredTermEnum
28 private void InitBlock()
30 for (int i = 0; i < 1; i++)
32 e[i] = new int[1];
35 internal double distance;
36 internal bool endEnum = false;
38 internal Term searchTerm = null;
39 internal System.String field = "";
40 internal System.String text = "";
41 internal int textlen;
42 internal System.String prefix = "";
43 internal int prefixLength = 0;
44 internal float minimumSimilarity;
45 internal double scale_factor;
48 /// <summary> Empty prefix and minSimilarity of 0.5f are used.
49 ///
50 /// </summary>
51 /// <param name="">reader
52 /// </param>
53 /// <param name="">term
54 /// </param>
55 /// <throws> IOException </throws>
56 /// <seealso cref="Term, float, int)">
57 /// </seealso>
58 public FuzzyTermEnum(IndexReader reader, Term term):this(reader, term, FuzzyQuery.defaultMinSimilarity, 0)
62 /// <summary> This is the standard FuzzyTermEnum with an empty prefix.
63 ///
64 /// </summary>
65 /// <param name="">reader
66 /// </param>
67 /// <param name="">term
68 /// </param>
69 /// <param name="">minSimilarity
70 /// </param>
71 /// <throws> IOException </throws>
72 /// <seealso cref="Term, float, int)">
73 /// </seealso>
74 public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity):this(reader, term, minSimilarity, 0)
78 /// <summary> Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
79 /// length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
80 /// <code>minSimilarity</code>.
81 ///
82 /// </summary>
83 /// <param name="reader">Delivers terms.
84 /// </param>
85 /// <param name="term">Pattern term.
86 /// </param>
87 /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
88 /// </param>
89 /// <param name="prefixLength">Length of required common prefix. Default value is 0.
90 /// </param>
91 /// <throws> IOException </throws>
92 public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength):base()
94 InitBlock();
95 minimumSimilarity = minSimilarity;
96 scale_factor = 1.0f / (1.0f - minimumSimilarity);
97 searchTerm = term;
98 field = searchTerm.Field();
99 text = searchTerm.Text();
100 textlen = text.Length;
101 if (prefixLength > 0 && prefixLength < textlen)
103 this.prefixLength = prefixLength;
104 prefix = text.Substring(0, (prefixLength) - (0));
105 text = text.Substring(prefixLength);
106 textlen = text.Length;
108 SetEnum(reader.Terms(new Term(searchTerm.Field(), prefix)));
111 /// <summary>The termCompare method in FuzzyTermEnum uses Levenshtein distance to
112 /// calculate the distance between the given term and the comparing term.
113 /// </summary>
114 protected internal override bool TermCompare(Term term)
116 System.String termText = term.Text();
117 if ((System.Object) field == (System.Object) term.Field() && termText.StartsWith(prefix))
119 System.String target = termText.Substring(prefixLength);
120 int targetlen = target.Length;
121 int dist = EditDistance(text, target, textlen, targetlen);
122 distance = 1 - ((double) dist / (double) System.Math.Min(textlen, targetlen));
123 return (distance > minimumSimilarity);
125 endEnum = true;
126 return false;
129 public override float Difference()
131 return (float) ((distance - minimumSimilarity) * scale_factor);
134 public override bool EndEnum()
136 return endEnum;
139 /// <summary>***************************
140 /// Compute Levenshtein distance
141 /// ****************************
142 /// </summary>
144 /// <summary>Finds and returns the smallest of three integers </summary>
145 private static int Min(int a, int b, int c)
147 int t = (a < b) ? a : b;
148 return (t < c) ? t : c;
151 /// <summary> This static array saves us from the time required to create a new array
152 /// everytime editDistance is called.
153 /// </summary>
154 private int[][] e = new int[1][];
156 /// <summary>Levenshtein distance also known as edit distance is a measure of similiarity
157 /// between two strings where the distance is measured as the number of character
158 /// deletions, insertions or substitutions required to transform one string to
159 /// the other string.
160 /// <p>This method takes in four parameters; two strings and their respective
161 /// lengths to compute the Levenshtein distance between the two strings.
162 /// The result is returned as an integer.
163 /// </summary>
164 private int EditDistance(System.String s, System.String t, int n, int m)
166 if (e.Length <= n || e[0].Length <= m)
168 int[][] tmpArray = new int[System.Math.Max(e.Length, n + 1)][];
169 for (int i = 0; i < System.Math.Max(e.Length, n + 1); i++)
171 tmpArray[i] = new int[System.Math.Max(e[0].Length, m + 1)];
173 e = tmpArray;
175 int[][] d = e; // matrix
176 int i2; // iterates through s
177 int j; // iterates through t
178 char s_i; // ith character of s
180 if (n == 0)
181 return m;
182 if (m == 0)
183 return n;
185 // init matrix d
186 for (i2 = 0; i2 <= n; i2++)
187 d[i2][0] = i2;
188 for (j = 0; j <= m; j++)
189 d[0][j] = j;
191 // start computing edit distance
192 for (i2 = 1; i2 <= n; i2++)
194 s_i = s[i2 - 1];
195 for (j = 1; j <= m; j++)
197 if (s_i != t[j - 1])
198 d[i2][j] = Min(d[i2 - 1][j], d[i2][j - 1], d[i2 - 1][j - 1]) + 1;
199 else
200 d[i2][j] = Min(d[i2 - 1][j] + 1, d[i2][j - 1] + 1, d[i2 - 1][j - 1]);
204 // we got the result!
205 return d[n][m];
208 public override void Close()
210 base.Close();
211 searchTerm = null;
212 field = null;
213 text = null;