2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 using IndexReader
= Lucene
.Net
.Index
.IndexReader
;
19 using Term
= Lucene
.Net
.Index
.Term
;
21 namespace Lucene
.Net
.Search
24 /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar
25 /// to the specified filter term.
27 /// <p>Term enumerations are always ordered by Term.compareTo(). Each term in
28 /// the enumeration is greater than all that precede it.
30 public sealed class FuzzyTermEnum
: FilteredTermEnum
33 /* This should be somewhere around the average long word.
34 * If it is longer, we waste time and space. If it is shorter, we waste a
35 * little bit of time growing the array as we encounter longer words.
37 private const int TYPICAL_LONGEST_WORD_IN_INDEX
= 19;
39 /* Allows us save time required to create a new array
40 * everytime similarity is called.
44 private float similarity
;
45 private bool endEnum
= false;
47 private Term searchTerm
= null;
48 private System
.String field
;
49 private System
.String text
;
50 private System
.String prefix
;
52 private float minimumSimilarity
;
53 private float scale_factor
;
54 private int[] maxDistances
= new int[TYPICAL_LONGEST_WORD_IN_INDEX
];
56 /// <summary> Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
58 /// After calling the constructor the enumeration is already pointing to the first
59 /// valid term if such a term exists.
62 /// <param name="reader">
64 /// <param name="term">
66 /// <throws> IOException </throws>
67 /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
69 public FuzzyTermEnum(IndexReader reader
, Term term
) : this(reader
, term
, FuzzyQuery
.defaultMinSimilarity
, FuzzyQuery
.defaultPrefixLength
)
73 /// <summary> Creates a FuzzyTermEnum with an empty prefix.
75 /// After calling the constructor the enumeration is already pointing to the first
76 /// valid term if such a term exists.
79 /// <param name="reader">
81 /// <param name="term">
83 /// <param name="minSimilarity">
85 /// <throws> IOException </throws>
86 /// <seealso cref="FuzzyTermEnum(IndexReader, Term, float, int)">
88 public FuzzyTermEnum(IndexReader reader
, Term term
, float minSimilarity
) : this(reader
, term
, minSimilarity
, FuzzyQuery
.defaultPrefixLength
)
92 /// <summary> Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
93 /// length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
94 /// <code>minSimilarity</code>.
96 /// After calling the constructor the enumeration is already pointing to the first
97 /// valid term if such a term exists.
100 /// <param name="reader">Delivers terms.
102 /// <param name="term">Pattern term.
104 /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
106 /// <param name="prefixLength">Length of required common prefix. Default value is 0.
108 /// <throws> IOException </throws>
109 public FuzzyTermEnum(IndexReader reader
, Term term
, float minSimilarity
, int prefixLength
) : base()
112 if (minSimilarity
>= 1.0f
)
113 throw new System
.ArgumentException("minimumSimilarity cannot be greater than or equal to 1");
114 else if (minSimilarity
< 0.0f
)
115 throw new System
.ArgumentException("minimumSimilarity cannot be less than 0");
116 if (prefixLength
< 0)
117 throw new System
.ArgumentException("prefixLength cannot be less than 0");
119 this.minimumSimilarity
= minSimilarity
;
120 this.scale_factor
= 1.0f
/ (1.0f
- minimumSimilarity
);
121 this.searchTerm
= term
;
122 this.field
= searchTerm
.Field();
124 //The prefix could be longer than the word.
125 //It's kind of silly though. It means we must match the entire word.
126 int fullSearchTermLength
= searchTerm
.Text().Length
;
127 int realPrefixLength
= prefixLength
> fullSearchTermLength
?fullSearchTermLength
:prefixLength
;
129 this.text
= searchTerm
.Text().Substring(realPrefixLength
);
130 this.prefix
= searchTerm
.Text().Substring(0, (realPrefixLength
) - (0));
132 InitializeMaxDistances();
133 this.d
= InitDistanceArray();
135 SetEnum(reader
.Terms(new Term(searchTerm
.Field(), prefix
)));
138 /// <summary> The termCompare method in FuzzyTermEnum uses Levenshtein distance to
139 /// calculate the distance between the given term and the comparing term.
141 protected internal override bool TermCompare(Term term
)
143 if (field
== term
.Field() && term
.Text().StartsWith(prefix
))
145 System
.String target
= term
.Text().Substring(prefix
.Length
);
146 this.similarity
= Similarity(target
);
147 return (similarity
> minimumSimilarity
);
153 public override float Difference()
155 return (float) ((similarity
- minimumSimilarity
) * scale_factor
);
158 public override bool EndEnum()
163 /// <summary>***************************
164 /// Compute Levenshtein distance
165 /// ****************************
168 /// <summary> Finds and returns the smallest of three integers </summary>
169 private static int min(int a
, int b
, int c
)
171 int t
= (a
< b
) ? a
: b
;
172 return (t
< c
) ? t
: c
;
175 private int[][] InitDistanceArray()
177 int[][] tmpArray
= new int[this.text
.Length
+ 1][];
178 for (int i
= 0; i
< this.text
.Length
+ 1; i
++)
180 tmpArray
[i
] = new int[TYPICAL_LONGEST_WORD_IN_INDEX
];
185 /// <summary> <p>Similarity returns a number that is 1.0f or less (including negative numbers)
186 /// based on how similar the Term is compared to a target term. It returns
187 /// exactly 0.0f when
189 /// editDistance < maximumEditDistance</pre>
190 /// Otherwise it returns:
192 /// 1 - (editDistance / length)</pre>
193 /// where length is the length of the shortest term (text or target) including a
194 /// prefix that are identical and editDistance is the Levenshtein distance for
195 /// the two words.</p>
197 /// <p>Embedded within this algorithm is a fail-fast Levenshtein distance
198 /// algorithm. The fail-fast algorithm differs from the standard Levenshtein
199 /// distance algorithm in that it is aborted if it is discovered that the
200 /// mimimum distance between the words is greater than some threshold.
202 /// <p>To calculate the maximum distance threshold we use the following formula:
204 /// (1 - minimumSimilarity) * length</pre>
205 /// where length is the shortest term including any prefix that is not part of the
206 /// similarity comparision. This formula was derived by solving for what maximum value
207 /// of distance returns false for the following statements:
209 /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
210 /// return (similarity > minimumSimilarity);</pre>
211 /// where distance is the Levenshtein distance for the two words.
213 /// <p>Levenshtein distance (also known as edit distance) is a measure of similiarity
214 /// between two strings where the distance is measured as the number of character
215 /// deletions, insertions or substitutions required to transform one string to
216 /// the other string.
218 /// <param name="target">the target word or phrase
220 /// <returns> the similarity, 0.0 or less indicates that it matches less than the required
221 /// threshold and 1.0 indicates that the text and target are identical
223 private float Similarity(System
.String target
)
227 int m
= target
.Length
;
231 //we don't have anything to compare. That means if we just add
232 //the letters for m we get the new word
233 return prefix
.Length
== 0 ? 0.0f
: 1.0f
- ((float) m
/ prefix
.Length
);
237 return prefix
.Length
== 0 ? 0.0f
: 1.0f
- ((float) n
/ prefix
.Length
);
240 int maxDistance
= GetMaxDistance(m
);
242 if (maxDistance
< System
.Math
.Abs(m
- n
))
244 //just adding the characters of m to n or vice-versa results in
246 //for example "pre" length is 3 and "prefixes" length is 8. We can see that
247 //given this optimal circumstance, the edit distance cannot be less than 5.
248 //which is 8-3 or more precisesly Math.abs(3-8).
249 //if our maximum edit distance is 4, then we can discard this word
250 //without looking at it.
254 //let's make sure we have enough room in our array to do the distance calculations.
255 if (d
[0].Length
<= m
)
257 GrowDistanceArray(m
);
261 for (int i
= 0; i
<= n
; i
++)
263 for (int j
= 0; j
<= m
; j
++)
266 // start computing edit distance
267 for (int i
= 1; i
<= n
; i
++)
269 int bestPossibleEditDistance
= m
;
270 char s_i
= text
[i
- 1];
271 for (int j
= 1; j
<= m
; j
++)
273 if (s_i
!= target
[j
- 1])
275 d
[i
][j
] = min(d
[i
- 1][j
], d
[i
][j
- 1], d
[i
- 1][j
- 1]) + 1;
279 d
[i
][j
] = min(d
[i
- 1][j
] + 1, d
[i
][j
- 1] + 1, d
[i
- 1][j
- 1]);
281 bestPossibleEditDistance
= System
.Math
.Min(bestPossibleEditDistance
, d
[i
][j
]);
284 //After calculating row i, the best possible edit distance
285 //can be found by found by finding the smallest value in a given column.
286 //If the bestPossibleEditDistance is greater than the max distance, abort.
288 if (i
> maxDistance
&& bestPossibleEditDistance
> maxDistance
)
290 //equal is okay, but not greater
291 //the closest the target can be to the text is just too far away.
292 //this target is leaving the party early.
297 // this will return less than 0.0 when the edit distance is
298 // greater than the number of characters in the shorter word.
299 // but this was the formula that was previously used in FuzzyTermEnum,
300 // so it has not been changed (even though minimumSimilarity must be
302 return 1.0f
- ((float) d
[n
][m
] / (float) (prefix
.Length
+ System
.Math
.Min(n
, m
)));
306 /// <summary> Grow the second dimension of the array, so that we can calculate the
307 /// Levenshtein difference.
309 private void GrowDistanceArray(int m
)
311 for (int i
= 0; i
< d
.Length
; i
++)
313 d
[i
] = new int[m
+ 1];
317 /// <summary> The max Distance is the maximum Levenshtein distance for the text
318 /// compared to some other value that results in score that is
319 /// better than the minimum similarity.
321 /// <param name="m">the length of the "other value"
323 /// <returns> the maximum levenshtein distance that we care about
325 private int GetMaxDistance(int m
)
327 return (m
< maxDistances
.Length
)?maxDistances
[m
]:CalculateMaxDistance(m
);
330 private void InitializeMaxDistances()
332 for (int i
= 0; i
< maxDistances
.Length
; i
++)
334 maxDistances
[i
] = CalculateMaxDistance(i
);
338 private int CalculateMaxDistance(int m
)
340 return (int) ((1 - minimumSimilarity
) * (System
.Math
.Min(text
.Length
, m
) + prefix
.Length
));
343 public override void Close()
345 base.Close(); //call super.close() and let the garbage collector do its work.