2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 using IndexReader
= Lucene
.Net
.Index
.IndexReader
;
18 using Term
= Lucene
.Net
.Index
.Term
;
19 namespace Lucene
.Net
.Search
22 /// <summary>Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
23 /// <p>Term enumerations are always ordered by Term.compareTo(). Each term in
24 /// the enumeration is greater than all that precede it.
26 public sealed class FuzzyTermEnum
:FilteredTermEnum
28 private void InitBlock()
30 for (int i
= 0; i
< 1; i
++)
35 internal double distance
;
36 internal bool endEnum
= false;
38 internal Term searchTerm
= null;
39 internal System
.String field
= "";
40 internal System
.String text
= "";
42 internal System
.String prefix
= "";
43 internal int prefixLength
= 0;
44 internal float minimumSimilarity
;
45 internal double scale_factor
;
48 /// <summary> Empty prefix and minSimilarity of 0.5f are used.
51 /// <param name="">reader
53 /// <param name="">term
55 /// <throws> IOException </throws>
56 /// <seealso cref="Term, float, int)">
58 public FuzzyTermEnum(IndexReader reader
, Term term
):this(reader
, term
, FuzzyQuery
.defaultMinSimilarity
, 0)
62 /// <summary> This is the standard FuzzyTermEnum with an empty prefix.
65 /// <param name="">reader
67 /// <param name="">term
69 /// <param name="">minSimilarity
71 /// <throws> IOException </throws>
72 /// <seealso cref="Term, float, int)">
74 public FuzzyTermEnum(IndexReader reader
, Term term
, float minSimilarity
):this(reader
, term
, minSimilarity
, 0)
78 /// <summary> Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
79 /// length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
80 /// <code>minSimilarity</code>.
83 /// <param name="reader">Delivers terms.
85 /// <param name="term">Pattern term.
87 /// <param name="minSimilarity">Minimum required similarity for terms from the reader. Default value is 0.5f.
89 /// <param name="prefixLength">Length of required common prefix. Default value is 0.
91 /// <throws> IOException </throws>
92 public FuzzyTermEnum(IndexReader reader
, Term term
, float minSimilarity
, int prefixLength
):base()
95 minimumSimilarity
= minSimilarity
;
96 scale_factor
= 1.0f
/ (1.0f
- minimumSimilarity
);
98 field
= searchTerm
.Field();
99 text
= searchTerm
.Text();
100 textlen
= text
.Length
;
101 if (prefixLength
> 0 && prefixLength
< textlen
)
103 this.prefixLength
= prefixLength
;
104 prefix
= text
.Substring(0, (prefixLength
) - (0));
105 text
= text
.Substring(prefixLength
);
106 textlen
= text
.Length
;
108 SetEnum(reader
.Terms(new Term(searchTerm
.Field(), prefix
)));
111 /// <summary>The termCompare method in FuzzyTermEnum uses Levenshtein distance to
112 /// calculate the distance between the given term and the comparing term.
114 protected internal override bool TermCompare(Term term
)
116 System
.String termText
= term
.Text();
117 if ((System
.Object
) field
== (System
.Object
) term
.Field() && termText
.StartsWith(prefix
))
119 System
.String target
= termText
.Substring(prefixLength
);
120 int targetlen
= target
.Length
;
121 int dist
= EditDistance(text
, target
, textlen
, targetlen
);
122 distance
= 1 - ((double) dist
/ (double) System
.Math
.Min(textlen
, targetlen
));
123 return (distance
> minimumSimilarity
);
129 public override float Difference()
131 return (float) ((distance
- minimumSimilarity
) * scale_factor
);
134 public override bool EndEnum()
139 /// <summary>***************************
140 /// Compute Levenshtein distance
141 /// ****************************
144 /// <summary>Finds and returns the smallest of three integers </summary>
145 private static int Min(int a
, int b
, int c
)
147 int t
= (a
< b
) ? a
: b
;
148 return (t
< c
) ? t
: c
;
151 /// <summary> This static array saves us from the time required to create a new array
152 /// everytime editDistance is called.
154 private int[][] e
= new int[1][];
156 /// <summary>Levenshtein distance also known as edit distance is a measure of similiarity
157 /// between two strings where the distance is measured as the number of character
158 /// deletions, insertions or substitutions required to transform one string to
159 /// the other string.
160 /// <p>This method takes in four parameters; two strings and their respective
161 /// lengths to compute the Levenshtein distance between the two strings.
162 /// The result is returned as an integer.
164 private int EditDistance(System
.String s
, System
.String t
, int n
, int m
)
166 if (e
.Length
<= n
|| e
[0].Length
<= m
)
168 int[][] tmpArray
= new int[System
.Math
.Max(e
.Length
, n
+ 1)][];
169 for (int i
= 0; i
< System
.Math
.Max(e
.Length
, n
+ 1); i
++)
171 tmpArray
[i
] = new int[System
.Math
.Max(e
[0].Length
, m
+ 1)];
175 int[][] d
= e
; // matrix
176 int i2
; // iterates through s
177 int j
; // iterates through t
178 char s_i
; // ith character of s
186 for (i2
= 0; i2
<= n
; i2
++)
188 for (j
= 0; j
<= m
; j
++)
191 // start computing edit distance
192 for (i2
= 1; i2
<= n
; i2
++)
195 for (j
= 1; j
<= m
; j
++)
198 d
[i2
][j
] = Min(d
[i2
- 1][j
], d
[i2
][j
- 1], d
[i2
- 1][j
- 1]) + 1;
200 d
[i2
][j
] = Min(d
[i2
- 1][j
] + 1, d
[i2
][j
- 1] + 1, d
[i2
- 1][j
- 1]);
204 // we got the result!
208 public override void Close()