Initial revision
[beagle.git] / Lucene.Net / Search / FuzzyTermEnum.cs
blob9448f3bcdc62b23edca6699c905448b0dccc8e20
1 using System;
2 using Lucene.Net.Index;
4 namespace Lucene.Net.Search
6 /* ====================================================================
7 * The Apache Software License, Version 1.1
9 * Copyright (c) 2001 The Apache Software Foundation. All rights
10 * reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
24 * 3. The end-user documentation included with the redistribution,
25 * if any, must include the following acknowledgment:
26 * "This product includes software developed by the
27 * Apache Software Foundation (http://www.apache.org/)."
28 * Alternately, this acknowledgment may appear in the software itself,
29 * if and wherever such third-party acknowledgments normally appear.
31 * 4. The names "Apache" and "Apache Software Foundation" and
32 * "Apache Lucene" must not be used to endorse or promote products
33 * derived from this software without prior written permission. For
34 * written permission, please contact apache@apache.org.
36 * 5. Products derived from this software may not be called "Apache",
37 * "Apache Lucene", nor may "Apache" appear in their name, without
38 * prior written permission of the Apache Software Foundation.
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
44 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 * ====================================================================
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Apache Software Foundation. For more
56 * information on the Apache Software Foundation, please see
57 * <http://www.apache.org/>.
60 /// <summary>
61 /// Subclass of FilteredTermEnum for enumerating all terms that are
62 /// similiar to the specified filter term.
63 /// <p>Term enumerations are always ordered by Term.CompareTo(). Each term in
64 /// the enumeration is greater than all that precede it.</p>
65 /// </summary>
66 public sealed class FuzzyTermEnum : FilteredTermEnum
68 internal double distance;
69 internal bool fieldMatch = false;
70 internal bool endEnum = false;
72 internal Term searchTerm = null;
73 internal String field = "";
74 internal String text = "";
75 internal int textlen;
77 public FuzzyTermEnum(IndexReader reader, Term term) : base(reader, term)
79 searchTerm = term;
80 field = searchTerm.Field();
81 text = searchTerm.Text();
82 textlen = text.Length;
83 SetEnum(reader.Terms(new Term(searchTerm.Field(), "")));
86 /// <summary>
87 /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to
88 /// calculate the distance between the given term and the comparing term.
89 /// </summary>
90 /// <param name="term"></param>
91 /// <returns></returns>
92 protected override bool TermCompare(Term term)
94 if (field == term.Field())
96 String target = term.Text();
97 int targetlen = target.Length;
98 int dist = EditDistance(text, target, textlen, targetlen);
99 distance = 1 - ((double)dist / (double)Math.Min(textlen, targetlen));
100 return (distance > FUZZY_THRESHOLD);
102 endEnum = true;
103 return false;
106 public override float Difference()
108 return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
111 public override bool EndEnum()
113 return endEnum;
116 /******************************
117 * Compute Levenshtein distance
118 ******************************/
120 public const double FUZZY_THRESHOLD = 0.5;
121 public const double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
123 /// <summary>
124 /// Finds and returns the smallest of three integers
125 /// </summary>
126 /// <param name="a"></param>
127 /// <param name="b"></param>
128 /// <param name="c"></param>
129 /// <returns></returns>
130 private int Min(int a, int b, int c)
132 int t = (a < b) ? a : b;
133 return (t < c) ? t : c;
136 /// <summary>
137 /// This static array saves us from the time required to create a new array
138 /// everytime editDistance is called.
139 ///
140 /// </summary>
141 private int[][] e = null;
143 /// <summary>
144 /// Levenshtein distance also known as edit distance is a measure of similiarity
145 /// between two strings where the distance is measured as the number of character
146 /// deletions, insertions or substitutions required to transform one string to
147 /// the other string.
148 /// <p>This method takes in four parameters; two strings and their respective
149 /// lengths to compute the Levenshtein distance between the two strings.
150 /// The result is returned as an integer.</p>
151 /// </summary>
152 /// <param name="s"></param>
153 /// <param name="t"></param>
154 /// <param name="n"></param>
155 /// <param name="m"></param>
156 /// <returns></returns>
157 private int EditDistance(String s, String t, int n, int m)
159 if (e == null || e.Length <= n || e[0].Length <= m)
161 int l1, l2;
163 if (e == null)
165 l1 = n+1;
166 l2 = m+1;
168 else
170 l1 = Math.Max(e.Length, n+1);
171 l2 = Math.Max(e[0].Length, m+1);
174 e = new int[l1][];
175 for (int i1 = 0; i1 < l1; i1++)
177 e[i1] = new int[l2];
180 int[][] d = e; // matrix
181 int i; // iterates through s
182 int j; // iterates through t
183 char s_i; // ith character of s
185 if (n == 0) return m;
186 if (m == 0) return n;
188 // init matrix d
189 for (i = 0; i <= n; i++) d[i][0] = i;
190 for (j = 0; j <= m; j++) d[0][j] = j;
192 // start computing edit distance
193 for (i = 1; i <= n; i++)
195 s_i = s[i - 1];
196 for (j = 1; j <= m; j++)
198 if (s_i != t[j-1])
199 d[i][j] = Min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
200 else d[i][j] = Min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
204 // we got the result!
205 return d[n][m];
208 public override void Close()
210 base.Close();
211 searchTerm = null;
212 field = null;
213 text = null;