Some more fixes wrt child-indexables. Namely, fix proper handling of child indexables...
[beagle.git] / Util / StringMatcher.cs
blob447eaccc5205a3a9632e196d4f468930d14cf157
1 //
2 // StringMatcher.cs
3 //
4 // Copyright (C) 2004 Novell, Inc.
5 //
7 //
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This code implements the Knuth-Morris-Pratt algorithm, and is based
29 // on information and code I found at
30 // http://www.ics.uci.edu/~eppstein/161/960227.html
33 using System;
34 using System.Collections;
35 using System.IO;
37 namespace Beagle.Util {
39 public class StringMatcher {
41 private class Pattern {
42 public string Term;
43 public bool RequireLeadingWordBoundary;
44 public bool RequireTrailingWordBoundary;
47 public class Match {
48 protected string term;
49 protected int position;
51 public Match (string term, int position)
53 this.term = term;
54 this.position = position;
57 public string Term {
58 get { return term; }
61 public int Position {
62 get { return position; }
66 ArrayList pattern_array = new ArrayList ();
67 int [][] overlap_vector;
68 int [] state_vector;
70 TextReader reader;
71 char [] buffer = new char [8192];
72 int buffer_offset;
73 int buffer_len;
74 int buffer_pos;
75 bool finished;
77 public void Add (string term,
78 bool require_leading,
79 bool require_trailing)
81 Pattern pattern = new Pattern ();
82 pattern.Term = term;
83 pattern.RequireLeadingWordBoundary = require_leading;
84 pattern.RequireTrailingWordBoundary = require_trailing;
86 pattern_array.Add (pattern);
89 public void Study ()
95 public void Start (TextReader _reader)
97 int N = pattern_array.Count;
99 // Build the overlap vector
100 overlap_vector = new int [N] [];
101 for (int i = 0; i < N; ++i) {
102 Pattern pattern = pattern_array [i] as Pattern;
103 string term = pattern.Term;
105 int [] overlap = new int [term.Length];
106 overlap [0] = -1;
107 for (int j = 0; i < term.Length-1; ++j) {
108 overlap [j + 1] = overlap [j] + 1;
109 while (overlap [j + 1] > 0
110 && term [j] != term [overlap [j + 1] - 1])
111 overlap [j + 1] = overlap [overlap [j + 1] - 1] + 1;
114 overlap_vector [i] = overlap;
117 state_vector = new int [N];
119 reader = _reader;
121 buffer_offset = 0;
122 buffer_len = -1;
123 buffer_pos = 0;
125 finished = false;
128 public Match Find ()
130 if (finished)
131 return null;
133 int N = pattern_array.Count;
135 while (true) {
137 // If necessary, read another block of characters
138 // into our buffer.
139 if (buffer_pos >= buffer_len) {
140 if (buffer_len > 0)
141 buffer_offset += buffer_len;
142 buffer_len = reader.Read (buffer, 0, buffer.Length);
143 if (buffer_len <= 0) {
144 finished = true;
145 return null;
149 while (buffer_pos < buffer_len) {
150 char buffer_c = buffer [buffer_pos];
151 ++buffer_pos;
153 // Try each pattern in sequence
154 for (int i = 0; i < N; ++i) {
155 Pattern pattern = pattern_array [i] as Pattern;
156 string term = pattern.Term;
158 while (true) {
160 int state = state_vector [i];
161 if (buffer_c == term [state]) {
163 // If we match, move to the next state.
164 ++state_vector [i];
166 // if we reach the last state, return
167 // a Match object.
168 if (state_vector [i] == term.Length) {
169 state_vector [i] = 0;
170 return new Match (term, buffer_offset + buffer_pos - term.Length);
173 // Otherwise break out of the loop so that we can
174 // proceed to the next needle.
175 break;
177 } else if (state == 0) {
178 // If we don't get a match in the first
179 // state, there is no hope...
180 break;
181 } else {
182 // Try a shorter partial match
183 state_vector [i] = overlap_vector [i] [state];
191 #if false
192 public int Find (TextReader reader)
194 if (needle_vector == null || overlap_vector == null)
195 Study ();
197 int N = needle_vector.Length;
200 int [] offset_vector = new int [N];
202 while (true) {
204 if (buffer_len > 0)
205 buffer_offset += buffer_len;
207 buffer_len = reader.Read (buffer, 0, buffer.Length);
208 if (buffer_len <= 0)
209 break; // Oops... no matches found.
211 // Walk across the buffer
212 for (int i = 0; i < buffer_len; ++i) {
214 char buffer_c = buffer [i];
216 // Try each needle in sequences
217 for (int j = 0; j < N; ++j) {
218 string needle = needle_vector [j];
219 int k = offset_vector [j];
221 while (true) {
222 if (buffer_c == needle [k]) {
223 // If we match, move to the next state.
224 ++k;
225 // If we reach the last state, return the
226 // offset of the match.
227 if (k == needle.Length)
228 return buffer_offset + i - k + 1;
229 // Otherwise remember the state and
230 // break out of the loop so that we can
231 // proceed to the next needle.
232 offset_vector [j] = k;
233 break;
234 } else if (k == 0) {
235 // If we don't get a match in the first
236 // state, there is no hope...
237 offset_vector [j] = 0;
238 break;
239 } else {
240 // Try a shorter partial match
241 k = overlap_vector [j] [k];
248 return null;
250 #endif
252 #if false
253 static void Main ()
255 StringMatcher matcher = new StringMatcher ();
256 matcher.Add ("proceed");
257 matcher.Add ("remember");
259 TextReader reader = new StreamReader ("StringMatcher.cs");
260 int offset = matcher.Find (reader);
261 reader.Close ();
262 if (offset < 0) {
263 Console.WriteLine ("No matches found.");
264 } else {
265 FileStream stream = new FileStream ("StringMatcher.cs", FileMode.Open, FileAccess.Read);
266 stream.Seek (offset, SeekOrigin.Begin);
267 StreamReader foo = new StreamReader (stream);
268 string line = foo.ReadLine ();
270 Console.WriteLine ("Found match at offset {0}", offset);
271 Console.WriteLine ("[{0}]", line);
274 #endif