4 // Copyright (C) 2004 Novell, Inc.
8 // Permission is hereby granted, free of charge, to any person obtaining a
9 // copy of this software and associated documentation files (the "Software"),
10 // to deal in the Software without restriction, including without limitation
11 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 // and/or sell copies of the Software, and to permit persons to whom the
13 // Software is furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 // DEALINGS IN THE SOFTWARE.
28 // This code implements the Knuth-Morris-Pratt algorithm, and is based
29 // on information and code I found at
30 // http://www.ics.uci.edu/~eppstein/161/960227.html
34 using System
.Collections
;
37 namespace Beagle
.Util
{
39 public class StringMatcher
{
41 private class Pattern
{
43 public bool RequireLeadingWordBoundary
;
44 public bool RequireTrailingWordBoundary
;
48 protected string term
;
49 protected int position
;
51 public Match (string term
, int position
)
54 this.position
= position
;
62 get { return position; }
66 ArrayList pattern_array
= new ArrayList ();
67 int [][] overlap_vector
;
71 char [] buffer
= new char [8192];
77 public void Add (string term
,
79 bool require_trailing
)
81 Pattern pattern
= new Pattern ();
83 pattern
.RequireLeadingWordBoundary
= require_leading
;
84 pattern
.RequireTrailingWordBoundary
= require_trailing
;
86 pattern_array
.Add (pattern
);
95 public void Start (TextReader _reader
)
97 int N
= pattern_array
.Count
;
99 // Build the overlap vector
100 overlap_vector
= new int [N
] [];
101 for (int i
= 0; i
< N
; ++i
) {
102 Pattern pattern
= pattern_array
[i
] as Pattern
;
103 string term
= pattern
.Term
;
105 int [] overlap
= new int [term
.Length
];
107 for (int j
= 0; i
< term
.Length
-1; ++j
) {
108 overlap
[j
+ 1] = overlap
[j
] + 1;
109 while (overlap
[j
+ 1] > 0
110 && term
[j
] != term
[overlap
[j
+ 1] - 1])
111 overlap
[j
+ 1] = overlap
[overlap
[j
+ 1] - 1] + 1;
114 overlap_vector
[i
] = overlap
;
117 state_vector
= new int [N
];
133 int N
= pattern_array
.Count
;
137 // If necessary, read another block of characters
139 if (buffer_pos
>= buffer_len
) {
141 buffer_offset
+= buffer_len
;
142 buffer_len
= reader
.Read (buffer
, 0, buffer
.Length
);
143 if (buffer_len
<= 0) {
149 while (buffer_pos
< buffer_len
) {
150 char buffer_c
= buffer
[buffer_pos
];
153 // Try each pattern in sequence
154 for (int i
= 0; i
< N
; ++i
) {
155 Pattern pattern
= pattern_array
[i
] as Pattern
;
156 string term
= pattern
.Term
;
160 int state
= state_vector
[i
];
161 if (buffer_c
== term
[state
]) {
163 // If we match, move to the next state.
166 // if we reach the last state, return
168 if (state_vector
[i
] == term
.Length
) {
169 state_vector
[i
] = 0;
170 return new Match (term
, buffer_offset
+ buffer_pos
- term
.Length
);
173 // Otherwise break out of the loop so that we can
174 // proceed to the next needle.
177 } else if (state
== 0) {
178 // If we don't get a match in the first
179 // state, there is no hope...
182 // Try a shorter partial match
183 state_vector
[i
] = overlap_vector
[i
] [state
];
192 public int Find (TextReader reader
)
194 if (needle_vector
== null || overlap_vector
== null)
197 int N
= needle_vector
.Length
;
200 int [] offset_vector
= new int [N
];
205 buffer_offset
+= buffer_len
;
207 buffer_len
= reader
.Read (buffer
, 0, buffer
.Length
);
209 break; // Oops... no matches found.
211 // Walk across the buffer
212 for (int i
= 0; i
< buffer_len
; ++i
) {
214 char buffer_c
= buffer
[i
];
216 // Try each needle in sequences
217 for (int j
= 0; j
< N
; ++j
) {
218 string needle
= needle_vector
[j
];
219 int k
= offset_vector
[j
];
222 if (buffer_c
== needle
[k
]) {
223 // If we match, move to the next state.
225 // If we reach the last state, return the
226 // offset of the match.
227 if (k
== needle
.Length
)
228 return buffer_offset
+ i
- k
+ 1;
229 // Otherwise remember the state and
230 // break out of the loop so that we can
231 // proceed to the next needle.
232 offset_vector
[j
] = k
;
235 // If we don't get a match in the first
236 // state, there is no hope...
237 offset_vector
[j
] = 0;
240 // Try a shorter partial match
241 k
= overlap_vector
[j
] [k
];
255 StringMatcher matcher
= new StringMatcher ();
256 matcher
.Add ("proceed");
257 matcher
.Add ("remember");
259 TextReader reader
= new StreamReader ("StringMatcher.cs");
260 int offset
= matcher
.Find (reader
);
263 Console
.WriteLine ("No matches found.");
265 FileStream stream
= new FileStream ("StringMatcher.cs", FileMode
.Open
, FileAccess
.Read
);
266 stream
.Seek (offset
, SeekOrigin
.Begin
);
267 StreamReader foo
= new StreamReader (stream
);
268 string line
= foo
.ReadLine ();
270 Console
.WriteLine ("Found match at offset {0}", offset
);
271 Console
.WriteLine ("[{0}]", line
);