2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
11 #include <afs/param.h>
16 #include <WINNT/regexp.h>
20 * DEFINITIONS ________________________________________________________________
24 #define markREPEAT TEXT('\x01')
25 #define markCHARACTER TEXT('\x02')
26 #define markANYCHAR TEXT('\x04')
27 #define markCHARSET TEXT('\x06')
28 #define markNONCHARSET TEXT('\x08')
29 #define markREFERENCE TEXT('\x0A')
30 #define markLPAREN TEXT('\xFC')
31 #define markRPAREN TEXT('\xFD')
32 #define markENDLINE TEXT('\xFE')
33 #define markENDPATTERN TEXT('\xFF')
37 * CLASS ROUTINES _____________________________________________________________
43 m_fMatchFromStart
= FALSE
;
44 m_achCompiled
[0] = TEXT('\0');
47 REGEXP::REGEXP (LPCTSTR pszExpr
)
49 m_fMatchFromStart
= FALSE
;
50 m_achCompiled
[0] = TEXT('\0');
51 SetExpression (pszExpr
);
54 REGEXP::~REGEXP (void)
56 ; // nothing really to do here
59 BOOL
REGEXP::SetExpression (LPCTSTR pszExpr
)
61 return Compile (pszExpr
);
64 BOOL
REGEXP::Matches (LPCTSTR pszExpr
, LPCTSTR pszString
)
66 REGEXP
Expr (pszExpr
);
67 return Expr
.Matches (pszString
);
70 BOOL
REGEXP::fIsRegExp (void)
72 if (m_fMatchFromStart
) // started with "^"?
73 return TRUE
; // it's a regexp.
75 for (LPCTSTR pch
= m_achCompiled
; (*pch
) && (*pch
!= markENDPATTERN
); pch
+= 2)
77 if (*pch
!= markCHARACTER
)
81 return FALSE
; // just a string of characters
84 BOOL
REGEXP::fIsRegExp (LPCTSTR pszExpr
)
86 REGEXP
Expr (pszExpr
);
87 return Expr
.fIsRegExp();
92 * REGEXP _____________________________________________________________________
96 BOOL
REGEXP::Compile (LPCTSTR pszExpr
)
98 BYTE aParens
[ nCOMPILED_PARENS_MAX
];
99 PBYTE pParen
= &aParens
[0];
100 LPTSTR pchLastEx
= NULL
;
103 // Erase any previous compiled expression
105 LPTSTR pchCompiled
= m_achCompiled
;
106 *pchCompiled
= TEXT('\0');
107 m_fMatchFromStart
= FALSE
;
109 if (!pszExpr
|| !*pszExpr
)
111 SetLastError (ERROR_INVALID_PARAMETER
);
115 // See if the expression starts with a "^"
117 if ((m_fMatchFromStart
= (*pszExpr
== TEXT('^'))) == TRUE
)
120 // Start stripping characters from the expression
123 for (rc
= TRUE
; rc
; )
127 if ((sizeof(TCHAR
)*(pchCompiled
- m_achCompiled
)) > sizeof(m_achCompiled
))
129 SetLastError (ERROR_META_EXPANSION_TOO_LONG
);
134 if ((ch
= *pszExpr
++) == TEXT('\0'))
136 // We finally hit the end of this expression.
138 if (pParen
!= &aParens
[0])
140 SetLastError (ERROR_BAD_FORMAT
); // unmatched "\("
147 pchLastEx
= pchCompiled
;
153 *pchCompiled
++ = markANYCHAR
;
157 if ((pchLastEx
== NULL
) || (*pchLastEx
== markLPAREN
) || (*pchLastEx
== markRPAREN
))
159 *pchCompiled
++ = markCHARACTER
;
162 else // record that we can repeat the last expression
164 *pchLastEx
|= markREPEAT
;
169 if (*pszExpr
!= TEXT('\0'))
171 *pchCompiled
++ = markCHARACTER
;
174 else // record that we should match end-of-line
176 *pchCompiled
++ = markENDLINE
;
181 if ((ch
= *pszExpr
++) == '^')
183 *pchCompiled
++ = markNONCHARSET
;
188 *pchCompiled
++ = markCHARSET
;
191 *pchCompiled
++ = 1; // length; this is pchLastEx[1]
194 if (ch
== TEXT('\0'))
196 SetLastError (ERROR_BAD_FORMAT
); // unmatched "\("
201 if ((ch
== TEXT('-')) && (*pchCompiled
!= pchLastEx
[2]))
203 if ((ch
= *pszExpr
++) == TEXT(']'))
205 *pchCompiled
++ = TEXT('-');
209 while ((BYTE
)pchCompiled
[-1] < (BYTE
)ch
)
211 *pchCompiled
= pchCompiled
[-1] + 1;
214 if ((sizeof(TCHAR
)*(pchCompiled
- m_achCompiled
)) > sizeof(m_achCompiled
))
216 SetLastError (ERROR_META_EXPANSION_TOO_LONG
);
227 if ((sizeof(TCHAR
)*(pchCompiled
- m_achCompiled
)) > sizeof(m_achCompiled
))
229 SetLastError (ERROR_META_EXPANSION_TOO_LONG
);
235 } while ((ch
= *pszExpr
++) != TEXT(']'));
239 if ((ch
= *pszExpr
++) == TEXT('('))
241 if (nParens
>= nCOMPILED_PARENS_MAX
)
243 SetLastError (ERROR_META_EXPANSION_TOO_LONG
);
248 *pchCompiled
++ = markLPAREN
;
249 *pchCompiled
++ = nParens
++;
251 else if (ch
== TEXT(')'))
253 if (pParen
== &aParens
[0])
255 SetLastError (ERROR_BAD_FORMAT
);
259 *pchCompiled
++ = markRPAREN
;
260 *pchCompiled
++ = *--pParen
;
262 else if ((ch
>= TEXT('1')) && (ch
< (TEXT('1') + nCOMPILED_PARENS_MAX
)))
264 *pchCompiled
++ = markREFERENCE
;
265 *pchCompiled
++ = ch
- '1';
269 *pchCompiled
++ = markCHARACTER
;
275 *pchCompiled
++ = markCHARACTER
;
281 *pchCompiled
++ = markENDPATTERN
;
287 BOOL
REGEXP::Matches (LPCTSTR pszString
)
292 // Prepare a place to store information about \( and \) pairs
294 LPCTSTR aParenStarts
[ nCOMPILED_PARENS_MAX
];
295 LPCTSTR aParenEnds
[ nCOMPILED_PARENS_MAX
];
297 for (size_t ii
= 0; ii
< nCOMPILED_PARENS_MAX
; ii
++)
299 aParenStarts
[ii
] = NULL
;
300 aParenEnds
[ii
] = NULL
;
303 // If the expression starts with "^", we can do a quick pattern-match...
305 if (m_fMatchFromStart
)
307 return MatchSubset (pszString
, m_achCompiled
, aParenStarts
, aParenEnds
);
310 // Otherwise, we have to work a little harder. If the expression
311 // at least starts with a recognized character, we can scan for that
312 // as the start of a pattern...
314 LPTSTR pchCompiled
= m_achCompiled
;
315 if (*pchCompiled
== markCHARACTER
)
317 TCHAR chStart
= pchCompiled
[1];
319 if (*pszString
!= chStart
)
321 if (MatchSubset (pszString
, pchCompiled
, aParenStarts
, aParenEnds
))
323 } while (*pszString
++);
328 // If the expression starts with something weird, we'll have to test
329 // against every character in the string.
332 if (MatchSubset (pszString
, pchCompiled
, aParenStarts
, aParenEnds
))
334 } while (*pszString
++);
340 BOOL
REGEXP::MatchSubset (LPCTSTR pszString
, LPCTSTR pchCompiled
, LPCTSTR
*aParenStarts
, LPCTSTR
*aParenEnds
)
342 LPCTSTR pchStartOfEx
;
347 switch (*pchCompiled
++)
350 if (*pchCompiled
++ == *pszString
++)
360 if (*pszString
== TEXT('\0'))
368 if (fIsInCharSet (pchCompiled
, *pszString
++, TRUE
))
370 pchCompiled
+= *pchCompiled
;
376 if (fIsInCharSet (pchCompiled
, *pszString
++, FALSE
))
378 pchCompiled
+= *pchCompiled
;
384 aParenStarts
[*pchCompiled
++] = pszString
;
388 aParenEnds
[*pchCompiled
++] = pszString
;
392 if (aParenEnds
[ii
= *pchCompiled
++] == 0)
393 return FALSE
; // reference to invalid \(\) pair
394 if (CompareParen (ii
, pszString
, aParenStarts
, aParenEnds
))
396 pszString
+= aParenEnds
[ii
] - aParenStarts
[ii
];
401 case markREFERENCE
|markREPEAT
:
402 if (aParenEnds
[ii
= *pchCompiled
++] == 0)
403 return FALSE
; // reference to invalid \(\) pair
404 pchStartOfEx
= pszString
;
405 cchPattern
= aParenEnds
[ii
] - aParenStarts
[ii
];
406 while (CompareParen (ii
, pszString
, aParenStarts
, aParenEnds
))
407 pszString
+= cchPattern
;
408 while (pszString
>= pchStartOfEx
)
410 if (MatchSubset (pszString
, pchCompiled
, aParenStarts
, aParenEnds
))
412 pszString
-= cchPattern
;
416 case markANYCHAR
|markREPEAT
:
417 pchStartOfEx
= pszString
;
422 case markCHARACTER
|markREPEAT
:
423 pchStartOfEx
= pszString
;
424 while (*pszString
++ == *pchCompiled
)
429 case markCHARSET
|markREPEAT
:
430 case markNONCHARSET
|markREPEAT
:
431 pchStartOfEx
= pszString
;
432 while (fIsInCharSet (pchCompiled
, *pszString
++, (pchCompiled
[-1] == (markCHARSET
|markREPEAT
))))
434 pchCompiled
+= *pchCompiled
;
440 if (MatchSubset (pszString
, pchCompiled
, aParenStarts
, aParenEnds
))
442 } while (pszString
> pchStartOfEx
);
446 return FALSE
; // damaged compiled string
451 BOOL
REGEXP::CompareParen (int ii
, LPCTSTR pszString
, LPCTSTR
*aParenStarts
, LPCTSTR
*aParenEnds
)
453 LPCTSTR pchInParen
= aParenStarts
[ii
];
454 while (*pchInParen
++ == *pszString
++)
455 if (pchInParen
>= aParenEnds
[ii
])
461 BOOL
REGEXP::fIsInCharSet (LPCTSTR pszCharSet
, TCHAR chTest
, int fInclusive
)
465 for (int n
= (int)(*pszCharSet
++); --n
; )
467 if (*pszCharSet
++ == chTest
)