1 // This is a part of the Active Template Library.
2 // Copyright (C) Microsoft Corporation
3 // All rights reserved.
5 // This source code is only intended as a supplement to the
6 // Active Template Library Reference and related
7 // electronic documentation provided with the library.
8 // See these sources for detailed information regarding the
9 // Active Template Library product.
20 #ifndef ATL_REGEXP_MIN_STACK
21 #define ATL_REGEXP_MIN_STACK 256
25 Regular Expression Grammar
27 R - top level grammar rule
28 RE - regular expression
29 AltE - Alternative expression
31 SE - simple expression
34 '^'RE (matches begining of string)
42 E -> SE (RepeatOp '?'?)?
47 '\'EscapedChar (any character including reserved symbols)
48 '\'Digit+ (Arg back reference)
52 Symbol (any non-reserved character)
55 CharClass -> '[' '^'? CharSet ']'
57 CharItem -> Char('-'Char)?
61 Abbrev -> Abbreviation defined in CAtlRECharTraits
62 Abbrev Expansion Meaning
63 a ([a-zA-Z0-9]) alpha numeric
64 b ([ \\t]) white space (blank)
67 h ([0-9a-fA-F]) hex digit
68 n (\r|(\r?\n)) newline
69 q (\"[^\"]*\")|(\'[^\']*\') quoted string
70 w ([a-zA-Z]+) simple word
74 #pragma pack(push,_ATL_PACKING)
77 //Convertion utility classes used to convert char* to RECHAR.
78 //Used by rx debugging printing.
79 template <typename RECHARTYPE
=char>
83 CAToREChar(const char* psz
) throw()
87 operator const RECHARTYPE
*() const throw() { return m_psz
; }
92 class CAToREChar
<wchar_t>
95 CAToREChar(const char* psz
) throw()
99 operator const wchar_t*() const throw() { return (wchar_t*)m_a2w
; }
105 class CAtlRECharTraitsA
108 typedef char RECHARTYPE
;
110 static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE
*sz
) throw()
112 #ifndef ATL_NO_CHECK_BIT_FIELD
113 ATLASSERT(UseBitFieldForRange());
115 return static_cast<size_t>(static_cast<unsigned char>(*sz
));
117 static RECHARTYPE
*Next(const RECHARTYPE
*sz
) throw()
119 return (RECHARTYPE
*) (sz
+1);
122 static int Strncmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
124 return strncmp(szLeft
, szRight
, nCount
);
127 static int Strnicmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
129 return _strnicmp(szLeft
, szRight
, nCount
);
132 _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsA::Strlwr must be passed a buffer size.")
133 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
) throw()
135 #pragma warning (push)
136 #pragma warning(disable : 4996)
138 #pragma warning (pop)
141 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
, int nSize
) throw()
143 Checked::strlwr_s(sz
, nSize
);
147 static long Strtol(const RECHARTYPE
*sz
, RECHARTYPE
**szEnd
, int nBase
) throw()
149 return strtol(sz
, szEnd
, nBase
);
152 static int Isdigit(RECHARTYPE ch
) throw()
154 return isdigit(static_cast<unsigned char>(ch
));
157 static const RECHARTYPE
** GetAbbrevs()
159 static const RECHARTYPE
*s_szAbbrevs
[] =
161 "a([a-zA-Z0-9])", // alpha numeric
162 "b([ \\t])", // white space (blank)
163 "c([a-zA-Z])", // alpha
165 "h([0-9a-fA-F])", // hex digit
166 "n(\r|(\r?\n))", // newline
167 "q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
168 "w([a-zA-Z]+)", // simple word
169 "z([0-9]+)", // integer
176 static BOOL
UseBitFieldForRange() throw()
181 static int ByteLen(const RECHARTYPE
*sz
) throw()
183 return int(strlen(sz
));
187 class CAtlRECharTraitsW
190 typedef WCHAR RECHARTYPE
;
192 static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE
*sz
) throw()
194 #ifndef ATL_NO_CHECK_BIT_FIELD
195 ATLASSERT(UseBitFieldForRange());
197 return static_cast<size_t>(*sz
);
199 static RECHARTYPE
*Next(const RECHARTYPE
*sz
) throw()
201 return (RECHARTYPE
*) (sz
+1);
204 static int Strncmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
206 return wcsncmp(szLeft
, szRight
, nCount
);
209 static int Strnicmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
211 return _wcsnicmp(szLeft
, szRight
, nCount
);
214 _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsW::Strlwr must be passed a buffer size.")
215 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
) throw()
217 #pragma warning (push)
218 #pragma warning(disable : 4996)
220 #pragma warning (pop)
223 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
, int nSize
) throw()
225 Checked::wcslwr_s(sz
, nSize
);
229 static long Strtol(const RECHARTYPE
*sz
, RECHARTYPE
**szEnd
, int nBase
) throw()
231 return wcstol(sz
, szEnd
, nBase
);
234 static int Isdigit(RECHARTYPE ch
) throw()
239 static const RECHARTYPE
** GetAbbrevs()
241 static const RECHARTYPE
*s_szAbbrevs
[] =
243 L
"a([a-zA-Z0-9])", // alpha numeric
244 L
"b([ \\t])", // white space (blank)
245 L
"c([a-zA-Z])", // alpha
246 L
"d([0-9])", // digit
247 L
"h([0-9a-fA-F])", // hex digit
248 L
"n(\r|(\r?\n))", // newline
249 L
"q(\"[^\"]*\")|(\'[^\']*\')", // quoted string
250 L
"w([a-zA-Z]+)", // simple word
251 L
"z([0-9]+)", // integer
258 static BOOL
UseBitFieldForRange() throw()
263 static int ByteLen(const RECHARTYPE
*sz
) throw()
265 return int(wcslen(sz
)*sizeof(WCHAR
));
269 class CAtlRECharTraitsMB
272 typedef unsigned char RECHARTYPE
;
274 static size_t GetBitFieldForRangeArrayIndex(const RECHARTYPE
*sz
) throw()
276 #ifndef ATL_NO_CHECK_BIT_FIELD
277 ATLASSERT(UseBitFieldForRange());
280 return static_cast<size_t>(*sz
);
283 static RECHARTYPE
*Next(const RECHARTYPE
*sz
) throw()
288 static int Strncmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
290 return _mbsncmp(szLeft
, szRight
, nCount
);
293 static int Strnicmp(const RECHARTYPE
*szLeft
, const RECHARTYPE
*szRight
, size_t nCount
) throw()
295 return _mbsnicmp(szLeft
, szRight
, nCount
);
298 _ATL_INSECURE_DEPRECATE("CAtlRECharTraitsMB::Strlwr must be passed a buffer size.")
299 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
) throw()
301 #pragma warning (push)
302 #pragma warning(disable : 4996)
304 #pragma warning (pop)
307 static RECHARTYPE
*Strlwr(RECHARTYPE
*sz
, int nSize
) throw()
309 Checked::mbslwr_s(sz
, nSize
);
313 static long Strtol(const RECHARTYPE
*sz
, RECHARTYPE
**szEnd
, int nBase
) throw()
315 return strtol((const char *) sz
, (char **) szEnd
, nBase
);
318 static int Isdigit(RECHARTYPE ch
) throw()
320 return _ismbcdigit((unsigned int) ch
);
323 static const RECHARTYPE
** GetAbbrevs()
325 return reinterpret_cast<const RECHARTYPE
**>(CAtlRECharTraitsA::GetAbbrevs());
328 static BOOL
UseBitFieldForRange() throw()
333 static int ByteLen(const RECHARTYPE
*sz
) throw()
335 return (int)strlen((const char *) sz
);
340 typedef CAtlRECharTraitsA CAtlRECharTraits
;
342 typedef CAtlRECharTraitsW CAtlRECharTraits
;
344 // Note: If you want to use CAtlRECharTraitsMB you must pass it in
345 // as a template argument
347 template <class CharTraits
=CAtlRECharTraits
>
348 class CAtlRegExp
; // forward declaration
350 template <class CharTraits
=CAtlRECharTraits
>
351 class CAtlREMatchContext
354 friend CAtlRegExp
<CharTraits
>;
355 typedef typename
CharTraits::RECHARTYPE RECHAR
;
359 const RECHAR
*szStart
;
367 void GetMatch(UINT nIndex
, const RECHAR
**szStart
, const RECHAR
**szEnd
)
369 ATLENSURE(szStart
!= NULL
);
370 ATLENSURE(szEnd
!= NULL
);
371 ATLENSURE(nIndex
>=0 && nIndex
< m_uNumGroups
);
372 *szStart
= m_Matches
[nIndex
].szStart
;
373 *szEnd
= m_Matches
[nIndex
].szEnd
;
376 void GetMatch(UINT nIndex
, MatchGroup
*pGroup
)
379 ATLENSURE(pGroup
!= NULL
);
380 ATLENSURE(nIndex
>=0&&(static_cast<UINT
>(nIndex
))< m_uNumGroups
);
381 pGroup
->szStart
= m_Matches
[nIndex
].szStart
;
382 pGroup
->szEnd
= m_Matches
[nIndex
].szEnd
;
386 CAutoVectorPtr
<void *> m_Mem
;
387 CAutoVectorPtr
<MatchGroup
> m_Matches
;
388 CAtlArray
<void *> m_stack
;
392 CAtlREMatchContext(size_t nInitStackSize
=ATL_REGEXP_MIN_STACK
)
396 m_stack
.SetCount(nInitStackSize
);
397 m_Match
.szStart
= NULL
;
398 m_Match
.szEnd
= NULL
;
402 BOOL
Initialize(UINT uRequiredMem
, UINT uNumGroups
) throw()
409 if (!m_Matches
.Allocate(uNumGroups
))
412 m_uNumGroups
= uNumGroups
;
416 if (!m_Mem
.Allocate(uRequiredMem
))
419 memset(m_Mem
.m_p
, 0x00, uRequiredMem
*sizeof(void *));
421 memset(m_Matches
, 0x00, m_uNumGroups
* sizeof(MatchGroup
));
428 if (m_stack
.GetCount() <= (UINT
) m_nTos
)
430 if (!m_stack
.SetCount((m_nTos
+1)*2))
442 return Push((void *) n
);
450 // this should never happen at match time.
451 // (the parsing succeeded when it shouldn't have)
455 void *p
= m_stack
[m_nTos
];
462 REPARSE_ERROR_OK
= 0, // No error occurred
463 REPARSE_ERROR_OUTOFMEMORY
, // Out of memory
464 REPARSE_ERROR_BRACE_EXPECTED
, // A closing brace was expected
465 REPARSE_ERROR_PAREN_EXPECTED
, // A closing parenthesis was expected
466 REPARSE_ERROR_BRACKET_EXPECTED
, // A closing bracket was expected
467 REPARSE_ERROR_UNEXPECTED
, // An unspecified fatal error occurred
468 REPARSE_ERROR_EMPTY_RANGE
, // A range expression was empty
469 REPARSE_ERROR_INVALID_GROUP
, // A backreference was made to a group
470 // that did not exist
471 REPARSE_ERROR_INVALID_RANGE
, // An invalid range was specified
472 REPARSE_ERROR_EMPTY_REPEATOP
, // A possibly empty * or + was detected
473 REPARSE_ERROR_INVALID_INPUT
, // The input string was invalid
476 template <class CharTraits
/* =CAtlRECharTraits */>
484 m_bCaseSensitive
= TRUE
;
485 m_LastError
= REPARSE_ERROR_OK
;
488 typedef typename
CharTraits::RECHARTYPE RECHAR
;
491 // Parses the regular expression
492 // returns REPARSE_ERROR_OK if successful, an REParseError otherwise
493 REParseError
Parse(const RECHAR
*szRE
, BOOL bCaseSensitive
=TRUE
)
497 return REPARSE_ERROR_INVALID_INPUT
;
501 m_bCaseSensitive
= bCaseSensitive
;
503 const RECHAR
*szInput
= szRE
;
508 int nSize
= CharTraits::ByteLen(szRE
)+sizeof(RECHAR
);
509 szInput
= (const RECHAR
*) malloc(nSize
);
511 return REPARSE_ERROR_OUTOFMEMORY
;
513 Checked::memcpy_s((char *) szInput
, nSize
, szRE
, nSize
);
515 CharTraits::Strlwr(const_cast<RECHAR
*>(szInput
), nSize
/sizeof(RECHAR
));
517 const RECHAR
*sz
= szInput
;
519 int nCall
= AddInstruction(RE_CALL
);
521 return REPARSE_ERROR_OUTOFMEMORY
;
525 if (AddInstruction(RE_FAIL
) < 0)
526 return REPARSE_ERROR_OUTOFMEMORY
;
531 if (AddInstruction(RE_ADVANCE
) < 0)
532 return REPARSE_ERROR_OUTOFMEMORY
;
536 ParseRE(&sz
, bEmpty
);
537 if (!GetLastParseError())
539 GetInstruction(nCall
).call
.nTarget
= 2;
541 if (AddInstruction(RE_MATCH
) < 0)
542 return REPARSE_ERROR_OUTOFMEMORY
;
546 free((void *) szInput
);
548 return GetLastParseError();
551 BOOL
Match(const RECHAR
*szIn
, CAtlREMatchContext
<CharTraits
> *pContext
, const RECHAR
**ppszEnd
=NULL
)
556 if (!szIn
|| !pContext
)
562 const RECHAR
*szInput
= szIn
;
564 if (!m_bCaseSensitive
)
566 int nSize
= CharTraits::ByteLen(szIn
)+sizeof(RECHAR
);
567 szInput
= (const RECHAR
*) malloc(nSize
);
571 Checked::memcpy_s((char *) szInput
, nSize
, szIn
, nSize
);
572 CharTraits::Strlwr(const_cast<RECHAR
*>(szInput
), nSize
/sizeof(RECHAR
));
575 if (!pContext
->Initialize(m_uRequiredMem
, m_uNumGroups
))
578 free((void *) szInput
);
584 const RECHAR
*sz
= szInput
;
585 const RECHAR
*szCurrInput
= szInput
;
587 #pragma warning(push)
588 #pragma warning(disable:4127) // conditional expression is constant
593 OnDebugEvent(ip
, szInput
, sz
, pContext
);
596 pContext
->m_Match
.szStart
= sz
;
598 switch (GetInstruction(ip
).type
)
605 if (GetInstruction(ip
).symbol
.nSymbol
== static_cast<size_t>(static_cast<_TUCHAR
>(*sz
)))
607 sz
= CharTraits::Next(sz
);
612 ip
= (size_t) pContext
->Pop();
619 sz
= CharTraits::Next(sz
);
624 ip
= (size_t) pContext
->Pop();
629 pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szStart
= sz
;
634 pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szEnd
= sz
;
638 case RE_PUSH_CHARPOS
:
639 pContext
->Push((void *) sz
);
644 sz
= (RECHAR
*) pContext
->Pop();
649 pContext
->Push(ip
+1);
650 ip
= GetInstruction(ip
).call
.nTarget
;
654 ip
= GetInstruction(ip
).jmp
.nTarget
;
658 ip
= (size_t) pContext
->Pop();
662 pContext
->Push((void *) (pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
]));
667 pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
] = pContext
->Pop();
671 case RE_STORE_CHARPOS
:
672 pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
] = (void *) sz
;
677 sz
= (RECHAR
*) pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
];
681 case RE_STORE_STACKPOS
:
682 pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
] = (void *) pContext
->m_nTos
;
686 case RE_GET_STACKPOS
:
687 pContext
->m_nTos
= (size_t) pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
];
692 if (sz
== (RECHAR
*) pContext
->m_Mem
[GetInstruction(ip
).memory
.nIndex
])
695 ip
= (size_t) pContext
->Pop();
702 sz
= CharTraits::Next(szCurrInput
);
707 pContext
->m_nTos
= 0;
717 ip
= (size_t) pContext
->Pop();
721 RECHAR
*pBits
= reinterpret_cast<RECHAR
*>((&m_Instructions
[ip
]+1));
722 size_t u
= CharTraits::GetBitFieldForRangeArrayIndex(sz
);
723 if (pBits
[u
>> 3] & 1 << (u
& 0x7))
725 ip
+= InstructionsPerRangeBitField();
727 sz
= CharTraits::Next(sz
);
731 ip
= (size_t) pContext
->Pop();
740 ip
= (size_t) pContext
->Pop();
744 RECHAR
*pBits
= reinterpret_cast<RECHAR
*>((&m_Instructions
[ip
]+1));
745 size_t u
= static_cast<size_t>(static_cast<_TUCHAR
>(* ((RECHAR
*) sz
)));
746 if (pBits
[u
>> 3] & 1 << (u
& 0x7))
748 ip
= (size_t) pContext
->Pop();
752 ip
+= InstructionsPerRangeBitField();
754 sz
= CharTraits::Next(sz
);
763 ip
= (size_t) pContext
->Pop();
768 size_t inEnd
= GetInstruction(ip
).range
.nTarget
;
773 if (static_cast<size_t>(static_cast<_TUCHAR
>(*sz
)) >= GetInstruction(ip
).memory
.nIndex
&&
774 static_cast<size_t>(static_cast<_TUCHAR
>(*sz
)) <= GetInstruction(ip
+1).memory
.nIndex
)
776 // if we match, we jump to the end
777 sz
= CharTraits::Next(sz
);
788 ip
= (size_t) pContext
->Pop();
797 ip
= (size_t) pContext
->Pop();
802 size_t inEnd
= GetInstruction(ip
).range
.nTarget
;
807 if (static_cast<size_t>(static_cast<_TUCHAR
>(*sz
)) >= GetInstruction(ip
).memory
.nIndex
&&
808 static_cast<size_t>(static_cast<_TUCHAR
>(*sz
)) <= GetInstruction(ip
+1).memory
.nIndex
)
810 ip
= (size_t) pContext
->Pop();
816 // if we match, we jump to the end
821 sz
= CharTraits::Next(sz
);
828 if (m_bCaseSensitive
)
830 bMatch
= !CharTraits::Strncmp(sz
, pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szStart
,
831 pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szEnd
-pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szStart
);
835 bMatch
= !CharTraits::Strnicmp(sz
, pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szStart
,
836 pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szEnd
-pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szStart
);
840 sz
+= pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szEnd
-pContext
->m_Matches
[GetInstruction(ip
).prev
.nGroup
].szStart
;
844 ip
= (size_t) pContext
->Pop();
849 pContext
->m_Match
.szEnd
= sz
;
850 if (!m_bCaseSensitive
)
851 FixupMatchContext(pContext
, szIn
, szInput
);
853 *ppszEnd
= szIn
+ (sz
- szInput
);
855 free((void *) szInput
);
860 pContext
->Push((void *) pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szStart
);
861 pContext
->Push((void *) pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szEnd
);
866 pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szEnd
= (const RECHAR
*) pContext
->Pop();
867 pContext
->m_Matches
[GetInstruction(ip
).group
.nGroup
].szStart
= (const RECHAR
*) pContext
->Pop();
877 #pragma warning(pop) // 4127
881 pContext
->m_Match
.szEnd
= sz
;
882 if (!m_bCaseSensitive
)
883 FixupMatchContext(pContext
, szIn
, szInput
);
885 *ppszEnd
= szIn
+ (sz
- szInput
);
887 free((void *) szInput
);
892 REParseError m_LastError
;
894 REParseError
GetLastParseError() throw()
899 void SetLastParseError(REParseError Error
) throw()
904 // Removes all instructions to allow reparsing into the same instance
907 m_Instructions
.RemoveAll();
909 m_bCaseSensitive
= TRUE
;
911 SetLastParseError(REPARSE_ERROR_OK
);
915 enum REInstructionType
{
951 struct INSTRUCTION_SYMBOL
956 struct INSTRUCTION_JMP
961 struct INSTRUCTION_GROUP
966 struct INSTRUCTION_CALL
971 struct INSTRUCTION_MEMORY
976 struct INSTRUCTION_PREVIOUS
981 struct INSTRUCTION_RANGE_EX
988 REInstructionType type
;
991 INSTRUCTION_SYMBOL symbol
;
993 INSTRUCTION_GROUP group
;
994 INSTRUCTION_CALL call
;
995 INSTRUCTION_MEMORY memory
;
996 INSTRUCTION_PREVIOUS prev
;
997 INSTRUCTION_RANGE_EX range
;
1001 inline int InstructionsPerRangeBitField() throw()
1003 return (256/8) / sizeof(INSTRUCTION
) + (((256/8) % sizeof(INSTRUCTION
)) ? 1 : 0);
1006 CAtlArray
<INSTRUCTION
> m_Instructions
;
1009 UINT m_uRequiredMem
;
1010 BOOL m_bCaseSensitive
;
1013 // class used internally to restore
1014 // parsing state when unwinding
1018 int m_nNumInstructions
;
1020 UINT m_uRequiredMem
;
1022 CParseState(CAtlRegExp
*pRegExp
) throw()
1024 m_nNumInstructions
= (int) pRegExp
->m_Instructions
.GetCount();
1025 m_uNumGroups
= pRegExp
->m_uNumGroups
;
1026 m_uRequiredMem
= pRegExp
->m_uRequiredMem
;
1029 void Restore(CAtlRegExp
*pRegExp
)
1031 pRegExp
->m_Instructions
.SetCount(m_nNumInstructions
);
1032 pRegExp
->m_uNumGroups
= m_uNumGroups
;
1033 pRegExp
->m_uRequiredMem
= m_uRequiredMem
;
1037 int AddInstruction(REInstructionType type
)
1039 if (!m_Instructions
.SetCount(m_Instructions
.GetCount()+1))
1041 SetLastParseError(REPARSE_ERROR_OUTOFMEMORY
);
1045 m_Instructions
[m_Instructions
.GetCount()-1].type
= type
;
1046 return (int) m_Instructions
.GetCount()-1;
1049 BOOL
PeekToken(const RECHAR
**ppszRE
, int ch
) throw()
1056 BOOL
MatchToken(const RECHAR
**ppszRE
, int ch
) throw()
1058 if (!PeekToken(ppszRE
, ch
))
1060 *ppszRE
= CharTraits::Next(*ppszRE
);
1064 INSTRUCTION
&GetInstruction(size_t nIndex
) throw()
1066 return m_Instructions
[nIndex
];
1069 // ParseArg: parse grammar rule Arg
1070 int ParseArg(const RECHAR
**ppszRE
, bool &bEmpty
)
1072 int nPushGroup
= AddInstruction(RE_PUSH_GROUP
);
1076 GetInstruction(nPushGroup
).group
.nGroup
= m_uNumGroups
;
1078 int p
= AddInstruction(RE_GROUP_START
);
1081 GetInstruction(p
).group
.nGroup
= m_uNumGroups
++;
1083 int nCall
= AddInstruction(RE_CALL
);
1087 int nPopGroup
= AddInstruction(RE_POP_GROUP
);
1090 GetInstruction(nPopGroup
).group
.nGroup
= GetInstruction(nPushGroup
).group
.nGroup
;
1092 if (AddInstruction(RE_RETURN
) < 0)
1095 int nAlt
= ParseRE(ppszRE
, bEmpty
);
1098 if (GetLastParseError())
1101 if (!PeekToken(ppszRE
, '}'))
1103 SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED
);
1107 // in the case of an empty group, we add a nop
1108 nAlt
= AddInstruction(RE_NOP
);
1113 GetInstruction(nCall
).call
.nTarget
= nAlt
;
1115 if (!MatchToken(ppszRE
, '}'))
1117 SetLastParseError(REPARSE_ERROR_BRACE_EXPECTED
);
1121 int nEnd
= AddInstruction(RE_GROUP_END
);
1124 GetInstruction(nEnd
).group
.nGroup
= GetInstruction(p
).group
.nGroup
;
1128 // ParseGroup: parse grammar rule Group
1129 int ParseGroup(const RECHAR
**ppszRE
, bool &bEmpty
)
1131 int nCall
= AddInstruction(RE_CALL
);
1135 if (AddInstruction(RE_RETURN
) < 0)
1138 int nAlt
= ParseRE(ppszRE
, bEmpty
);
1141 if (GetLastParseError())
1144 if (!PeekToken(ppszRE
, ')'))
1146 SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED
);
1150 // in the case of an empty group, we add a nop
1151 nAlt
= AddInstruction(RE_NOP
);
1156 GetInstruction(nCall
).call
.nTarget
= nAlt
;
1158 if (!MatchToken(ppszRE
, ')'))
1160 SetLastParseError(REPARSE_ERROR_PAREN_EXPECTED
);
1167 RECHAR
GetEscapedChar(RECHAR ch
) throw()
1174 // ParseCharItem: parse grammar rule CharItem
1175 int ParseCharItem(const RECHAR
**ppszRE
, RECHAR
*pchStartChar
, RECHAR
*pchEndChar
) throw()
1177 if (**ppszRE
== '\\')
1179 *ppszRE
= CharTraits::Next(*ppszRE
);
1180 *pchStartChar
= GetEscapedChar(**ppszRE
);
1183 *pchStartChar
= **ppszRE
;
1184 *ppszRE
= CharTraits::Next(*ppszRE
);
1186 if (!MatchToken(ppszRE
, '-'))
1188 *pchEndChar
= *pchStartChar
;
1192 // check for unterminated range
1193 if (!**ppszRE
|| PeekToken(ppszRE
, ']'))
1195 SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED
);
1199 *pchEndChar
= **ppszRE
;
1200 *ppszRE
= CharTraits::Next(*ppszRE
);
1202 if (*pchEndChar
< *pchStartChar
)
1204 SetLastParseError(REPARSE_ERROR_INVALID_RANGE
);
1210 int AddInstructions(int nNumInstructions
)
1212 size_t nCurr
= m_Instructions
.GetCount();
1213 if (!m_Instructions
.SetCount(nCurr
+nNumInstructions
))
1215 SetLastParseError(REPARSE_ERROR_OUTOFMEMORY
);
1221 // ParseCharSet: parse grammar rule CharSet
1222 int ParseCharSet(const RECHAR
**ppszRE
, BOOL bNot
)
1226 unsigned char *pBits
= NULL
;
1228 if (CharTraits::UseBitFieldForRange())
1230 // we use a bit field to represent the characters
1231 // a 1 bit means match against the character
1232 // the last 5 bits are used as an index into
1233 // the byte array, and the first 3 bits
1234 // are used to index into the selected byte
1236 p
= AddInstruction(bNot
? RE_NOTRANGE
: RE_RANGE
);
1240 // add the required space to hold the character
1241 // set. We use one bit per character for ansi
1242 if (AddInstructions(InstructionsPerRangeBitField()) < 0)
1245 pBits
= (unsigned char *) (&m_Instructions
[p
+1]);
1246 memset(pBits
, 0x00, 256/8);
1250 p
= AddInstruction(bNot
? RE_NOTRANGE_EX
: RE_RANGE_EX
);
1258 while (**ppszRE
&& **ppszRE
!= ']')
1260 if (ParseCharItem(ppszRE
, &chStart
, &chEnd
))
1263 if (CharTraits::UseBitFieldForRange())
1265 for (int i
=chStart
; i
<=chEnd
; i
++)
1266 pBits
[i
>> 3] |= 1 << (i
& 0x7);
1270 int nStart
= AddInstruction(RE_NOP
);
1274 int nEnd
= AddInstruction(RE_NOP
);
1278 GetInstruction(nStart
).memory
.nIndex
= (int) chStart
;
1279 GetInstruction(nEnd
).memory
.nIndex
= (int) chEnd
;
1283 if (!CharTraits::UseBitFieldForRange())
1284 GetInstruction(p
).range
.nTarget
= m_Instructions
.GetCount();
1289 // ParseCharClass: parse grammar rule CharClass
1290 int ParseCharClass(const RECHAR
**ppszRE
, bool &bEmpty
)
1293 if (MatchToken(ppszRE
, ']'))
1295 SetLastParseError(REPARSE_ERROR_EMPTY_RANGE
);
1300 if (MatchToken(ppszRE
, '^'))
1303 if (MatchToken(ppszRE
, ']'))
1305 SetLastParseError(REPARSE_ERROR_EMPTY_RANGE
);
1309 int p
= ParseCharSet(ppszRE
, bNot
);
1312 if (!MatchToken(ppszRE
, ']'))
1314 SetLastParseError(REPARSE_ERROR_BRACKET_EXPECTED
);
1321 int AddMemInstruction(REInstructionType type
)
1323 int p
= AddInstruction(type
);
1326 GetInstruction(p
).memory
.nIndex
= m_uRequiredMem
++;
1330 // helper for parsing !SE
1331 int ParseNot(const RECHAR
**ppszRE
, bool &bEmpty
)
1333 int nStoreCP
= AddMemInstruction(RE_STORE_CHARPOS
);
1334 int nStoreSP
= AddMemInstruction(RE_STORE_STACKPOS
);
1336 int nCall
= AddInstruction(RE_CALL
);
1340 int nGetCP
= AddInstruction(RE_GET_CHARPOS
);
1343 GetInstruction(nGetCP
).memory
.nIndex
= GetInstruction(nStoreCP
).memory
.nIndex
;
1345 int nGetSP
= AddInstruction(RE_GET_STACKPOS
);
1348 GetInstruction(nGetSP
).memory
.nIndex
= GetInstruction(nStoreSP
).memory
.nIndex
;
1350 int nJmp
= AddInstruction(RE_JMP
);
1354 int nSE
= ParseSE(ppszRE
, bEmpty
);
1359 GetInstruction(nCall
).call
.nTarget
= nSE
;
1361 int nGetCP1
= AddInstruction(RE_GET_CHARPOS
);
1364 GetInstruction(nGetCP1
).memory
.nIndex
= GetInstruction(nStoreCP
).memory
.nIndex
;
1366 int nGetSP1
= AddInstruction(RE_GET_STACKPOS
);
1369 GetInstruction(nGetSP1
).memory
.nIndex
= GetInstruction(nStoreSP
).memory
.nIndex
;
1371 int nRet
= AddInstruction(RE_RETURN
);
1375 GetInstruction(nJmp
).jmp
.nTarget
= nRet
+1;
1380 // ParseAbbrev: parse grammar rule Abbrev
1381 int ParseAbbrev(const RECHAR
**ppszRE
, bool &bEmpty
)
1383 const RECHAR
**szAbbrevs
= CharTraits::GetAbbrevs();
1387 if (**ppszRE
== **szAbbrevs
)
1389 const RECHAR
*szAbbrev
= (*szAbbrevs
)+1;
1390 int p
= ParseE(&szAbbrev
, bEmpty
);
1393 SetLastParseError(REPARSE_ERROR_UNEXPECTED
);
1396 *ppszRE
= CharTraits::Next(*ppszRE
);
1404 // ParseSE: parse grammar rule SE (simple expression)
1405 int ParseSE(const RECHAR
**ppszRE
, bool &bEmpty
)
1408 if (MatchToken(ppszRE
, '{'))
1409 return ParseArg(ppszRE
, bEmpty
);
1410 if (MatchToken(ppszRE
, '('))
1411 return ParseGroup(ppszRE
, bEmpty
);
1412 if (MatchToken(ppszRE
, '['))
1413 return ParseCharClass(ppszRE
, bEmpty
);
1415 if (MatchToken(ppszRE
, '\\'))
1417 if (!CharTraits::Isdigit(**ppszRE
))
1419 // check for abbreviations
1421 p
= ParseAbbrev(ppszRE
, bEmpty
);
1425 if (GetLastParseError())
1429 p
= AddInstruction(RE_SYMBOL
);
1432 GetInstruction(p
).symbol
.nSymbol
= (int) **ppszRE
;
1433 *ppszRE
= CharTraits::Next(*ppszRE
);
1438 int nPrev
= AddInstruction(RE_PREVIOUS
);
1442 UINT uValue
= (UINT
) CharTraits::Strtol(*ppszRE
, (RECHAR
**) ppszRE
, 10);
1443 if (uValue
>= m_uNumGroups
)
1445 SetLastParseError(REPARSE_ERROR_INVALID_GROUP
);
1448 GetInstruction(nPrev
).prev
.nGroup
= (size_t) uValue
;
1452 if (MatchToken(ppszRE
, '!'))
1453 return ParseNot(ppszRE
, bEmpty
);
1455 if (**ppszRE
== '}' || **ppszRE
== ']' || **ppszRE
== ')')
1460 if (**ppszRE
== '\0')
1466 if (**ppszRE
== '.')
1468 p
= AddInstruction(RE_ANY
);
1473 else if (**ppszRE
== '$' && (*ppszRE
)[1] == '\0')
1475 p
= AddInstruction(RE_SYMBOL
);
1478 GetInstruction(p
).symbol
.nSymbol
= 0;
1483 p
= AddInstruction(RE_SYMBOL
);
1486 GetInstruction(p
).symbol
.nSymbol
= (int) **ppszRE
;
1489 *ppszRE
= CharTraits::Next(*ppszRE
);
1493 // ParseE: parse grammar rule E (expression)
1494 int ParseE(const RECHAR
**ppszRE
, bool &bEmpty
)
1496 CParseState
ParseState(this);
1497 const RECHAR
*sz
= *ppszRE
;
1501 int nFirst
= ParseSE(ppszRE
, bEmpty
);
1505 REInstructionType type
= RE_MATCH
;
1507 if (MatchToken(ppszRE
, '*'))
1508 if(MatchToken(ppszRE
, '?'))
1509 type
= RE_NG_STAR_BEGIN
;
1511 type
= RE_STAR_BEGIN
;
1514 else if (MatchToken(ppszRE
, '+'))
1515 if(MatchToken(ppszRE
, '?'))
1520 else if (MatchToken(ppszRE
, '?'))
1521 if(MatchToken(ppszRE
, '?'))
1522 type
= RE_NG_QUESTION
;
1527 if (type
== RE_MATCH
)
1530 if (type
== RE_STAR_BEGIN
|| type
== RE_QUESTION
|| type
== RE_NG_STAR_BEGIN
|| type
== RE_NG_QUESTION
)
1532 ParseState
.Restore(this);
1536 m_uNumGroups
= ParseState
.m_uNumGroups
;
1542 if (type
== RE_NG_STAR_BEGIN
|| type
== RE_NG_PLUS
|| type
== RE_NG_QUESTION
) // Non-Greedy
1544 int nCall
= AddInstruction(RE_CALL
);
1550 nSE
= ParseSE(ppszRE
, bEmpty
);
1554 if (bEmpty
&& (type
== RE_NG_STAR_BEGIN
|| type
== RE_NG_PLUS
))
1556 SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP
);
1561 *ppszRE
= CharTraits::Next(*ppszRE
);
1562 *ppszRE
= CharTraits::Next(*ppszRE
);
1564 if (type
== RE_NG_STAR_BEGIN
|| type
== RE_NG_PLUS
)
1566 int nJmp
= AddInstruction(RE_JMP
);
1569 GetInstruction(nCall
).call
.nTarget
= nJmp
+1;
1570 GetInstruction(nJmp
).jmp
.nTarget
= nCall
;
1573 GetInstruction(nCall
).call
.nTarget
= nSE
+1;
1575 if (type
== RE_NG_PLUS
)
1583 int nPushMem
= AddInstruction(RE_PUSH_MEMORY
);
1587 int nStore
= AddInstruction(RE_STORE_CHARPOS
);
1591 if (AddInstruction(RE_PUSH_CHARPOS
) < 0)
1594 int nCall
= AddInstruction(RE_CALL
);
1598 if (AddInstruction(RE_POP_CHARPOS
) < 0)
1601 int nPopMem
= AddInstruction(RE_POP_MEMORY
);
1605 int nJmp
= AddInstruction(RE_JMP
);
1609 GetInstruction(nPushMem
).memory
.nIndex
= m_uRequiredMem
++;
1610 GetInstruction(nStore
).memory
.nIndex
= GetInstruction(nPushMem
).memory
.nIndex
;
1611 GetInstruction(nCall
).call
.nTarget
= nJmp
+1;
1612 GetInstruction(nPopMem
).memory
.nIndex
= GetInstruction(nPushMem
).memory
.nIndex
;
1616 nSE
= ParseSE(ppszRE
, bEmpty
);
1620 if (bEmpty
&& (type
== RE_STAR_BEGIN
|| type
== RE_PLUS
))
1622 SetLastParseError(REPARSE_ERROR_EMPTY_REPEATOP
);
1626 if (type
!= RE_PLUS
&& type
!= RE_NG_PLUS
)
1629 *ppszRE
= CharTraits::Next(*ppszRE
);
1632 int nRetNoMatch
= AddInstruction(RE_RET_NOMATCH
);
1633 if (nRetNoMatch
< 0)
1636 int nStore1
= AddInstruction(RE_STORE_CHARPOS
);
1640 GetInstruction(nRetNoMatch
).memory
.nIndex
= GetInstruction(nPushMem
).memory
.nIndex
;
1641 GetInstruction(nStore1
).memory
.nIndex
= GetInstruction(nPushMem
).memory
.nIndex
;
1643 if (type
!= RE_QUESTION
)
1645 int nJmp1
= AddInstruction(RE_JMP
);
1648 GetInstruction(nJmp1
).jmp
.nTarget
= nPushMem
;
1651 GetInstruction(nJmp
).jmp
.nTarget
= m_Instructions
.GetCount();
1652 if (type
== RE_PLUS
)
1662 // ParseAltE: parse grammar rule AltE
1663 int ParseAltE(const RECHAR
**ppszRE
, bool &bEmpty
)
1665 const RECHAR
*sz
= *ppszRE
;
1666 CParseState
ParseState(this);
1668 int nPush
= AddInstruction(RE_PUSH_CHARPOS
);
1672 int nCall
= AddInstruction(RE_CALL
);
1676 GetInstruction(nCall
).call
.nTarget
= nPush
+4;
1677 if (AddInstruction(RE_POP_CHARPOS
) < 0)
1680 int nJmpNext
= AddInstruction(RE_JMP
);
1684 int nE
= ParseE(ppszRE
, bEmpty
);
1687 if (GetLastParseError())
1689 ParseState
.Restore(this);
1693 int nJmpEnd
= AddInstruction(RE_JMP
);
1697 GetInstruction(nJmpNext
).jmp
.nTarget
= nJmpEnd
+1;
1699 if (!MatchToken(ppszRE
, '|'))
1701 ParseState
.Restore(this);
1704 return ParseE(ppszRE
, bEmpty
);
1708 int nAltE
= ParseAltE(ppszRE
, bEmptyAltE
);
1709 GetInstruction(nJmpEnd
).jmp
.nTarget
= m_Instructions
.GetCount();
1710 GetInstruction(nJmpNext
).jmp
.nTarget
= nAltE
;
1713 if (GetLastParseError())
1715 ParseState
.Restore(this);
1718 bEmpty
= bEmpty
| bEmptyAltE
;
1722 // ParseRE: parse grammar rule RE (regular expression)
1723 int ParseRE(const RECHAR
**ppszRE
, bool &bEmpty
)
1725 if (**ppszRE
== '\0')
1728 int p
= ParseAltE(ppszRE
, bEmpty
);
1732 bool bEmptyRE
= true;
1733 ParseRE(ppszRE
, bEmptyRE
);
1734 if (GetLastParseError())
1736 bEmpty
= bEmpty
&& bEmptyRE
;
1740 //pointers to the matched string and matched groups, currently point into an internal allocated
1741 //buffer that hold a copy of the input string.
1742 //This function fix these pointers to point into the original, user supplied buffer (first param to Match method).
1743 //Example: If a ptr (szStart) currently point to <internal buffer>+3, it is fixed to <user supplied buffer>+3
1744 void FixupMatchContext(CAtlREMatchContext
<CharTraits
> *pContext
, const RECHAR
*szOrig
, const RECHAR
*szNew
)
1746 ATLENSURE(pContext
);
1750 pContext
->m_Match
.szStart
= szOrig
+ (pContext
->m_Match
.szStart
- szNew
);
1751 pContext
->m_Match
.szEnd
= szOrig
+ (pContext
->m_Match
.szEnd
- szNew
);
1752 for (UINT i
=0; i
<pContext
->m_uNumGroups
; i
++)
1754 if (pContext
->m_Matches
[i
].szStart
==NULL
|| pContext
->m_Matches
[i
].szEnd
==NULL
)
1756 continue; //Do not fix unmatched groups.
1758 pContext
->m_Matches
[i
].szStart
= szOrig
+ (pContext
->m_Matches
[i
].szStart
- szNew
);
1759 pContext
->m_Matches
[i
].szEnd
= szOrig
+ (pContext
->m_Matches
[i
].szEnd
- szNew
);
1763 // helpers for dumping and debugging the rx engine
1765 #ifdef ATL_REGEXP_DUMP
1766 size_t DumpInstruction(size_t ip
)
1768 printf("%08x ", ip
);
1769 switch (GetInstruction(ip
).type
)
1777 AtlprintfT
<RECHAR
>(CAToREChar
<RECHAR
>("Symbol %c\n"),GetInstruction(ip
).symbol
.nSymbol
);
1789 ip
+= InstructionsPerRangeBitField();
1793 printf("NOT Range\n");
1795 ip
+= InstructionsPerRangeBitField();
1799 printf("RangeEx %08x\n", GetInstruction(ip
).range
.nTarget
);
1803 case RE_NOTRANGE_EX
:
1804 printf("NotRangeEx %08x\n", GetInstruction(ip
).range
.nTarget
);
1808 case RE_GROUP_START
:
1809 printf("Start group %d\n", GetInstruction(ip
).group
.nGroup
);
1814 printf("Group end %d\n", GetInstruction(ip
).group
.nGroup
);
1818 case RE_PUSH_CHARPOS
:
1819 printf("Push char pos\n");
1823 case RE_POP_CHARPOS
:
1824 printf("Pop char pos\n");
1828 case RE_STORE_CHARPOS
:
1829 printf("Store char pos %d\n", GetInstruction(ip
).memory
.nIndex
);
1833 case RE_GET_CHARPOS
:
1834 printf("Get char pos %d\n", GetInstruction(ip
).memory
.nIndex
);
1838 case RE_STORE_STACKPOS
:
1839 printf("Store stack pos %d\n", GetInstruction(ip
).memory
.nIndex
);
1843 case RE_GET_STACKPOS
:
1844 printf("Get stack pos %d\n", GetInstruction(ip
).memory
.nIndex
);
1849 printf("Call %08x\n", GetInstruction(ip
).call
.nTarget
);
1854 printf("Jump %08x\n", GetInstruction(ip
).jmp
.nTarget
);
1863 case RE_PUSH_MEMORY
:
1864 printf("Push memory %08x\n", GetInstruction(ip
).memory
.nIndex
);
1869 printf("Pop memory %08x\n", GetInstruction(ip
).memory
.nIndex
);
1873 case RE_RET_NOMATCH
:
1874 printf("Return no match %08x\n", GetInstruction(ip
).memory
.nIndex
);
1884 printf("ADVANCE\n");
1894 printf("Prev %d\n", GetInstruction(ip
).prev
.nGroup
);
1899 printf("Push group %d\n", GetInstruction(ip
).group
.nGroup
);
1904 printf("Pop group %d\n", GetInstruction(ip
).group
.nGroup
);
1917 void Dump(size_t ipCurrent
= 0)
1921 while (ip
< m_Instructions
.GetCount())
1923 if (ip
== ipCurrent
)
1925 ip
= DumpInstruction(ip
);
1931 void cls( HANDLE hConsole
)
1933 COORD coordScreen
= { 0, 0 }; /* here's where we'll home the
1936 DWORD cCharsWritten
;
1937 CONSOLE_SCREEN_BUFFER_INFO csbi
; /* to get buffer info */
1938 DWORD dwConSize
; /* number of character cells in
1939 the current buffer */
1941 /* get the number of character cells in the current buffer */
1943 bSuccess
= GetConsoleScreenBufferInfo( hConsole
, &csbi
);
1944 dwConSize
= csbi
.dwSize
.X
* csbi
.dwSize
.Y
;
1946 /* fill the entire screen with blanks */
1948 bSuccess
= FillConsoleOutputCharacter( hConsole
, (TCHAR
) ' ',
1949 dwConSize
, coordScreen
, &cCharsWritten
);
1951 /* get the current text attribute */
1953 bSuccess
= GetConsoleScreenBufferInfo( hConsole
, &csbi
);
1955 /* now set the buffer's attributes accordingly */
1957 bSuccess
= FillConsoleOutputAttribute( hConsole
, csbi
.wAttributes
,
1958 dwConSize
, coordScreen
, &cCharsWritten
);
1960 /* put the cursor at (0, 0) */
1962 bSuccess
= SetConsoleCursorPosition( hConsole
, coordScreen
);
1966 void DumpStack(CAtlREMatchContext
<CharTraits
> *pContext
)
1968 for (size_t i
=pContext
->m_nTos
; i
>0; i
--)
1970 if (pContext
->m_stack
[i
] < (void *) m_Instructions
.GetCount())
1971 printf("0x%p\n", pContext
->m_stack
[i
]);
1974 // assume a pointer into the input
1975 AtlprintfT
<RECHAR
>(CAToREChar
<RECHAR
>("%s\n"), pContext
->m_stack
[i
]);
1980 void DumpMemory(CAtlREMatchContext
<CharTraits
> *pContext
)
1982 for (UINT i
=0; i
<m_uRequiredMem
; i
++)
1984 AtlprintfT
<RECHAR
>(CAToREChar
<RECHAR
>("%d: %s\n"), i
, pContext
->m_Mem
.m_p
[i
]);
1988 virtual void OnDebugEvent(size_t ip
, const RECHAR
*szIn
, const RECHAR
*sz
, CAtlREMatchContext
<CharTraits
> *pContext
)
1990 cls(GetStdHandle(STD_OUTPUT_HANDLE
));
1991 printf("----------Code---------\n");
1993 printf("----------Input---------\n");
1994 AtlprintfT
<RECHAR
>(CAToREChar
<RECHAR
>("%s\n"), szIn
);
1995 for (int s
=0; szIn
+s
< sz
; s
++)
2000 printf("----------Memory---------\n");
2001 DumpMemory(pContext
);
2002 printf("----------Stack---------\n");
2003 DumpStack(pContext
);
2013 #endif // __ATLRX_H__