2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
38 #pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
43 static char copyright
[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
47 #include "structmember.h" /* offsetof */
53 /* name of this module, minus the leading underscore */
54 #if !defined(SRE_MODULE)
55 #define SRE_MODULE "sre"
58 /* defining this one enables tracing */
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
84 /* -------------------------------------------------------------------- */
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
94 #define LOCAL(type) static type
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
104 #define TRACE(v) printf v
109 /* -------------------------------------------------------------------- */
110 /* search engine state */
112 /* default character predicates (run sre_chars.py to regenerate tables) */
114 #define SRE_DIGIT_MASK 1
115 #define SRE_SPACE_MASK 2
116 #define SRE_LINEBREAK_MASK 4
117 #define SRE_ALNUM_MASK 8
118 #define SRE_WORD_MASK 16
120 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
123 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
127 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
128 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
131 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
132 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
133 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
134 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138 120, 121, 122, 123, 124, 125, 126, 127 };
140 #define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142 #define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144 #define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146 #define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148 #define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 static unsigned int sre_lower(unsigned int ch
)
153 return ((ch
) < 128 ? (unsigned int)sre_char_lower
[ch
] : ch
);
156 /* locale-specific character predicates */
157 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
161 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
162 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
163 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 static unsigned int sre_lower_locale(unsigned int ch
)
167 return ((ch
) < 256 ? (unsigned int)tolower((ch
)) : ch
);
170 /* unicode-specific character predicates */
172 #if defined(HAVE_UNICODE)
174 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_unicode(unsigned int ch
)
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
188 sre_category(SRE_CODE category
, unsigned int ch
)
192 case SRE_CATEGORY_DIGIT
:
193 return SRE_IS_DIGIT(ch
);
194 case SRE_CATEGORY_NOT_DIGIT
:
195 return !SRE_IS_DIGIT(ch
);
196 case SRE_CATEGORY_SPACE
:
197 return SRE_IS_SPACE(ch
);
198 case SRE_CATEGORY_NOT_SPACE
:
199 return !SRE_IS_SPACE(ch
);
200 case SRE_CATEGORY_WORD
:
201 return SRE_IS_WORD(ch
);
202 case SRE_CATEGORY_NOT_WORD
:
203 return !SRE_IS_WORD(ch
);
204 case SRE_CATEGORY_LINEBREAK
:
205 return SRE_IS_LINEBREAK(ch
);
206 case SRE_CATEGORY_NOT_LINEBREAK
:
207 return !SRE_IS_LINEBREAK(ch
);
209 case SRE_CATEGORY_LOC_WORD
:
210 return SRE_LOC_IS_WORD(ch
);
211 case SRE_CATEGORY_LOC_NOT_WORD
:
212 return !SRE_LOC_IS_WORD(ch
);
214 #if defined(HAVE_UNICODE)
215 case SRE_CATEGORY_UNI_DIGIT
:
216 return SRE_UNI_IS_DIGIT(ch
);
217 case SRE_CATEGORY_UNI_NOT_DIGIT
:
218 return !SRE_UNI_IS_DIGIT(ch
);
219 case SRE_CATEGORY_UNI_SPACE
:
220 return SRE_UNI_IS_SPACE(ch
);
221 case SRE_CATEGORY_UNI_NOT_SPACE
:
222 return !SRE_UNI_IS_SPACE(ch
);
223 case SRE_CATEGORY_UNI_WORD
:
224 return SRE_UNI_IS_WORD(ch
);
225 case SRE_CATEGORY_UNI_NOT_WORD
:
226 return !SRE_UNI_IS_WORD(ch
);
227 case SRE_CATEGORY_UNI_LINEBREAK
:
228 return SRE_UNI_IS_LINEBREAK(ch
);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
230 return !SRE_UNI_IS_LINEBREAK(ch
);
232 case SRE_CATEGORY_UNI_DIGIT
:
233 return SRE_IS_DIGIT(ch
);
234 case SRE_CATEGORY_UNI_NOT_DIGIT
:
235 return !SRE_IS_DIGIT(ch
);
236 case SRE_CATEGORY_UNI_SPACE
:
237 return SRE_IS_SPACE(ch
);
238 case SRE_CATEGORY_UNI_NOT_SPACE
:
239 return !SRE_IS_SPACE(ch
);
240 case SRE_CATEGORY_UNI_WORD
:
241 return SRE_LOC_IS_WORD(ch
);
242 case SRE_CATEGORY_UNI_NOT_WORD
:
243 return !SRE_LOC_IS_WORD(ch
);
244 case SRE_CATEGORY_UNI_LINEBREAK
:
245 return SRE_IS_LINEBREAK(ch
);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
247 return !SRE_IS_LINEBREAK(ch
);
256 data_stack_dealloc(SRE_STATE
* state
)
258 if (state
->data_stack
) {
259 free(state
->data_stack
);
260 state
->data_stack
= NULL
;
262 state
->data_stack_size
= state
->data_stack_base
= 0;
266 data_stack_grow(SRE_STATE
* state
, int size
)
268 int minsize
, cursize
;
269 minsize
= state
->data_stack_base
+size
;
270 cursize
= state
->data_stack_size
;
271 if (cursize
< minsize
) {
273 cursize
= minsize
+minsize
/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize
));
275 stack
= realloc(state
->data_stack
, cursize
);
277 data_stack_dealloc(state
);
278 return SRE_ERROR_MEMORY
;
280 state
->data_stack
= stack
;
281 state
->data_stack_size
= cursize
;
286 /* generate 8-bit version */
288 #define SRE_CHAR unsigned char
289 #define SRE_AT sre_at
290 #define SRE_COUNT sre_count
291 #define SRE_CHARSET sre_charset
292 #define SRE_INFO sre_info
293 #define SRE_MATCH sre_match
294 #define SRE_MATCH_CONTEXT sre_match_context
295 #define SRE_SEARCH sre_search
296 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 #if defined(HAVE_UNICODE)
300 #define SRE_RECURSIVE
304 #undef SRE_LITERAL_TEMPLATE
307 #undef SRE_MATCH_CONTEXT
314 /* generate 16-bit unicode version */
316 #define SRE_CHAR Py_UNICODE
317 #define SRE_AT sre_uat
318 #define SRE_COUNT sre_ucount
319 #define SRE_CHARSET sre_ucharset
320 #define SRE_INFO sre_uinfo
321 #define SRE_MATCH sre_umatch
322 #define SRE_MATCH_CONTEXT sre_umatch_context
323 #define SRE_SEARCH sre_usearch
324 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
327 #endif /* SRE_RECURSIVE */
329 /* -------------------------------------------------------------------- */
330 /* String matching engine */
332 /* the following section is compiled twice, with different character
336 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
338 /* check if pointer is at given position */
344 case SRE_AT_BEGINNING
:
345 case SRE_AT_BEGINNING_STRING
:
346 return ((void*) ptr
== state
->beginning
);
348 case SRE_AT_BEGINNING_LINE
:
349 return ((void*) ptr
== state
->beginning
||
350 SRE_IS_LINEBREAK((int) ptr
[-1]));
353 return (((void*) (ptr
+1) == state
->end
&&
354 SRE_IS_LINEBREAK((int) ptr
[0])) ||
355 ((void*) ptr
== state
->end
));
357 case SRE_AT_END_LINE
:
358 return ((void*) ptr
== state
->end
||
359 SRE_IS_LINEBREAK((int) ptr
[0]));
361 case SRE_AT_END_STRING
:
362 return ((void*) ptr
== state
->end
);
364 case SRE_AT_BOUNDARY
:
365 if (state
->beginning
== state
->end
)
367 that
= ((void*) ptr
> state
->beginning
) ?
368 SRE_IS_WORD((int) ptr
[-1]) : 0;
369 this = ((void*) ptr
< state
->end
) ?
370 SRE_IS_WORD((int) ptr
[0]) : 0;
373 case SRE_AT_NON_BOUNDARY
:
374 if (state
->beginning
== state
->end
)
376 that
= ((void*) ptr
> state
->beginning
) ?
377 SRE_IS_WORD((int) ptr
[-1]) : 0;
378 this = ((void*) ptr
< state
->end
) ?
379 SRE_IS_WORD((int) ptr
[0]) : 0;
382 case SRE_AT_LOC_BOUNDARY
:
383 if (state
->beginning
== state
->end
)
385 that
= ((void*) ptr
> state
->beginning
) ?
386 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
387 this = ((void*) ptr
< state
->end
) ?
388 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
391 case SRE_AT_LOC_NON_BOUNDARY
:
392 if (state
->beginning
== state
->end
)
394 that
= ((void*) ptr
> state
->beginning
) ?
395 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
396 this = ((void*) ptr
< state
->end
) ?
397 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
400 #if defined(HAVE_UNICODE)
401 case SRE_AT_UNI_BOUNDARY
:
402 if (state
->beginning
== state
->end
)
404 that
= ((void*) ptr
> state
->beginning
) ?
405 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
406 this = ((void*) ptr
< state
->end
) ?
407 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
410 case SRE_AT_UNI_NON_BOUNDARY
:
411 if (state
->beginning
== state
->end
)
413 that
= ((void*) ptr
> state
->beginning
) ?
414 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
415 this = ((void*) ptr
< state
->end
) ?
416 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
426 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
428 /* check if character is a member of the given set */
439 /* <LITERAL> <code> */
445 case SRE_OP_CATEGORY
:
446 /* <CATEGORY> <code> */
447 if (sre_category(set
[0], (int) ch
))
453 if (sizeof(SRE_CODE
) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch
< 256 && (set
[ch
>> 5] & (1 << (ch
& 31))))
468 /* <RANGE> <lower> <upper> */
469 if (set
[0] <= ch
&& ch
<= set
[1])
478 case SRE_OP_BIGCHARSET
:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
484 if (sizeof(SRE_CODE
) == 2) {
485 block
= ((unsigned char*)set
)[ch
>> 8];
487 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
495 block
= ((unsigned char*)set
)[ch
>> 8];
500 (set
[block
*8 + ((ch
& 255)>>5)] & (1 << (ch
& 31))))
508 /* internal error -- there's not much we can do about it
509 here, so let's just pretend it didn't match... */
515 LOCAL(int) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
);
518 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, int maxcount
)
521 SRE_CHAR
* ptr
= state
->ptr
;
522 SRE_CHAR
* end
= state
->end
;
526 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
527 end
= ptr
+ maxcount
;
529 switch (pattern
[0]) {
533 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
534 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
541 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
546 /* repeated dot wildcare. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
553 /* repeated literal */
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
556 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
560 case SRE_OP_LITERAL_IGNORE
:
561 /* repeated literal */
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
564 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
568 case SRE_OP_NOT_LITERAL
:
569 /* repeated non-literal */
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
572 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
576 case SRE_OP_NOT_LITERAL_IGNORE
:
577 /* repeated non-literal */
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
580 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
585 /* repeated single character pattern */
586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
587 while ((SRE_CHAR
*) state
->ptr
< end
) {
588 i
= SRE_MATCH(state
, pattern
);
594 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
595 (SRE_CHAR
*) state
->ptr
- ptr
));
596 return (SRE_CHAR
*) state
->ptr
- ptr
;
599 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
600 return ptr
- (SRE_CHAR
*) state
->ptr
;
603 #if 0 /* not used in this release */
605 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
611 SRE_CHAR
* end
= state
->end
;
612 SRE_CHAR
* ptr
= state
->ptr
;
615 /* check minimal length */
616 if (pattern
[3] && (end
- ptr
) < pattern
[3])
619 /* check known prefix */
620 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i
= 0; i
< pattern
[5]; i
++)
623 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
625 return pattern
[0] + 2 * pattern
[6];
631 /* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
657 * For more information, check the discussion at SF patch #712900.
659 #define LASTMARK_SAVE() \
661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
664 #define LASTMARK_RESTORE() \
666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
670 #define RETURN_ERROR(i) do { return i; } while(0)
671 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674 #define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676 #define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678 #define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
683 #define DATA_STACK_ALLOC(state, type, ptr) \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
698 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
704 #define DATA_STACK_PUSH(state, data, size) \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
718 #define DATA_STACK_POP(state, data, size, discard) \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
724 state->data_stack_base -= size; \
727 #define DATA_STACK_POP_DISCARD(state, size) \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
734 #define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736 #define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
738 #define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740 #define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
742 #define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745 #define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
750 #define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
754 #define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
758 #define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
764 #define JUMP_MAX_UNTIL_1 1
765 #define JUMP_MAX_UNTIL_2 2
766 #define JUMP_MAX_UNTIL_3 3
767 #define JUMP_MIN_UNTIL_1 4
768 #define JUMP_MIN_UNTIL_2 5
769 #define JUMP_MIN_UNTIL_3 6
770 #define JUMP_REPEAT 7
771 #define JUMP_REPEAT_ONE_1 8
772 #define JUMP_REPEAT_ONE_2 9
773 #define JUMP_MIN_REPEAT_ONE 10
774 #define JUMP_BRANCH 11
775 #define JUMP_ASSERT 12
776 #define JUMP_ASSERT_NOT 13
778 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
803 /* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
806 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
808 SRE_CHAR
* end
= state
->end
;
809 int alloc_pos
, ctx_pos
= -1;
813 SRE_MATCH_CONTEXT
* ctx
;
814 SRE_MATCH_CONTEXT
* nextctx
;
816 TRACE(("|%p|%p|ENTER\n", pattern
, state
->ptr
));
818 DATA_ALLOC(SRE_MATCH_CONTEXT
, ctx
);
819 ctx
->last_ctx_pos
= -1;
820 ctx
->jump
= JUMP_NONE
;
821 ctx
->pattern
= pattern
;
826 ctx
->ptr
= state
->ptr
;
828 if (ctx
->pattern
[0] == SRE_OP_INFO
) {
829 /* optimization info block */
830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
831 if (ctx
->pattern
[3] && (end
- ctx
->ptr
) < ctx
->pattern
[3]) {
832 TRACE(("reject (got %d chars, need %d)\n",
833 (end
- ctx
->ptr
), ctx
->pattern
[3]));
836 ctx
->pattern
+= ctx
->pattern
[1] + 1;
841 switch (*ctx
->pattern
++) {
846 TRACE(("|%p|%p|MARK %d\n", ctx
->pattern
,
847 ctx
->ptr
, ctx
->pattern
[0]));
850 state
->lastindex
= i
/2 + 1;
851 if (i
> state
->lastmark
) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 int j
= state
->lastmark
+ 1;
858 state
->mark
[j
++] = NULL
;
861 state
->mark
[i
] = ctx
->ptr
;
866 /* match literal string */
867 /* <LITERAL> <code> */
868 TRACE(("|%p|%p|LITERAL %d\n", ctx
->pattern
,
869 ctx
->ptr
, *ctx
->pattern
));
870 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] != ctx
->pattern
[0])
876 case SRE_OP_NOT_LITERAL
:
877 /* match anything that is not literal character */
878 /* <NOT_LITERAL> <code> */
879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx
->pattern
,
880 ctx
->ptr
, *ctx
->pattern
));
881 if (ctx
->ptr
>= end
|| (SRE_CODE
) ctx
->ptr
[0] == ctx
->pattern
[0])
889 TRACE(("|%p|%p|SUCCESS\n", ctx
->pattern
, ctx
->ptr
));
890 state
->ptr
= ctx
->ptr
;
894 /* match at given position */
896 TRACE(("|%p|%p|AT %d\n", ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
897 if (!SRE_AT(state
, ctx
->ptr
, *ctx
->pattern
))
902 case SRE_OP_CATEGORY
:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx
->pattern
,
906 ctx
->ptr
, *ctx
->pattern
));
907 if (ctx
->ptr
>= end
|| !sre_category(ctx
->pattern
[0], ctx
->ptr
[0]))
914 /* match anything (except a newline) */
916 TRACE(("|%p|%p|ANY\n", ctx
->pattern
, ctx
->ptr
));
917 if (ctx
->ptr
>= end
|| SRE_IS_LINEBREAK(ctx
->ptr
[0]))
925 TRACE(("|%p|%p|ANY_ALL\n", ctx
->pattern
, ctx
->ptr
));
932 /* match set member (or non_member) */
933 /* <IN> <skip> <set> */
934 TRACE(("|%p|%p|IN\n", ctx
->pattern
, ctx
->ptr
));
935 if (ctx
->ptr
>= end
|| !SRE_CHARSET(ctx
->pattern
+ 1, *ctx
->ptr
))
937 ctx
->pattern
+= ctx
->pattern
[0];
941 case SRE_OP_LITERAL_IGNORE
:
942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx
->pattern
, ctx
->ptr
, ctx
->pattern
[0]));
944 if (ctx
->ptr
>= end
||
945 state
->lower(*ctx
->ptr
) != state
->lower(*ctx
->pattern
))
951 case SRE_OP_NOT_LITERAL_IGNORE
:
952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx
->pattern
, ctx
->ptr
, *ctx
->pattern
));
954 if (ctx
->ptr
>= end
||
955 state
->lower(*ctx
->ptr
) == state
->lower(*ctx
->pattern
))
961 case SRE_OP_IN_IGNORE
:
962 TRACE(("|%p|%p|IN_IGNORE\n", ctx
->pattern
, ctx
->ptr
));
964 || !SRE_CHARSET(ctx
->pattern
+1,
965 (SRE_CODE
)state
->lower(*ctx
->ptr
)))
967 ctx
->pattern
+= ctx
->pattern
[0];
974 /* <JUMP> <offset> */
975 TRACE(("|%p|%p|JUMP %d\n", ctx
->pattern
,
976 ctx
->ptr
, ctx
->pattern
[0]));
977 ctx
->pattern
+= ctx
->pattern
[0];
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
983 TRACE(("|%p|%p|BRANCH\n", ctx
->pattern
, ctx
->ptr
));
985 ctx
->u
.rep
= state
->repeat
;
987 MARK_PUSH(ctx
->lastmark
);
988 for (; ctx
->pattern
[0]; ctx
->pattern
+= ctx
->pattern
[0]) {
989 if (ctx
->pattern
[1] == SRE_OP_LITERAL
&&
991 (SRE_CODE
) *ctx
->ptr
!= ctx
->pattern
[2]))
993 if (ctx
->pattern
[1] == SRE_OP_IN
&&
995 !SRE_CHARSET(ctx
->pattern
+ 3, (SRE_CODE
) *ctx
->ptr
)))
997 state
->ptr
= ctx
->ptr
;
998 DO_JUMP(JUMP_BRANCH
, jump_branch
, ctx
->pattern
+1);
1001 MARK_POP_DISCARD(ctx
->lastmark
);
1002 RETURN_ON_ERROR(ret
);
1006 MARK_POP_KEEP(ctx
->lastmark
);
1010 MARK_POP_DISCARD(ctx
->lastmark
);
1013 case SRE_OP_REPEAT_ONE
:
1014 /* match repeated sequence (maximizing regexp) */
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
1019 use the MAX_REPEAT operator */
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1024 ctx
->pattern
[1], ctx
->pattern
[2]));
1026 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1027 RETURN_FAILURE
; /* cannot match */
1029 state
->ptr
= ctx
->ptr
;
1031 ctx
->count
= SRE_COUNT(state
, ctx
->pattern
+3, ctx
->pattern
[2]);
1032 RETURN_ON_ERROR(ctx
->count
);
1034 ctx
->ptr
+= ctx
->count
;
1036 /* when we arrive here, count contains the number of
1037 matches, and ctx->ptr points to the tail of the target
1038 string. check if the rest of the pattern matches,
1039 and backtrack if not. */
1041 if (ctx
->count
< (int) ctx
->pattern
[1])
1044 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1045 /* tail is empty. we're finished */
1046 state
->ptr
= ctx
->ptr
;
1052 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_LITERAL
) {
1053 /* tail starts with a literal. skip positions where
1054 the rest of the pattern cannot possibly match */
1055 ctx
->u
.chr
= ctx
->pattern
[ctx
->pattern
[0]+1];
1057 while (ctx
->count
>= (int) ctx
->pattern
[1] &&
1058 (ctx
->ptr
>= end
|| *ctx
->ptr
!= ctx
->u
.chr
)) {
1062 if (ctx
->count
< (int) ctx
->pattern
[1])
1064 state
->ptr
= ctx
->ptr
;
1065 DO_JUMP(JUMP_REPEAT_ONE_1
, jump_repeat_one_1
,
1066 ctx
->pattern
+ctx
->pattern
[0]);
1068 RETURN_ON_ERROR(ret
);
1080 while (ctx
->count
>= (int) ctx
->pattern
[1]) {
1081 state
->ptr
= ctx
->ptr
;
1082 DO_JUMP(JUMP_REPEAT_ONE_2
, jump_repeat_one_2
,
1083 ctx
->pattern
+ctx
->pattern
[0]);
1085 RETURN_ON_ERROR(ret
);
1095 case SRE_OP_MIN_REPEAT_ONE
:
1096 /* match repeated sequence (minimizing regexp) */
1098 /* this operator only works if the repeated item is
1099 exactly one character wide, and we're not already
1100 collecting backtracking points. for other cases,
1101 use the MIN_REPEAT operator */
1103 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1105 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx
->pattern
, ctx
->ptr
,
1106 ctx
->pattern
[1], ctx
->pattern
[2]));
1108 if (ctx
->ptr
+ ctx
->pattern
[1] > end
)
1109 RETURN_FAILURE
; /* cannot match */
1111 state
->ptr
= ctx
->ptr
;
1113 if (ctx
->pattern
[1] == 0)
1116 /* count using pattern min as the maximum */
1117 ctx
->count
= SRE_COUNT(state
, ctx
->pattern
+3,
1119 RETURN_ON_ERROR(ctx
->count
);
1120 if (ctx
->count
< (int) ctx
->pattern
[1])
1121 /* didn't match minimum number of times */
1123 /* advance past minimum matches of repeat */
1124 ctx
->ptr
+= ctx
->count
;
1127 if (ctx
->pattern
[ctx
->pattern
[0]] == SRE_OP_SUCCESS
) {
1128 /* tail is empty. we're finished */
1129 state
->ptr
= ctx
->ptr
;
1135 while ((int)ctx
->pattern
[2] == 65535
1136 || ctx
->count
<= (int)ctx
->pattern
[2]) {
1137 state
->ptr
= ctx
->ptr
;
1138 DO_JUMP(JUMP_MIN_REPEAT_ONE
,jump_min_repeat_one
,
1139 ctx
->pattern
+ctx
->pattern
[0]);
1141 RETURN_ON_ERROR(ret
);
1144 state
->ptr
= ctx
->ptr
;
1145 ret
= SRE_COUNT(state
, ctx
->pattern
+3, 1);
1146 RETURN_ON_ERROR(ret
);
1158 /* create repeat context. all the hard work is done
1159 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1160 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1161 TRACE(("|%p|%p|REPEAT %d %d\n", ctx
->pattern
, ctx
->ptr
,
1162 ctx
->pattern
[1], ctx
->pattern
[2]));
1164 /* install new repeat context */
1165 ctx
->u
.rep
= (SRE_REPEAT
*) malloc(sizeof(*ctx
->u
.rep
));
1166 ctx
->u
.rep
->count
= -1;
1167 ctx
->u
.rep
->pattern
= ctx
->pattern
;
1168 ctx
->u
.rep
->prev
= state
->repeat
;
1169 ctx
->u
.rep
->last_ptr
= NULL
;
1170 state
->repeat
= ctx
->u
.rep
;
1172 state
->ptr
= ctx
->ptr
;
1173 DO_JUMP(JUMP_REPEAT
, jump_repeat
, ctx
->pattern
+ctx
->pattern
[0]);
1174 state
->repeat
= ctx
->u
.rep
->prev
;
1178 RETURN_ON_ERROR(ret
);
1183 case SRE_OP_MAX_UNTIL
:
1184 /* maximizing repeat */
1185 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1187 /* FIXME: we probably need to deal with zero-width
1188 matches in here... */
1190 ctx
->u
.rep
= state
->repeat
;
1192 RETURN_ERROR(SRE_ERROR_STATE
);
1194 state
->ptr
= ctx
->ptr
;
1196 ctx
->count
= ctx
->u
.rep
->count
+1;
1198 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx
->pattern
,
1199 ctx
->ptr
, ctx
->count
));
1201 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1202 /* not enough matches */
1203 ctx
->u
.rep
->count
= ctx
->count
;
1204 DO_JUMP(JUMP_MAX_UNTIL_1
, jump_max_until_1
,
1205 ctx
->u
.rep
->pattern
+3);
1207 RETURN_ON_ERROR(ret
);
1210 ctx
->u
.rep
->count
= ctx
->count
-1;
1211 state
->ptr
= ctx
->ptr
;
1215 if ((ctx
->count
< ctx
->u
.rep
->pattern
[2] ||
1216 ctx
->u
.rep
->pattern
[2] == 65535) &&
1217 state
->ptr
!= ctx
->u
.rep
->last_ptr
) {
1218 /* we may have enough matches, but if we can
1219 match another item, do so */
1220 ctx
->u
.rep
->count
= ctx
->count
;
1222 MARK_PUSH(ctx
->lastmark
);
1223 /* zero-width match protection */
1224 DATA_PUSH(&ctx
->u
.rep
->last_ptr
);
1225 ctx
->u
.rep
->last_ptr
= state
->ptr
;
1226 DO_JUMP(JUMP_MAX_UNTIL_2
, jump_max_until_2
,
1227 ctx
->u
.rep
->pattern
+3);
1228 DATA_POP(&ctx
->u
.rep
->last_ptr
);
1230 MARK_POP_DISCARD(ctx
->lastmark
);
1231 RETURN_ON_ERROR(ret
);
1234 MARK_POP(ctx
->lastmark
);
1236 ctx
->u
.rep
->count
= ctx
->count
-1;
1237 state
->ptr
= ctx
->ptr
;
1240 /* cannot match more repeated items here. make sure the
1242 state
->repeat
= ctx
->u
.rep
->prev
;
1243 DO_JUMP(JUMP_MAX_UNTIL_3
, jump_max_until_3
, ctx
->pattern
);
1244 RETURN_ON_SUCCESS(ret
);
1245 state
->repeat
= ctx
->u
.rep
;
1246 state
->ptr
= ctx
->ptr
;
1249 case SRE_OP_MIN_UNTIL
:
1250 /* minimizing repeat */
1251 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1253 ctx
->u
.rep
= state
->repeat
;
1255 RETURN_ERROR(SRE_ERROR_STATE
);
1257 state
->ptr
= ctx
->ptr
;
1259 ctx
->count
= ctx
->u
.rep
->count
+1;
1261 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx
->pattern
,
1262 ctx
->ptr
, ctx
->count
, ctx
->u
.rep
->pattern
));
1264 if (ctx
->count
< ctx
->u
.rep
->pattern
[1]) {
1265 /* not enough matches */
1266 ctx
->u
.rep
->count
= ctx
->count
;
1267 DO_JUMP(JUMP_MIN_UNTIL_1
, jump_min_until_1
,
1268 ctx
->u
.rep
->pattern
+3);
1270 RETURN_ON_ERROR(ret
);
1273 ctx
->u
.rep
->count
= ctx
->count
-1;
1274 state
->ptr
= ctx
->ptr
;
1280 /* see if the tail matches */
1281 state
->repeat
= ctx
->u
.rep
->prev
;
1282 DO_JUMP(JUMP_MIN_UNTIL_2
, jump_min_until_2
, ctx
->pattern
);
1284 RETURN_ON_ERROR(ret
);
1288 state
->repeat
= ctx
->u
.rep
;
1289 state
->ptr
= ctx
->ptr
;
1293 if (ctx
->count
>= ctx
->u
.rep
->pattern
[2]
1294 && ctx
->u
.rep
->pattern
[2] != 65535)
1297 ctx
->u
.rep
->count
= ctx
->count
;
1298 DO_JUMP(JUMP_MIN_UNTIL_3
,jump_min_until_3
,
1299 ctx
->u
.rep
->pattern
+3);
1301 RETURN_ON_ERROR(ret
);
1304 ctx
->u
.rep
->count
= ctx
->count
-1;
1305 state
->ptr
= ctx
->ptr
;
1308 case SRE_OP_GROUPREF
:
1309 /* match backreference */
1310 TRACE(("|%p|%p|GROUPREF %d\n", ctx
->pattern
,
1311 ctx
->ptr
, ctx
->pattern
[0]));
1312 i
= ctx
->pattern
[0];
1315 if (groupref
>= state
->lastmark
) {
1318 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1319 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1320 if (!p
|| !e
|| e
< p
)
1323 if (ctx
->ptr
>= end
|| *ctx
->ptr
!= *p
)
1332 case SRE_OP_GROUPREF_IGNORE
:
1333 /* match backreference */
1334 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx
->pattern
,
1335 ctx
->ptr
, ctx
->pattern
[0]));
1336 i
= ctx
->pattern
[0];
1339 if (groupref
>= state
->lastmark
) {
1342 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1343 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1344 if (!p
|| !e
|| e
< p
)
1347 if (ctx
->ptr
>= end
||
1348 state
->lower(*ctx
->ptr
) != state
->lower(*p
))
1357 case SRE_OP_GROUPREF_EXISTS
:
1358 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx
->pattern
,
1359 ctx
->ptr
, ctx
->pattern
[0]));
1360 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1361 i
= ctx
->pattern
[0];
1364 if (groupref
>= state
->lastmark
) {
1365 ctx
->pattern
+= ctx
->pattern
[1];
1368 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[groupref
];
1369 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[groupref
+1];
1370 if (!p
|| !e
|| e
< p
) {
1371 ctx
->pattern
+= ctx
->pattern
[1];
1380 /* assert subpattern */
1381 /* <ASSERT> <skip> <back> <pattern> */
1382 TRACE(("|%p|%p|ASSERT %d\n", ctx
->pattern
,
1383 ctx
->ptr
, ctx
->pattern
[1]));
1384 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1385 if (state
->ptr
< state
->beginning
)
1387 DO_JUMP(JUMP_ASSERT
, jump_assert
, ctx
->pattern
+2);
1388 RETURN_ON_FAILURE(ret
);
1389 ctx
->pattern
+= ctx
->pattern
[0];
1392 case SRE_OP_ASSERT_NOT
:
1393 /* assert not subpattern */
1394 /* <ASSERT_NOT> <skip> <back> <pattern> */
1395 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx
->pattern
,
1396 ctx
->ptr
, ctx
->pattern
[1]));
1397 state
->ptr
= ctx
->ptr
- ctx
->pattern
[1];
1398 if (state
->ptr
>= state
->beginning
) {
1399 DO_JUMP(JUMP_ASSERT_NOT
, jump_assert_not
, ctx
->pattern
+2);
1401 RETURN_ON_ERROR(ret
);
1405 ctx
->pattern
+= ctx
->pattern
[0];
1408 case SRE_OP_FAILURE
:
1409 /* immediate failure */
1410 TRACE(("|%p|%p|FAILURE\n", ctx
->pattern
, ctx
->ptr
));
1414 TRACE(("|%p|%p|UNKNOWN %d\n", ctx
->pattern
, ctx
->ptr
,
1416 RETURN_ERROR(SRE_ERROR_ILLEGAL
);
1421 ctx_pos
= ctx
->last_ctx_pos
;
1423 DATA_POP_DISCARD(ctx
);
1426 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT
, ctx
, ctx_pos
);
1429 case JUMP_MAX_UNTIL_2
:
1430 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1431 goto jump_max_until_2
;
1432 case JUMP_MAX_UNTIL_3
:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1434 goto jump_max_until_3
;
1435 case JUMP_MIN_UNTIL_2
:
1436 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx
->pattern
, ctx
->ptr
));
1437 goto jump_min_until_2
;
1438 case JUMP_MIN_UNTIL_3
:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx
->pattern
, ctx
->ptr
));
1440 goto jump_min_until_3
;
1442 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx
->pattern
, ctx
->ptr
));
1444 case JUMP_MAX_UNTIL_1
:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1446 goto jump_max_until_1
;
1447 case JUMP_MIN_UNTIL_1
:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx
->pattern
, ctx
->ptr
));
1449 goto jump_min_until_1
;
1451 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx
->pattern
, ctx
->ptr
));
1453 case JUMP_REPEAT_ONE_1
:
1454 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx
->pattern
, ctx
->ptr
));
1455 goto jump_repeat_one_1
;
1456 case JUMP_REPEAT_ONE_2
:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx
->pattern
, ctx
->ptr
));
1458 goto jump_repeat_one_2
;
1459 case JUMP_MIN_REPEAT_ONE
:
1460 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx
->pattern
, ctx
->ptr
));
1461 goto jump_min_repeat_one
;
1463 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx
->pattern
, ctx
->ptr
));
1465 case JUMP_ASSERT_NOT
:
1466 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx
->pattern
, ctx
->ptr
));
1467 goto jump_assert_not
;
1469 TRACE(("|%p|%p|RETURN %d\n", ctx
->pattern
, ctx
->ptr
, ret
));
1473 return ret
; /* should never get here */
1477 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1479 SRE_CHAR
* ptr
= state
->start
;
1480 SRE_CHAR
* end
= state
->end
;
1483 int prefix_skip
= 0;
1484 SRE_CODE
* prefix
= NULL
;
1485 SRE_CODE
* charset
= NULL
;
1486 SRE_CODE
* overlap
= NULL
;
1489 if (pattern
[0] == SRE_OP_INFO
) {
1490 /* optimization info block */
1491 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1495 if (pattern
[3] > 1) {
1496 /* adjust end point (but make sure we leave at least one
1497 character in there, so literal search will work) */
1498 end
-= pattern
[3]-1;
1503 if (flags
& SRE_INFO_PREFIX
) {
1504 /* pattern starts with a known prefix */
1505 /* <length> <skip> <prefix data> <overlap data> */
1506 prefix_len
= pattern
[5];
1507 prefix_skip
= pattern
[6];
1508 prefix
= pattern
+ 7;
1509 overlap
= prefix
+ prefix_len
- 1;
1510 } else if (flags
& SRE_INFO_CHARSET
)
1511 /* pattern starts with a character from a known set */
1513 charset
= pattern
+ 5;
1515 pattern
+= 1 + pattern
[1];
1518 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1519 TRACE(("charset = %p\n", charset
));
1521 #if defined(USE_FAST_SEARCH)
1522 if (prefix_len
> 1) {
1523 /* pattern starts with a known prefix. use the overlap
1524 table to skip forward as fast as we possibly can */
1529 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1535 if (++i
== prefix_len
) {
1536 /* found a potential match */
1537 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1538 state
->start
= ptr
+ 1 - prefix_len
;
1539 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1540 if (flags
& SRE_INFO_LITERAL
)
1541 return 1; /* we got all of it */
1542 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
);
1545 /* close but no cigar -- try again */
1558 if (pattern
[0] == SRE_OP_LITERAL
) {
1559 /* pattern starts with a literal character. this is used
1560 for short prefixes, and if fast search is disabled */
1561 SRE_CODE chr
= pattern
[1];
1564 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1571 if (flags
& SRE_INFO_LITERAL
)
1572 return 1; /* we got all of it */
1573 status
= SRE_MATCH(state
, pattern
+ 2);
1577 } else if (charset
) {
1578 /* pattern starts with a character from a known set */
1581 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1588 status
= SRE_MATCH(state
, pattern
);
1595 while (ptr
<= end
) {
1596 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1597 state
->start
= state
->ptr
= ptr
++;
1598 status
= SRE_MATCH(state
, pattern
);
1607 SRE_LITERAL_TEMPLATE(SRE_CHAR
* ptr
, int len
)
1609 /* check if given string is a literal template (i.e. no escapes) */
1616 #if !defined(SRE_RECURSIVE)
1618 /* -------------------------------------------------------------------- */
1619 /* factories and destructors */
1621 /* see sre.h for object declarations */
1623 static PyTypeObject Pattern_Type
;
1624 static PyTypeObject Match_Type
;
1625 static PyTypeObject Scanner_Type
;
1628 _compile(PyObject
* self_
, PyObject
* args
)
1630 /* "compile" pattern descriptor to pattern object */
1632 PatternObject
* self
;
1639 PyObject
* groupindex
= NULL
;
1640 PyObject
* indexgroup
= NULL
;
1641 if (!PyArg_ParseTuple(args
, "OiO!|iOO", &pattern
, &flags
,
1642 &PyList_Type
, &code
, &groups
,
1643 &groupindex
, &indexgroup
))
1646 n
= PyList_GET_SIZE(code
);
1648 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
1654 for (i
= 0; i
< n
; i
++) {
1655 PyObject
*o
= PyList_GET_ITEM(code
, i
);
1657 self
->code
[i
] = (SRE_CODE
) PyInt_AsLong(o
);
1659 self
->code
[i
] = (SRE_CODE
) PyLong_AsUnsignedLong(o
);
1662 if (PyErr_Occurred()) {
1668 self
->pattern
= pattern
;
1670 self
->flags
= flags
;
1672 self
->groups
= groups
;
1674 Py_XINCREF(groupindex
);
1675 self
->groupindex
= groupindex
;
1677 Py_XINCREF(indexgroup
);
1678 self
->indexgroup
= indexgroup
;
1680 self
->weakreflist
= NULL
;
1682 return (PyObject
*) self
;
1686 sre_codesize(PyObject
* self
, PyObject
* args
)
1688 return Py_BuildValue("i", sizeof(SRE_CODE
));
1692 sre_getlower(PyObject
* self
, PyObject
* args
)
1694 int character
, flags
;
1695 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1697 if (flags
& SRE_FLAG_LOCALE
)
1698 return Py_BuildValue("i", sre_lower_locale(character
));
1699 if (flags
& SRE_FLAG_UNICODE
)
1700 #if defined(HAVE_UNICODE)
1701 return Py_BuildValue("i", sre_lower_unicode(character
));
1703 return Py_BuildValue("i", sre_lower_locale(character
));
1705 return Py_BuildValue("i", sre_lower(character
));
1709 state_reset(SRE_STATE
* state
)
1711 /* FIXME: dynamic! */
1712 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1714 state
->lastmark
= -1;
1715 state
->lastindex
= -1;
1717 state
->repeat
= NULL
;
1719 data_stack_dealloc(state
);
1723 getstring(PyObject
* string
, int* p_length
, int* p_charsize
)
1725 /* given a python object, return a data pointer, a length (in
1726 characters), and a character size. return NULL if the object
1727 is not a string (or not compatible) */
1729 PyBufferProcs
*buffer
;
1730 int size
, bytes
, charsize
;
1733 #if defined(HAVE_UNICODE)
1734 if (PyUnicode_Check(string
)) {
1735 /* unicode strings doesn't always support the buffer interface */
1736 ptr
= (void*) PyUnicode_AS_DATA(string
);
1737 bytes
= PyUnicode_GET_DATA_SIZE(string
);
1738 size
= PyUnicode_GET_SIZE(string
);
1739 charsize
= sizeof(Py_UNICODE
);
1744 /* get pointer to string buffer */
1745 buffer
= string
->ob_type
->tp_as_buffer
;
1746 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1747 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1748 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1752 /* determine buffer size */
1753 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1755 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1759 /* determine character size */
1760 #if PY_VERSION_HEX >= 0x01060000
1761 size
= PyObject_Size(string
);
1763 size
= PyObject_Length(string
);
1766 if (PyString_Check(string
) || bytes
== size
)
1768 #if defined(HAVE_UNICODE)
1769 else if (bytes
== (int) (size
* sizeof(Py_UNICODE
)))
1770 charsize
= sizeof(Py_UNICODE
);
1773 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1777 #if defined(HAVE_UNICODE)
1782 *p_charsize
= charsize
;
1788 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1791 /* prepare state object */
1797 memset(state
, 0, sizeof(SRE_STATE
));
1799 state
->lastmark
= -1;
1800 state
->lastindex
= -1;
1802 ptr
= getstring(string
, &length
, &charsize
);
1806 /* adjust boundaries */
1809 else if (start
> length
)
1814 else if (end
> length
)
1817 state
->charsize
= charsize
;
1819 state
->beginning
= ptr
;
1821 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1822 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1825 state
->string
= string
;
1827 state
->endpos
= end
;
1829 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1830 state
->lower
= sre_lower_locale
;
1831 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1832 #if defined(HAVE_UNICODE)
1833 state
->lower
= sre_lower_unicode
;
1835 state
->lower
= sre_lower_locale
;
1838 state
->lower
= sre_lower
;
1844 state_fini(SRE_STATE
* state
)
1846 Py_XDECREF(state
->string
);
1847 data_stack_dealloc(state
);
1850 /* calculate offset from start of string */
1851 #define STATE_OFFSET(state, member)\
1852 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1855 state_getslice(SRE_STATE
* state
, int index
, PyObject
* string
, int empty
)
1859 index
= (index
- 1) * 2;
1861 if (string
== Py_None
|| index
>= state
->lastmark
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1863 /* want empty string */
1870 i
= STATE_OFFSET(state
, state
->mark
[index
]);
1871 j
= STATE_OFFSET(state
, state
->mark
[index
+1]);
1874 return PySequence_GetSlice(string
, i
, j
);
1878 pattern_error(int status
)
1881 case SRE_ERROR_RECURSION_LIMIT
:
1884 "maximum recursion limit exceeded"
1887 case SRE_ERROR_MEMORY
:
1891 /* other error codes indicate compiler/engine bugs */
1894 "internal error in regular expression engine"
1900 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
1902 /* create match object (from state object) */
1911 /* create match object (with room for extra group marks) */
1912 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
1913 2*(pattern
->groups
+1));
1918 match
->pattern
= pattern
;
1920 Py_INCREF(state
->string
);
1921 match
->string
= state
->string
;
1924 match
->groups
= pattern
->groups
+1;
1926 /* fill in group slices */
1928 base
= (char*) state
->beginning
;
1929 n
= state
->charsize
;
1931 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
1932 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
1934 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
1935 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
1936 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
1937 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
1939 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
1941 match
->pos
= state
->pos
;
1942 match
->endpos
= state
->endpos
;
1944 match
->lastindex
= state
->lastindex
;
1946 return (PyObject
*) match
;
1948 } else if (status
== 0) {
1956 /* internal error */
1957 pattern_error(status
);
1962 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
1964 /* create search state object */
1966 ScannerObject
* self
;
1971 if (!PyArg_ParseTuple(args
, "O|ii:scanner", &string
, &start
, &end
))
1974 /* create scanner object */
1975 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
1979 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
1986 self
->pattern
= (PyObject
*) pattern
;
1988 return (PyObject
*) self
;
1992 pattern_dealloc(PatternObject
* self
)
1994 if (self
->weakreflist
!= NULL
)
1995 PyObject_ClearWeakRefs((PyObject
*) self
);
1996 Py_XDECREF(self
->pattern
);
1997 Py_XDECREF(self
->groupindex
);
1998 Py_XDECREF(self
->indexgroup
);
2003 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2011 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
2012 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:match", kwlist
,
2013 &string
, &start
, &end
))
2016 string
= state_init(&state
, self
, string
, start
, end
);
2020 state
.ptr
= state
.start
;
2022 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
2024 if (state
.charsize
== 1) {
2025 status
= sre_match(&state
, PatternObject_GetCode(self
));
2027 #if defined(HAVE_UNICODE)
2028 status
= sre_umatch(&state
, PatternObject_GetCode(self
));
2032 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2036 return pattern_new_match(self
, &state
, status
);
2040 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2048 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
2049 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:search", kwlist
,
2050 &string
, &start
, &end
))
2053 string
= state_init(&state
, self
, string
, start
, end
);
2057 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
2059 if (state
.charsize
== 1) {
2060 status
= sre_search(&state
, PatternObject_GetCode(self
));
2062 #if defined(HAVE_UNICODE)
2063 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2067 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
2071 return pattern_new_match(self
, &state
, status
);
2075 call(char* module
, char* function
, PyObject
* args
)
2084 name
= PyString_FromString(module
);
2087 mod
= PyImport_Import(name
);
2091 func
= PyObject_GetAttrString(mod
, function
);
2095 result
= PyObject_CallObject(func
, args
);
2101 #ifdef USE_BUILTIN_COPY
2103 deepcopy(PyObject
** object
, PyObject
* memo
)
2109 PyTuple_Pack(2, *object
, memo
)
2117 return 1; /* success */
2122 join_list(PyObject
* list
, PyObject
* pattern
)
2124 /* join list elements */
2127 #if PY_VERSION_HEX >= 0x01060000
2133 switch (PyList_GET_SIZE(list
)) {
2136 return PySequence_GetSlice(pattern
, 0, 0);
2138 result
= PyList_GET_ITEM(list
, 0);
2144 /* two or more elements: slice out a suitable separator from the
2145 first member, and use that to join the entire list */
2147 joiner
= PySequence_GetSlice(pattern
, 0, 0);
2151 #if PY_VERSION_HEX >= 0x01060000
2152 function
= PyObject_GetAttrString(joiner
, "join");
2157 args
= PyTuple_New(1);
2159 Py_DECREF(function
);
2163 PyTuple_SET_ITEM(args
, 0, list
);
2164 result
= PyObject_CallObject(function
, args
);
2165 Py_DECREF(args
); /* also removes list */
2166 Py_DECREF(function
);
2170 PyTuple_Pack(2, list
, joiner
)
2179 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2189 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
2190 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:findall", kwlist
,
2191 &string
, &start
, &end
))
2194 string
= state_init(&state
, self
, string
, start
, end
);
2198 list
= PyList_New(0);
2204 while (state
.start
<= state
.end
) {
2208 state_reset(&state
);
2210 state
.ptr
= state
.start
;
2212 if (state
.charsize
== 1) {
2213 status
= sre_search(&state
, PatternObject_GetCode(self
));
2215 #if defined(HAVE_UNICODE)
2216 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2223 pattern_error(status
);
2227 /* don't bother to build a match object */
2228 switch (self
->groups
) {
2230 b
= STATE_OFFSET(&state
, state
.start
);
2231 e
= STATE_OFFSET(&state
, state
.ptr
);
2232 item
= PySequence_GetSlice(string
, b
, e
);
2237 item
= state_getslice(&state
, 1, string
, 1);
2242 item
= PyTuple_New(self
->groups
);
2245 for (i
= 0; i
< self
->groups
; i
++) {
2246 PyObject
* o
= state_getslice(&state
, i
+1, string
, 1);
2251 PyTuple_SET_ITEM(item
, i
, o
);
2256 status
= PyList_Append(list
, item
);
2261 if (state
.ptr
== state
.start
)
2262 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2264 state
.start
= state
.ptr
;
2278 #if PY_VERSION_HEX >= 0x02020000
2280 pattern_finditer(PatternObject
* pattern
, PyObject
* args
)
2286 scanner
= pattern_scanner(pattern
, args
);
2290 search
= PyObject_GetAttrString(scanner
, "search");
2295 iterator
= PyCallIter_New(search
, Py_None
);
2303 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2315 static char* kwlist
[] = { "source", "maxsplit", NULL
};
2316 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|i:split", kwlist
,
2317 &string
, &maxsplit
))
2320 string
= state_init(&state
, self
, string
, 0, INT_MAX
);
2324 list
= PyList_New(0);
2333 while (!maxsplit
|| n
< maxsplit
) {
2335 state_reset(&state
);
2337 state
.ptr
= state
.start
;
2339 if (state
.charsize
== 1) {
2340 status
= sre_search(&state
, PatternObject_GetCode(self
));
2342 #if defined(HAVE_UNICODE)
2343 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2350 pattern_error(status
);
2354 if (state
.start
== state
.ptr
) {
2355 if (last
== state
.end
)
2357 /* skip one character */
2358 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2362 /* get segment before this match */
2363 item
= PySequence_GetSlice(
2364 string
, STATE_OFFSET(&state
, last
),
2365 STATE_OFFSET(&state
, state
.start
)
2369 status
= PyList_Append(list
, item
);
2374 /* add groups (if any) */
2375 for (i
= 0; i
< self
->groups
; i
++) {
2376 item
= state_getslice(&state
, i
+1, string
, 0);
2379 status
= PyList_Append(list
, item
);
2387 last
= state
.start
= state
.ptr
;
2391 /* get segment following last match (even if empty) */
2392 item
= PySequence_GetSlice(
2393 string
, STATE_OFFSET(&state
, last
), state
.endpos
2397 status
= PyList_Append(list
, item
);
2413 pattern_subx(PatternObject
* self
, PyObject
* template, PyObject
* string
,
2414 int count
, int subn
)
2426 int filter_is_callable
;
2428 if (PyCallable_Check(template)) {
2429 /* sub/subn takes either a function or a template */
2432 filter_is_callable
= 1;
2434 /* if not callable, check if it's a literal string */
2436 ptr
= getstring(template, &n
, &b
);
2439 literal
= sre_literal_template(ptr
, n
);
2441 #if defined(HAVE_UNICODE)
2442 literal
= sre_uliteral_template(ptr
, n
);
2452 filter_is_callable
= 0;
2454 /* not a literal; hand it over to the template compiler */
2456 SRE_MODULE
, "_subx",
2457 PyTuple_Pack(2, self
, template)
2461 filter_is_callable
= PyCallable_Check(filter
);
2465 string
= state_init(&state
, self
, string
, 0, INT_MAX
);
2471 list
= PyList_New(0);
2480 while (!count
|| n
< count
) {
2482 state_reset(&state
);
2484 state
.ptr
= state
.start
;
2486 if (state
.charsize
== 1) {
2487 status
= sre_search(&state
, PatternObject_GetCode(self
));
2489 #if defined(HAVE_UNICODE)
2490 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
2497 pattern_error(status
);
2501 b
= STATE_OFFSET(&state
, state
.start
);
2502 e
= STATE_OFFSET(&state
, state
.ptr
);
2505 /* get segment before this match */
2506 item
= PySequence_GetSlice(string
, i
, b
);
2509 status
= PyList_Append(list
, item
);
2514 } else if (i
== b
&& i
== e
&& n
> 0)
2515 /* ignore empty match on latest position */
2518 if (filter_is_callable
) {
2519 /* pass match object through filter */
2520 match
= pattern_new_match(self
, &state
, 1);
2523 args
= PyTuple_Pack(1, match
);
2528 item
= PyObject_CallObject(filter
, args
);
2534 /* filter is literal string */
2540 if (item
!= Py_None
) {
2541 status
= PyList_Append(list
, item
);
2552 if (state
.ptr
== state
.start
)
2553 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
2555 state
.start
= state
.ptr
;
2559 /* get segment following last match */
2560 if (i
< state
.endpos
) {
2561 item
= PySequence_GetSlice(string
, i
, state
.endpos
);
2564 status
= PyList_Append(list
, item
);
2574 /* convert list to single string (also removes list) */
2575 item
= join_list(list
, self
->pattern
);
2581 return Py_BuildValue("Ni", item
, n
);
2594 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2599 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2600 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|i:sub", kwlist
,
2601 &template, &string
, &count
))
2604 return pattern_subx(self
, template, string
, count
, 0);
2608 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
2613 static char* kwlist
[] = { "repl", "string", "count", NULL
};
2614 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|i:subn", kwlist
,
2615 &template, &string
, &count
))
2618 return pattern_subx(self
, template, string
, count
, 1);
2622 pattern_copy(PatternObject
* self
, PyObject
* args
)
2624 #ifdef USE_BUILTIN_COPY
2625 PatternObject
* copy
;
2628 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
2631 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
2635 offset
= offsetof(PatternObject
, groups
);
2637 Py_XINCREF(self
->groupindex
);
2638 Py_XINCREF(self
->indexgroup
);
2639 Py_XINCREF(self
->pattern
);
2641 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2642 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
2643 copy
->weakreflist
= NULL
;
2645 return (PyObject
*) copy
;
2647 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
2653 pattern_deepcopy(PatternObject
* self
, PyObject
* args
)
2655 #ifdef USE_BUILTIN_COPY
2656 PatternObject
* copy
;
2659 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
2662 copy
= (PatternObject
*) pattern_copy(self
, Py_None
);
2666 if (!deepcopy(©
->groupindex
, memo
) ||
2667 !deepcopy(©
->indexgroup
, memo
) ||
2668 !deepcopy(©
->pattern
, memo
)) {
2674 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
2679 static PyMethodDef pattern_methods
[] = {
2680 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
},
2681 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
},
2682 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
},
2683 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
},
2684 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
},
2685 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
},
2686 #if PY_VERSION_HEX >= 0x02020000
2687 {"finditer", (PyCFunction
) pattern_finditer
, METH_VARARGS
},
2689 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
2690 {"__copy__", (PyCFunction
) pattern_copy
, METH_VARARGS
},
2691 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_VARARGS
},
2696 pattern_getattr(PatternObject
* self
, char* name
)
2700 res
= Py_FindMethod(pattern_methods
, (PyObject
*) self
, name
);
2708 if (!strcmp(name
, "pattern")) {
2709 Py_INCREF(self
->pattern
);
2710 return self
->pattern
;
2713 if (!strcmp(name
, "flags"))
2714 return Py_BuildValue("i", self
->flags
);
2716 if (!strcmp(name
, "groups"))
2717 return Py_BuildValue("i", self
->groups
);
2719 if (!strcmp(name
, "groupindex") && self
->groupindex
) {
2720 Py_INCREF(self
->groupindex
);
2721 return self
->groupindex
;
2724 PyErr_SetString(PyExc_AttributeError
, name
);
2728 statichere PyTypeObject Pattern_Type
= {
2729 PyObject_HEAD_INIT(NULL
)
2730 0, "_" SRE_MODULE
".SRE_Pattern",
2731 sizeof(PatternObject
), sizeof(SRE_CODE
),
2732 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2734 (getattrfunc
)pattern_getattr
, /*tp_getattr*/
2738 0, /* tp_as_number */
2739 0, /* tp_as_sequence */
2740 0, /* tp_as_mapping */
2744 0, /* tp_getattro */
2745 0, /* tp_setattro */
2746 0, /* tp_as_buffer */
2747 Py_TPFLAGS_HAVE_WEAKREFS
, /* tp_flags */
2749 0, /* tp_traverse */
2751 0, /* tp_richcompare */
2752 offsetof(PatternObject
, weakreflist
), /* tp_weaklistoffset */
2755 /* -------------------------------------------------------------------- */
2759 match_dealloc(MatchObject
* self
)
2761 Py_XDECREF(self
->regs
);
2762 Py_XDECREF(self
->string
);
2763 Py_DECREF(self
->pattern
);
2768 match_getslice_by_index(MatchObject
* self
, int index
, PyObject
* def
)
2770 if (index
< 0 || index
>= self
->groups
) {
2771 /* raise IndexError if we were given a bad group number */
2781 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
2782 /* return default value if the string or group is undefined */
2787 return PySequence_GetSlice(
2788 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
2793 match_getindex(MatchObject
* self
, PyObject
* index
)
2797 if (PyInt_Check(index
))
2798 return (int) PyInt_AS_LONG(index
);
2802 if (self
->pattern
->groupindex
) {
2803 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
2805 if (PyInt_Check(index
))
2806 i
= (int) PyInt_AS_LONG(index
);
2816 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
2818 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
2822 match_expand(MatchObject
* self
, PyObject
* args
)
2825 if (!PyArg_ParseTuple(args
, "O:expand", &template))
2828 /* delegate to Python code */
2830 SRE_MODULE
, "_expand",
2831 PyTuple_Pack(3, self
->pattern
, self
, template)
2836 match_group(MatchObject
* self
, PyObject
* args
)
2841 size
= PyTuple_GET_SIZE(args
);
2845 result
= match_getslice(self
, Py_False
, Py_None
);
2848 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
2851 /* fetch multiple items */
2852 result
= PyTuple_New(size
);
2855 for (i
= 0; i
< size
; i
++) {
2856 PyObject
* item
= match_getslice(
2857 self
, PyTuple_GET_ITEM(args
, i
), Py_None
2863 PyTuple_SET_ITEM(result
, i
, item
);
2871 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2876 PyObject
* def
= Py_None
;
2877 static char* kwlist
[] = { "default", NULL
};
2878 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
2881 result
= PyTuple_New(self
->groups
-1);
2885 for (index
= 1; index
< self
->groups
; index
++) {
2887 item
= match_getslice_by_index(self
, index
, def
);
2892 PyTuple_SET_ITEM(result
, index
-1, item
);
2899 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2905 PyObject
* def
= Py_None
;
2906 static char* kwlist
[] = { "default", NULL
};
2907 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
2910 result
= PyDict_New();
2911 if (!result
|| !self
->pattern
->groupindex
)
2914 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
2918 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
2922 key
= PyList_GET_ITEM(keys
, index
);
2925 value
= match_getslice(self
, key
, def
);
2930 status
= PyDict_SetItem(result
, key
, value
);
2947 match_start(MatchObject
* self
, PyObject
* args
)
2951 PyObject
* index_
= Py_False
; /* zero */
2952 if (!PyArg_ParseTuple(args
, "|O:start", &index_
))
2955 index
= match_getindex(self
, index_
);
2957 if (index
< 0 || index
>= self
->groups
) {
2965 /* mark is -1 if group is undefined */
2966 return Py_BuildValue("i", self
->mark
[index
*2]);
2970 match_end(MatchObject
* self
, PyObject
* args
)
2974 PyObject
* index_
= Py_False
; /* zero */
2975 if (!PyArg_ParseTuple(args
, "|O:end", &index_
))
2978 index
= match_getindex(self
, index_
);
2980 if (index
< 0 || index
>= self
->groups
) {
2988 /* mark is -1 if group is undefined */
2989 return Py_BuildValue("i", self
->mark
[index
*2+1]);
2993 _pair(int i1
, int i2
)
2998 pair
= PyTuple_New(2);
3002 item
= PyInt_FromLong(i1
);
3005 PyTuple_SET_ITEM(pair
, 0, item
);
3007 item
= PyInt_FromLong(i2
);
3010 PyTuple_SET_ITEM(pair
, 1, item
);
3020 match_span(MatchObject
* self
, PyObject
* args
)
3024 PyObject
* index_
= Py_False
; /* zero */
3025 if (!PyArg_ParseTuple(args
, "|O:span", &index_
))
3028 index
= match_getindex(self
, index_
);
3030 if (index
< 0 || index
>= self
->groups
) {
3038 /* marks are -1 if group is undefined */
3039 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3043 match_regs(MatchObject
* self
)
3049 regs
= PyTuple_New(self
->groups
);
3053 for (index
= 0; index
< self
->groups
; index
++) {
3054 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
3059 PyTuple_SET_ITEM(regs
, index
, item
);
3069 match_copy(MatchObject
* self
, PyObject
* args
)
3071 #ifdef USE_BUILTIN_COPY
3075 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
3078 slots
= 2 * (self
->pattern
->groups
+1);
3080 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
3084 /* this value a constant, but any compiler should be able to
3085 figure that out all by itself */
3086 offset
= offsetof(MatchObject
, string
);
3088 Py_XINCREF(self
->pattern
);
3089 Py_XINCREF(self
->string
);
3090 Py_XINCREF(self
->regs
);
3092 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
3093 sizeof(MatchObject
) + slots
* sizeof(int) - offset
);
3095 return (PyObject
*) copy
;
3097 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
3103 match_deepcopy(MatchObject
* self
, PyObject
* args
)
3105 #ifdef USE_BUILTIN_COPY
3109 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
3112 copy
= (MatchObject
*) match_copy(self
, Py_None
);
3116 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
3117 !deepcopy(©
->string
, memo
) ||
3118 !deepcopy(©
->regs
, memo
)) {
3124 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
3129 static PyMethodDef match_methods
[] = {
3130 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
3131 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
3132 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
3133 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
3134 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
3135 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
3136 {"expand", (PyCFunction
) match_expand
, METH_VARARGS
},
3137 {"__copy__", (PyCFunction
) match_copy
, METH_VARARGS
},
3138 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_VARARGS
},
3143 match_getattr(MatchObject
* self
, char* name
)
3147 res
= Py_FindMethod(match_methods
, (PyObject
*) self
, name
);
3153 if (!strcmp(name
, "lastindex")) {
3154 if (self
->lastindex
>= 0)
3155 return Py_BuildValue("i", self
->lastindex
);
3160 if (!strcmp(name
, "lastgroup")) {
3161 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
3162 PyObject
* result
= PySequence_GetItem(
3163 self
->pattern
->indexgroup
, self
->lastindex
3173 if (!strcmp(name
, "string")) {
3175 Py_INCREF(self
->string
);
3176 return self
->string
;
3183 if (!strcmp(name
, "regs")) {
3185 Py_INCREF(self
->regs
);
3188 return match_regs(self
);
3191 if (!strcmp(name
, "re")) {
3192 Py_INCREF(self
->pattern
);
3193 return (PyObject
*) self
->pattern
;
3196 if (!strcmp(name
, "pos"))
3197 return Py_BuildValue("i", self
->pos
);
3199 if (!strcmp(name
, "endpos"))
3200 return Py_BuildValue("i", self
->endpos
);
3202 PyErr_SetString(PyExc_AttributeError
, name
);
3206 /* FIXME: implement setattr("string", None) as a special case (to
3207 detach the associated string, if any */
3209 statichere PyTypeObject Match_Type
= {
3210 PyObject_HEAD_INIT(NULL
)
3211 0, "_" SRE_MODULE
".SRE_Match",
3212 sizeof(MatchObject
), sizeof(int),
3213 (destructor
)match_dealloc
, /*tp_dealloc*/
3215 (getattrfunc
)match_getattr
/*tp_getattr*/
3218 /* -------------------------------------------------------------------- */
3219 /* scanner methods (experimental) */
3222 scanner_dealloc(ScannerObject
* self
)
3224 state_fini(&self
->state
);
3225 Py_DECREF(self
->pattern
);
3230 scanner_match(ScannerObject
* self
, PyObject
* args
)
3232 SRE_STATE
* state
= &self
->state
;
3238 state
->ptr
= state
->start
;
3240 if (state
->charsize
== 1) {
3241 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
));
3243 #if defined(HAVE_UNICODE)
3244 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
));
3248 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3251 if ((status
== 0 || state
->ptr
== state
->start
) &&
3252 state
->ptr
< state
->end
)
3253 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3255 state
->start
= state
->ptr
;
3262 scanner_search(ScannerObject
* self
, PyObject
* args
)
3264 SRE_STATE
* state
= &self
->state
;
3270 state
->ptr
= state
->start
;
3272 if (state
->charsize
== 1) {
3273 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
3275 #if defined(HAVE_UNICODE)
3276 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
3280 match
= pattern_new_match((PatternObject
*) self
->pattern
,
3283 if ((status
== 0 || state
->ptr
== state
->start
) &&
3284 state
->ptr
< state
->end
)
3285 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
3287 state
->start
= state
->ptr
;
3292 static PyMethodDef scanner_methods
[] = {
3293 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3294 /* METH_OLDARGS is not in Python 1.5.2 */
3295 {"match", (PyCFunction
) scanner_match
, 0},
3296 {"search", (PyCFunction
) scanner_search
, 0},
3301 scanner_getattr(ScannerObject
* self
, char* name
)
3305 res
= Py_FindMethod(scanner_methods
, (PyObject
*) self
, name
);
3312 if (!strcmp(name
, "pattern")) {
3313 Py_INCREF(self
->pattern
);
3314 return self
->pattern
;
3317 PyErr_SetString(PyExc_AttributeError
, name
);
3321 statichere PyTypeObject Scanner_Type
= {
3322 PyObject_HEAD_INIT(NULL
)
3323 0, "_" SRE_MODULE
".SRE_Scanner",
3324 sizeof(ScannerObject
), 0,
3325 (destructor
)scanner_dealloc
, /*tp_dealloc*/
3327 (getattrfunc
)scanner_getattr
, /*tp_getattr*/
3330 static PyMethodDef _functions
[] = {
3331 {"compile", _compile
, METH_VARARGS
},
3332 {"getcodesize", sre_codesize
, METH_VARARGS
},
3333 {"getlower", sre_getlower
, METH_VARARGS
},
3337 #if PY_VERSION_HEX < 0x02030000
3338 DL_EXPORT(void) init_sre(void)
3340 PyMODINIT_FUNC
init_sre(void)
3347 /* Patch object types */
3348 Pattern_Type
.ob_type
= Match_Type
.ob_type
=
3349 Scanner_Type
.ob_type
= &PyType_Type
;
3351 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
3352 d
= PyModule_GetDict(m
);
3354 x
= PyInt_FromLong(SRE_MAGIC
);
3356 PyDict_SetItemString(d
, "MAGIC", x
);
3360 x
= PyInt_FromLong(sizeof(SRE_CODE
));
3362 PyDict_SetItemString(d
, "CODESIZE", x
);
3366 x
= PyString_FromString(copyright
);
3368 PyDict_SetItemString(d
, "copyright", x
);
3373 #endif /* !defined(SRE_RECURSIVE) */