2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
12 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
14 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
16 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
23 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
24 * 2000-10-24 fl really fixed assert_not; reset groups in findall
25 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
28 * 2001-01-16 fl fixed memory leak in pattern destructor
29 * 2001-03-20 fl lots of fixes for 2.1b2
30 * 2001-04-15 fl export copyright as Python attribute, not global
32 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
34 * This version of the SRE library can be redistributed under CNRI's
35 * Python 1.6 license. For any other use, please contact Secret Labs
36 * AB (info@pythonware.com).
38 * Portions of this engine have been developed in cooperation with
39 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
40 * other compatibility work.
45 static char copyright
[] =
46 " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
54 /* name of this module, minus the leading underscore */
55 #if !defined(SRE_MODULE)
56 #define SRE_MODULE "sre"
59 /* defining this one enables tracing */
62 #if PY_VERSION_HEX >= 0x01060000
63 /* defining this enables unicode support (default under 1.6a1 and later) */
67 /* -------------------------------------------------------------------- */
68 /* optional features */
70 /* prevent run-away recursion (bad patterns on long strings) */
72 #if !defined(USE_STACKCHECK)
73 #if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
74 /* require smaller recursion limit for a number of 64-bit platforms:
75 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
76 /* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
77 #define USE_RECURSION_LIMIT 7500
79 #define USE_RECURSION_LIMIT 10000
83 /* enables fast searching */
84 #define USE_FAST_SEARCH
86 /* enables aggressive inlining (always on for Visual C) */
89 #if PY_VERSION_HEX < 0x01060000
90 #define PyObject_DEL(op) PyMem_DEL((op))
93 /* -------------------------------------------------------------------- */
96 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
97 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
98 /* fastest possible local call under MSVC */
99 #define LOCAL(type) static __inline type __fastcall
100 #elif defined(USE_INLINE)
101 #define LOCAL(type) static inline type
103 #define LOCAL(type) static type
107 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
108 #define SRE_ERROR_STATE -2 /* illegal state */
109 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
110 #define SRE_ERROR_MEMORY -9 /* out of memory */
113 #define TRACE(v) printf v
118 /* -------------------------------------------------------------------- */
119 /* search engine state */
121 /* default character predicates (run sre_chars.py to regenerate tables) */
123 #define SRE_DIGIT_MASK 1
124 #define SRE_SPACE_MASK 2
125 #define SRE_LINEBREAK_MASK 4
126 #define SRE_ALNUM_MASK 8
127 #define SRE_WORD_MASK 16
129 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
130 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
132 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
133 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
134 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
135 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
137 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
138 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
139 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
140 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
141 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
142 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
143 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
144 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
145 120, 121, 122, 123, 124, 125, 126, 127 };
147 #define SRE_IS_DIGIT(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
149 #define SRE_IS_SPACE(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
151 #define SRE_IS_LINEBREAK(ch)\
152 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
153 #define SRE_IS_ALNUM(ch)\
154 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
155 #define SRE_IS_WORD(ch)\
156 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
158 static unsigned int sre_lower(unsigned int ch
)
160 return ((ch
) < 128 ? sre_char_lower
[ch
] : ch
);
163 /* locale-specific character predicates */
165 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
166 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
167 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
168 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
169 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
171 static unsigned int sre_lower_locale(unsigned int ch
)
173 return ((ch
) < 256 ? tolower((ch
)) : ch
);
176 /* unicode-specific character predicates */
178 #if defined(HAVE_UNICODE)
180 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
181 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
182 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
183 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
184 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
186 static unsigned int sre_lower_unicode(unsigned int ch
)
188 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
194 sre_category(SRE_CODE category
, unsigned int ch
)
198 case SRE_CATEGORY_DIGIT
:
199 return SRE_IS_DIGIT(ch
);
200 case SRE_CATEGORY_NOT_DIGIT
:
201 return !SRE_IS_DIGIT(ch
);
202 case SRE_CATEGORY_SPACE
:
203 return SRE_IS_SPACE(ch
);
204 case SRE_CATEGORY_NOT_SPACE
:
205 return !SRE_IS_SPACE(ch
);
206 case SRE_CATEGORY_WORD
:
207 return SRE_IS_WORD(ch
);
208 case SRE_CATEGORY_NOT_WORD
:
209 return !SRE_IS_WORD(ch
);
210 case SRE_CATEGORY_LINEBREAK
:
211 return SRE_IS_LINEBREAK(ch
);
212 case SRE_CATEGORY_NOT_LINEBREAK
:
213 return !SRE_IS_LINEBREAK(ch
);
215 case SRE_CATEGORY_LOC_WORD
:
216 return SRE_LOC_IS_WORD(ch
);
217 case SRE_CATEGORY_LOC_NOT_WORD
:
218 return !SRE_LOC_IS_WORD(ch
);
220 #if defined(HAVE_UNICODE)
221 case SRE_CATEGORY_UNI_DIGIT
:
222 return SRE_UNI_IS_DIGIT(ch
);
223 case SRE_CATEGORY_UNI_NOT_DIGIT
:
224 return !SRE_UNI_IS_DIGIT(ch
);
225 case SRE_CATEGORY_UNI_SPACE
:
226 return SRE_UNI_IS_SPACE(ch
);
227 case SRE_CATEGORY_UNI_NOT_SPACE
:
228 return !SRE_UNI_IS_SPACE(ch
);
229 case SRE_CATEGORY_UNI_WORD
:
230 return SRE_UNI_IS_WORD(ch
);
231 case SRE_CATEGORY_UNI_NOT_WORD
:
232 return !SRE_UNI_IS_WORD(ch
);
233 case SRE_CATEGORY_UNI_LINEBREAK
:
234 return SRE_UNI_IS_LINEBREAK(ch
);
235 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
236 return !SRE_UNI_IS_LINEBREAK(ch
);
238 case SRE_CATEGORY_UNI_DIGIT
:
239 return SRE_IS_DIGIT(ch
);
240 case SRE_CATEGORY_UNI_NOT_DIGIT
:
241 return !SRE_IS_DIGIT(ch
);
242 case SRE_CATEGORY_UNI_SPACE
:
243 return SRE_IS_SPACE(ch
);
244 case SRE_CATEGORY_UNI_NOT_SPACE
:
245 return !SRE_IS_SPACE(ch
);
246 case SRE_CATEGORY_UNI_WORD
:
247 return SRE_LOC_IS_WORD(ch
);
248 case SRE_CATEGORY_UNI_NOT_WORD
:
249 return !SRE_LOC_IS_WORD(ch
);
250 case SRE_CATEGORY_UNI_LINEBREAK
:
251 return SRE_IS_LINEBREAK(ch
);
252 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
253 return !SRE_IS_LINEBREAK(ch
);
262 mark_fini(SRE_STATE
* state
)
264 if (state
->mark_stack
) {
265 free(state
->mark_stack
);
266 state
->mark_stack
= NULL
;
268 state
->mark_stack_size
= state
->mark_stack_base
= 0;
272 mark_save(SRE_STATE
* state
, int lo
, int hi
)
276 int minsize
, newsize
;
281 size
= (hi
- lo
) + 1;
283 newsize
= state
->mark_stack_size
;
284 minsize
= state
->mark_stack_base
+ size
;
286 if (newsize
< minsize
) {
287 /* create new stack */
290 if (newsize
< minsize
)
292 TRACE(("allocate stack %d\n", newsize
));
293 stack
= malloc(sizeof(void*) * newsize
);
296 while (newsize
< minsize
)
298 TRACE(("grow stack to %d\n", newsize
));
299 stack
= realloc(state
->mark_stack
, sizeof(void*) * newsize
);
303 return SRE_ERROR_MEMORY
;
305 state
->mark_stack
= stack
;
306 state
->mark_stack_size
= newsize
;
309 TRACE(("copy %d:%d to %d (%d)\n", lo
, hi
, state
->mark_stack_base
, size
));
311 memcpy(state
->mark_stack
+ state
->mark_stack_base
, state
->mark
+ lo
,
312 size
* sizeof(void*));
314 state
->mark_stack_base
+= size
;
320 mark_restore(SRE_STATE
* state
, int lo
, int hi
)
327 size
= (hi
- lo
) + 1;
329 state
->mark_stack_base
-= size
;
331 TRACE(("copy %d:%d from %d\n", lo
, hi
, state
->mark_stack_base
));
333 memcpy(state
->mark
+ lo
, state
->mark_stack
+ state
->mark_stack_base
,
334 size
* sizeof(void*));
339 /* generate 8-bit version */
341 #define SRE_CHAR unsigned char
342 #define SRE_AT sre_at
343 #define SRE_COUNT sre_count
344 #define SRE_CHARSET sre_charset
345 #define SRE_INFO sre_info
346 #define SRE_MATCH sre_match
347 #define SRE_SEARCH sre_search
349 #if defined(HAVE_UNICODE)
351 #define SRE_RECURSIVE
363 /* generate 16-bit unicode version */
365 #define SRE_CHAR Py_UNICODE
366 #define SRE_AT sre_uat
367 #define SRE_COUNT sre_ucount
368 #define SRE_CHARSET sre_ucharset
369 #define SRE_INFO sre_uinfo
370 #define SRE_MATCH sre_umatch
371 #define SRE_SEARCH sre_usearch
374 #endif /* SRE_RECURSIVE */
376 /* -------------------------------------------------------------------- */
377 /* String matching engine */
379 /* the following section is compiled twice, with different character
383 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
385 /* check if pointer is at given position */
391 case SRE_AT_BEGINNING
:
392 case SRE_AT_BEGINNING_STRING
:
393 return ((void*) ptr
== state
->beginning
);
395 case SRE_AT_BEGINNING_LINE
:
396 return ((void*) ptr
== state
->beginning
||
397 SRE_IS_LINEBREAK((int) ptr
[-1]));
400 return (((void*) (ptr
+1) == state
->end
&&
401 SRE_IS_LINEBREAK((int) ptr
[0])) ||
402 ((void*) ptr
== state
->end
));
404 case SRE_AT_END_LINE
:
405 return ((void*) ptr
== state
->end
||
406 SRE_IS_LINEBREAK((int) ptr
[0]));
408 case SRE_AT_END_STRING
:
409 return ((void*) ptr
== state
->end
);
411 case SRE_AT_BOUNDARY
:
412 if (state
->beginning
== state
->end
)
414 that
= ((void*) ptr
> state
->beginning
) ?
415 SRE_IS_WORD((int) ptr
[-1]) : 0;
416 this = ((void*) ptr
< state
->end
) ?
417 SRE_IS_WORD((int) ptr
[0]) : 0;
420 case SRE_AT_NON_BOUNDARY
:
421 if (state
->beginning
== state
->end
)
423 that
= ((void*) ptr
> state
->beginning
) ?
424 SRE_IS_WORD((int) ptr
[-1]) : 0;
425 this = ((void*) ptr
< state
->end
) ?
426 SRE_IS_WORD((int) ptr
[0]) : 0;
429 case SRE_AT_LOC_BOUNDARY
:
430 if (state
->beginning
== state
->end
)
432 that
= ((void*) ptr
> state
->beginning
) ?
433 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
434 this = ((void*) ptr
< state
->end
) ?
435 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
438 case SRE_AT_LOC_NON_BOUNDARY
:
439 if (state
->beginning
== state
->end
)
441 that
= ((void*) ptr
> state
->beginning
) ?
442 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
443 this = ((void*) ptr
< state
->end
) ?
444 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
447 case SRE_AT_UNI_BOUNDARY
:
448 if (state
->beginning
== state
->end
)
450 that
= ((void*) ptr
> state
->beginning
) ?
451 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
452 this = ((void*) ptr
< state
->end
) ?
453 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
456 case SRE_AT_UNI_NON_BOUNDARY
:
457 if (state
->beginning
== state
->end
)
459 that
= ((void*) ptr
> state
->beginning
) ?
460 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
461 this = ((void*) ptr
< state
->end
) ?
462 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
470 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
472 /* check if character is a member of the given set */
480 /* <LITERAL> <code> */
487 /* <RANGE> <lower> <upper> */
488 if (set
[0] <= ch
&& ch
<= set
[1])
494 /* <CHARSET> <bitmap> (16 bits per code word) */
495 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
500 case SRE_OP_CATEGORY
:
501 /* <CATEGORY> <code> */
502 if (sre_category(set
[0], (int) ch
))
515 /* internal error -- there's not much we can do about it
516 here, so let's just pretend it didn't match... */
522 LOCAL(int) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
, int level
);
525 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, int maxcount
, int level
)
528 SRE_CHAR
* ptr
= state
->ptr
;
529 SRE_CHAR
* end
= state
->end
;
533 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
534 end
= ptr
+ maxcount
;
536 switch (pattern
[0]) {
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
541 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
546 /* repeated dot wildcare. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
553 /* repeated literal */
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
556 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
560 case SRE_OP_LITERAL_IGNORE
:
561 /* repeated literal */
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
564 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
568 case SRE_OP_NOT_LITERAL
:
569 /* repeated non-literal */
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
572 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
576 case SRE_OP_NOT_LITERAL_IGNORE
:
577 /* repeated non-literal */
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
580 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
586 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
587 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
592 /* repeated single character pattern */
593 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
594 while ((SRE_CHAR
*) state
->ptr
< end
) {
595 i
= SRE_MATCH(state
, pattern
, level
);
601 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
602 (SRE_CHAR
*) state
->ptr
- ptr
));
603 return (SRE_CHAR
*) state
->ptr
- ptr
;
606 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
607 return ptr
- (SRE_CHAR
*) state
->ptr
;
610 #if 0 /* not used in this release */
612 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
614 /* check if an SRE_OP_INFO block matches at the current position.
615 returns the number of SRE_CODE objects to skip if successful, 0
618 SRE_CHAR
* end
= state
->end
;
619 SRE_CHAR
* ptr
= state
->ptr
;
622 /* check minimal length */
623 if (pattern
[3] && (end
- ptr
) < pattern
[3])
626 /* check known prefix */
627 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
628 /* <length> <skip> <prefix data> <overlap data> */
629 for (i
= 0; i
< pattern
[5]; i
++)
630 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
632 return pattern
[0] + 2 * pattern
[6];
639 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
, int level
)
641 /* check if string matches the given pattern. returns <0 for
642 error, 0 for failure, and 1 for success */
644 SRE_CHAR
* end
= state
->end
;
645 SRE_CHAR
* ptr
= state
->ptr
;
651 SRE_REPEAT rep
; /* FIXME: <fl> allocate in STATE instead */
653 TRACE(("|%p|%p|ENTER %d\n", pattern
, ptr
, level
));
655 #if defined(USE_STACKCHECK)
656 if (level
% 10 == 0 && PyOS_CheckStack())
657 return SRE_ERROR_RECURSION_LIMIT
;
660 #if defined(USE_RECURSION_LIMIT)
661 if (level
> USE_RECURSION_LIMIT
)
662 return SRE_ERROR_RECURSION_LIMIT
;
665 if (pattern
[0] == SRE_OP_INFO
) {
666 /* optimization info block */
667 /* <INFO> <1=skip> <2=flags> <3=min> ... */
668 if (pattern
[3] && (end
- ptr
) < pattern
[3]) {
669 TRACE(("reject (got %d chars, need %d)\n",
670 (end
- ptr
), pattern
[3]));
673 pattern
+= pattern
[1] + 1;
678 switch (*pattern
++) {
681 /* immediate failure */
682 TRACE(("|%p|%p|FAILURE\n", pattern
, ptr
));
687 TRACE(("|%p|%p|SUCCESS\n", pattern
, ptr
));
692 /* match at given position */
694 TRACE(("|%p|%p|AT %d\n", pattern
, ptr
, *pattern
));
695 if (!SRE_AT(state
, ptr
, *pattern
))
700 case SRE_OP_CATEGORY
:
701 /* match at given category */
702 /* <CATEGORY> <code> */
703 TRACE(("|%p|%p|CATEGORY %d\n", pattern
, ptr
, *pattern
));
704 if (ptr
>= end
|| !sre_category(pattern
[0], ptr
[0]))
711 /* match literal string */
712 /* <LITERAL> <code> */
713 TRACE(("|%p|%p|LITERAL %d\n", pattern
, ptr
, *pattern
));
714 if (ptr
>= end
|| (SRE_CODE
) ptr
[0] != pattern
[0])
720 case SRE_OP_NOT_LITERAL
:
721 /* match anything that is not literal character */
722 /* <NOT_LITERAL> <code> */
723 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern
, ptr
, *pattern
));
724 if (ptr
>= end
|| (SRE_CODE
) ptr
[0] == pattern
[0])
731 /* match anything (except a newline) */
733 TRACE(("|%p|%p|ANY\n", pattern
, ptr
));
734 if (ptr
>= end
|| SRE_IS_LINEBREAK(ptr
[0]))
742 TRACE(("|%p|%p|ANY_ALL\n", pattern
, ptr
));
749 /* match set member (or non_member) */
750 /* <IN> <skip> <set> */
751 TRACE(("|%p|%p|IN\n", pattern
, ptr
));
752 if (ptr
>= end
|| !SRE_CHARSET(pattern
+ 1, *ptr
))
754 pattern
+= pattern
[0];
758 case SRE_OP_GROUPREF
:
759 /* match backreference */
760 TRACE(("|%p|%p|GROUPREF %d\n", pattern
, ptr
, pattern
[0]));
763 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[i
+i
];
764 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[i
+i
+1];
765 if (!p
|| !e
|| e
< p
)
768 if (ptr
>= end
|| *ptr
!= *p
)
776 case SRE_OP_GROUPREF_IGNORE
:
777 /* match backreference */
778 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern
, ptr
, pattern
[0]));
781 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[i
+i
];
782 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[i
+i
+1];
783 if (!p
|| !e
|| e
< p
)
787 state
->lower(*ptr
) != state
->lower(*p
))
795 case SRE_OP_LITERAL_IGNORE
:
796 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern
, ptr
, pattern
[0]));
798 state
->lower(*ptr
) != state
->lower(*pattern
))
804 case SRE_OP_NOT_LITERAL_IGNORE
:
805 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, *pattern
));
807 state
->lower(*ptr
) == state
->lower(*pattern
))
813 case SRE_OP_IN_IGNORE
:
814 TRACE(("|%p|%p|IN_IGNORE\n", pattern
, ptr
));
816 || !SRE_CHARSET(pattern
+ 1, (SRE_CODE
) state
->lower(*ptr
)))
818 pattern
+= pattern
[0];
825 TRACE(("|%p|%p|MARK %d\n", pattern
, ptr
, pattern
[0]));
828 state
->lastindex
= i
/2 + 1;
829 if (i
> state
->lastmark
)
831 state
->mark
[i
] = ptr
;
838 /* <JUMP> <offset> */
839 TRACE(("|%p|%p|JUMP %d\n", pattern
, ptr
, pattern
[0]));
840 pattern
+= pattern
[0];
844 /* assert subpattern */
845 /* <ASSERT> <skip> <back> <pattern> */
846 TRACE(("|%p|%p|ASSERT %d\n", pattern
, ptr
, pattern
[1]));
847 state
->ptr
= ptr
- pattern
[1];
848 if (state
->ptr
< state
->beginning
)
850 i
= SRE_MATCH(state
, pattern
+ 2, level
+ 1);
853 pattern
+= pattern
[0];
856 case SRE_OP_ASSERT_NOT
:
857 /* assert not subpattern */
858 /* <ASSERT_NOT> <skip> <back> <pattern> */
859 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern
, ptr
, pattern
[1]));
860 state
->ptr
= ptr
- pattern
[1];
861 if (state
->ptr
>= state
->beginning
) {
862 i
= SRE_MATCH(state
, pattern
+ 2, level
+ 1);
868 pattern
+= pattern
[0];
873 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
874 TRACE(("|%p|%p|BRANCH\n", pattern
, ptr
));
875 lastmark
= state
->lastmark
;
876 for (; pattern
[0]; pattern
+= pattern
[0]) {
877 if (pattern
[1] == SRE_OP_LITERAL
&&
878 (ptr
>= end
|| (SRE_CODE
) *ptr
!= pattern
[2]))
880 if (pattern
[1] == SRE_OP_IN
&&
881 (ptr
>= end
|| !SRE_CHARSET(pattern
+ 3, (SRE_CODE
) *ptr
)))
884 i
= SRE_MATCH(state
, pattern
+ 1, level
+ 1);
887 if (state
->lastmark
> lastmark
) {
889 state
->mark
+ lastmark
+ 1, 0,
890 (state
->lastmark
- lastmark
) * sizeof(void*)
892 state
->lastmark
= lastmark
;
897 case SRE_OP_REPEAT_ONE
:
898 /* match repeated sequence (maximizing regexp) */
900 /* this operator only works if the repeated item is
901 exactly one character wide, and we're not already
902 collecting backtracking points. for other cases,
903 use the MAX_REPEAT operator */
905 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
907 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern
, ptr
,
908 pattern
[1], pattern
[2]));
910 if (ptr
+ pattern
[1] > end
)
911 return 0; /* cannot match */
915 count
= SRE_COUNT(state
, pattern
+ 3, pattern
[2], level
+ 1);
921 /* when we arrive here, count contains the number of
922 matches, and ptr points to the tail of the target
923 string. check if the rest of the pattern matches,
924 and backtrack if not. */
926 if (count
< (int) pattern
[1])
929 if (pattern
[pattern
[0]] == SRE_OP_SUCCESS
) {
930 /* tail is empty. we're finished */
934 } else if (pattern
[pattern
[0]] == SRE_OP_LITERAL
) {
935 /* tail starts with a literal. skip positions where
936 the rest of the pattern cannot possibly match */
937 chr
= pattern
[pattern
[0]+1];
939 while (count
>= (int) pattern
[1] &&
940 (ptr
>= end
|| *ptr
!= chr
)) {
944 if (count
< (int) pattern
[1])
947 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
956 lastmark
= state
->lastmark
;
957 while (count
>= (int) pattern
[1]) {
959 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
964 if (state
->lastmark
> lastmark
) {
966 state
->mark
+ lastmark
+ 1, 0,
967 (state
->lastmark
- lastmark
) * sizeof(void*)
969 state
->lastmark
= lastmark
;
976 /* create repeat context. all the hard work is done
977 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
978 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
979 TRACE(("|%p|%p|REPEAT %d %d\n", pattern
, ptr
,
980 pattern
[1], pattern
[2]));
983 rep
.pattern
= pattern
;
985 /* install new repeat context */
986 rep
.prev
= state
->repeat
;
987 state
->repeat
= &rep
;
990 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
992 state
->repeat
= rep
.prev
;
996 case SRE_OP_MAX_UNTIL
:
997 /* maximizing repeat */
998 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1000 /* FIXME: we probably need to deal with zero-width
1001 matches in here... */
1005 return SRE_ERROR_STATE
;
1009 count
= rp
->count
+ 1;
1011 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern
, ptr
, count
));
1013 if (count
< rp
->pattern
[1]) {
1014 /* not enough matches */
1017 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1020 rp
->count
= count
- 1;
1025 if (count
< rp
->pattern
[2] || rp
->pattern
[2] == 65535) {
1026 /* we may have enough matches, but if we can
1027 match another item, do so */
1029 lastmark
= state
->lastmark
;
1030 i
= mark_save(state
, 0, lastmark
);
1034 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1037 i
= mark_restore(state
, 0, lastmark
);
1040 rp
->count
= count
- 1;
1044 /* cannot match more repeated items here. make sure the
1046 state
->repeat
= rp
->prev
;
1047 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1054 case SRE_OP_MIN_UNTIL
:
1055 /* minimizing repeat */
1056 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1060 return SRE_ERROR_STATE
;
1062 count
= rp
->count
+ 1;
1064 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern
, ptr
, count
,
1069 if (count
< rp
->pattern
[1]) {
1070 /* not enough matches */
1073 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1076 rp
->count
= count
-1;
1081 /* see if the tail matches */
1082 state
->repeat
= rp
->prev
;
1083 /* FIXME: the following fix doesn't always work (#133283) */
1084 if (0 && rp
->pattern
[2] == 65535) {
1085 /* unbounded repeat */
1087 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1088 if (i
|| ptr
>= end
)
1093 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1102 if (count
>= rp
->pattern
[2] && rp
->pattern
[2] != 65535)
1107 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1110 rp
->count
= count
- 1;
1115 TRACE(("|%p|%p|UNKNOWN %d\n", pattern
, ptr
, pattern
[-1]));
1116 return SRE_ERROR_ILLEGAL
;
1120 /* shouldn't end up here */
1121 return SRE_ERROR_ILLEGAL
;
1125 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1127 SRE_CHAR
* ptr
= state
->start
;
1128 SRE_CHAR
* end
= state
->end
;
1131 int prefix_skip
= 0;
1132 SRE_CODE
* prefix
= NULL
;
1133 SRE_CODE
* charset
= NULL
;
1134 SRE_CODE
* overlap
= NULL
;
1137 if (pattern
[0] == SRE_OP_INFO
) {
1138 /* optimization info block */
1139 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1143 if (pattern
[3] > 0) {
1144 /* adjust end point (but make sure we leave at least one
1145 character in there, so literal search will work) */
1146 end
-= pattern
[3]-1;
1151 if (flags
& SRE_INFO_PREFIX
) {
1152 /* pattern starts with a known prefix */
1153 /* <length> <skip> <prefix data> <overlap data> */
1154 prefix_len
= pattern
[5];
1155 prefix_skip
= pattern
[6];
1156 prefix
= pattern
+ 7;
1157 overlap
= prefix
+ prefix_len
- 1;
1158 } else if (flags
& SRE_INFO_CHARSET
)
1159 /* pattern starts with a character from a known set */
1161 charset
= pattern
+ 5;
1163 pattern
+= 1 + pattern
[1];
1166 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1167 TRACE(("charset = %p\n", charset
));
1169 #if defined(USE_FAST_SEARCH)
1170 if (prefix_len
> 1) {
1171 /* pattern starts with a known prefix. use the overlap
1172 table to skip forward as fast as we possibly can */
1177 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1183 if (++i
== prefix_len
) {
1184 /* found a potential match */
1185 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1186 state
->start
= ptr
+ 1 - prefix_len
;
1187 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1188 if (flags
& SRE_INFO_LITERAL
)
1189 return 1; /* we got all of it */
1190 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
, 1);
1193 /* close but no cigar -- try again */
1206 if (pattern
[0] == SRE_OP_LITERAL
) {
1207 /* pattern starts with a literal character. this is used
1208 for short prefixes, and if fast search is disabled */
1209 SRE_CODE chr
= pattern
[1];
1212 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1216 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1219 status
= SRE_MATCH(state
, pattern
+ 2, 1);
1223 } else if (charset
) {
1224 /* pattern starts with a character from a known set */
1227 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1231 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1234 status
= SRE_MATCH(state
, pattern
, 1);
1241 while (ptr
<= end
) {
1242 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1243 state
->start
= state
->ptr
= ptr
++;
1244 status
= SRE_MATCH(state
, pattern
, 1);
1253 #if !defined(SRE_RECURSIVE)
1255 /* -------------------------------------------------------------------- */
1256 /* factories and destructors */
1258 /* see sre.h for object declarations */
1260 staticforward PyTypeObject Pattern_Type
;
1261 staticforward PyTypeObject Match_Type
;
1262 staticforward PyTypeObject Scanner_Type
;
1265 _compile(PyObject
* self_
, PyObject
* args
)
1267 /* "compile" pattern descriptor to pattern object */
1269 PatternObject
* self
;
1276 PyObject
* groupindex
= NULL
;
1277 PyObject
* indexgroup
= NULL
;
1278 if (!PyArg_ParseTuple(args
, "OiO!|iOO", &pattern
, &flags
,
1279 &PyList_Type
, &code
, &groups
,
1280 &groupindex
, &indexgroup
))
1283 n
= PyList_GET_SIZE(code
);
1285 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
1289 for (i
= 0; i
< n
; i
++) {
1290 PyObject
*o
= PyList_GET_ITEM(code
, i
);
1291 self
->code
[i
] = (SRE_CODE
) PyInt_AsLong(o
);
1294 if (PyErr_Occurred()) {
1300 self
->pattern
= pattern
;
1302 self
->flags
= flags
;
1304 self
->groups
= groups
;
1306 Py_XINCREF(groupindex
);
1307 self
->groupindex
= groupindex
;
1309 Py_XINCREF(indexgroup
);
1310 self
->indexgroup
= indexgroup
;
1312 return (PyObject
*) self
;
1316 sre_codesize(PyObject
* self
, PyObject
* args
)
1318 return Py_BuildValue("i", sizeof(SRE_CODE
));
1322 sre_getlower(PyObject
* self
, PyObject
* args
)
1324 int character
, flags
;
1325 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1327 if (flags
& SRE_FLAG_LOCALE
)
1328 return Py_BuildValue("i", sre_lower_locale(character
));
1329 if (flags
& SRE_FLAG_UNICODE
)
1330 #if defined(HAVE_UNICODE)
1331 return Py_BuildValue("i", sre_lower_unicode(character
));
1333 return Py_BuildValue("i", sre_lower_locale(character
));
1335 return Py_BuildValue("i", sre_lower(character
));
1339 state_reset(SRE_STATE
* state
)
1343 state
->lastmark
= 0;
1345 /* FIXME: dynamic! */
1346 for (i
= 0; i
< SRE_MARK_SIZE
; i
++)
1347 state
->mark
[i
] = NULL
;
1349 state
->lastindex
= -1;
1351 state
->repeat
= NULL
;
1357 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1360 /* prepare state object */
1362 PyBufferProcs
*buffer
;
1366 memset(state
, 0, sizeof(SRE_STATE
));
1368 state
->lastindex
= -1;
1370 #if defined(HAVE_UNICODE)
1371 if (PyUnicode_Check(string
)) {
1372 /* unicode strings doesn't always support the buffer interface */
1373 ptr
= (void*) PyUnicode_AS_DATA(string
);
1374 bytes
= PyUnicode_GET_DATA_SIZE(string
);
1375 size
= PyUnicode_GET_SIZE(string
);
1376 state
->charsize
= sizeof(Py_UNICODE
);
1381 /* get pointer to string buffer */
1382 buffer
= string
->ob_type
->tp_as_buffer
;
1383 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1384 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1385 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1389 /* determine buffer size */
1390 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1392 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1396 /* determine character size */
1397 #if PY_VERSION_HEX >= 0x01060000
1398 size
= PyObject_Size(string
);
1400 size
= PyObject_Length(string
);
1403 if (PyString_Check(string
) || bytes
== size
)
1404 state
->charsize
= 1;
1405 #if defined(HAVE_UNICODE)
1406 else if (bytes
== (int) (size
* sizeof(Py_UNICODE
)))
1407 state
->charsize
= sizeof(Py_UNICODE
);
1410 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1414 #if defined(HAVE_UNICODE)
1418 /* adjust boundaries */
1421 else if (start
> size
)
1426 else if (end
> size
)
1429 state
->beginning
= ptr
;
1431 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1432 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1435 state
->string
= string
;
1437 state
->endpos
= end
;
1439 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1440 state
->lower
= sre_lower_locale
;
1441 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1442 #if defined(HAVE_UNICODE)
1443 state
->lower
= sre_lower_unicode
;
1445 state
->lower
= sre_lower_locale
;
1448 state
->lower
= sre_lower
;
1454 state_fini(SRE_STATE
* state
)
1456 Py_XDECREF(state
->string
);
1461 state_getslice(SRE_STATE
* state
, int index
, PyObject
* string
)
1465 index
= (index
- 1) * 2;
1467 if (string
== Py_None
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1470 i
= ((char*)state
->mark
[index
] - (char*)state
->beginning
) /
1472 j
= ((char*)state
->mark
[index
+1] - (char*)state
->beginning
) /
1476 return PySequence_GetSlice(string
, i
, j
);
1480 pattern_error(int status
)
1483 case SRE_ERROR_RECURSION_LIMIT
:
1486 "maximum recursion limit exceeded"
1489 case SRE_ERROR_MEMORY
:
1493 /* other error codes indicate compiler/engine bugs */
1496 "internal error in regular expression engine"
1502 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
1504 /* create match object (from state object) */
1513 /* create match object (with room for extra group marks) */
1514 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
1515 2*(pattern
->groups
+1));
1520 match
->pattern
= pattern
;
1522 Py_INCREF(state
->string
);
1523 match
->string
= state
->string
;
1526 match
->groups
= pattern
->groups
+1;
1528 /* fill in group slices */
1530 base
= (char*) state
->beginning
;
1531 n
= state
->charsize
;
1533 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
1534 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
1536 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
1537 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
1538 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
1539 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
1541 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
1543 match
->pos
= state
->pos
;
1544 match
->endpos
= state
->endpos
;
1546 match
->lastindex
= state
->lastindex
;
1548 return (PyObject
*) match
;
1550 } else if (status
== 0) {
1558 /* internal error */
1559 pattern_error(status
);
1564 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
1566 /* create search state object */
1568 ScannerObject
* self
;
1573 if (!PyArg_ParseTuple(args
, "O|ii:scanner", &string
, &start
, &end
))
1576 /* create scanner object */
1577 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
1581 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
1588 self
->pattern
= (PyObject
*) pattern
;
1590 return (PyObject
*) self
;
1594 pattern_dealloc(PatternObject
* self
)
1596 Py_XDECREF(self
->pattern
);
1597 Py_XDECREF(self
->groupindex
);
1598 Py_XDECREF(self
->indexgroup
);
1603 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1611 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1612 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:match", kwlist
,
1613 &string
, &start
, &end
))
1616 string
= state_init(&state
, self
, string
, start
, end
);
1620 state
.ptr
= state
.start
;
1622 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1624 if (state
.charsize
== 1) {
1625 status
= sre_match(&state
, PatternObject_GetCode(self
), 1);
1627 #if defined(HAVE_UNICODE)
1628 status
= sre_umatch(&state
, PatternObject_GetCode(self
), 1);
1632 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1636 return pattern_new_match(self
, &state
, status
);
1640 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1648 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1649 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:search", kwlist
,
1650 &string
, &start
, &end
))
1653 string
= state_init(&state
, self
, string
, start
, end
);
1657 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1659 if (state
.charsize
== 1) {
1660 status
= sre_search(&state
, PatternObject_GetCode(self
));
1662 #if defined(HAVE_UNICODE)
1663 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1667 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1671 return pattern_new_match(self
, &state
, status
);
1675 call(char* function
, PyObject
* args
)
1682 name
= PyString_FromString(SRE_MODULE
);
1685 module
= PyImport_Import(name
);
1689 func
= PyObject_GetAttrString(module
, function
);
1693 result
= PyObject_CallObject(func
, args
);
1700 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1704 PyObject
* count
= Py_False
; /* zero */
1705 static char* kwlist
[] = { "repl", "string", "count", NULL
};
1706 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|O:sub", kwlist
,
1707 &template, &string
, &count
))
1710 /* delegate to Python code */
1711 return call("_sub", Py_BuildValue("OOOO", self
, template, string
, count
));
1715 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1719 PyObject
* count
= Py_False
; /* zero */
1720 static char* kwlist
[] = { "repl", "string", "count", NULL
};
1721 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|O:subn", kwlist
,
1722 &template, &string
, &count
))
1725 /* delegate to Python code */
1726 return call("_subn", Py_BuildValue("OOOO", self
, template, string
, count
));
1730 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1733 PyObject
* maxsplit
= Py_False
; /* zero */
1734 static char* kwlist
[] = { "source", "maxsplit", NULL
};
1735 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|O:split", kwlist
,
1736 &string
, &maxsplit
))
1739 /* delegate to Python code */
1740 return call("_split", Py_BuildValue("OOO", self
, string
, maxsplit
));
1744 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1754 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
1755 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:findall", kwlist
,
1756 &string
, &start
, &end
))
1759 string
= state_init(&state
, self
, string
, start
, end
);
1763 list
= PyList_New(0);
1765 while (state
.start
<= state
.end
) {
1769 state_reset(&state
);
1771 state
.ptr
= state
.start
;
1773 if (state
.charsize
== 1) {
1774 status
= sre_search(&state
, PatternObject_GetCode(self
));
1776 #if defined(HAVE_UNICODE)
1777 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1783 /* don't bother to build a match object */
1784 switch (self
->groups
) {
1786 item
= PySequence_GetSlice(
1788 ((char*) state
.start
- (char*) state
.beginning
) /
1790 ((char*) state
.ptr
- (char*) state
.beginning
) /
1796 item
= state_getslice(&state
, 1, string
);
1801 item
= PyTuple_New(self
->groups
);
1804 for (i
= 0; i
< self
->groups
; i
++) {
1805 PyObject
* o
= state_getslice(&state
, i
+1, string
);
1810 PyTuple_SET_ITEM(item
, i
, o
);
1815 status
= PyList_Append(list
, item
);
1821 if (state
.ptr
== state
.start
)
1822 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
1824 state
.start
= state
.ptr
;
1831 pattern_error(status
);
1847 static PyMethodDef pattern_methods
[] = {
1848 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
},
1849 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
},
1850 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
},
1851 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
},
1852 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
},
1853 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
},
1855 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
1860 pattern_getattr(PatternObject
* self
, char* name
)
1864 res
= Py_FindMethod(pattern_methods
, (PyObject
*) self
, name
);
1872 if (!strcmp(name
, "pattern")) {
1873 Py_INCREF(self
->pattern
);
1874 return self
->pattern
;
1877 if (!strcmp(name
, "flags"))
1878 return Py_BuildValue("i", self
->flags
);
1880 if (!strcmp(name
, "groups"))
1881 return Py_BuildValue("i", self
->groups
);
1883 if (!strcmp(name
, "groupindex") && self
->groupindex
) {
1884 Py_INCREF(self
->groupindex
);
1885 return self
->groupindex
;
1888 PyErr_SetString(PyExc_AttributeError
, name
);
1892 statichere PyTypeObject Pattern_Type
= {
1893 PyObject_HEAD_INIT(NULL
)
1895 sizeof(PatternObject
), sizeof(SRE_CODE
),
1896 (destructor
)pattern_dealloc
, /*tp_dealloc*/
1898 (getattrfunc
)pattern_getattr
/*tp_getattr*/
1901 /* -------------------------------------------------------------------- */
1905 match_dealloc(MatchObject
* self
)
1907 Py_XDECREF(self
->regs
);
1908 Py_XDECREF(self
->string
);
1909 Py_DECREF(self
->pattern
);
1914 match_getslice_by_index(MatchObject
* self
, int index
, PyObject
* def
)
1916 if (index
< 0 || index
>= self
->groups
) {
1917 /* raise IndexError if we were given a bad group number */
1927 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
1928 /* return default value if the string or group is undefined */
1933 return PySequence_GetSlice(
1934 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
1939 match_getindex(MatchObject
* self
, PyObject
* index
)
1943 if (PyInt_Check(index
))
1944 return (int) PyInt_AS_LONG(index
);
1948 if (self
->pattern
->groupindex
) {
1949 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
1951 if (PyInt_Check(index
))
1952 i
= (int) PyInt_AS_LONG(index
);
1962 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
1964 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
1968 match_expand(MatchObject
* self
, PyObject
* args
)
1971 if (!PyArg_ParseTuple(args
, "O:expand", &template))
1974 /* delegate to Python code */
1977 Py_BuildValue("OOO", self
->pattern
, self
, template)
1982 match_group(MatchObject
* self
, PyObject
* args
)
1987 size
= PyTuple_GET_SIZE(args
);
1991 result
= match_getslice(self
, Py_False
, Py_None
);
1994 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
1997 /* fetch multiple items */
1998 result
= PyTuple_New(size
);
2001 for (i
= 0; i
< size
; i
++) {
2002 PyObject
* item
= match_getslice(
2003 self
, PyTuple_GET_ITEM(args
, i
), Py_None
2009 PyTuple_SET_ITEM(result
, i
, item
);
2017 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2022 PyObject
* def
= Py_None
;
2023 static char* kwlist
[] = { "default", NULL
};
2024 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
2027 result
= PyTuple_New(self
->groups
-1);
2031 for (index
= 1; index
< self
->groups
; index
++) {
2033 item
= match_getslice_by_index(self
, index
, def
);
2038 PyTuple_SET_ITEM(result
, index
-1, item
);
2045 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2051 PyObject
* def
= Py_None
;
2052 static char* kwlist
[] = { "default", NULL
};
2053 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
2056 result
= PyDict_New();
2057 if (!result
|| !self
->pattern
->groupindex
)
2060 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
2064 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
2068 key
= PyList_GET_ITEM(keys
, index
);
2071 value
= match_getslice(self
, key
, def
);
2076 status
= PyDict_SetItem(result
, key
, value
);
2093 match_start(MatchObject
* self
, PyObject
* args
)
2097 PyObject
* index_
= Py_False
; /* zero */
2098 if (!PyArg_ParseTuple(args
, "|O:start", &index_
))
2101 index
= match_getindex(self
, index_
);
2103 if (index
< 0 || index
>= self
->groups
) {
2111 /* mark is -1 if group is undefined */
2112 return Py_BuildValue("i", self
->mark
[index
*2]);
2116 match_end(MatchObject
* self
, PyObject
* args
)
2120 PyObject
* index_
= Py_False
; /* zero */
2121 if (!PyArg_ParseTuple(args
, "|O:end", &index_
))
2124 index
= match_getindex(self
, index_
);
2126 if (index
< 0 || index
>= self
->groups
) {
2134 /* mark is -1 if group is undefined */
2135 return Py_BuildValue("i", self
->mark
[index
*2+1]);
2139 _pair(int i1
, int i2
)
2144 pair
= PyTuple_New(2);
2148 item
= PyInt_FromLong(i1
);
2151 PyTuple_SET_ITEM(pair
, 0, item
);
2153 item
= PyInt_FromLong(i2
);
2156 PyTuple_SET_ITEM(pair
, 1, item
);
2166 match_span(MatchObject
* self
, PyObject
* args
)
2170 PyObject
* index_
= Py_False
; /* zero */
2171 if (!PyArg_ParseTuple(args
, "|O:span", &index_
))
2174 index
= match_getindex(self
, index_
);
2176 if (index
< 0 || index
>= self
->groups
) {
2184 /* marks are -1 if group is undefined */
2185 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
2189 match_regs(MatchObject
* self
)
2195 regs
= PyTuple_New(self
->groups
);
2199 for (index
= 0; index
< self
->groups
; index
++) {
2200 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
2205 PyTuple_SET_ITEM(regs
, index
, item
);
2214 static PyMethodDef match_methods
[] = {
2215 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
2216 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
2217 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
2218 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
2219 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
2220 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
2221 {"expand", (PyCFunction
) match_expand
, METH_VARARGS
},
2226 match_getattr(MatchObject
* self
, char* name
)
2230 res
= Py_FindMethod(match_methods
, (PyObject
*) self
, name
);
2236 if (!strcmp(name
, "lastindex")) {
2237 if (self
->lastindex
>= 0)
2238 return Py_BuildValue("i", self
->lastindex
);
2243 if (!strcmp(name
, "lastgroup")) {
2244 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
2245 PyObject
* result
= PySequence_GetItem(
2246 self
->pattern
->indexgroup
, self
->lastindex
2256 if (!strcmp(name
, "string")) {
2258 Py_INCREF(self
->string
);
2259 return self
->string
;
2266 if (!strcmp(name
, "regs")) {
2268 Py_INCREF(self
->regs
);
2271 return match_regs(self
);
2274 if (!strcmp(name
, "re")) {
2275 Py_INCREF(self
->pattern
);
2276 return (PyObject
*) self
->pattern
;
2279 if (!strcmp(name
, "pos"))
2280 return Py_BuildValue("i", self
->pos
);
2282 if (!strcmp(name
, "endpos"))
2283 return Py_BuildValue("i", self
->endpos
);
2285 PyErr_SetString(PyExc_AttributeError
, name
);
2289 /* FIXME: implement setattr("string", None) as a special case (to
2290 detach the associated string, if any */
2292 statichere PyTypeObject Match_Type
= {
2293 PyObject_HEAD_INIT(NULL
)
2295 sizeof(MatchObject
), sizeof(int),
2296 (destructor
)match_dealloc
, /*tp_dealloc*/
2298 (getattrfunc
)match_getattr
/*tp_getattr*/
2301 /* -------------------------------------------------------------------- */
2302 /* scanner methods (experimental) */
2305 scanner_dealloc(ScannerObject
* self
)
2307 state_fini(&self
->state
);
2308 Py_DECREF(self
->pattern
);
2313 scanner_match(ScannerObject
* self
, PyObject
* args
)
2315 SRE_STATE
* state
= &self
->state
;
2321 state
->ptr
= state
->start
;
2323 if (state
->charsize
== 1) {
2324 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
), 1);
2326 #if defined(HAVE_UNICODE)
2327 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
), 1);
2331 match
= pattern_new_match((PatternObject
*) self
->pattern
,
2334 if (status
== 0 || state
->ptr
== state
->start
)
2335 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
2337 state
->start
= state
->ptr
;
2344 scanner_search(ScannerObject
* self
, PyObject
* args
)
2346 SRE_STATE
* state
= &self
->state
;
2352 state
->ptr
= state
->start
;
2354 if (state
->charsize
== 1) {
2355 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
2357 #if defined(HAVE_UNICODE)
2358 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
2362 match
= pattern_new_match((PatternObject
*) self
->pattern
,
2365 if (status
== 0 || state
->ptr
== state
->start
)
2366 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
2368 state
->start
= state
->ptr
;
2373 static PyMethodDef scanner_methods
[] = {
2374 {"match", (PyCFunction
) scanner_match
, 0},
2375 {"search", (PyCFunction
) scanner_search
, 0},
2380 scanner_getattr(ScannerObject
* self
, char* name
)
2384 res
= Py_FindMethod(scanner_methods
, (PyObject
*) self
, name
);
2391 if (!strcmp(name
, "pattern")) {
2392 Py_INCREF(self
->pattern
);
2393 return self
->pattern
;
2396 PyErr_SetString(PyExc_AttributeError
, name
);
2400 statichere PyTypeObject Scanner_Type
= {
2401 PyObject_HEAD_INIT(NULL
)
2403 sizeof(ScannerObject
), 0,
2404 (destructor
)scanner_dealloc
, /*tp_dealloc*/
2406 (getattrfunc
)scanner_getattr
, /*tp_getattr*/
2409 static PyMethodDef _functions
[] = {
2410 {"compile", _compile
, 1},
2411 {"getcodesize", sre_codesize
, 1},
2412 {"getlower", sre_getlower
, 1},
2422 /* Patch object types */
2423 Pattern_Type
.ob_type
= Match_Type
.ob_type
=
2424 Scanner_Type
.ob_type
= &PyType_Type
;
2426 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
2427 d
= PyModule_GetDict(m
);
2429 PyDict_SetItemString(
2430 d
, "MAGIC", (PyObject
*) PyInt_FromLong(SRE_MAGIC
)
2433 PyDict_SetItemString(
2434 d
, "copyright", (PyObject
*) PyString_FromString(copyright
)
2439 #endif /* !defined(SRE_RECURSIVE) */