2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
12 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
14 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
16 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
23 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
24 * 2000-10-24 fl really fixed assert_not; reset groups in findall
25 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
28 * 2001-01-16 fl fixed memory leak in pattern destructor
29 * 2001-03-20 fl lots of fixes for 2.1b2
30 * 2001-04-15 fl export copyright as Python attribute, not global
31 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
33 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
35 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
37 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license. For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
41 * Portions of this engine have been developed in cooperation with
42 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
43 * other compatibility work.
48 static char copyright
[] =
49 " SRE 2.1.1 Copyright (c) 1997-2001 by Secret Labs AB ";
52 #include "structmember.h" /* offsetof */
58 /* name of this module, minus the leading underscore */
59 #if !defined(SRE_MODULE)
60 #define SRE_MODULE "sre"
63 /* defining this one enables tracing */
66 #if PY_VERSION_HEX >= 0x01060000
67 /* defining this enables unicode support (default under 1.6a1 and later) */
71 /* -------------------------------------------------------------------- */
72 /* optional features */
74 /* prevent run-away recursion (bad patterns on long strings) */
76 #if !defined(USE_STACKCHECK)
77 #if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
78 /* require smaller recursion limit for a number of 64-bit platforms:
79 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
80 /* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
81 #define USE_RECURSION_LIMIT 7500
83 #define USE_RECURSION_LIMIT 10000
87 /* enables fast searching */
88 #define USE_FAST_SEARCH
90 /* enables aggressive inlining (always on for Visual C) */
93 /* enables copy/deepcopy handling (work in progress) */
94 #undef USE_BUILTIN_COPY
96 #if PY_VERSION_HEX < 0x01060000
97 #define PyObject_DEL(op) PyMem_DEL((op))
100 /* -------------------------------------------------------------------- */
102 #if defined(_MSC_VER)
103 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
104 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
105 /* fastest possible local call under MSVC */
106 #define LOCAL(type) static __inline type __fastcall
107 #elif defined(USE_INLINE)
108 #define LOCAL(type) static inline type
110 #define LOCAL(type) static type
114 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
115 #define SRE_ERROR_STATE -2 /* illegal state */
116 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
117 #define SRE_ERROR_MEMORY -9 /* out of memory */
120 #define TRACE(v) printf v
125 /* -------------------------------------------------------------------- */
126 /* search engine state */
128 /* default character predicates (run sre_chars.py to regenerate tables) */
130 #define SRE_DIGIT_MASK 1
131 #define SRE_SPACE_MASK 2
132 #define SRE_LINEBREAK_MASK 4
133 #define SRE_ALNUM_MASK 8
134 #define SRE_WORD_MASK 16
136 static char sre_char_info
[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
137 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
139 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
140 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
141 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
142 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
144 static char sre_char_lower
[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
145 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
146 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
147 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
148 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152 120, 121, 122, 123, 124, 125, 126, 127 };
154 #define SRE_IS_DIGIT(ch)\
155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156 #define SRE_IS_SPACE(ch)\
157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158 #define SRE_IS_LINEBREAK(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160 #define SRE_IS_ALNUM(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162 #define SRE_IS_WORD(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
165 static unsigned int sre_lower(unsigned int ch
)
167 return ((ch
) < 128 ? sre_char_lower
[ch
] : ch
);
170 /* locale-specific character predicates */
172 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
173 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
174 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
175 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
176 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
178 static unsigned int sre_lower_locale(unsigned int ch
)
180 return ((ch
) < 256 ? tolower((ch
)) : ch
);
183 /* unicode-specific character predicates */
185 #if defined(HAVE_UNICODE)
187 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
188 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
189 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
190 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
191 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
193 static unsigned int sre_lower_unicode(unsigned int ch
)
195 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE
)(ch
));
201 sre_category(SRE_CODE category
, unsigned int ch
)
205 case SRE_CATEGORY_DIGIT
:
206 return SRE_IS_DIGIT(ch
);
207 case SRE_CATEGORY_NOT_DIGIT
:
208 return !SRE_IS_DIGIT(ch
);
209 case SRE_CATEGORY_SPACE
:
210 return SRE_IS_SPACE(ch
);
211 case SRE_CATEGORY_NOT_SPACE
:
212 return !SRE_IS_SPACE(ch
);
213 case SRE_CATEGORY_WORD
:
214 return SRE_IS_WORD(ch
);
215 case SRE_CATEGORY_NOT_WORD
:
216 return !SRE_IS_WORD(ch
);
217 case SRE_CATEGORY_LINEBREAK
:
218 return SRE_IS_LINEBREAK(ch
);
219 case SRE_CATEGORY_NOT_LINEBREAK
:
220 return !SRE_IS_LINEBREAK(ch
);
222 case SRE_CATEGORY_LOC_WORD
:
223 return SRE_LOC_IS_WORD(ch
);
224 case SRE_CATEGORY_LOC_NOT_WORD
:
225 return !SRE_LOC_IS_WORD(ch
);
227 #if defined(HAVE_UNICODE)
228 case SRE_CATEGORY_UNI_DIGIT
:
229 return SRE_UNI_IS_DIGIT(ch
);
230 case SRE_CATEGORY_UNI_NOT_DIGIT
:
231 return !SRE_UNI_IS_DIGIT(ch
);
232 case SRE_CATEGORY_UNI_SPACE
:
233 return SRE_UNI_IS_SPACE(ch
);
234 case SRE_CATEGORY_UNI_NOT_SPACE
:
235 return !SRE_UNI_IS_SPACE(ch
);
236 case SRE_CATEGORY_UNI_WORD
:
237 return SRE_UNI_IS_WORD(ch
);
238 case SRE_CATEGORY_UNI_NOT_WORD
:
239 return !SRE_UNI_IS_WORD(ch
);
240 case SRE_CATEGORY_UNI_LINEBREAK
:
241 return SRE_UNI_IS_LINEBREAK(ch
);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
243 return !SRE_UNI_IS_LINEBREAK(ch
);
245 case SRE_CATEGORY_UNI_DIGIT
:
246 return SRE_IS_DIGIT(ch
);
247 case SRE_CATEGORY_UNI_NOT_DIGIT
:
248 return !SRE_IS_DIGIT(ch
);
249 case SRE_CATEGORY_UNI_SPACE
:
250 return SRE_IS_SPACE(ch
);
251 case SRE_CATEGORY_UNI_NOT_SPACE
:
252 return !SRE_IS_SPACE(ch
);
253 case SRE_CATEGORY_UNI_WORD
:
254 return SRE_LOC_IS_WORD(ch
);
255 case SRE_CATEGORY_UNI_NOT_WORD
:
256 return !SRE_LOC_IS_WORD(ch
);
257 case SRE_CATEGORY_UNI_LINEBREAK
:
258 return SRE_IS_LINEBREAK(ch
);
259 case SRE_CATEGORY_UNI_NOT_LINEBREAK
:
260 return !SRE_IS_LINEBREAK(ch
);
269 mark_fini(SRE_STATE
* state
)
271 if (state
->mark_stack
) {
272 free(state
->mark_stack
);
273 state
->mark_stack
= NULL
;
275 state
->mark_stack_size
= state
->mark_stack_base
= 0;
279 mark_save(SRE_STATE
* state
, int lo
, int hi
)
283 int minsize
, newsize
;
288 size
= (hi
- lo
) + 1;
290 newsize
= state
->mark_stack_size
;
291 minsize
= state
->mark_stack_base
+ size
;
293 if (newsize
< minsize
) {
294 /* create new stack */
297 if (newsize
< minsize
)
299 TRACE(("allocate stack %d\n", newsize
));
300 stack
= malloc(sizeof(void*) * newsize
);
303 while (newsize
< minsize
)
305 TRACE(("grow stack to %d\n", newsize
));
306 stack
= realloc(state
->mark_stack
, sizeof(void*) * newsize
);
310 return SRE_ERROR_MEMORY
;
312 state
->mark_stack
= stack
;
313 state
->mark_stack_size
= newsize
;
316 TRACE(("copy %d:%d to %d (%d)\n", lo
, hi
, state
->mark_stack_base
, size
));
318 memcpy(state
->mark_stack
+ state
->mark_stack_base
, state
->mark
+ lo
,
319 size
* sizeof(void*));
321 state
->mark_stack_base
+= size
;
327 mark_restore(SRE_STATE
* state
, int lo
, int hi
)
334 size
= (hi
- lo
) + 1;
336 state
->mark_stack_base
-= size
;
338 TRACE(("copy %d:%d from %d\n", lo
, hi
, state
->mark_stack_base
));
340 memcpy(state
->mark
+ lo
, state
->mark_stack
+ state
->mark_stack_base
,
341 size
* sizeof(void*));
346 /* generate 8-bit version */
348 #define SRE_CHAR unsigned char
349 #define SRE_AT sre_at
350 #define SRE_COUNT sre_count
351 #define SRE_CHARSET sre_charset
352 #define SRE_INFO sre_info
353 #define SRE_MATCH sre_match
354 #define SRE_SEARCH sre_search
356 #if defined(HAVE_UNICODE)
358 #define SRE_RECURSIVE
370 /* generate 16-bit unicode version */
372 #define SRE_CHAR Py_UNICODE
373 #define SRE_AT sre_uat
374 #define SRE_COUNT sre_ucount
375 #define SRE_CHARSET sre_ucharset
376 #define SRE_INFO sre_uinfo
377 #define SRE_MATCH sre_umatch
378 #define SRE_SEARCH sre_usearch
381 #endif /* SRE_RECURSIVE */
383 /* -------------------------------------------------------------------- */
384 /* String matching engine */
386 /* the following section is compiled twice, with different character
390 SRE_AT(SRE_STATE
* state
, SRE_CHAR
* ptr
, SRE_CODE at
)
392 /* check if pointer is at given position */
398 case SRE_AT_BEGINNING
:
399 case SRE_AT_BEGINNING_STRING
:
400 return ((void*) ptr
== state
->beginning
);
402 case SRE_AT_BEGINNING_LINE
:
403 return ((void*) ptr
== state
->beginning
||
404 SRE_IS_LINEBREAK((int) ptr
[-1]));
407 return (((void*) (ptr
+1) == state
->end
&&
408 SRE_IS_LINEBREAK((int) ptr
[0])) ||
409 ((void*) ptr
== state
->end
));
411 case SRE_AT_END_LINE
:
412 return ((void*) ptr
== state
->end
||
413 SRE_IS_LINEBREAK((int) ptr
[0]));
415 case SRE_AT_END_STRING
:
416 return ((void*) ptr
== state
->end
);
418 case SRE_AT_BOUNDARY
:
419 if (state
->beginning
== state
->end
)
421 that
= ((void*) ptr
> state
->beginning
) ?
422 SRE_IS_WORD((int) ptr
[-1]) : 0;
423 this = ((void*) ptr
< state
->end
) ?
424 SRE_IS_WORD((int) ptr
[0]) : 0;
427 case SRE_AT_NON_BOUNDARY
:
428 if (state
->beginning
== state
->end
)
430 that
= ((void*) ptr
> state
->beginning
) ?
431 SRE_IS_WORD((int) ptr
[-1]) : 0;
432 this = ((void*) ptr
< state
->end
) ?
433 SRE_IS_WORD((int) ptr
[0]) : 0;
436 case SRE_AT_LOC_BOUNDARY
:
437 if (state
->beginning
== state
->end
)
439 that
= ((void*) ptr
> state
->beginning
) ?
440 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
441 this = ((void*) ptr
< state
->end
) ?
442 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
445 case SRE_AT_LOC_NON_BOUNDARY
:
446 if (state
->beginning
== state
->end
)
448 that
= ((void*) ptr
> state
->beginning
) ?
449 SRE_LOC_IS_WORD((int) ptr
[-1]) : 0;
450 this = ((void*) ptr
< state
->end
) ?
451 SRE_LOC_IS_WORD((int) ptr
[0]) : 0;
454 #if defined(HAVE_UNICODE)
455 case SRE_AT_UNI_BOUNDARY
:
456 if (state
->beginning
== state
->end
)
458 that
= ((void*) ptr
> state
->beginning
) ?
459 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
460 this = ((void*) ptr
< state
->end
) ?
461 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
464 case SRE_AT_UNI_NON_BOUNDARY
:
465 if (state
->beginning
== state
->end
)
467 that
= ((void*) ptr
> state
->beginning
) ?
468 SRE_UNI_IS_WORD((int) ptr
[-1]) : 0;
469 this = ((void*) ptr
< state
->end
) ?
470 SRE_UNI_IS_WORD((int) ptr
[0]) : 0;
480 SRE_CHARSET(SRE_CODE
* set
, SRE_CODE ch
)
482 /* check if character is a member of the given set */
490 /* <LITERAL> <code> */
497 /* <RANGE> <lower> <upper> */
498 if (set
[0] <= ch
&& ch
<= set
[1])
504 /* <CHARSET> <bitmap> (16 bits per code word) */
505 if (ch
< 256 && (set
[ch
>> 4] & (1 << (ch
& 15))))
510 case SRE_OP_BIGCHARSET
:
511 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
515 block
= ((unsigned char*)set
)[ch
>> 8];
517 if (set
[block
*16 + ((ch
& 255)>>4)] & (1 << (ch
& 15)))
523 case SRE_OP_CATEGORY
:
524 /* <CATEGORY> <code> */
525 if (sre_category(set
[0], (int) ch
))
538 /* internal error -- there's not much we can do about it
539 here, so let's just pretend it didn't match... */
545 LOCAL(int) SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
, int level
);
548 SRE_COUNT(SRE_STATE
* state
, SRE_CODE
* pattern
, int maxcount
, int level
)
551 SRE_CHAR
* ptr
= state
->ptr
;
552 SRE_CHAR
* end
= state
->end
;
556 if (maxcount
< end
- ptr
&& maxcount
!= 65535)
557 end
= ptr
+ maxcount
;
559 switch (pattern
[0]) {
562 /* repeated dot wildcard. */
563 TRACE(("|%p|%p|COUNT ANY\n", pattern
, ptr
));
564 while (ptr
< end
&& !SRE_IS_LINEBREAK(*ptr
))
569 /* repeated dot wildcare. skip to the end of the target
570 string, and backtrack from there */
571 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern
, ptr
));
576 /* repeated literal */
578 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern
, ptr
, chr
));
579 while (ptr
< end
&& (SRE_CODE
) *ptr
== chr
)
583 case SRE_OP_LITERAL_IGNORE
:
584 /* repeated literal */
586 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
587 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) == chr
)
591 case SRE_OP_NOT_LITERAL
:
592 /* repeated non-literal */
594 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern
, ptr
, chr
));
595 while (ptr
< end
&& (SRE_CODE
) *ptr
!= chr
)
599 case SRE_OP_NOT_LITERAL_IGNORE
:
600 /* repeated non-literal */
602 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, chr
));
603 while (ptr
< end
&& (SRE_CODE
) state
->lower(*ptr
) != chr
)
609 TRACE(("|%p|%p|COUNT IN\n", pattern
, ptr
));
610 while (ptr
< end
&& SRE_CHARSET(pattern
+ 2, *ptr
))
615 /* repeated single character pattern */
616 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern
, ptr
));
617 while ((SRE_CHAR
*) state
->ptr
< end
) {
618 i
= SRE_MATCH(state
, pattern
, level
);
624 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
,
625 (SRE_CHAR
*) state
->ptr
- ptr
));
626 return (SRE_CHAR
*) state
->ptr
- ptr
;
629 TRACE(("|%p|%p|COUNT %d\n", pattern
, ptr
, ptr
- (SRE_CHAR
*) state
->ptr
));
630 return ptr
- (SRE_CHAR
*) state
->ptr
;
633 #if 0 /* not used in this release */
635 SRE_INFO(SRE_STATE
* state
, SRE_CODE
* pattern
)
637 /* check if an SRE_OP_INFO block matches at the current position.
638 returns the number of SRE_CODE objects to skip if successful, 0
641 SRE_CHAR
* end
= state
->end
;
642 SRE_CHAR
* ptr
= state
->ptr
;
645 /* check minimal length */
646 if (pattern
[3] && (end
- ptr
) < pattern
[3])
649 /* check known prefix */
650 if (pattern
[2] & SRE_INFO_PREFIX
&& pattern
[5] > 1) {
651 /* <length> <skip> <prefix data> <overlap data> */
652 for (i
= 0; i
< pattern
[5]; i
++)
653 if ((SRE_CODE
) ptr
[i
] != pattern
[7 + i
])
655 return pattern
[0] + 2 * pattern
[6];
662 SRE_MATCH(SRE_STATE
* state
, SRE_CODE
* pattern
, int level
)
664 /* check if string matches the given pattern. returns <0 for
665 error, 0 for failure, and 1 for success */
667 SRE_CHAR
* end
= state
->end
;
668 SRE_CHAR
* ptr
= state
->ptr
;
674 SRE_REPEAT rep
; /* FIXME: <fl> allocate in STATE instead */
676 TRACE(("|%p|%p|ENTER %d\n", pattern
, ptr
, level
));
678 #if defined(USE_STACKCHECK)
679 if (level
% 10 == 0 && PyOS_CheckStack())
680 return SRE_ERROR_RECURSION_LIMIT
;
683 #if defined(USE_RECURSION_LIMIT)
684 if (level
> USE_RECURSION_LIMIT
)
685 return SRE_ERROR_RECURSION_LIMIT
;
688 if (pattern
[0] == SRE_OP_INFO
) {
689 /* optimization info block */
690 /* <INFO> <1=skip> <2=flags> <3=min> ... */
691 if (pattern
[3] && (end
- ptr
) < pattern
[3]) {
692 TRACE(("reject (got %d chars, need %d)\n",
693 (end
- ptr
), pattern
[3]));
696 pattern
+= pattern
[1] + 1;
701 switch (*pattern
++) {
704 /* immediate failure */
705 TRACE(("|%p|%p|FAILURE\n", pattern
, ptr
));
710 TRACE(("|%p|%p|SUCCESS\n", pattern
, ptr
));
715 /* match at given position */
717 TRACE(("|%p|%p|AT %d\n", pattern
, ptr
, *pattern
));
718 if (!SRE_AT(state
, ptr
, *pattern
))
723 case SRE_OP_CATEGORY
:
724 /* match at given category */
725 /* <CATEGORY> <code> */
726 TRACE(("|%p|%p|CATEGORY %d\n", pattern
, ptr
, *pattern
));
727 if (ptr
>= end
|| !sre_category(pattern
[0], ptr
[0]))
734 /* match literal string */
735 /* <LITERAL> <code> */
736 TRACE(("|%p|%p|LITERAL %d\n", pattern
, ptr
, *pattern
));
737 if (ptr
>= end
|| (SRE_CODE
) ptr
[0] != pattern
[0])
743 case SRE_OP_NOT_LITERAL
:
744 /* match anything that is not literal character */
745 /* <NOT_LITERAL> <code> */
746 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern
, ptr
, *pattern
));
747 if (ptr
>= end
|| (SRE_CODE
) ptr
[0] == pattern
[0])
754 /* match anything (except a newline) */
756 TRACE(("|%p|%p|ANY\n", pattern
, ptr
));
757 if (ptr
>= end
|| SRE_IS_LINEBREAK(ptr
[0]))
765 TRACE(("|%p|%p|ANY_ALL\n", pattern
, ptr
));
772 /* match set member (or non_member) */
773 /* <IN> <skip> <set> */
774 TRACE(("|%p|%p|IN\n", pattern
, ptr
));
775 if (ptr
>= end
|| !SRE_CHARSET(pattern
+ 1, *ptr
))
777 pattern
+= pattern
[0];
781 case SRE_OP_GROUPREF
:
782 /* match backreference */
783 TRACE(("|%p|%p|GROUPREF %d\n", pattern
, ptr
, pattern
[0]));
786 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[i
+i
];
787 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[i
+i
+1];
788 if (!p
|| !e
|| e
< p
)
791 if (ptr
>= end
|| *ptr
!= *p
)
799 case SRE_OP_GROUPREF_IGNORE
:
800 /* match backreference */
801 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern
, ptr
, pattern
[0]));
804 SRE_CHAR
* p
= (SRE_CHAR
*) state
->mark
[i
+i
];
805 SRE_CHAR
* e
= (SRE_CHAR
*) state
->mark
[i
+i
+1];
806 if (!p
|| !e
|| e
< p
)
810 state
->lower(*ptr
) != state
->lower(*p
))
818 case SRE_OP_LITERAL_IGNORE
:
819 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern
, ptr
, pattern
[0]));
821 state
->lower(*ptr
) != state
->lower(*pattern
))
827 case SRE_OP_NOT_LITERAL_IGNORE
:
828 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern
, ptr
, *pattern
));
830 state
->lower(*ptr
) == state
->lower(*pattern
))
836 case SRE_OP_IN_IGNORE
:
837 TRACE(("|%p|%p|IN_IGNORE\n", pattern
, ptr
));
839 || !SRE_CHARSET(pattern
+ 1, (SRE_CODE
) state
->lower(*ptr
)))
841 pattern
+= pattern
[0];
848 TRACE(("|%p|%p|MARK %d\n", pattern
, ptr
, pattern
[0]));
851 state
->lastindex
= i
/2 + 1;
852 if (i
> state
->lastmark
)
854 state
->mark
[i
] = ptr
;
861 /* <JUMP> <offset> */
862 TRACE(("|%p|%p|JUMP %d\n", pattern
, ptr
, pattern
[0]));
863 pattern
+= pattern
[0];
867 /* assert subpattern */
868 /* <ASSERT> <skip> <back> <pattern> */
869 TRACE(("|%p|%p|ASSERT %d\n", pattern
, ptr
, pattern
[1]));
870 state
->ptr
= ptr
- pattern
[1];
871 if (state
->ptr
< state
->beginning
)
873 i
= SRE_MATCH(state
, pattern
+ 2, level
+ 1);
876 pattern
+= pattern
[0];
879 case SRE_OP_ASSERT_NOT
:
880 /* assert not subpattern */
881 /* <ASSERT_NOT> <skip> <back> <pattern> */
882 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern
, ptr
, pattern
[1]));
883 state
->ptr
= ptr
- pattern
[1];
884 if (state
->ptr
>= state
->beginning
) {
885 i
= SRE_MATCH(state
, pattern
+ 2, level
+ 1);
891 pattern
+= pattern
[0];
896 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
897 TRACE(("|%p|%p|BRANCH\n", pattern
, ptr
));
898 lastmark
= state
->lastmark
;
899 for (; pattern
[0]; pattern
+= pattern
[0]) {
900 if (pattern
[1] == SRE_OP_LITERAL
&&
901 (ptr
>= end
|| (SRE_CODE
) *ptr
!= pattern
[2]))
903 if (pattern
[1] == SRE_OP_IN
&&
904 (ptr
>= end
|| !SRE_CHARSET(pattern
+ 3, (SRE_CODE
) *ptr
)))
907 i
= SRE_MATCH(state
, pattern
+ 1, level
+ 1);
910 if (state
->lastmark
> lastmark
) {
912 state
->mark
+ lastmark
+ 1, 0,
913 (state
->lastmark
- lastmark
) * sizeof(void*)
915 state
->lastmark
= lastmark
;
920 case SRE_OP_REPEAT_ONE
:
921 /* match repeated sequence (maximizing regexp) */
923 /* this operator only works if the repeated item is
924 exactly one character wide, and we're not already
925 collecting backtracking points. for other cases,
926 use the MAX_REPEAT operator */
928 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
930 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern
, ptr
,
931 pattern
[1], pattern
[2]));
933 if (ptr
+ pattern
[1] > end
)
934 return 0; /* cannot match */
938 count
= SRE_COUNT(state
, pattern
+ 3, pattern
[2], level
+ 1);
944 /* when we arrive here, count contains the number of
945 matches, and ptr points to the tail of the target
946 string. check if the rest of the pattern matches,
947 and backtrack if not. */
949 if (count
< (int) pattern
[1])
952 if (pattern
[pattern
[0]] == SRE_OP_SUCCESS
) {
953 /* tail is empty. we're finished */
957 } else if (pattern
[pattern
[0]] == SRE_OP_LITERAL
) {
958 /* tail starts with a literal. skip positions where
959 the rest of the pattern cannot possibly match */
960 chr
= pattern
[pattern
[0]+1];
962 while (count
>= (int) pattern
[1] &&
963 (ptr
>= end
|| *ptr
!= chr
)) {
967 if (count
< (int) pattern
[1])
970 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
979 lastmark
= state
->lastmark
;
980 while (count
>= (int) pattern
[1]) {
982 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
987 if (state
->lastmark
> lastmark
) {
989 state
->mark
+ lastmark
+ 1, 0,
990 (state
->lastmark
- lastmark
) * sizeof(void*)
992 state
->lastmark
= lastmark
;
999 /* create repeat context. all the hard work is done
1000 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1001 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1002 TRACE(("|%p|%p|REPEAT %d %d\n", pattern
, ptr
,
1003 pattern
[1], pattern
[2]));
1006 rep
.pattern
= pattern
;
1008 /* install new repeat context */
1009 rep
.prev
= state
->repeat
;
1010 state
->repeat
= &rep
;
1013 i
= SRE_MATCH(state
, pattern
+ pattern
[0], level
+ 1);
1015 state
->repeat
= rep
.prev
;
1019 case SRE_OP_MAX_UNTIL
:
1020 /* maximizing repeat */
1021 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1023 /* FIXME: we probably need to deal with zero-width
1024 matches in here... */
1028 return SRE_ERROR_STATE
;
1032 count
= rp
->count
+ 1;
1034 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern
, ptr
, count
));
1036 if (count
< rp
->pattern
[1]) {
1037 /* not enough matches */
1040 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1043 rp
->count
= count
- 1;
1048 if (count
< rp
->pattern
[2] || rp
->pattern
[2] == 65535) {
1049 /* we may have enough matches, but if we can
1050 match another item, do so */
1052 lastmark
= state
->lastmark
;
1053 i
= mark_save(state
, 0, lastmark
);
1057 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1060 i
= mark_restore(state
, 0, lastmark
);
1063 rp
->count
= count
- 1;
1067 /* cannot match more repeated items here. make sure the
1069 state
->repeat
= rp
->prev
;
1070 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1077 case SRE_OP_MIN_UNTIL
:
1078 /* minimizing repeat */
1079 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1083 return SRE_ERROR_STATE
;
1085 count
= rp
->count
+ 1;
1087 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern
, ptr
, count
,
1092 if (count
< rp
->pattern
[1]) {
1093 /* not enough matches */
1096 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1099 rp
->count
= count
-1;
1104 /* see if the tail matches */
1105 state
->repeat
= rp
->prev
;
1106 /* FIXME: the following fix doesn't always work (#133283) */
1107 if (rp
->pattern
[2] == 65535) {
1108 /* unbounded repeat */
1110 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1111 if (i
|| ptr
>= end
)
1116 i
= SRE_MATCH(state
, pattern
, level
+ 1);
1125 if (count
>= rp
->pattern
[2] && rp
->pattern
[2] != 65535)
1130 i
= SRE_MATCH(state
, rp
->pattern
+ 3, level
+ 1);
1133 rp
->count
= count
- 1;
1138 TRACE(("|%p|%p|UNKNOWN %d\n", pattern
, ptr
, pattern
[-1]));
1139 return SRE_ERROR_ILLEGAL
;
1143 /* shouldn't end up here */
1144 return SRE_ERROR_ILLEGAL
;
1148 SRE_SEARCH(SRE_STATE
* state
, SRE_CODE
* pattern
)
1150 SRE_CHAR
* ptr
= state
->start
;
1151 SRE_CHAR
* end
= state
->end
;
1154 int prefix_skip
= 0;
1155 SRE_CODE
* prefix
= NULL
;
1156 SRE_CODE
* charset
= NULL
;
1157 SRE_CODE
* overlap
= NULL
;
1160 if (pattern
[0] == SRE_OP_INFO
) {
1161 /* optimization info block */
1162 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1166 if (pattern
[3] > 0) {
1167 /* adjust end point (but make sure we leave at least one
1168 character in there, so literal search will work) */
1169 end
-= pattern
[3]-1;
1174 if (flags
& SRE_INFO_PREFIX
) {
1175 /* pattern starts with a known prefix */
1176 /* <length> <skip> <prefix data> <overlap data> */
1177 prefix_len
= pattern
[5];
1178 prefix_skip
= pattern
[6];
1179 prefix
= pattern
+ 7;
1180 overlap
= prefix
+ prefix_len
- 1;
1181 } else if (flags
& SRE_INFO_CHARSET
)
1182 /* pattern starts with a character from a known set */
1184 charset
= pattern
+ 5;
1186 pattern
+= 1 + pattern
[1];
1189 TRACE(("prefix = %p %d %d\n", prefix
, prefix_len
, prefix_skip
));
1190 TRACE(("charset = %p\n", charset
));
1192 #if defined(USE_FAST_SEARCH)
1193 if (prefix_len
> 1) {
1194 /* pattern starts with a known prefix. use the overlap
1195 table to skip forward as fast as we possibly can */
1200 if ((SRE_CODE
) ptr
[0] != prefix
[i
]) {
1206 if (++i
== prefix_len
) {
1207 /* found a potential match */
1208 TRACE(("|%p|%p|SEARCH SCAN\n", pattern
, ptr
));
1209 state
->start
= ptr
+ 1 - prefix_len
;
1210 state
->ptr
= ptr
+ 1 - prefix_len
+ prefix_skip
;
1211 if (flags
& SRE_INFO_LITERAL
)
1212 return 1; /* we got all of it */
1213 status
= SRE_MATCH(state
, pattern
+ 2*prefix_skip
, 1);
1216 /* close but no cigar -- try again */
1229 if (pattern
[0] == SRE_OP_LITERAL
) {
1230 /* pattern starts with a literal character. this is used
1231 for short prefixes, and if fast search is disabled */
1232 SRE_CODE chr
= pattern
[1];
1235 while (ptr
< end
&& (SRE_CODE
) ptr
[0] != chr
)
1239 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern
, ptr
));
1242 status
= SRE_MATCH(state
, pattern
+ 2, 1);
1246 } else if (charset
) {
1247 /* pattern starts with a character from a known set */
1250 while (ptr
< end
&& !SRE_CHARSET(charset
, ptr
[0]))
1254 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern
, ptr
));
1257 status
= SRE_MATCH(state
, pattern
, 1);
1264 while (ptr
<= end
) {
1265 TRACE(("|%p|%p|SEARCH\n", pattern
, ptr
));
1266 state
->start
= state
->ptr
= ptr
++;
1267 status
= SRE_MATCH(state
, pattern
, 1);
1276 #if !defined(SRE_RECURSIVE)
1278 /* -------------------------------------------------------------------- */
1279 /* factories and destructors */
1281 /* see sre.h for object declarations */
1283 staticforward PyTypeObject Pattern_Type
;
1284 staticforward PyTypeObject Match_Type
;
1285 staticforward PyTypeObject Scanner_Type
;
1288 _compile(PyObject
* self_
, PyObject
* args
)
1290 /* "compile" pattern descriptor to pattern object */
1292 PatternObject
* self
;
1299 PyObject
* groupindex
= NULL
;
1300 PyObject
* indexgroup
= NULL
;
1301 if (!PyArg_ParseTuple(args
, "OiO!|iOO", &pattern
, &flags
,
1302 &PyList_Type
, &code
, &groups
,
1303 &groupindex
, &indexgroup
))
1306 n
= PyList_GET_SIZE(code
);
1308 self
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, n
);
1314 for (i
= 0; i
< n
; i
++) {
1315 PyObject
*o
= PyList_GET_ITEM(code
, i
);
1316 self
->code
[i
] = (SRE_CODE
) PyInt_AsLong(o
);
1319 if (PyErr_Occurred()) {
1325 self
->pattern
= pattern
;
1327 self
->flags
= flags
;
1329 self
->groups
= groups
;
1331 Py_XINCREF(groupindex
);
1332 self
->groupindex
= groupindex
;
1334 Py_XINCREF(indexgroup
);
1335 self
->indexgroup
= indexgroup
;
1337 return (PyObject
*) self
;
1341 sre_codesize(PyObject
* self
, PyObject
* args
)
1343 return Py_BuildValue("i", sizeof(SRE_CODE
));
1347 sre_getlower(PyObject
* self
, PyObject
* args
)
1349 int character
, flags
;
1350 if (!PyArg_ParseTuple(args
, "ii", &character
, &flags
))
1352 if (flags
& SRE_FLAG_LOCALE
)
1353 return Py_BuildValue("i", sre_lower_locale(character
));
1354 if (flags
& SRE_FLAG_UNICODE
)
1355 #if defined(HAVE_UNICODE)
1356 return Py_BuildValue("i", sre_lower_unicode(character
));
1358 return Py_BuildValue("i", sre_lower_locale(character
));
1360 return Py_BuildValue("i", sre_lower(character
));
1364 state_reset(SRE_STATE
* state
)
1368 state
->lastmark
= 0;
1370 /* FIXME: dynamic! */
1371 for (i
= 0; i
< SRE_MARK_SIZE
; i
++)
1372 state
->mark
[i
] = NULL
;
1374 state
->lastindex
= -1;
1376 state
->repeat
= NULL
;
1382 state_init(SRE_STATE
* state
, PatternObject
* pattern
, PyObject
* string
,
1385 /* prepare state object */
1387 PyBufferProcs
*buffer
;
1391 memset(state
, 0, sizeof(SRE_STATE
));
1393 state
->lastindex
= -1;
1395 #if defined(HAVE_UNICODE)
1396 if (PyUnicode_Check(string
)) {
1397 /* unicode strings doesn't always support the buffer interface */
1398 ptr
= (void*) PyUnicode_AS_DATA(string
);
1399 bytes
= PyUnicode_GET_DATA_SIZE(string
);
1400 size
= PyUnicode_GET_SIZE(string
);
1401 state
->charsize
= sizeof(Py_UNICODE
);
1406 /* get pointer to string buffer */
1407 buffer
= string
->ob_type
->tp_as_buffer
;
1408 if (!buffer
|| !buffer
->bf_getreadbuffer
|| !buffer
->bf_getsegcount
||
1409 buffer
->bf_getsegcount(string
, NULL
) != 1) {
1410 PyErr_SetString(PyExc_TypeError
, "expected string or buffer");
1414 /* determine buffer size */
1415 bytes
= buffer
->bf_getreadbuffer(string
, 0, &ptr
);
1417 PyErr_SetString(PyExc_TypeError
, "buffer has negative size");
1421 /* determine character size */
1422 #if PY_VERSION_HEX >= 0x01060000
1423 size
= PyObject_Size(string
);
1425 size
= PyObject_Length(string
);
1428 if (PyString_Check(string
) || bytes
== size
)
1429 state
->charsize
= 1;
1430 #if defined(HAVE_UNICODE)
1431 else if (bytes
== (int) (size
* sizeof(Py_UNICODE
)))
1432 state
->charsize
= sizeof(Py_UNICODE
);
1435 PyErr_SetString(PyExc_TypeError
, "buffer size mismatch");
1439 #if defined(HAVE_UNICODE)
1443 /* adjust boundaries */
1446 else if (start
> size
)
1451 else if (end
> size
)
1454 state
->beginning
= ptr
;
1456 state
->start
= (void*) ((char*) ptr
+ start
* state
->charsize
);
1457 state
->end
= (void*) ((char*) ptr
+ end
* state
->charsize
);
1460 state
->string
= string
;
1462 state
->endpos
= end
;
1464 if (pattern
->flags
& SRE_FLAG_LOCALE
)
1465 state
->lower
= sre_lower_locale
;
1466 else if (pattern
->flags
& SRE_FLAG_UNICODE
)
1467 #if defined(HAVE_UNICODE)
1468 state
->lower
= sre_lower_unicode
;
1470 state
->lower
= sre_lower_locale
;
1473 state
->lower
= sre_lower
;
1479 state_fini(SRE_STATE
* state
)
1481 Py_XDECREF(state
->string
);
1486 state_getslice(SRE_STATE
* state
, int index
, PyObject
* string
)
1490 index
= (index
- 1) * 2;
1492 if (string
== Py_None
|| !state
->mark
[index
] || !state
->mark
[index
+1]) {
1495 i
= ((char*)state
->mark
[index
] - (char*)state
->beginning
) /
1497 j
= ((char*)state
->mark
[index
+1] - (char*)state
->beginning
) /
1501 return PySequence_GetSlice(string
, i
, j
);
1505 pattern_error(int status
)
1508 case SRE_ERROR_RECURSION_LIMIT
:
1511 "maximum recursion limit exceeded"
1514 case SRE_ERROR_MEMORY
:
1518 /* other error codes indicate compiler/engine bugs */
1521 "internal error in regular expression engine"
1527 pattern_new_match(PatternObject
* pattern
, SRE_STATE
* state
, int status
)
1529 /* create match object (from state object) */
1538 /* create match object (with room for extra group marks) */
1539 match
= PyObject_NEW_VAR(MatchObject
, &Match_Type
,
1540 2*(pattern
->groups
+1));
1545 match
->pattern
= pattern
;
1547 Py_INCREF(state
->string
);
1548 match
->string
= state
->string
;
1551 match
->groups
= pattern
->groups
+1;
1553 /* fill in group slices */
1555 base
= (char*) state
->beginning
;
1556 n
= state
->charsize
;
1558 match
->mark
[0] = ((char*) state
->start
- base
) / n
;
1559 match
->mark
[1] = ((char*) state
->ptr
- base
) / n
;
1561 for (i
= j
= 0; i
< pattern
->groups
; i
++, j
+=2)
1562 if (j
+1 <= state
->lastmark
&& state
->mark
[j
] && state
->mark
[j
+1]) {
1563 match
->mark
[j
+2] = ((char*) state
->mark
[j
] - base
) / n
;
1564 match
->mark
[j
+3] = ((char*) state
->mark
[j
+1] - base
) / n
;
1566 match
->mark
[j
+2] = match
->mark
[j
+3] = -1; /* undefined */
1568 match
->pos
= state
->pos
;
1569 match
->endpos
= state
->endpos
;
1571 match
->lastindex
= state
->lastindex
;
1573 return (PyObject
*) match
;
1575 } else if (status
== 0) {
1583 /* internal error */
1584 pattern_error(status
);
1589 pattern_scanner(PatternObject
* pattern
, PyObject
* args
)
1591 /* create search state object */
1593 ScannerObject
* self
;
1598 if (!PyArg_ParseTuple(args
, "O|ii:scanner", &string
, &start
, &end
))
1601 /* create scanner object */
1602 self
= PyObject_NEW(ScannerObject
, &Scanner_Type
);
1606 string
= state_init(&self
->state
, pattern
, string
, start
, end
);
1613 self
->pattern
= (PyObject
*) pattern
;
1615 return (PyObject
*) self
;
1619 pattern_dealloc(PatternObject
* self
)
1621 Py_XDECREF(self
->pattern
);
1622 Py_XDECREF(self
->groupindex
);
1623 Py_XDECREF(self
->indexgroup
);
1628 pattern_match(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1636 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1637 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:match", kwlist
,
1638 &string
, &start
, &end
))
1641 string
= state_init(&state
, self
, string
, start
, end
);
1645 state
.ptr
= state
.start
;
1647 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self
), state
.ptr
));
1649 if (state
.charsize
== 1) {
1650 status
= sre_match(&state
, PatternObject_GetCode(self
), 1);
1652 #if defined(HAVE_UNICODE)
1653 status
= sre_umatch(&state
, PatternObject_GetCode(self
), 1);
1657 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1661 return pattern_new_match(self
, &state
, status
);
1665 pattern_search(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1673 static char* kwlist
[] = { "pattern", "pos", "endpos", NULL
};
1674 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:search", kwlist
,
1675 &string
, &start
, &end
))
1678 string
= state_init(&state
, self
, string
, start
, end
);
1682 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self
), state
.ptr
));
1684 if (state
.charsize
== 1) {
1685 status
= sre_search(&state
, PatternObject_GetCode(self
));
1687 #if defined(HAVE_UNICODE)
1688 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1692 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self
), state
.ptr
));
1696 return pattern_new_match(self
, &state
, status
);
1700 call(char* module
, char* function
, PyObject
* args
)
1707 name
= PyString_FromString(module
);
1710 mod
= PyImport_Import(name
);
1714 func
= PyObject_GetAttrString(mod
, function
);
1718 result
= PyObject_CallObject(func
, args
);
1724 #ifdef USE_BUILTIN_COPY
1726 deepcopy(PyObject
** object
, PyObject
* memo
)
1732 Py_BuildValue("OO", *object
, memo
)
1740 return 1; /* success */
1745 pattern_sub(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1749 PyObject
* count
= Py_False
; /* zero */
1750 static char* kwlist
[] = { "repl", "string", "count", NULL
};
1751 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|O:sub", kwlist
,
1752 &template, &string
, &count
))
1755 /* delegate to Python code */
1758 Py_BuildValue("OOOO", self
, template, string
, count
)
1763 pattern_subn(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1767 PyObject
* count
= Py_False
; /* zero */
1768 static char* kwlist
[] = { "repl", "string", "count", NULL
};
1769 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "OO|O:subn", kwlist
,
1770 &template, &string
, &count
))
1773 /* delegate to Python code */
1775 SRE_MODULE
, "_subn",
1776 Py_BuildValue("OOOO", self
, template, string
, count
)
1781 pattern_split(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1784 PyObject
* maxsplit
= Py_False
; /* zero */
1785 static char* kwlist
[] = { "source", "maxsplit", NULL
};
1786 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|O:split", kwlist
,
1787 &string
, &maxsplit
))
1790 /* delegate to Python code */
1792 SRE_MODULE
, "_split",
1793 Py_BuildValue("OOO", self
, string
, maxsplit
)
1798 pattern_findall(PatternObject
* self
, PyObject
* args
, PyObject
* kw
)
1808 static char* kwlist
[] = { "source", "pos", "endpos", NULL
};
1809 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "O|ii:findall", kwlist
,
1810 &string
, &start
, &end
))
1813 string
= state_init(&state
, self
, string
, start
, end
);
1817 list
= PyList_New(0);
1819 while (state
.start
<= state
.end
) {
1823 state_reset(&state
);
1825 state
.ptr
= state
.start
;
1827 if (state
.charsize
== 1) {
1828 status
= sre_search(&state
, PatternObject_GetCode(self
));
1830 #if defined(HAVE_UNICODE)
1831 status
= sre_usearch(&state
, PatternObject_GetCode(self
));
1837 /* don't bother to build a match object */
1838 switch (self
->groups
) {
1840 item
= PySequence_GetSlice(
1842 ((char*) state
.start
- (char*) state
.beginning
) /
1844 ((char*) state
.ptr
- (char*) state
.beginning
) /
1850 item
= state_getslice(&state
, 1, string
);
1855 item
= PyTuple_New(self
->groups
);
1858 for (i
= 0; i
< self
->groups
; i
++) {
1859 PyObject
* o
= state_getslice(&state
, i
+1, string
);
1864 PyTuple_SET_ITEM(item
, i
, o
);
1869 status
= PyList_Append(list
, item
);
1875 if (state
.ptr
== state
.start
)
1876 state
.start
= (void*) ((char*) state
.ptr
+ state
.charsize
);
1878 state
.start
= state
.ptr
;
1885 pattern_error(status
);
1902 pattern_copy(PatternObject
* self
, PyObject
* args
)
1904 #ifdef USE_BUILTIN_COPY
1905 PatternObject
* copy
;
1908 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
1911 copy
= PyObject_NEW_VAR(PatternObject
, &Pattern_Type
, self
->codesize
);
1915 offset
= offsetof(PatternObject
, groups
);
1917 Py_XINCREF(self
->groupindex
);
1918 Py_XINCREF(self
->indexgroup
);
1919 Py_XINCREF(self
->pattern
);
1921 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
1922 sizeof(PatternObject
) + self
->codesize
* sizeof(SRE_CODE
) - offset
);
1924 return (PyObject
*) copy
;
1926 PyErr_SetString(PyExc_TypeError
, "cannot copy this pattern object");
1932 pattern_deepcopy(PatternObject
* self
, PyObject
* args
)
1934 #ifdef USE_BUILTIN_COPY
1935 PatternObject
* copy
;
1938 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
1941 copy
= (PatternObject
*) pattern_copy(self
, Py_None
);
1945 if (!deepcopy(©
->groupindex
, memo
) ||
1946 !deepcopy(©
->indexgroup
, memo
) ||
1947 !deepcopy(©
->pattern
, memo
)) {
1953 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this pattern object");
1959 pattern_isliteral(PatternObject
* self
, PyObject
* args
)
1961 /* internal: return true if pattern consists of literal text only */
1964 PyObject
* isliteral
;
1966 if (!PyArg_ParseTuple(args
, ":_isliteral"))
1969 code
= PatternObject_GetCode(self
);
1971 if (code
[0] == SRE_OP_INFO
&& code
[2] & SRE_INFO_LITERAL
)
1972 isliteral
= Py_True
;
1974 isliteral
= Py_False
;
1976 Py_INCREF(isliteral
);
1980 static PyMethodDef pattern_methods
[] = {
1981 {"match", (PyCFunction
) pattern_match
, METH_VARARGS
|METH_KEYWORDS
},
1982 {"search", (PyCFunction
) pattern_search
, METH_VARARGS
|METH_KEYWORDS
},
1983 {"sub", (PyCFunction
) pattern_sub
, METH_VARARGS
|METH_KEYWORDS
},
1984 {"subn", (PyCFunction
) pattern_subn
, METH_VARARGS
|METH_KEYWORDS
},
1985 {"split", (PyCFunction
) pattern_split
, METH_VARARGS
|METH_KEYWORDS
},
1986 {"findall", (PyCFunction
) pattern_findall
, METH_VARARGS
|METH_KEYWORDS
},
1987 {"scanner", (PyCFunction
) pattern_scanner
, METH_VARARGS
},
1988 {"__copy__", (PyCFunction
) pattern_copy
, METH_VARARGS
},
1989 {"__deepcopy__", (PyCFunction
) pattern_deepcopy
, METH_VARARGS
},
1990 {"_isliteral", (PyCFunction
) pattern_isliteral
, METH_VARARGS
},
1995 pattern_getattr(PatternObject
* self
, char* name
)
1999 res
= Py_FindMethod(pattern_methods
, (PyObject
*) self
, name
);
2007 if (!strcmp(name
, "pattern")) {
2008 Py_INCREF(self
->pattern
);
2009 return self
->pattern
;
2012 if (!strcmp(name
, "flags"))
2013 return Py_BuildValue("i", self
->flags
);
2015 if (!strcmp(name
, "groups"))
2016 return Py_BuildValue("i", self
->groups
);
2018 if (!strcmp(name
, "groupindex") && self
->groupindex
) {
2019 Py_INCREF(self
->groupindex
);
2020 return self
->groupindex
;
2023 PyErr_SetString(PyExc_AttributeError
, name
);
2027 statichere PyTypeObject Pattern_Type
= {
2028 PyObject_HEAD_INIT(NULL
)
2030 sizeof(PatternObject
), sizeof(SRE_CODE
),
2031 (destructor
)pattern_dealloc
, /*tp_dealloc*/
2033 (getattrfunc
)pattern_getattr
/*tp_getattr*/
2036 /* -------------------------------------------------------------------- */
2040 match_dealloc(MatchObject
* self
)
2042 Py_XDECREF(self
->regs
);
2043 Py_XDECREF(self
->string
);
2044 Py_DECREF(self
->pattern
);
2049 match_getslice_by_index(MatchObject
* self
, int index
, PyObject
* def
)
2051 if (index
< 0 || index
>= self
->groups
) {
2052 /* raise IndexError if we were given a bad group number */
2062 if (self
->string
== Py_None
|| self
->mark
[index
] < 0) {
2063 /* return default value if the string or group is undefined */
2068 return PySequence_GetSlice(
2069 self
->string
, self
->mark
[index
], self
->mark
[index
+1]
2074 match_getindex(MatchObject
* self
, PyObject
* index
)
2078 if (PyInt_Check(index
))
2079 return (int) PyInt_AS_LONG(index
);
2083 if (self
->pattern
->groupindex
) {
2084 index
= PyObject_GetItem(self
->pattern
->groupindex
, index
);
2086 if (PyInt_Check(index
))
2087 i
= (int) PyInt_AS_LONG(index
);
2097 match_getslice(MatchObject
* self
, PyObject
* index
, PyObject
* def
)
2099 return match_getslice_by_index(self
, match_getindex(self
, index
), def
);
2103 match_expand(MatchObject
* self
, PyObject
* args
)
2106 if (!PyArg_ParseTuple(args
, "O:expand", &template))
2109 /* delegate to Python code */
2111 SRE_MODULE
, "_expand",
2112 Py_BuildValue("OOO", self
->pattern
, self
, template)
2117 match_group(MatchObject
* self
, PyObject
* args
)
2122 size
= PyTuple_GET_SIZE(args
);
2126 result
= match_getslice(self
, Py_False
, Py_None
);
2129 result
= match_getslice(self
, PyTuple_GET_ITEM(args
, 0), Py_None
);
2132 /* fetch multiple items */
2133 result
= PyTuple_New(size
);
2136 for (i
= 0; i
< size
; i
++) {
2137 PyObject
* item
= match_getslice(
2138 self
, PyTuple_GET_ITEM(args
, i
), Py_None
2144 PyTuple_SET_ITEM(result
, i
, item
);
2152 match_groups(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2157 PyObject
* def
= Py_None
;
2158 static char* kwlist
[] = { "default", NULL
};
2159 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groups", kwlist
, &def
))
2162 result
= PyTuple_New(self
->groups
-1);
2166 for (index
= 1; index
< self
->groups
; index
++) {
2168 item
= match_getslice_by_index(self
, index
, def
);
2173 PyTuple_SET_ITEM(result
, index
-1, item
);
2180 match_groupdict(MatchObject
* self
, PyObject
* args
, PyObject
* kw
)
2186 PyObject
* def
= Py_None
;
2187 static char* kwlist
[] = { "default", NULL
};
2188 if (!PyArg_ParseTupleAndKeywords(args
, kw
, "|O:groupdict", kwlist
, &def
))
2191 result
= PyDict_New();
2192 if (!result
|| !self
->pattern
->groupindex
)
2195 keys
= PyMapping_Keys(self
->pattern
->groupindex
);
2199 for (index
= 0; index
< PyList_GET_SIZE(keys
); index
++) {
2203 key
= PyList_GET_ITEM(keys
, index
);
2206 value
= match_getslice(self
, key
, def
);
2211 status
= PyDict_SetItem(result
, key
, value
);
2228 match_start(MatchObject
* self
, PyObject
* args
)
2232 PyObject
* index_
= Py_False
; /* zero */
2233 if (!PyArg_ParseTuple(args
, "|O:start", &index_
))
2236 index
= match_getindex(self
, index_
);
2238 if (index
< 0 || index
>= self
->groups
) {
2246 /* mark is -1 if group is undefined */
2247 return Py_BuildValue("i", self
->mark
[index
*2]);
2251 match_end(MatchObject
* self
, PyObject
* args
)
2255 PyObject
* index_
= Py_False
; /* zero */
2256 if (!PyArg_ParseTuple(args
, "|O:end", &index_
))
2259 index
= match_getindex(self
, index_
);
2261 if (index
< 0 || index
>= self
->groups
) {
2269 /* mark is -1 if group is undefined */
2270 return Py_BuildValue("i", self
->mark
[index
*2+1]);
2274 _pair(int i1
, int i2
)
2279 pair
= PyTuple_New(2);
2283 item
= PyInt_FromLong(i1
);
2286 PyTuple_SET_ITEM(pair
, 0, item
);
2288 item
= PyInt_FromLong(i2
);
2291 PyTuple_SET_ITEM(pair
, 1, item
);
2301 match_span(MatchObject
* self
, PyObject
* args
)
2305 PyObject
* index_
= Py_False
; /* zero */
2306 if (!PyArg_ParseTuple(args
, "|O:span", &index_
))
2309 index
= match_getindex(self
, index_
);
2311 if (index
< 0 || index
>= self
->groups
) {
2319 /* marks are -1 if group is undefined */
2320 return _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
2324 match_regs(MatchObject
* self
)
2330 regs
= PyTuple_New(self
->groups
);
2334 for (index
= 0; index
< self
->groups
; index
++) {
2335 item
= _pair(self
->mark
[index
*2], self
->mark
[index
*2+1]);
2340 PyTuple_SET_ITEM(regs
, index
, item
);
2350 match_copy(MatchObject
* self
, PyObject
* args
)
2352 #ifdef USE_BUILTIN_COPY
2356 if (args
!= Py_None
&& !PyArg_ParseTuple(args
, ":__copy__"))
2359 slots
= 2 * (self
->pattern
->groups
+1);
2361 copy
= PyObject_NEW_VAR(MatchObject
, &Match_Type
, slots
);
2365 /* this value a constant, but any compiler should be able to
2366 figure that out all by itself */
2367 offset
= offsetof(MatchObject
, string
);
2369 Py_XINCREF(self
->pattern
);
2370 Py_XINCREF(self
->string
);
2371 Py_XINCREF(self
->regs
);
2373 memcpy((char*) copy
+ offset
, (char*) self
+ offset
,
2374 sizeof(MatchObject
) + slots
* sizeof(int) - offset
);
2376 return (PyObject
*) copy
;
2378 PyErr_SetString(PyExc_TypeError
, "cannot copy this match object");
2384 match_deepcopy(MatchObject
* self
, PyObject
* args
)
2386 #ifdef USE_BUILTIN_COPY
2390 if (!PyArg_ParseTuple(args
, "O:__deepcopy__", &memo
))
2393 copy
= (MatchObject
*) match_copy(self
, Py_None
);
2397 if (!deepcopy((PyObject
**) ©
->pattern
, memo
) ||
2398 !deepcopy(©
->string
, memo
) ||
2399 !deepcopy(©
->regs
, memo
)) {
2405 PyErr_SetString(PyExc_TypeError
, "cannot deepcopy this match object");
2410 static PyMethodDef match_methods
[] = {
2411 {"group", (PyCFunction
) match_group
, METH_VARARGS
},
2412 {"start", (PyCFunction
) match_start
, METH_VARARGS
},
2413 {"end", (PyCFunction
) match_end
, METH_VARARGS
},
2414 {"span", (PyCFunction
) match_span
, METH_VARARGS
},
2415 {"groups", (PyCFunction
) match_groups
, METH_VARARGS
|METH_KEYWORDS
},
2416 {"groupdict", (PyCFunction
) match_groupdict
, METH_VARARGS
|METH_KEYWORDS
},
2417 {"expand", (PyCFunction
) match_expand
, METH_VARARGS
},
2418 {"__copy__", (PyCFunction
) match_copy
, METH_VARARGS
},
2419 {"__deepcopy__", (PyCFunction
) match_deepcopy
, METH_VARARGS
},
2424 match_getattr(MatchObject
* self
, char* name
)
2428 res
= Py_FindMethod(match_methods
, (PyObject
*) self
, name
);
2434 if (!strcmp(name
, "lastindex")) {
2435 if (self
->lastindex
>= 0)
2436 return Py_BuildValue("i", self
->lastindex
);
2441 if (!strcmp(name
, "lastgroup")) {
2442 if (self
->pattern
->indexgroup
&& self
->lastindex
>= 0) {
2443 PyObject
* result
= PySequence_GetItem(
2444 self
->pattern
->indexgroup
, self
->lastindex
2454 if (!strcmp(name
, "string")) {
2456 Py_INCREF(self
->string
);
2457 return self
->string
;
2464 if (!strcmp(name
, "regs")) {
2466 Py_INCREF(self
->regs
);
2469 return match_regs(self
);
2472 if (!strcmp(name
, "re")) {
2473 Py_INCREF(self
->pattern
);
2474 return (PyObject
*) self
->pattern
;
2477 if (!strcmp(name
, "pos"))
2478 return Py_BuildValue("i", self
->pos
);
2480 if (!strcmp(name
, "endpos"))
2481 return Py_BuildValue("i", self
->endpos
);
2483 PyErr_SetString(PyExc_AttributeError
, name
);
2487 /* FIXME: implement setattr("string", None) as a special case (to
2488 detach the associated string, if any */
2490 statichere PyTypeObject Match_Type
= {
2491 PyObject_HEAD_INIT(NULL
)
2493 sizeof(MatchObject
), sizeof(int),
2494 (destructor
)match_dealloc
, /*tp_dealloc*/
2496 (getattrfunc
)match_getattr
/*tp_getattr*/
2499 /* -------------------------------------------------------------------- */
2500 /* scanner methods (experimental) */
2503 scanner_dealloc(ScannerObject
* self
)
2505 state_fini(&self
->state
);
2506 Py_DECREF(self
->pattern
);
2511 scanner_match(ScannerObject
* self
, PyObject
* args
)
2513 SRE_STATE
* state
= &self
->state
;
2519 state
->ptr
= state
->start
;
2521 if (state
->charsize
== 1) {
2522 status
= sre_match(state
, PatternObject_GetCode(self
->pattern
), 1);
2524 #if defined(HAVE_UNICODE)
2525 status
= sre_umatch(state
, PatternObject_GetCode(self
->pattern
), 1);
2529 match
= pattern_new_match((PatternObject
*) self
->pattern
,
2532 if (status
== 0 || state
->ptr
== state
->start
)
2533 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
2535 state
->start
= state
->ptr
;
2542 scanner_search(ScannerObject
* self
, PyObject
* args
)
2544 SRE_STATE
* state
= &self
->state
;
2550 state
->ptr
= state
->start
;
2552 if (state
->charsize
== 1) {
2553 status
= sre_search(state
, PatternObject_GetCode(self
->pattern
));
2555 #if defined(HAVE_UNICODE)
2556 status
= sre_usearch(state
, PatternObject_GetCode(self
->pattern
));
2560 match
= pattern_new_match((PatternObject
*) self
->pattern
,
2563 if (status
== 0 || state
->ptr
== state
->start
)
2564 state
->start
= (void*) ((char*) state
->ptr
+ state
->charsize
);
2566 state
->start
= state
->ptr
;
2571 static PyMethodDef scanner_methods
[] = {
2572 {"match", (PyCFunction
) scanner_match
, 0},
2573 {"search", (PyCFunction
) scanner_search
, 0},
2578 scanner_getattr(ScannerObject
* self
, char* name
)
2582 res
= Py_FindMethod(scanner_methods
, (PyObject
*) self
, name
);
2589 if (!strcmp(name
, "pattern")) {
2590 Py_INCREF(self
->pattern
);
2591 return self
->pattern
;
2594 PyErr_SetString(PyExc_AttributeError
, name
);
2598 statichere PyTypeObject Scanner_Type
= {
2599 PyObject_HEAD_INIT(NULL
)
2601 sizeof(ScannerObject
), 0,
2602 (destructor
)scanner_dealloc
, /*tp_dealloc*/
2604 (getattrfunc
)scanner_getattr
, /*tp_getattr*/
2607 static PyMethodDef _functions
[] = {
2608 {"compile", _compile
, 1},
2609 {"getcodesize", sre_codesize
, 1},
2610 {"getlower", sre_getlower
, 1},
2620 /* Patch object types */
2621 Pattern_Type
.ob_type
= Match_Type
.ob_type
=
2622 Scanner_Type
.ob_type
= &PyType_Type
;
2624 m
= Py_InitModule("_" SRE_MODULE
, _functions
);
2625 d
= PyModule_GetDict(m
);
2627 PyDict_SetItemString(
2628 d
, "MAGIC", (PyObject
*) PyInt_FromLong(SRE_MAGIC
)
2631 PyDict_SetItemString(
2632 d
, "copyright", (PyObject
*) PyString_FromString(copyright
)
2637 #endif /* !defined(SRE_RECURSIVE) */