Tagging 3.0a4.
[python/dscho.git] / Modules / _sre.c
blobbfe4ae93d9d5de88048ff5d6abc41c033e3748c9
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifndef SRE_RECURSIVE
39 static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
42 #define PY_SSIZE_T_CLEAN
44 #include "Python.h"
45 #include "structmember.h" /* offsetof */
47 #include "sre.h"
49 #include <ctype.h>
51 /* name of this module, minus the leading underscore */
52 #if !defined(SRE_MODULE)
53 #define SRE_MODULE "sre"
54 #endif
56 #define SRE_PY_MODULE "re"
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 /* defining this enables unicode support (default under 1.6a1 and later) */
62 #define HAVE_UNICODE
64 /* -------------------------------------------------------------------- */
65 /* optional features */
67 /* enables fast searching */
68 #define USE_FAST_SEARCH
70 /* enables aggressive inlining (always on for Visual C) */
71 #undef USE_INLINE
73 /* enables copy/deepcopy handling (work in progress) */
74 #undef USE_BUILTIN_COPY
76 #if PY_VERSION_HEX < 0x01060000
77 #define PyObject_DEL(op) PyMem_DEL((op))
78 #endif
80 /* -------------------------------------------------------------------- */
82 #if defined(_MSC_VER)
83 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
84 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
85 /* fastest possible local call under MSVC */
86 #define LOCAL(type) static __inline type __fastcall
87 #elif defined(USE_INLINE)
88 #define LOCAL(type) static inline type
89 #else
90 #define LOCAL(type) static type
91 #endif
93 /* error codes */
94 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
95 #define SRE_ERROR_STATE -2 /* illegal state */
96 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
97 #define SRE_ERROR_MEMORY -9 /* out of memory */
98 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
100 #if defined(VERBOSE)
101 #define TRACE(v) printf v
102 #else
103 #define TRACE(v)
104 #endif
106 /* -------------------------------------------------------------------- */
107 /* search engine state */
109 /* default character predicates (run sre_chars.py to regenerate tables) */
111 #define SRE_DIGIT_MASK 1
112 #define SRE_SPACE_MASK 2
113 #define SRE_LINEBREAK_MASK 4
114 #define SRE_ALNUM_MASK 8
115 #define SRE_WORD_MASK 16
117 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
119 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
120 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
122 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
123 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
124 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
125 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
127 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
128 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
129 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
130 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
131 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
132 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
133 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
134 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
135 120, 121, 122, 123, 124, 125, 126, 127 };
137 #define SRE_IS_DIGIT(ch)\
138 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
139 #define SRE_IS_SPACE(ch)\
140 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
141 #define SRE_IS_LINEBREAK(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
143 #define SRE_IS_ALNUM(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
145 #define SRE_IS_WORD(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
148 static unsigned int sre_lower(unsigned int ch)
150 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
153 /* locale-specific character predicates */
154 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
155 * warnings when c's type supports only numbers < N+1 */
156 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
157 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
158 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
159 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
160 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
162 static unsigned int sre_lower_locale(unsigned int ch)
164 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
167 /* unicode-specific character predicates */
169 #if defined(HAVE_UNICODE)
171 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
172 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
173 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
174 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
177 static unsigned int sre_lower_unicode(unsigned int ch)
179 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
182 #endif
184 LOCAL(int)
185 sre_category(SRE_CODE category, unsigned int ch)
187 switch (category) {
189 case SRE_CATEGORY_DIGIT:
190 return SRE_IS_DIGIT(ch);
191 case SRE_CATEGORY_NOT_DIGIT:
192 return !SRE_IS_DIGIT(ch);
193 case SRE_CATEGORY_SPACE:
194 return SRE_IS_SPACE(ch);
195 case SRE_CATEGORY_NOT_SPACE:
196 return !SRE_IS_SPACE(ch);
197 case SRE_CATEGORY_WORD:
198 return SRE_IS_WORD(ch);
199 case SRE_CATEGORY_NOT_WORD:
200 return !SRE_IS_WORD(ch);
201 case SRE_CATEGORY_LINEBREAK:
202 return SRE_IS_LINEBREAK(ch);
203 case SRE_CATEGORY_NOT_LINEBREAK:
204 return !SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_LOC_WORD:
207 return SRE_LOC_IS_WORD(ch);
208 case SRE_CATEGORY_LOC_NOT_WORD:
209 return !SRE_LOC_IS_WORD(ch);
211 #if defined(HAVE_UNICODE)
212 case SRE_CATEGORY_UNI_DIGIT:
213 return SRE_UNI_IS_DIGIT(ch);
214 case SRE_CATEGORY_UNI_NOT_DIGIT:
215 return !SRE_UNI_IS_DIGIT(ch);
216 case SRE_CATEGORY_UNI_SPACE:
217 return SRE_UNI_IS_SPACE(ch);
218 case SRE_CATEGORY_UNI_NOT_SPACE:
219 return !SRE_UNI_IS_SPACE(ch);
220 case SRE_CATEGORY_UNI_WORD:
221 return SRE_UNI_IS_WORD(ch);
222 case SRE_CATEGORY_UNI_NOT_WORD:
223 return !SRE_UNI_IS_WORD(ch);
224 case SRE_CATEGORY_UNI_LINEBREAK:
225 return SRE_UNI_IS_LINEBREAK(ch);
226 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
227 return !SRE_UNI_IS_LINEBREAK(ch);
228 #else
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_LOC_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_LOC_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_IS_LINEBREAK(ch);
245 #endif
247 return 0;
250 /* helpers */
252 static void
253 data_stack_dealloc(SRE_STATE* state)
255 if (state->data_stack) {
256 PyMem_FREE(state->data_stack);
257 state->data_stack = NULL;
259 state->data_stack_size = state->data_stack_base = 0;
262 static int
263 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
265 Py_ssize_t minsize, cursize;
266 minsize = state->data_stack_base+size;
267 cursize = state->data_stack_size;
268 if (cursize < minsize) {
269 void* stack;
270 cursize = minsize+minsize/4+1024;
271 TRACE(("allocate/grow stack %d\n", cursize));
272 stack = PyMem_REALLOC(state->data_stack, cursize);
273 if (!stack) {
274 data_stack_dealloc(state);
275 return SRE_ERROR_MEMORY;
277 state->data_stack = (char *)stack;
278 state->data_stack_size = cursize;
280 return 0;
283 /* generate 8-bit version */
285 #define SRE_CHAR unsigned char
286 #define SRE_AT sre_at
287 #define SRE_COUNT sre_count
288 #define SRE_CHARSET sre_charset
289 #define SRE_INFO sre_info
290 #define SRE_MATCH sre_match
291 #define SRE_MATCH_CONTEXT sre_match_context
292 #define SRE_SEARCH sre_search
293 #define SRE_LITERAL_TEMPLATE sre_literal_template
295 #if defined(HAVE_UNICODE)
297 #define SRE_RECURSIVE
298 #include "_sre.c"
299 #undef SRE_RECURSIVE
301 #undef SRE_LITERAL_TEMPLATE
302 #undef SRE_SEARCH
303 #undef SRE_MATCH
304 #undef SRE_MATCH_CONTEXT
305 #undef SRE_INFO
306 #undef SRE_CHARSET
307 #undef SRE_COUNT
308 #undef SRE_AT
309 #undef SRE_CHAR
311 /* generate 16-bit unicode version */
313 #define SRE_CHAR Py_UNICODE
314 #define SRE_AT sre_uat
315 #define SRE_COUNT sre_ucount
316 #define SRE_CHARSET sre_ucharset
317 #define SRE_INFO sre_uinfo
318 #define SRE_MATCH sre_umatch
319 #define SRE_MATCH_CONTEXT sre_umatch_context
320 #define SRE_SEARCH sre_usearch
321 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
322 #endif
324 #endif /* SRE_RECURSIVE */
326 /* -------------------------------------------------------------------- */
327 /* String matching engine */
329 /* the following section is compiled twice, with different character
330 settings */
332 LOCAL(int)
333 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
335 /* check if pointer is at given position */
337 Py_ssize_t thisp, thatp;
339 switch (at) {
341 case SRE_AT_BEGINNING:
342 case SRE_AT_BEGINNING_STRING:
343 return ((void*) ptr == state->beginning);
345 case SRE_AT_BEGINNING_LINE:
346 return ((void*) ptr == state->beginning ||
347 SRE_IS_LINEBREAK((int) ptr[-1]));
349 case SRE_AT_END:
350 return (((void*) (ptr+1) == state->end &&
351 SRE_IS_LINEBREAK((int) ptr[0])) ||
352 ((void*) ptr == state->end));
354 case SRE_AT_END_LINE:
355 return ((void*) ptr == state->end ||
356 SRE_IS_LINEBREAK((int) ptr[0]));
358 case SRE_AT_END_STRING:
359 return ((void*) ptr == state->end);
361 case SRE_AT_BOUNDARY:
362 if (state->beginning == state->end)
363 return 0;
364 thatp = ((void*) ptr > state->beginning) ?
365 SRE_IS_WORD((int) ptr[-1]) : 0;
366 thisp = ((void*) ptr < state->end) ?
367 SRE_IS_WORD((int) ptr[0]) : 0;
368 return thisp != thatp;
370 case SRE_AT_NON_BOUNDARY:
371 if (state->beginning == state->end)
372 return 0;
373 thatp = ((void*) ptr > state->beginning) ?
374 SRE_IS_WORD((int) ptr[-1]) : 0;
375 thisp = ((void*) ptr < state->end) ?
376 SRE_IS_WORD((int) ptr[0]) : 0;
377 return thisp == thatp;
379 case SRE_AT_LOC_BOUNDARY:
380 if (state->beginning == state->end)
381 return 0;
382 thatp = ((void*) ptr > state->beginning) ?
383 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
384 thisp = ((void*) ptr < state->end) ?
385 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
386 return thisp != thatp;
388 case SRE_AT_LOC_NON_BOUNDARY:
389 if (state->beginning == state->end)
390 return 0;
391 thatp = ((void*) ptr > state->beginning) ?
392 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
393 thisp = ((void*) ptr < state->end) ?
394 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
395 return thisp == thatp;
397 #if defined(HAVE_UNICODE)
398 case SRE_AT_UNI_BOUNDARY:
399 if (state->beginning == state->end)
400 return 0;
401 thatp = ((void*) ptr > state->beginning) ?
402 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
403 thisp = ((void*) ptr < state->end) ?
404 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
405 return thisp != thatp;
407 case SRE_AT_UNI_NON_BOUNDARY:
408 if (state->beginning == state->end)
409 return 0;
410 thatp = ((void*) ptr > state->beginning) ?
411 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
412 thisp = ((void*) ptr < state->end) ?
413 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
414 return thisp == thatp;
415 #endif
419 return 0;
422 LOCAL(int)
423 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
425 /* check if character is a member of the given set */
427 int ok = 1;
429 for (;;) {
430 switch (*set++) {
432 case SRE_OP_FAILURE:
433 return !ok;
435 case SRE_OP_LITERAL:
436 /* <LITERAL> <code> */
437 if (ch == set[0])
438 return ok;
439 set++;
440 break;
442 case SRE_OP_CATEGORY:
443 /* <CATEGORY> <code> */
444 if (sre_category(set[0], (int) ch))
445 return ok;
446 set += 1;
447 break;
449 case SRE_OP_CHARSET:
450 if (sizeof(SRE_CODE) == 2) {
451 /* <CHARSET> <bitmap> (16 bits per code word) */
452 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
453 return ok;
454 set += 16;
456 else {
457 /* <CHARSET> <bitmap> (32 bits per code word) */
458 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
459 return ok;
460 set += 8;
462 break;
464 case SRE_OP_RANGE:
465 /* <RANGE> <lower> <upper> */
466 if (set[0] <= ch && ch <= set[1])
467 return ok;
468 set += 2;
469 break;
471 case SRE_OP_NEGATE:
472 ok = !ok;
473 break;
475 case SRE_OP_BIGCHARSET:
476 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
478 Py_ssize_t count, block;
479 count = *(set++);
481 if (sizeof(SRE_CODE) == 2) {
482 block = ((unsigned char*)set)[ch >> 8];
483 set += 128;
484 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
485 return ok;
486 set += count*16;
488 else {
489 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
490 * warnings when c's type supports only numbers < N+1 */
491 if (!(ch & ~65535))
492 block = ((unsigned char*)set)[ch >> 8];
493 else
494 block = -1;
495 set += 64;
496 if (block >=0 &&
497 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
498 return ok;
499 set += count*8;
501 break;
504 default:
505 /* internal error -- there's not much we can do about it
506 here, so let's just pretend it didn't match... */
507 return 0;
512 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
514 LOCAL(Py_ssize_t)
515 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
517 SRE_CODE chr;
518 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
519 SRE_CHAR* end = (SRE_CHAR *)state->end;
520 Py_ssize_t i;
522 /* adjust end */
523 if (maxcount < end - ptr && maxcount != 65535)
524 end = ptr + maxcount;
526 switch (pattern[0]) {
528 case SRE_OP_IN:
529 /* repeated set */
530 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
531 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
532 ptr++;
533 break;
535 case SRE_OP_ANY:
536 /* repeated dot wildcard. */
537 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
538 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
539 ptr++;
540 break;
542 case SRE_OP_ANY_ALL:
543 /* repeated dot wildcard. skip to the end of the target
544 string, and backtrack from there */
545 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
546 ptr = end;
547 break;
549 case SRE_OP_LITERAL:
550 /* repeated literal */
551 chr = pattern[1];
552 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
553 while (ptr < end && (SRE_CODE) *ptr == chr)
554 ptr++;
555 break;
557 case SRE_OP_LITERAL_IGNORE:
558 /* repeated literal */
559 chr = pattern[1];
560 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
561 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
562 ptr++;
563 break;
565 case SRE_OP_NOT_LITERAL:
566 /* repeated non-literal */
567 chr = pattern[1];
568 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
569 while (ptr < end && (SRE_CODE) *ptr != chr)
570 ptr++;
571 break;
573 case SRE_OP_NOT_LITERAL_IGNORE:
574 /* repeated non-literal */
575 chr = pattern[1];
576 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
577 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
578 ptr++;
579 break;
581 default:
582 /* repeated single character pattern */
583 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
584 while ((SRE_CHAR*) state->ptr < end) {
585 i = SRE_MATCH(state, pattern);
586 if (i < 0)
587 return i;
588 if (!i)
589 break;
591 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
592 (SRE_CHAR*) state->ptr - ptr));
593 return (SRE_CHAR*) state->ptr - ptr;
596 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
597 return ptr - (SRE_CHAR*) state->ptr;
600 #if 0 /* not used in this release */
601 LOCAL(int)
602 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
604 /* check if an SRE_OP_INFO block matches at the current position.
605 returns the number of SRE_CODE objects to skip if successful, 0
606 if no match */
608 SRE_CHAR* end = state->end;
609 SRE_CHAR* ptr = state->ptr;
610 Py_ssize_t i;
612 /* check minimal length */
613 if (pattern[3] && (end - ptr) < pattern[3])
614 return 0;
616 /* check known prefix */
617 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
618 /* <length> <skip> <prefix data> <overlap data> */
619 for (i = 0; i < pattern[5]; i++)
620 if ((SRE_CODE) ptr[i] != pattern[7 + i])
621 return 0;
622 return pattern[0] + 2 * pattern[6];
624 return pattern[0];
626 #endif
628 /* The macros below should be used to protect recursive SRE_MATCH()
629 * calls that *failed* and do *not* return immediately (IOW, those
630 * that will backtrack). Explaining:
632 * - Recursive SRE_MATCH() returned true: that's usually a success
633 * (besides atypical cases like ASSERT_NOT), therefore there's no
634 * reason to restore lastmark;
636 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
637 * is returning to the caller: If the current SRE_MATCH() is the
638 * top function of the recursion, returning false will be a matching
639 * failure, and it doesn't matter where lastmark is pointing to.
640 * If it's *not* the top function, it will be a recursive SRE_MATCH()
641 * failure by itself, and the calling SRE_MATCH() will have to deal
642 * with the failure by the same rules explained here (it will restore
643 * lastmark by itself if necessary);
645 * - Recursive SRE_MATCH() returned false, and will continue the
646 * outside 'for' loop: must be protected when breaking, since the next
647 * OP could potentially depend on lastmark;
649 * - Recursive SRE_MATCH() returned false, and will be called again
650 * inside a local for/while loop: must be protected between each
651 * loop iteration, since the recursive SRE_MATCH() could do anything,
652 * and could potentially depend on lastmark.
654 * For more information, check the discussion at SF patch #712900.
656 #define LASTMARK_SAVE() \
657 do { \
658 ctx->lastmark = state->lastmark; \
659 ctx->lastindex = state->lastindex; \
660 } while (0)
661 #define LASTMARK_RESTORE() \
662 do { \
663 state->lastmark = ctx->lastmark; \
664 state->lastindex = ctx->lastindex; \
665 } while (0)
667 #define RETURN_ERROR(i) do { return i; } while(0)
668 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
669 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
671 #define RETURN_ON_ERROR(i) \
672 do { if (i < 0) RETURN_ERROR(i); } while (0)
673 #define RETURN_ON_SUCCESS(i) \
674 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
675 #define RETURN_ON_FAILURE(i) \
676 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
678 #define SFY(x) #x
680 #define DATA_STACK_ALLOC(state, type, ptr) \
681 do { \
682 alloc_pos = state->data_stack_base; \
683 TRACE(("allocating %s in %d (%d)\n", \
684 SFY(type), alloc_pos, sizeof(type))); \
685 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
686 int j = data_stack_grow(state, sizeof(type)); \
687 if (j < 0) return j; \
688 if (ctx_pos != -1) \
689 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
691 ptr = (type*)(state->data_stack+alloc_pos); \
692 state->data_stack_base += sizeof(type); \
693 } while (0)
695 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
696 do { \
697 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
698 ptr = (type*)(state->data_stack+pos); \
699 } while (0)
701 #define DATA_STACK_PUSH(state, data, size) \
702 do { \
703 TRACE(("copy data in %p to %d (%d)\n", \
704 data, state->data_stack_base, size)); \
705 if (state->data_stack_size < state->data_stack_base+size) { \
706 int j = data_stack_grow(state, size); \
707 if (j < 0) return j; \
708 if (ctx_pos != -1) \
709 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
711 memcpy(state->data_stack+state->data_stack_base, data, size); \
712 state->data_stack_base += size; \
713 } while (0)
715 #define DATA_STACK_POP(state, data, size, discard) \
716 do { \
717 TRACE(("copy data to %p from %d (%d)\n", \
718 data, state->data_stack_base-size, size)); \
719 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
720 if (discard) \
721 state->data_stack_base -= size; \
722 } while (0)
724 #define DATA_STACK_POP_DISCARD(state, size) \
725 do { \
726 TRACE(("discard data from %d (%d)\n", \
727 state->data_stack_base-size, size)); \
728 state->data_stack_base -= size; \
729 } while(0)
731 #define DATA_PUSH(x) \
732 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
733 #define DATA_POP(x) \
734 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
735 #define DATA_POP_DISCARD(x) \
736 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
737 #define DATA_ALLOC(t,p) \
738 DATA_STACK_ALLOC(state, t, p)
739 #define DATA_LOOKUP_AT(t,p,pos) \
740 DATA_STACK_LOOKUP_AT(state,t,p,pos)
742 #define MARK_PUSH(lastmark) \
743 do if (lastmark > 0) { \
744 i = lastmark; /* ctx->lastmark may change if reallocated */ \
745 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
746 } while (0)
747 #define MARK_POP(lastmark) \
748 do if (lastmark > 0) { \
749 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
750 } while (0)
751 #define MARK_POP_KEEP(lastmark) \
752 do if (lastmark > 0) { \
753 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
754 } while (0)
755 #define MARK_POP_DISCARD(lastmark) \
756 do if (lastmark > 0) { \
757 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
758 } while (0)
760 #define JUMP_NONE 0
761 #define JUMP_MAX_UNTIL_1 1
762 #define JUMP_MAX_UNTIL_2 2
763 #define JUMP_MAX_UNTIL_3 3
764 #define JUMP_MIN_UNTIL_1 4
765 #define JUMP_MIN_UNTIL_2 5
766 #define JUMP_MIN_UNTIL_3 6
767 #define JUMP_REPEAT 7
768 #define JUMP_REPEAT_ONE_1 8
769 #define JUMP_REPEAT_ONE_2 9
770 #define JUMP_MIN_REPEAT_ONE 10
771 #define JUMP_BRANCH 11
772 #define JUMP_ASSERT 12
773 #define JUMP_ASSERT_NOT 13
775 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
776 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
777 nextctx->last_ctx_pos = ctx_pos; \
778 nextctx->jump = jumpvalue; \
779 nextctx->pattern = nextpattern; \
780 ctx_pos = alloc_pos; \
781 ctx = nextctx; \
782 goto entrance; \
783 jumplabel: \
784 while (0) /* gcc doesn't like labels at end of scopes */ \
786 typedef struct {
787 Py_ssize_t last_ctx_pos;
788 Py_ssize_t jump;
789 SRE_CHAR* ptr;
790 SRE_CODE* pattern;
791 Py_ssize_t count;
792 Py_ssize_t lastmark;
793 Py_ssize_t lastindex;
794 union {
795 SRE_CODE chr;
796 SRE_REPEAT* rep;
797 } u;
798 } SRE_MATCH_CONTEXT;
800 /* check if string matches the given pattern. returns <0 for
801 error, 0 for failure, and 1 for success */
802 LOCAL(Py_ssize_t)
803 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
805 SRE_CHAR* end = (SRE_CHAR *)state->end;
806 Py_ssize_t alloc_pos, ctx_pos = -1;
807 Py_ssize_t i, ret = 0;
808 Py_ssize_t jump;
809 unsigned int sigcount=0;
811 SRE_MATCH_CONTEXT* ctx;
812 SRE_MATCH_CONTEXT* nextctx;
814 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
816 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
817 ctx->last_ctx_pos = -1;
818 ctx->jump = JUMP_NONE;
819 ctx->pattern = pattern;
820 ctx_pos = alloc_pos;
822 entrance:
824 ctx->ptr = (SRE_CHAR *)state->ptr;
826 if (ctx->pattern[0] == SRE_OP_INFO) {
827 /* optimization info block */
828 /* <INFO> <1=skip> <2=flags> <3=min> ... */
829 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
830 TRACE(("reject (got %d chars, need %d)\n",
831 (end - ctx->ptr), ctx->pattern[3]));
832 RETURN_FAILURE;
834 ctx->pattern += ctx->pattern[1] + 1;
837 for (;;) {
838 ++sigcount;
839 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
840 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
842 switch (*ctx->pattern++) {
844 case SRE_OP_MARK:
845 /* set mark */
846 /* <MARK> <gid> */
847 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
848 ctx->ptr, ctx->pattern[0]));
849 i = ctx->pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark) {
853 /* state->lastmark is the highest valid index in the
854 state->mark array. If it is increased by more than 1,
855 the intervening marks must be set to NULL to signal
856 that these marks have not been encountered. */
857 Py_ssize_t j = state->lastmark + 1;
858 while (j < i)
859 state->mark[j++] = NULL;
860 state->lastmark = i;
862 state->mark[i] = ctx->ptr;
863 ctx->pattern++;
864 break;
866 case SRE_OP_LITERAL:
867 /* match literal string */
868 /* <LITERAL> <code> */
869 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
870 ctx->ptr, *ctx->pattern));
871 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
872 RETURN_FAILURE;
873 ctx->pattern++;
874 ctx->ptr++;
875 break;
877 case SRE_OP_NOT_LITERAL:
878 /* match anything that is not literal character */
879 /* <NOT_LITERAL> <code> */
880 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
881 ctx->ptr, *ctx->pattern));
882 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
883 RETURN_FAILURE;
884 ctx->pattern++;
885 ctx->ptr++;
886 break;
888 case SRE_OP_SUCCESS:
889 /* end of pattern */
890 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
891 state->ptr = ctx->ptr;
892 RETURN_SUCCESS;
894 case SRE_OP_AT:
895 /* match at given position */
896 /* <AT> <code> */
897 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
898 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
899 RETURN_FAILURE;
900 ctx->pattern++;
901 break;
903 case SRE_OP_CATEGORY:
904 /* match at given category */
905 /* <CATEGORY> <code> */
906 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
907 ctx->ptr, *ctx->pattern));
908 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 ctx->ptr++;
912 break;
914 case SRE_OP_ANY:
915 /* match anything (except a newline) */
916 /* <ANY> */
917 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
918 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->ptr++;
921 break;
923 case SRE_OP_ANY_ALL:
924 /* match anything */
925 /* <ANY_ALL> */
926 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
927 if (ctx->ptr >= end)
928 RETURN_FAILURE;
929 ctx->ptr++;
930 break;
932 case SRE_OP_IN:
933 /* match set member (or non_member) */
934 /* <IN> <skip> <set> */
935 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
936 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
937 RETURN_FAILURE;
938 ctx->pattern += ctx->pattern[0];
939 ctx->ptr++;
940 break;
942 case SRE_OP_LITERAL_IGNORE:
943 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
944 ctx->pattern, ctx->ptr, ctx->pattern[0]));
945 if (ctx->ptr >= end ||
946 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
947 RETURN_FAILURE;
948 ctx->pattern++;
949 ctx->ptr++;
950 break;
952 case SRE_OP_NOT_LITERAL_IGNORE:
953 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, *ctx->pattern));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
960 break;
962 case SRE_OP_IN_IGNORE:
963 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
964 if (ctx->ptr >= end
965 || !SRE_CHARSET(ctx->pattern+1,
966 (SRE_CODE)state->lower(*ctx->ptr)))
967 RETURN_FAILURE;
968 ctx->pattern += ctx->pattern[0];
969 ctx->ptr++;
970 break;
972 case SRE_OP_JUMP:
973 case SRE_OP_INFO:
974 /* jump forward */
975 /* <JUMP> <offset> */
976 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
977 ctx->ptr, ctx->pattern[0]));
978 ctx->pattern += ctx->pattern[0];
979 break;
981 case SRE_OP_BRANCH:
982 /* alternation */
983 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
984 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
985 LASTMARK_SAVE();
986 ctx->u.rep = state->repeat;
987 if (ctx->u.rep)
988 MARK_PUSH(ctx->lastmark);
989 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
990 if (ctx->pattern[1] == SRE_OP_LITERAL &&
991 (ctx->ptr >= end ||
992 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
993 continue;
994 if (ctx->pattern[1] == SRE_OP_IN &&
995 (ctx->ptr >= end ||
996 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
997 continue;
998 state->ptr = ctx->ptr;
999 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1000 if (ret) {
1001 if (ctx->u.rep)
1002 MARK_POP_DISCARD(ctx->lastmark);
1003 RETURN_ON_ERROR(ret);
1004 RETURN_SUCCESS;
1006 if (ctx->u.rep)
1007 MARK_POP_KEEP(ctx->lastmark);
1008 LASTMARK_RESTORE();
1010 if (ctx->u.rep)
1011 MARK_POP_DISCARD(ctx->lastmark);
1012 RETURN_FAILURE;
1014 case SRE_OP_REPEAT_ONE:
1015 /* match repeated sequence (maximizing regexp) */
1017 /* this operator only works if the repeated item is
1018 exactly one character wide, and we're not already
1019 collecting backtracking points. for other cases,
1020 use the MAX_REPEAT operator */
1022 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1024 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1025 ctx->pattern[1], ctx->pattern[2]));
1027 if (ctx->ptr + ctx->pattern[1] > end)
1028 RETURN_FAILURE; /* cannot match */
1030 state->ptr = ctx->ptr;
1032 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1033 RETURN_ON_ERROR(ret);
1034 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1035 ctx->count = ret;
1036 ctx->ptr += ctx->count;
1038 /* when we arrive here, count contains the number of
1039 matches, and ctx->ptr points to the tail of the target
1040 string. check if the rest of the pattern matches,
1041 and backtrack if not. */
1043 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1044 RETURN_FAILURE;
1046 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1047 /* tail is empty. we're finished */
1048 state->ptr = ctx->ptr;
1049 RETURN_SUCCESS;
1052 LASTMARK_SAVE();
1054 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1055 /* tail starts with a literal. skip positions where
1056 the rest of the pattern cannot possibly match */
1057 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1058 for (;;) {
1059 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1060 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1061 ctx->ptr--;
1062 ctx->count--;
1064 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1065 break;
1066 state->ptr = ctx->ptr;
1067 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1068 ctx->pattern+ctx->pattern[0]);
1069 if (ret) {
1070 RETURN_ON_ERROR(ret);
1071 RETURN_SUCCESS;
1074 LASTMARK_RESTORE();
1076 ctx->ptr--;
1077 ctx->count--;
1080 } else {
1081 /* general case */
1082 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1083 state->ptr = ctx->ptr;
1084 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1085 ctx->pattern+ctx->pattern[0]);
1086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1090 ctx->ptr--;
1091 ctx->count--;
1092 LASTMARK_RESTORE();
1095 RETURN_FAILURE;
1097 case SRE_OP_MIN_REPEAT_ONE:
1098 /* match repeated sequence (minimizing regexp) */
1100 /* this operator only works if the repeated item is
1101 exactly one character wide, and we're not already
1102 collecting backtracking points. for other cases,
1103 use the MIN_REPEAT operator */
1105 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1107 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1108 ctx->pattern[1], ctx->pattern[2]));
1110 if (ctx->ptr + ctx->pattern[1] > end)
1111 RETURN_FAILURE; /* cannot match */
1113 state->ptr = ctx->ptr;
1115 if (ctx->pattern[1] == 0)
1116 ctx->count = 0;
1117 else {
1118 /* count using pattern min as the maximum */
1119 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1120 RETURN_ON_ERROR(ret);
1121 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1122 if (ret < (Py_ssize_t) ctx->pattern[1])
1123 /* didn't match minimum number of times */
1124 RETURN_FAILURE;
1125 /* advance past minimum matches of repeat */
1126 ctx->count = ret;
1127 ctx->ptr += ctx->count;
1130 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1131 /* tail is empty. we're finished */
1132 state->ptr = ctx->ptr;
1133 RETURN_SUCCESS;
1135 } else {
1136 /* general case */
1137 LASTMARK_SAVE();
1138 while ((Py_ssize_t)ctx->pattern[2] == 65535
1139 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1140 state->ptr = ctx->ptr;
1141 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1142 ctx->pattern+ctx->pattern[0]);
1143 if (ret) {
1144 RETURN_ON_ERROR(ret);
1145 RETURN_SUCCESS;
1147 state->ptr = ctx->ptr;
1148 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1149 RETURN_ON_ERROR(ret);
1150 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1151 if (ret == 0)
1152 break;
1153 assert(ret == 1);
1154 ctx->ptr++;
1155 ctx->count++;
1156 LASTMARK_RESTORE();
1159 RETURN_FAILURE;
1161 case SRE_OP_REPEAT:
1162 /* create repeat context. all the hard work is done
1163 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1164 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1165 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1166 ctx->pattern[1], ctx->pattern[2]));
1168 /* install new repeat context */
1169 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1170 if (!ctx->u.rep) {
1171 PyErr_NoMemory();
1172 RETURN_FAILURE;
1174 ctx->u.rep->count = -1;
1175 ctx->u.rep->pattern = ctx->pattern;
1176 ctx->u.rep->prev = state->repeat;
1177 ctx->u.rep->last_ptr = NULL;
1178 state->repeat = ctx->u.rep;
1180 state->ptr = ctx->ptr;
1181 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1182 state->repeat = ctx->u.rep->prev;
1183 PyObject_FREE(ctx->u.rep);
1185 if (ret) {
1186 RETURN_ON_ERROR(ret);
1187 RETURN_SUCCESS;
1189 RETURN_FAILURE;
1191 case SRE_OP_MAX_UNTIL:
1192 /* maximizing repeat */
1193 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1195 /* FIXME: we probably need to deal with zero-width
1196 matches in here... */
1198 ctx->u.rep = state->repeat;
1199 if (!ctx->u.rep)
1200 RETURN_ERROR(SRE_ERROR_STATE);
1202 state->ptr = ctx->ptr;
1204 ctx->count = ctx->u.rep->count+1;
1206 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1207 ctx->ptr, ctx->count));
1209 if (ctx->count < ctx->u.rep->pattern[1]) {
1210 /* not enough matches */
1211 ctx->u.rep->count = ctx->count;
1212 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1213 ctx->u.rep->pattern+3);
1214 if (ret) {
1215 RETURN_ON_ERROR(ret);
1216 RETURN_SUCCESS;
1218 ctx->u.rep->count = ctx->count-1;
1219 state->ptr = ctx->ptr;
1220 RETURN_FAILURE;
1223 if ((ctx->count < ctx->u.rep->pattern[2] ||
1224 ctx->u.rep->pattern[2] == 65535) &&
1225 state->ptr != ctx->u.rep->last_ptr) {
1226 /* we may have enough matches, but if we can
1227 match another item, do so */
1228 ctx->u.rep->count = ctx->count;
1229 LASTMARK_SAVE();
1230 MARK_PUSH(ctx->lastmark);
1231 /* zero-width match protection */
1232 DATA_PUSH(&ctx->u.rep->last_ptr);
1233 ctx->u.rep->last_ptr = state->ptr;
1234 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1235 ctx->u.rep->pattern+3);
1236 DATA_POP(&ctx->u.rep->last_ptr);
1237 if (ret) {
1238 MARK_POP_DISCARD(ctx->lastmark);
1239 RETURN_ON_ERROR(ret);
1240 RETURN_SUCCESS;
1242 MARK_POP(ctx->lastmark);
1243 LASTMARK_RESTORE();
1244 ctx->u.rep->count = ctx->count-1;
1245 state->ptr = ctx->ptr;
1248 /* cannot match more repeated items here. make sure the
1249 tail matches */
1250 state->repeat = ctx->u.rep->prev;
1251 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1252 RETURN_ON_SUCCESS(ret);
1253 state->repeat = ctx->u.rep;
1254 state->ptr = ctx->ptr;
1255 RETURN_FAILURE;
1257 case SRE_OP_MIN_UNTIL:
1258 /* minimizing repeat */
1259 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1261 ctx->u.rep = state->repeat;
1262 if (!ctx->u.rep)
1263 RETURN_ERROR(SRE_ERROR_STATE);
1265 state->ptr = ctx->ptr;
1267 ctx->count = ctx->u.rep->count+1;
1269 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1270 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1272 if (ctx->count < ctx->u.rep->pattern[1]) {
1273 /* not enough matches */
1274 ctx->u.rep->count = ctx->count;
1275 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1276 ctx->u.rep->pattern+3);
1277 if (ret) {
1278 RETURN_ON_ERROR(ret);
1279 RETURN_SUCCESS;
1281 ctx->u.rep->count = ctx->count-1;
1282 state->ptr = ctx->ptr;
1283 RETURN_FAILURE;
1286 LASTMARK_SAVE();
1288 /* see if the tail matches */
1289 state->repeat = ctx->u.rep->prev;
1290 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1291 if (ret) {
1292 RETURN_ON_ERROR(ret);
1293 RETURN_SUCCESS;
1296 state->repeat = ctx->u.rep;
1297 state->ptr = ctx->ptr;
1299 LASTMARK_RESTORE();
1301 if (ctx->count >= ctx->u.rep->pattern[2]
1302 && ctx->u.rep->pattern[2] != 65535)
1303 RETURN_FAILURE;
1305 ctx->u.rep->count = ctx->count;
1306 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1307 ctx->u.rep->pattern+3);
1308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1312 ctx->u.rep->count = ctx->count-1;
1313 state->ptr = ctx->ptr;
1314 RETURN_FAILURE;
1316 case SRE_OP_GROUPREF:
1317 /* match backreference */
1318 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1319 ctx->ptr, ctx->pattern[0]));
1320 i = ctx->pattern[0];
1322 Py_ssize_t groupref = i+i;
1323 if (groupref >= state->lastmark) {
1324 RETURN_FAILURE;
1325 } else {
1326 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1327 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1328 if (!p || !e || e < p)
1329 RETURN_FAILURE;
1330 while (p < e) {
1331 if (ctx->ptr >= end || *ctx->ptr != *p)
1332 RETURN_FAILURE;
1333 p++; ctx->ptr++;
1337 ctx->pattern++;
1338 break;
1340 case SRE_OP_GROUPREF_IGNORE:
1341 /* match backreference */
1342 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1343 ctx->ptr, ctx->pattern[0]));
1344 i = ctx->pattern[0];
1346 Py_ssize_t groupref = i+i;
1347 if (groupref >= state->lastmark) {
1348 RETURN_FAILURE;
1349 } else {
1350 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1351 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1352 if (!p || !e || e < p)
1353 RETURN_FAILURE;
1354 while (p < e) {
1355 if (ctx->ptr >= end ||
1356 state->lower(*ctx->ptr) != state->lower(*p))
1357 RETURN_FAILURE;
1358 p++; ctx->ptr++;
1362 ctx->pattern++;
1363 break;
1365 case SRE_OP_GROUPREF_EXISTS:
1366 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1367 ctx->ptr, ctx->pattern[0]));
1368 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1369 i = ctx->pattern[0];
1371 Py_ssize_t groupref = i+i;
1372 if (groupref >= state->lastmark) {
1373 ctx->pattern += ctx->pattern[1];
1374 break;
1375 } else {
1376 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1377 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1378 if (!p || !e || e < p) {
1379 ctx->pattern += ctx->pattern[1];
1380 break;
1384 ctx->pattern += 2;
1385 break;
1387 case SRE_OP_ASSERT:
1388 /* assert subpattern */
1389 /* <ASSERT> <skip> <back> <pattern> */
1390 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1391 ctx->ptr, ctx->pattern[1]));
1392 state->ptr = ctx->ptr - ctx->pattern[1];
1393 if (state->ptr < state->beginning)
1394 RETURN_FAILURE;
1395 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1396 RETURN_ON_FAILURE(ret);
1397 ctx->pattern += ctx->pattern[0];
1398 break;
1400 case SRE_OP_ASSERT_NOT:
1401 /* assert not subpattern */
1402 /* <ASSERT_NOT> <skip> <back> <pattern> */
1403 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1404 ctx->ptr, ctx->pattern[1]));
1405 state->ptr = ctx->ptr - ctx->pattern[1];
1406 if (state->ptr >= state->beginning) {
1407 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1408 if (ret) {
1409 RETURN_ON_ERROR(ret);
1410 RETURN_FAILURE;
1413 ctx->pattern += ctx->pattern[0];
1414 break;
1416 case SRE_OP_FAILURE:
1417 /* immediate failure */
1418 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1419 RETURN_FAILURE;
1421 default:
1422 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1423 ctx->pattern[-1]));
1424 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1428 exit:
1429 ctx_pos = ctx->last_ctx_pos;
1430 jump = ctx->jump;
1431 DATA_POP_DISCARD(ctx);
1432 if (ctx_pos == -1)
1433 return ret;
1434 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1436 switch (jump) {
1437 case JUMP_MAX_UNTIL_2:
1438 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1439 goto jump_max_until_2;
1440 case JUMP_MAX_UNTIL_3:
1441 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1442 goto jump_max_until_3;
1443 case JUMP_MIN_UNTIL_2:
1444 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1445 goto jump_min_until_2;
1446 case JUMP_MIN_UNTIL_3:
1447 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1448 goto jump_min_until_3;
1449 case JUMP_BRANCH:
1450 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1451 goto jump_branch;
1452 case JUMP_MAX_UNTIL_1:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_1;
1455 case JUMP_MIN_UNTIL_1:
1456 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1457 goto jump_min_until_1;
1458 case JUMP_REPEAT:
1459 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1460 goto jump_repeat;
1461 case JUMP_REPEAT_ONE_1:
1462 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1463 goto jump_repeat_one_1;
1464 case JUMP_REPEAT_ONE_2:
1465 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1466 goto jump_repeat_one_2;
1467 case JUMP_MIN_REPEAT_ONE:
1468 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1469 goto jump_min_repeat_one;
1470 case JUMP_ASSERT:
1471 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1472 goto jump_assert;
1473 case JUMP_ASSERT_NOT:
1474 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1475 goto jump_assert_not;
1476 case JUMP_NONE:
1477 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1478 break;
1481 return ret; /* should never get here */
1484 LOCAL(Py_ssize_t)
1485 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1487 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1488 SRE_CHAR* end = (SRE_CHAR *)state->end;
1489 Py_ssize_t status = 0;
1490 Py_ssize_t prefix_len = 0;
1491 Py_ssize_t prefix_skip = 0;
1492 SRE_CODE* prefix = NULL;
1493 SRE_CODE* charset = NULL;
1494 SRE_CODE* overlap = NULL;
1495 int flags = 0;
1497 if (pattern[0] == SRE_OP_INFO) {
1498 /* optimization info block */
1499 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1501 flags = pattern[2];
1503 if (pattern[3] > 1) {
1504 /* adjust end point (but make sure we leave at least one
1505 character in there, so literal search will work) */
1506 end -= pattern[3]-1;
1507 if (end <= ptr)
1508 end = ptr+1;
1511 if (flags & SRE_INFO_PREFIX) {
1512 /* pattern starts with a known prefix */
1513 /* <length> <skip> <prefix data> <overlap data> */
1514 prefix_len = pattern[5];
1515 prefix_skip = pattern[6];
1516 prefix = pattern + 7;
1517 overlap = prefix + prefix_len - 1;
1518 } else if (flags & SRE_INFO_CHARSET)
1519 /* pattern starts with a character from a known set */
1520 /* <charset> */
1521 charset = pattern + 5;
1523 pattern += 1 + pattern[1];
1526 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1527 TRACE(("charset = %p\n", charset));
1529 #if defined(USE_FAST_SEARCH)
1530 if (prefix_len > 1) {
1531 /* pattern starts with a known prefix. use the overlap
1532 table to skip forward as fast as we possibly can */
1533 Py_ssize_t i = 0;
1534 end = (SRE_CHAR *)state->end;
1535 while (ptr < end) {
1536 for (;;) {
1537 if ((SRE_CODE) ptr[0] != prefix[i]) {
1538 if (!i)
1539 break;
1540 else
1541 i = overlap[i];
1542 } else {
1543 if (++i == prefix_len) {
1544 /* found a potential match */
1545 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1546 state->start = ptr + 1 - prefix_len;
1547 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1548 if (flags & SRE_INFO_LITERAL)
1549 return 1; /* we got all of it */
1550 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1551 if (status != 0)
1552 return status;
1553 /* close but no cigar -- try again */
1554 i = overlap[i];
1556 break;
1559 ptr++;
1561 return 0;
1563 #endif
1565 if (pattern[0] == SRE_OP_LITERAL) {
1566 /* pattern starts with a literal character. this is used
1567 for short prefixes, and if fast search is disabled */
1568 SRE_CODE chr = pattern[1];
1569 end = (SRE_CHAR *)state->end;
1570 for (;;) {
1571 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1572 ptr++;
1573 if (ptr >= end)
1574 return 0;
1575 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1576 state->start = ptr;
1577 state->ptr = ++ptr;
1578 if (flags & SRE_INFO_LITERAL)
1579 return 1; /* we got all of it */
1580 status = SRE_MATCH(state, pattern + 2);
1581 if (status != 0)
1582 break;
1584 } else if (charset) {
1585 /* pattern starts with a character from a known set */
1586 end = (SRE_CHAR *)state->end;
1587 for (;;) {
1588 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1589 ptr++;
1590 if (ptr >= end)
1591 return 0;
1592 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1593 state->start = ptr;
1594 state->ptr = ptr;
1595 status = SRE_MATCH(state, pattern);
1596 if (status != 0)
1597 break;
1598 ptr++;
1600 } else
1601 /* general case */
1602 while (ptr <= end) {
1603 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1604 state->start = state->ptr = ptr++;
1605 status = SRE_MATCH(state, pattern);
1606 if (status != 0)
1607 break;
1610 return status;
1613 LOCAL(int)
1614 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1616 /* check if given string is a literal template (i.e. no escapes) */
1617 while (len-- > 0)
1618 if (*ptr++ == '\\')
1619 return 0;
1620 return 1;
1623 #if !defined(SRE_RECURSIVE)
1625 /* -------------------------------------------------------------------- */
1626 /* factories and destructors */
1628 /* see sre.h for object declarations */
1629 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1630 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1632 static PyObject *
1633 sre_codesize(PyObject* self, PyObject *unused)
1635 return Py_BuildValue("l", sizeof(SRE_CODE));
1638 static PyObject *
1639 sre_getlower(PyObject* self, PyObject* args)
1641 int character, flags;
1642 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1643 return NULL;
1644 if (flags & SRE_FLAG_LOCALE)
1645 return Py_BuildValue("i", sre_lower_locale(character));
1646 if (flags & SRE_FLAG_UNICODE)
1647 #if defined(HAVE_UNICODE)
1648 return Py_BuildValue("i", sre_lower_unicode(character));
1649 #else
1650 return Py_BuildValue("i", sre_lower_locale(character));
1651 #endif
1652 return Py_BuildValue("i", sre_lower(character));
1655 LOCAL(void)
1656 state_reset(SRE_STATE* state)
1658 /* FIXME: dynamic! */
1659 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1661 state->lastmark = -1;
1662 state->lastindex = -1;
1664 state->repeat = NULL;
1666 data_stack_dealloc(state);
1669 static void*
1670 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1672 /* given a python object, return a data pointer, a length (in
1673 characters), and a character size. return NULL if the object
1674 is not a string (or not compatible) */
1676 PyBufferProcs *buffer;
1677 Py_ssize_t size, bytes;
1678 int charsize;
1679 void* ptr;
1680 Py_buffer view;
1682 /* Unicode objects do not support the buffer API. So, get the data
1683 directly instead. */
1684 if (PyUnicode_Check(string)) {
1685 ptr = (void *)PyUnicode_AS_DATA(string);
1686 *p_length = PyUnicode_GET_SIZE(string);
1687 *p_charsize = sizeof(Py_UNICODE);
1688 return ptr;
1691 /* get pointer to string buffer */
1692 view.len = -1;
1693 buffer = Py_TYPE(string)->tp_as_buffer;
1694 if (!buffer || !buffer->bf_getbuffer ||
1695 (*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
1696 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1697 return NULL;
1700 /* determine buffer size */
1701 bytes = view.len;
1702 ptr = view.buf;
1704 /* Release the buffer immediately --- possibly dangerous
1705 but doing something else would require some re-factoring
1707 PyObject_ReleaseBuffer(string, &view);
1709 if (bytes < 0) {
1710 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1711 return NULL;
1714 /* determine character size */
1715 size = PyObject_Size(string);
1717 if (PyString_Check(string) || bytes == size)
1718 charsize = 1;
1719 #if defined(HAVE_UNICODE)
1720 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1721 charsize = sizeof(Py_UNICODE);
1722 #endif
1723 else {
1724 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1725 return NULL;
1728 *p_length = size;
1729 *p_charsize = charsize;
1731 if (ptr == NULL) {
1732 PyErr_SetString(PyExc_ValueError,
1733 "Buffer is NULL");
1735 return ptr;
1738 LOCAL(PyObject*)
1739 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1740 Py_ssize_t start, Py_ssize_t end)
1742 /* prepare state object */
1744 Py_ssize_t length;
1745 int charsize;
1746 void* ptr;
1748 memset(state, 0, sizeof(SRE_STATE));
1750 state->lastmark = -1;
1751 state->lastindex = -1;
1753 ptr = getstring(string, &length, &charsize);
1754 if (!ptr)
1755 return NULL;
1757 /* adjust boundaries */
1758 if (start < 0)
1759 start = 0;
1760 else if (start > length)
1761 start = length;
1763 if (end < 0)
1764 end = 0;
1765 else if (end > length)
1766 end = length;
1768 state->charsize = charsize;
1770 state->beginning = ptr;
1772 state->start = (void*) ((char*) ptr + start * state->charsize);
1773 state->end = (void*) ((char*) ptr + end * state->charsize);
1775 Py_INCREF(string);
1776 state->string = string;
1777 state->pos = start;
1778 state->endpos = end;
1780 if (pattern->flags & SRE_FLAG_LOCALE)
1781 state->lower = sre_lower_locale;
1782 else if (pattern->flags & SRE_FLAG_UNICODE)
1783 #if defined(HAVE_UNICODE)
1784 state->lower = sre_lower_unicode;
1785 #else
1786 state->lower = sre_lower_locale;
1787 #endif
1788 else
1789 state->lower = sre_lower;
1791 return string;
1794 LOCAL(void)
1795 state_fini(SRE_STATE* state)
1797 Py_XDECREF(state->string);
1798 data_stack_dealloc(state);
1801 /* calculate offset from start of string */
1802 #define STATE_OFFSET(state, member)\
1803 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1805 LOCAL(PyObject*)
1806 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1808 Py_ssize_t i, j;
1810 index = (index - 1) * 2;
1812 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1813 if (empty)
1814 /* want empty string */
1815 i = j = 0;
1816 else {
1817 Py_INCREF(Py_None);
1818 return Py_None;
1820 } else {
1821 i = STATE_OFFSET(state, state->mark[index]);
1822 j = STATE_OFFSET(state, state->mark[index+1]);
1825 return PySequence_GetSlice(string, i, j);
1828 static void
1829 pattern_error(int status)
1831 switch (status) {
1832 case SRE_ERROR_RECURSION_LIMIT:
1833 PyErr_SetString(
1834 PyExc_RuntimeError,
1835 "maximum recursion limit exceeded"
1837 break;
1838 case SRE_ERROR_MEMORY:
1839 PyErr_NoMemory();
1840 break;
1841 case SRE_ERROR_INTERRUPTED:
1842 /* An exception has already been raised, so let it fly */
1843 break;
1844 default:
1845 /* other error codes indicate compiler/engine bugs */
1846 PyErr_SetString(
1847 PyExc_RuntimeError,
1848 "internal error in regular expression engine"
1853 static void
1854 pattern_dealloc(PatternObject* self)
1856 if (self->weakreflist != NULL)
1857 PyObject_ClearWeakRefs((PyObject *) self);
1858 Py_XDECREF(self->pattern);
1859 Py_XDECREF(self->groupindex);
1860 Py_XDECREF(self->indexgroup);
1861 PyObject_DEL(self);
1864 static PyObject*
1865 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1867 SRE_STATE state;
1868 int status;
1870 PyObject* string;
1871 Py_ssize_t start = 0;
1872 Py_ssize_t end = PY_SSIZE_T_MAX;
1873 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1874 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1875 &string, &start, &end))
1876 return NULL;
1878 string = state_init(&state, self, string, start, end);
1879 if (!string)
1880 return NULL;
1882 state.ptr = state.start;
1884 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1886 if (state.charsize == 1) {
1887 status = sre_match(&state, PatternObject_GetCode(self));
1888 } else {
1889 #if defined(HAVE_UNICODE)
1890 status = sre_umatch(&state, PatternObject_GetCode(self));
1891 #endif
1894 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1895 if (PyErr_Occurred())
1896 return NULL;
1898 state_fini(&state);
1900 return pattern_new_match(self, &state, status);
1903 static PyObject*
1904 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1906 SRE_STATE state;
1907 int status;
1909 PyObject* string;
1910 Py_ssize_t start = 0;
1911 Py_ssize_t end = PY_SSIZE_T_MAX;
1912 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1913 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1914 &string, &start, &end))
1915 return NULL;
1917 string = state_init(&state, self, string, start, end);
1918 if (!string)
1919 return NULL;
1921 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1923 if (state.charsize == 1) {
1924 status = sre_search(&state, PatternObject_GetCode(self));
1925 } else {
1926 #if defined(HAVE_UNICODE)
1927 status = sre_usearch(&state, PatternObject_GetCode(self));
1928 #endif
1931 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1933 state_fini(&state);
1935 if (PyErr_Occurred())
1936 return NULL;
1938 return pattern_new_match(self, &state, status);
1941 static PyObject*
1942 call(char* module, char* function, PyObject* args)
1944 PyObject* name;
1945 PyObject* mod;
1946 PyObject* func;
1947 PyObject* result;
1949 if (!args)
1950 return NULL;
1951 name = PyUnicode_FromString(module);
1952 if (!name)
1953 return NULL;
1954 mod = PyImport_Import(name);
1955 Py_DECREF(name);
1956 if (!mod)
1957 return NULL;
1958 func = PyObject_GetAttrString(mod, function);
1959 Py_DECREF(mod);
1960 if (!func)
1961 return NULL;
1962 result = PyObject_CallObject(func, args);
1963 Py_DECREF(func);
1964 Py_DECREF(args);
1965 return result;
1968 #ifdef USE_BUILTIN_COPY
1969 static int
1970 deepcopy(PyObject** object, PyObject* memo)
1972 PyObject* copy;
1974 copy = call(
1975 "copy", "deepcopy",
1976 PyTuple_Pack(2, *object, memo)
1978 if (!copy)
1979 return 0;
1981 Py_DECREF(*object);
1982 *object = copy;
1984 return 1; /* success */
1986 #endif
1988 static PyObject*
1989 join_list(PyObject* list, PyObject* string)
1991 /* join list elements */
1993 PyObject* joiner;
1994 #if PY_VERSION_HEX >= 0x01060000
1995 PyObject* function;
1996 PyObject* args;
1997 #endif
1998 PyObject* result;
2000 joiner = PySequence_GetSlice(string, 0, 0);
2001 if (!joiner)
2002 return NULL;
2004 if (PyList_GET_SIZE(list) == 0) {
2005 Py_DECREF(list);
2006 return joiner;
2009 #if PY_VERSION_HEX >= 0x01060000
2010 function = PyObject_GetAttrString(joiner, "join");
2011 if (!function) {
2012 Py_DECREF(joiner);
2013 return NULL;
2015 args = PyTuple_New(1);
2016 if (!args) {
2017 Py_DECREF(function);
2018 Py_DECREF(joiner);
2019 return NULL;
2021 PyTuple_SET_ITEM(args, 0, list);
2022 result = PyObject_CallObject(function, args);
2023 Py_DECREF(args); /* also removes list */
2024 Py_DECREF(function);
2025 #else
2026 result = call(
2027 "string", "join",
2028 PyTuple_Pack(2, list, joiner)
2030 #endif
2031 Py_DECREF(joiner);
2033 return result;
2036 static PyObject*
2037 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2039 SRE_STATE state;
2040 PyObject* list;
2041 int status;
2042 Py_ssize_t i, b, e;
2044 PyObject* string;
2045 Py_ssize_t start = 0;
2046 Py_ssize_t end = PY_SSIZE_T_MAX;
2047 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2048 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2049 &string, &start, &end))
2050 return NULL;
2052 string = state_init(&state, self, string, start, end);
2053 if (!string)
2054 return NULL;
2056 list = PyList_New(0);
2057 if (!list) {
2058 state_fini(&state);
2059 return NULL;
2062 while (state.start <= state.end) {
2064 PyObject* item;
2066 state_reset(&state);
2068 state.ptr = state.start;
2070 if (state.charsize == 1) {
2071 status = sre_search(&state, PatternObject_GetCode(self));
2072 } else {
2073 #if defined(HAVE_UNICODE)
2074 status = sre_usearch(&state, PatternObject_GetCode(self));
2075 #endif
2078 if (PyErr_Occurred())
2079 goto error;
2081 if (status <= 0) {
2082 if (status == 0)
2083 break;
2084 pattern_error(status);
2085 goto error;
2088 /* don't bother to build a match object */
2089 switch (self->groups) {
2090 case 0:
2091 b = STATE_OFFSET(&state, state.start);
2092 e = STATE_OFFSET(&state, state.ptr);
2093 item = PySequence_GetSlice(string, b, e);
2094 if (!item)
2095 goto error;
2096 break;
2097 case 1:
2098 item = state_getslice(&state, 1, string, 1);
2099 if (!item)
2100 goto error;
2101 break;
2102 default:
2103 item = PyTuple_New(self->groups);
2104 if (!item)
2105 goto error;
2106 for (i = 0; i < self->groups; i++) {
2107 PyObject* o = state_getslice(&state, i+1, string, 1);
2108 if (!o) {
2109 Py_DECREF(item);
2110 goto error;
2112 PyTuple_SET_ITEM(item, i, o);
2114 break;
2117 status = PyList_Append(list, item);
2118 Py_DECREF(item);
2119 if (status < 0)
2120 goto error;
2122 if (state.ptr == state.start)
2123 state.start = (void*) ((char*) state.ptr + state.charsize);
2124 else
2125 state.start = state.ptr;
2129 state_fini(&state);
2130 return list;
2132 error:
2133 Py_DECREF(list);
2134 state_fini(&state);
2135 return NULL;
2139 #if PY_VERSION_HEX >= 0x02020000
2140 static PyObject*
2141 pattern_finditer(PatternObject* pattern, PyObject* args)
2143 PyObject* scanner;
2144 PyObject* search;
2145 PyObject* iterator;
2147 scanner = pattern_scanner(pattern, args);
2148 if (!scanner)
2149 return NULL;
2151 search = PyObject_GetAttrString(scanner, "search");
2152 Py_DECREF(scanner);
2153 if (!search)
2154 return NULL;
2156 iterator = PyCallIter_New(search, Py_None);
2157 Py_DECREF(search);
2159 return iterator;
2161 #endif
2163 static PyObject*
2164 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2166 SRE_STATE state;
2167 PyObject* list;
2168 PyObject* item;
2169 int status;
2170 Py_ssize_t n;
2171 Py_ssize_t i;
2172 void* last;
2174 PyObject* string;
2175 Py_ssize_t maxsplit = 0;
2176 static char* kwlist[] = { "source", "maxsplit", NULL };
2177 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2178 &string, &maxsplit))
2179 return NULL;
2181 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2182 if (!string)
2183 return NULL;
2185 list = PyList_New(0);
2186 if (!list) {
2187 state_fini(&state);
2188 return NULL;
2191 n = 0;
2192 last = state.start;
2194 while (!maxsplit || n < maxsplit) {
2196 state_reset(&state);
2198 state.ptr = state.start;
2200 if (state.charsize == 1) {
2201 status = sre_search(&state, PatternObject_GetCode(self));
2202 } else {
2203 #if defined(HAVE_UNICODE)
2204 status = sre_usearch(&state, PatternObject_GetCode(self));
2205 #endif
2208 if (PyErr_Occurred())
2209 goto error;
2211 if (status <= 0) {
2212 if (status == 0)
2213 break;
2214 pattern_error(status);
2215 goto error;
2218 if (state.start == state.ptr) {
2219 if (last == state.end)
2220 break;
2221 /* skip one character */
2222 state.start = (void*) ((char*) state.ptr + state.charsize);
2223 continue;
2226 /* get segment before this match */
2227 item = PySequence_GetSlice(
2228 string, STATE_OFFSET(&state, last),
2229 STATE_OFFSET(&state, state.start)
2231 if (!item)
2232 goto error;
2233 status = PyList_Append(list, item);
2234 Py_DECREF(item);
2235 if (status < 0)
2236 goto error;
2238 /* add groups (if any) */
2239 for (i = 0; i < self->groups; i++) {
2240 item = state_getslice(&state, i+1, string, 0);
2241 if (!item)
2242 goto error;
2243 status = PyList_Append(list, item);
2244 Py_DECREF(item);
2245 if (status < 0)
2246 goto error;
2249 n = n + 1;
2251 last = state.start = state.ptr;
2255 /* get segment following last match (even if empty) */
2256 item = PySequence_GetSlice(
2257 string, STATE_OFFSET(&state, last), state.endpos
2259 if (!item)
2260 goto error;
2261 status = PyList_Append(list, item);
2262 Py_DECREF(item);
2263 if (status < 0)
2264 goto error;
2266 state_fini(&state);
2267 return list;
2269 error:
2270 Py_DECREF(list);
2271 state_fini(&state);
2272 return NULL;
2276 static PyObject*
2277 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2278 Py_ssize_t count, Py_ssize_t subn)
2280 SRE_STATE state;
2281 PyObject* list;
2282 PyObject* item;
2283 PyObject* filter;
2284 PyObject* args;
2285 PyObject* match;
2286 void* ptr;
2287 int status;
2288 Py_ssize_t n;
2289 Py_ssize_t i, b, e;
2290 int bint;
2291 int filter_is_callable;
2293 if (PyCallable_Check(ptemplate)) {
2294 /* sub/subn takes either a function or a template */
2295 filter = ptemplate;
2296 Py_INCREF(filter);
2297 filter_is_callable = 1;
2298 } else {
2299 /* if not callable, check if it's a literal string */
2300 int literal;
2301 ptr = getstring(ptemplate, &n, &bint);
2302 b = bint;
2303 if (ptr) {
2304 if (b == 1) {
2305 literal = sre_literal_template((unsigned char *)ptr, n);
2306 } else {
2307 #if defined(HAVE_UNICODE)
2308 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2309 #endif
2311 } else {
2312 PyErr_Clear();
2313 literal = 0;
2315 if (literal) {
2316 filter = ptemplate;
2317 Py_INCREF(filter);
2318 filter_is_callable = 0;
2319 } else {
2320 /* not a literal; hand it over to the template compiler */
2321 filter = call(
2322 SRE_PY_MODULE, "_subx",
2323 PyTuple_Pack(2, self, ptemplate)
2325 if (!filter)
2326 return NULL;
2327 filter_is_callable = PyCallable_Check(filter);
2331 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2332 if (!string) {
2333 Py_DECREF(filter);
2334 return NULL;
2337 list = PyList_New(0);
2338 if (!list) {
2339 Py_DECREF(filter);
2340 state_fini(&state);
2341 return NULL;
2344 n = i = 0;
2346 while (!count || n < count) {
2348 state_reset(&state);
2350 state.ptr = state.start;
2352 if (state.charsize == 1) {
2353 status = sre_search(&state, PatternObject_GetCode(self));
2354 } else {
2355 #if defined(HAVE_UNICODE)
2356 status = sre_usearch(&state, PatternObject_GetCode(self));
2357 #endif
2360 if (PyErr_Occurred())
2361 goto error;
2363 if (status <= 0) {
2364 if (status == 0)
2365 break;
2366 pattern_error(status);
2367 goto error;
2370 b = STATE_OFFSET(&state, state.start);
2371 e = STATE_OFFSET(&state, state.ptr);
2373 if (i < b) {
2374 /* get segment before this match */
2375 item = PySequence_GetSlice(string, i, b);
2376 if (!item)
2377 goto error;
2378 status = PyList_Append(list, item);
2379 Py_DECREF(item);
2380 if (status < 0)
2381 goto error;
2383 } else if (i == b && i == e && n > 0)
2384 /* ignore empty match on latest position */
2385 goto next;
2387 if (filter_is_callable) {
2388 /* pass match object through filter */
2389 match = pattern_new_match(self, &state, 1);
2390 if (!match)
2391 goto error;
2392 args = PyTuple_Pack(1, match);
2393 if (!args) {
2394 Py_DECREF(match);
2395 goto error;
2397 item = PyObject_CallObject(filter, args);
2398 Py_DECREF(args);
2399 Py_DECREF(match);
2400 if (!item)
2401 goto error;
2402 } else {
2403 /* filter is literal string */
2404 item = filter;
2405 Py_INCREF(item);
2408 /* add to list */
2409 if (item != Py_None) {
2410 status = PyList_Append(list, item);
2411 Py_DECREF(item);
2412 if (status < 0)
2413 goto error;
2416 i = e;
2417 n = n + 1;
2419 next:
2420 /* move on */
2421 if (state.ptr == state.start)
2422 state.start = (void*) ((char*) state.ptr + state.charsize);
2423 else
2424 state.start = state.ptr;
2428 /* get segment following last match */
2429 if (i < state.endpos) {
2430 item = PySequence_GetSlice(string, i, state.endpos);
2431 if (!item)
2432 goto error;
2433 status = PyList_Append(list, item);
2434 Py_DECREF(item);
2435 if (status < 0)
2436 goto error;
2439 state_fini(&state);
2441 Py_DECREF(filter);
2443 /* convert list to single string (also removes list) */
2444 item = join_list(list, string);
2446 if (!item)
2447 return NULL;
2449 if (subn)
2450 return Py_BuildValue("Ni", item, n);
2452 return item;
2454 error:
2455 Py_DECREF(list);
2456 state_fini(&state);
2457 Py_DECREF(filter);
2458 return NULL;
2462 static PyObject*
2463 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2465 PyObject* ptemplate;
2466 PyObject* string;
2467 Py_ssize_t count = 0;
2468 static char* kwlist[] = { "repl", "string", "count", NULL };
2469 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2470 &ptemplate, &string, &count))
2471 return NULL;
2473 return pattern_subx(self, ptemplate, string, count, 0);
2476 static PyObject*
2477 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2479 PyObject* ptemplate;
2480 PyObject* string;
2481 Py_ssize_t count = 0;
2482 static char* kwlist[] = { "repl", "string", "count", NULL };
2483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2484 &ptemplate, &string, &count))
2485 return NULL;
2487 return pattern_subx(self, ptemplate, string, count, 1);
2490 static PyObject*
2491 pattern_copy(PatternObject* self, PyObject *unused)
2493 #ifdef USE_BUILTIN_COPY
2494 PatternObject* copy;
2495 int offset;
2497 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2498 if (!copy)
2499 return NULL;
2501 offset = offsetof(PatternObject, groups);
2503 Py_XINCREF(self->groupindex);
2504 Py_XINCREF(self->indexgroup);
2505 Py_XINCREF(self->pattern);
2507 memcpy((char*) copy + offset, (char*) self + offset,
2508 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2509 copy->weakreflist = NULL;
2511 return (PyObject*) copy;
2512 #else
2513 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2514 return NULL;
2515 #endif
2518 static PyObject*
2519 pattern_deepcopy(PatternObject* self, PyObject* memo)
2521 #ifdef USE_BUILTIN_COPY
2522 PatternObject* copy;
2524 copy = (PatternObject*) pattern_copy(self);
2525 if (!copy)
2526 return NULL;
2528 if (!deepcopy(&copy->groupindex, memo) ||
2529 !deepcopy(&copy->indexgroup, memo) ||
2530 !deepcopy(&copy->pattern, memo)) {
2531 Py_DECREF(copy);
2532 return NULL;
2535 #else
2536 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2537 return NULL;
2538 #endif
2541 PyDoc_STRVAR(pattern_match_doc,
2542 "match(string[, pos[, endpos]]) --> match object or None.\n\
2543 Matches zero or more characters at the beginning of the string");
2545 PyDoc_STRVAR(pattern_search_doc,
2546 "search(string[, pos[, endpos]]) --> match object or None.\n\
2547 Scan through string looking for a match, and return a corresponding\n\
2548 MatchObject instance. Return None if no position in the string matches.");
2550 PyDoc_STRVAR(pattern_split_doc,
2551 "split(string[, maxsplit = 0]) --> list.\n\
2552 Split string by the occurrences of pattern.");
2554 PyDoc_STRVAR(pattern_findall_doc,
2555 "findall(string[, pos[, endpos]]) --> list.\n\
2556 Return a list of all non-overlapping matches of pattern in string.");
2558 PyDoc_STRVAR(pattern_finditer_doc,
2559 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2560 Return an iterator over all non-overlapping matches for the \n\
2561 RE pattern in string. For each match, the iterator returns a\n\
2562 match object.");
2564 PyDoc_STRVAR(pattern_sub_doc,
2565 "sub(repl, string[, count = 0]) --> newstring\n\
2566 Return the string obtained by replacing the leftmost non-overlapping\n\
2567 occurrences of pattern in string by the replacement repl.");
2569 PyDoc_STRVAR(pattern_subn_doc,
2570 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2571 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2572 the leftmost non-overlapping occurrences of pattern with the\n\
2573 replacement repl.");
2575 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2577 static PyMethodDef pattern_methods[] = {
2578 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2579 pattern_match_doc},
2580 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2581 pattern_search_doc},
2582 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2583 pattern_sub_doc},
2584 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2585 pattern_subn_doc},
2586 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2587 pattern_split_doc},
2588 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2589 pattern_findall_doc},
2590 #if PY_VERSION_HEX >= 0x02020000
2591 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2592 pattern_finditer_doc},
2593 #endif
2594 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2595 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2596 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2597 {NULL, NULL}
2600 static PyObject*
2601 pattern_getattr(PatternObject* self, char* name)
2603 PyObject* res;
2605 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2607 if (res)
2608 return res;
2610 PyErr_Clear();
2612 /* attributes */
2613 if (!strcmp(name, "pattern")) {
2614 Py_INCREF(self->pattern);
2615 return self->pattern;
2618 if (!strcmp(name, "flags"))
2619 return Py_BuildValue("i", self->flags);
2621 if (!strcmp(name, "groups"))
2622 return Py_BuildValue("i", self->groups);
2624 if (!strcmp(name, "groupindex") && self->groupindex) {
2625 Py_INCREF(self->groupindex);
2626 return self->groupindex;
2629 PyErr_SetString(PyExc_AttributeError, name);
2630 return NULL;
2633 static PyTypeObject Pattern_Type = {
2634 PyVarObject_HEAD_INIT(NULL, 0)
2635 "_" SRE_MODULE ".SRE_Pattern",
2636 sizeof(PatternObject), sizeof(SRE_CODE),
2637 (destructor)pattern_dealloc, /*tp_dealloc*/
2638 0, /*tp_print*/
2639 (getattrfunc)pattern_getattr, /*tp_getattr*/
2640 0, /* tp_setattr */
2641 0, /* tp_compare */
2642 0, /* tp_repr */
2643 0, /* tp_as_number */
2644 0, /* tp_as_sequence */
2645 0, /* tp_as_mapping */
2646 0, /* tp_hash */
2647 0, /* tp_call */
2648 0, /* tp_str */
2649 0, /* tp_getattro */
2650 0, /* tp_setattro */
2651 0, /* tp_as_buffer */
2652 Py_TPFLAGS_DEFAULT, /* tp_flags */
2653 pattern_doc, /* tp_doc */
2654 0, /* tp_traverse */
2655 0, /* tp_clear */
2656 0, /* tp_richcompare */
2657 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2660 static PyObject *
2661 _compile(PyObject* self_, PyObject* args)
2663 /* "compile" pattern descriptor to pattern object */
2665 PatternObject* self;
2666 Py_ssize_t i, n;
2668 PyObject* pattern;
2669 int flags = 0;
2670 PyObject* code;
2671 Py_ssize_t groups = 0;
2672 PyObject* groupindex = NULL;
2673 PyObject* indexgroup = NULL;
2674 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2675 &PyList_Type, &code, &groups,
2676 &groupindex, &indexgroup))
2677 return NULL;
2679 n = PyList_GET_SIZE(code);
2680 /* coverity[ampersand_in_size] */
2681 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2682 if (!self)
2683 return NULL;
2685 self->codesize = n;
2687 for (i = 0; i < n; i++) {
2688 PyObject *o = PyList_GET_ITEM(code, i);
2689 unsigned long value = PyLong_AsUnsignedLong(o);
2690 self->code[i] = (SRE_CODE) value;
2691 if ((unsigned long) self->code[i] != value) {
2692 PyErr_SetString(PyExc_OverflowError,
2693 "regular expression code size limit exceeded");
2694 break;
2698 if (PyErr_Occurred()) {
2699 PyObject_DEL(self);
2700 return NULL;
2703 Py_INCREF(pattern);
2704 self->pattern = pattern;
2706 self->flags = flags;
2708 self->groups = groups;
2710 Py_XINCREF(groupindex);
2711 self->groupindex = groupindex;
2713 Py_XINCREF(indexgroup);
2714 self->indexgroup = indexgroup;
2716 self->weakreflist = NULL;
2718 return (PyObject*) self;
2721 /* -------------------------------------------------------------------- */
2722 /* match methods */
2724 static void
2725 match_dealloc(MatchObject* self)
2727 Py_XDECREF(self->regs);
2728 Py_XDECREF(self->string);
2729 Py_DECREF(self->pattern);
2730 PyObject_DEL(self);
2733 static PyObject*
2734 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
2736 if (index < 0 || index >= self->groups) {
2737 /* raise IndexError if we were given a bad group number */
2738 PyErr_SetString(
2739 PyExc_IndexError,
2740 "no such group"
2742 return NULL;
2745 index *= 2;
2747 if (self->string == Py_None || self->mark[index] < 0) {
2748 /* return default value if the string or group is undefined */
2749 Py_INCREF(def);
2750 return def;
2753 return PySequence_GetSlice(
2754 self->string, self->mark[index], self->mark[index+1]
2758 static Py_ssize_t
2759 match_getindex(MatchObject* self, PyObject* index)
2761 Py_ssize_t i;
2763 if (index == NULL)
2764 /* Default value */
2765 return 0;
2767 if (PyLong_Check(index))
2768 return PyLong_AsSsize_t(index);
2770 i = -1;
2772 if (self->pattern->groupindex) {
2773 index = PyObject_GetItem(self->pattern->groupindex, index);
2774 if (index) {
2775 if (PyLong_Check(index))
2776 i = PyLong_AsSsize_t(index);
2777 Py_DECREF(index);
2778 } else
2779 PyErr_Clear();
2782 return i;
2785 static PyObject*
2786 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2788 return match_getslice_by_index(self, match_getindex(self, index), def);
2791 static PyObject*
2792 match_expand(MatchObject* self, PyObject* ptemplate)
2794 /* delegate to Python code */
2795 return call(
2796 SRE_PY_MODULE, "_expand",
2797 PyTuple_Pack(3, self->pattern, self, ptemplate)
2801 static PyObject*
2802 match_group(MatchObject* self, PyObject* args)
2804 PyObject* result;
2805 Py_ssize_t i, size;
2807 size = PyTuple_GET_SIZE(args);
2809 switch (size) {
2810 case 0:
2811 result = match_getslice(self, Py_False, Py_None);
2812 break;
2813 case 1:
2814 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2815 break;
2816 default:
2817 /* fetch multiple items */
2818 result = PyTuple_New(size);
2819 if (!result)
2820 return NULL;
2821 for (i = 0; i < size; i++) {
2822 PyObject* item = match_getslice(
2823 self, PyTuple_GET_ITEM(args, i), Py_None
2825 if (!item) {
2826 Py_DECREF(result);
2827 return NULL;
2829 PyTuple_SET_ITEM(result, i, item);
2831 break;
2833 return result;
2836 static PyObject*
2837 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2839 PyObject* result;
2840 Py_ssize_t index;
2842 PyObject* def = Py_None;
2843 static char* kwlist[] = { "default", NULL };
2844 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2845 return NULL;
2847 result = PyTuple_New(self->groups-1);
2848 if (!result)
2849 return NULL;
2851 for (index = 1; index < self->groups; index++) {
2852 PyObject* item;
2853 item = match_getslice_by_index(self, index, def);
2854 if (!item) {
2855 Py_DECREF(result);
2856 return NULL;
2858 PyTuple_SET_ITEM(result, index-1, item);
2861 return result;
2864 static PyObject*
2865 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2867 PyObject* result;
2868 PyObject* keys;
2869 Py_ssize_t index;
2871 PyObject* def = Py_None;
2872 static char* kwlist[] = { "default", NULL };
2873 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2874 return NULL;
2876 result = PyDict_New();
2877 if (!result || !self->pattern->groupindex)
2878 return result;
2880 keys = PyMapping_Keys(self->pattern->groupindex);
2881 if (!keys)
2882 goto failed;
2884 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2885 int status;
2886 PyObject* key;
2887 PyObject* value;
2888 key = PyList_GET_ITEM(keys, index);
2889 if (!key)
2890 goto failed;
2891 value = match_getslice(self, key, def);
2892 if (!value) {
2893 Py_DECREF(key);
2894 goto failed;
2896 status = PyDict_SetItem(result, key, value);
2897 Py_DECREF(value);
2898 if (status < 0)
2899 goto failed;
2902 Py_DECREF(keys);
2904 return result;
2906 failed:
2907 Py_XDECREF(keys);
2908 Py_DECREF(result);
2909 return NULL;
2912 static PyObject*
2913 match_start(MatchObject* self, PyObject* args)
2915 Py_ssize_t index;
2917 PyObject* index_ = NULL;
2918 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
2919 return NULL;
2921 index = match_getindex(self, index_);
2923 if (index < 0 || index >= self->groups) {
2924 PyErr_SetString(
2925 PyExc_IndexError,
2926 "no such group"
2928 return NULL;
2931 /* mark is -1 if group is undefined */
2932 return Py_BuildValue("i", self->mark[index*2]);
2935 static PyObject*
2936 match_end(MatchObject* self, PyObject* args)
2938 Py_ssize_t index;
2940 PyObject* index_ = NULL;
2941 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
2942 return NULL;
2944 index = match_getindex(self, index_);
2946 if (index < 0 || index >= self->groups) {
2947 PyErr_SetString(
2948 PyExc_IndexError,
2949 "no such group"
2951 return NULL;
2954 /* mark is -1 if group is undefined */
2955 return Py_BuildValue("i", self->mark[index*2+1]);
2958 LOCAL(PyObject*)
2959 _pair(Py_ssize_t i1, Py_ssize_t i2)
2961 PyObject* pair;
2962 PyObject* item;
2964 pair = PyTuple_New(2);
2965 if (!pair)
2966 return NULL;
2968 item = PyLong_FromSsize_t(i1);
2969 if (!item)
2970 goto error;
2971 PyTuple_SET_ITEM(pair, 0, item);
2973 item = PyLong_FromSsize_t(i2);
2974 if (!item)
2975 goto error;
2976 PyTuple_SET_ITEM(pair, 1, item);
2978 return pair;
2980 error:
2981 Py_DECREF(pair);
2982 return NULL;
2985 static PyObject*
2986 match_span(MatchObject* self, PyObject* args)
2988 Py_ssize_t index;
2990 PyObject* index_ = NULL;
2991 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
2992 return NULL;
2994 index = match_getindex(self, index_);
2996 if (index < 0 || index >= self->groups) {
2997 PyErr_SetString(
2998 PyExc_IndexError,
2999 "no such group"
3001 return NULL;
3004 /* marks are -1 if group is undefined */
3005 return _pair(self->mark[index*2], self->mark[index*2+1]);
3008 static PyObject*
3009 match_regs(MatchObject* self)
3011 PyObject* regs;
3012 PyObject* item;
3013 Py_ssize_t index;
3015 regs = PyTuple_New(self->groups);
3016 if (!regs)
3017 return NULL;
3019 for (index = 0; index < self->groups; index++) {
3020 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3021 if (!item) {
3022 Py_DECREF(regs);
3023 return NULL;
3025 PyTuple_SET_ITEM(regs, index, item);
3028 Py_INCREF(regs);
3029 self->regs = regs;
3031 return regs;
3034 static PyObject*
3035 match_copy(MatchObject* self, PyObject *unused)
3037 #ifdef USE_BUILTIN_COPY
3038 MatchObject* copy;
3039 Py_ssize_t slots, offset;
3041 slots = 2 * (self->pattern->groups+1);
3043 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3044 if (!copy)
3045 return NULL;
3047 /* this value a constant, but any compiler should be able to
3048 figure that out all by itself */
3049 offset = offsetof(MatchObject, string);
3051 Py_XINCREF(self->pattern);
3052 Py_XINCREF(self->string);
3053 Py_XINCREF(self->regs);
3055 memcpy((char*) copy + offset, (char*) self + offset,
3056 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3058 return (PyObject*) copy;
3059 #else
3060 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3061 return NULL;
3062 #endif
3065 static PyObject*
3066 match_deepcopy(MatchObject* self, PyObject* memo)
3068 #ifdef USE_BUILTIN_COPY
3069 MatchObject* copy;
3071 copy = (MatchObject*) match_copy(self);
3072 if (!copy)
3073 return NULL;
3075 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3076 !deepcopy(&copy->string, memo) ||
3077 !deepcopy(&copy->regs, memo)) {
3078 Py_DECREF(copy);
3079 return NULL;
3082 #else
3083 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3084 return NULL;
3085 #endif
3088 static PyMethodDef match_methods[] = {
3089 {"group", (PyCFunction) match_group, METH_VARARGS},
3090 {"start", (PyCFunction) match_start, METH_VARARGS},
3091 {"end", (PyCFunction) match_end, METH_VARARGS},
3092 {"span", (PyCFunction) match_span, METH_VARARGS},
3093 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3094 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3095 {"expand", (PyCFunction) match_expand, METH_O},
3096 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3097 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3098 {NULL, NULL}
3101 static PyObject*
3102 match_getattr(MatchObject* self, char* name)
3104 PyObject* res;
3106 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3107 if (res)
3108 return res;
3110 PyErr_Clear();
3112 if (!strcmp(name, "lastindex")) {
3113 if (self->lastindex >= 0)
3114 return Py_BuildValue("i", self->lastindex);
3115 Py_INCREF(Py_None);
3116 return Py_None;
3119 if (!strcmp(name, "lastgroup")) {
3120 if (self->pattern->indexgroup && self->lastindex >= 0) {
3121 PyObject* result = PySequence_GetItem(
3122 self->pattern->indexgroup, self->lastindex
3124 if (result)
3125 return result;
3126 PyErr_Clear();
3128 Py_INCREF(Py_None);
3129 return Py_None;
3132 if (!strcmp(name, "string")) {
3133 if (self->string) {
3134 Py_INCREF(self->string);
3135 return self->string;
3136 } else {
3137 Py_INCREF(Py_None);
3138 return Py_None;
3142 if (!strcmp(name, "regs")) {
3143 if (self->regs) {
3144 Py_INCREF(self->regs);
3145 return self->regs;
3146 } else
3147 return match_regs(self);
3150 if (!strcmp(name, "re")) {
3151 Py_INCREF(self->pattern);
3152 return (PyObject*) self->pattern;
3155 if (!strcmp(name, "pos"))
3156 return Py_BuildValue("i", self->pos);
3158 if (!strcmp(name, "endpos"))
3159 return Py_BuildValue("i", self->endpos);
3161 PyErr_SetString(PyExc_AttributeError, name);
3162 return NULL;
3165 /* FIXME: implement setattr("string", None) as a special case (to
3166 detach the associated string, if any */
3168 static PyTypeObject Match_Type = {
3169 PyVarObject_HEAD_INIT(NULL,0)
3170 "_" SRE_MODULE ".SRE_Match",
3171 sizeof(MatchObject), sizeof(Py_ssize_t),
3172 (destructor)match_dealloc, /*tp_dealloc*/
3173 0, /*tp_print*/
3174 (getattrfunc)match_getattr /*tp_getattr*/
3177 static PyObject*
3178 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3180 /* create match object (from state object) */
3182 MatchObject* match;
3183 Py_ssize_t i, j;
3184 char* base;
3185 int n;
3187 if (status > 0) {
3189 /* create match object (with room for extra group marks) */
3190 /* coverity[ampersand_in_size] */
3191 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3192 2*(pattern->groups+1));
3193 if (!match)
3194 return NULL;
3196 Py_INCREF(pattern);
3197 match->pattern = pattern;
3199 Py_INCREF(state->string);
3200 match->string = state->string;
3202 match->regs = NULL;
3203 match->groups = pattern->groups+1;
3205 /* fill in group slices */
3207 base = (char*) state->beginning;
3208 n = state->charsize;
3210 match->mark[0] = ((char*) state->start - base) / n;
3211 match->mark[1] = ((char*) state->ptr - base) / n;
3213 for (i = j = 0; i < pattern->groups; i++, j+=2)
3214 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3215 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3216 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3217 } else
3218 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3220 match->pos = state->pos;
3221 match->endpos = state->endpos;
3223 match->lastindex = state->lastindex;
3225 return (PyObject*) match;
3227 } else if (status == 0) {
3229 /* no match */
3230 Py_INCREF(Py_None);
3231 return Py_None;
3235 /* internal error */
3236 pattern_error(status);
3237 return NULL;
3241 /* -------------------------------------------------------------------- */
3242 /* scanner methods (experimental) */
3244 static void
3245 scanner_dealloc(ScannerObject* self)
3247 state_fini(&self->state);
3248 Py_DECREF(self->pattern);
3249 PyObject_DEL(self);
3252 static PyObject*
3253 scanner_match(ScannerObject* self, PyObject *unused)
3255 SRE_STATE* state = &self->state;
3256 PyObject* match;
3257 int status;
3259 state_reset(state);
3261 state->ptr = state->start;
3263 if (state->charsize == 1) {
3264 status = sre_match(state, PatternObject_GetCode(self->pattern));
3265 } else {
3266 #if defined(HAVE_UNICODE)
3267 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3268 #endif
3270 if (PyErr_Occurred())
3271 return NULL;
3273 match = pattern_new_match((PatternObject*) self->pattern,
3274 state, status);
3276 if (status == 0 || state->ptr == state->start)
3277 state->start = (void*) ((char*) state->ptr + state->charsize);
3278 else
3279 state->start = state->ptr;
3281 return match;
3285 static PyObject*
3286 scanner_search(ScannerObject* self, PyObject *unused)
3288 SRE_STATE* state = &self->state;
3289 PyObject* match;
3290 int status;
3292 state_reset(state);
3294 state->ptr = state->start;
3296 if (state->charsize == 1) {
3297 status = sre_search(state, PatternObject_GetCode(self->pattern));
3298 } else {
3299 #if defined(HAVE_UNICODE)
3300 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3301 #endif
3303 if (PyErr_Occurred())
3304 return NULL;
3306 match = pattern_new_match((PatternObject*) self->pattern,
3307 state, status);
3309 if (status == 0 || state->ptr == state->start)
3310 state->start = (void*) ((char*) state->ptr + state->charsize);
3311 else
3312 state->start = state->ptr;
3314 return match;
3317 static PyMethodDef scanner_methods[] = {
3318 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3319 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3320 {NULL, NULL}
3323 static PyObject*
3324 scanner_getattr(ScannerObject* self, char* name)
3326 PyObject* res;
3328 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3329 if (res)
3330 return res;
3332 PyErr_Clear();
3334 /* attributes */
3335 if (!strcmp(name, "pattern")) {
3336 Py_INCREF(self->pattern);
3337 return self->pattern;
3340 PyErr_SetString(PyExc_AttributeError, name);
3341 return NULL;
3344 static PyTypeObject Scanner_Type = {
3345 PyVarObject_HEAD_INIT(NULL, 0)
3346 "_" SRE_MODULE ".SRE_Scanner",
3347 sizeof(ScannerObject), 0,
3348 (destructor)scanner_dealloc, /*tp_dealloc*/
3349 0, /*tp_print*/
3350 (getattrfunc)scanner_getattr, /*tp_getattr*/
3353 static PyObject*
3354 pattern_scanner(PatternObject* pattern, PyObject* args)
3356 /* create search state object */
3358 ScannerObject* self;
3360 PyObject* string;
3361 Py_ssize_t start = 0;
3362 Py_ssize_t end = PY_SSIZE_T_MAX;
3363 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3364 return NULL;
3366 /* create scanner object */
3367 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3368 if (!self)
3369 return NULL;
3371 string = state_init(&self->state, pattern, string, start, end);
3372 if (!string) {
3373 PyObject_DEL(self);
3374 return NULL;
3377 Py_INCREF(pattern);
3378 self->pattern = (PyObject*) pattern;
3380 return (PyObject*) self;
3383 static PyMethodDef _functions[] = {
3384 {"compile", _compile, METH_VARARGS},
3385 {"getcodesize", sre_codesize, METH_NOARGS},
3386 {"getlower", sre_getlower, METH_VARARGS},
3387 {NULL, NULL}
3390 PyMODINIT_FUNC init_sre(void)
3392 PyObject* m;
3393 PyObject* d;
3394 PyObject* x;
3396 /* Initialize object types */
3397 if (PyType_Ready(&Pattern_Type) < 0)
3398 return;
3399 if (PyType_Ready(&Match_Type) < 0)
3400 return;
3401 if (PyType_Ready(&Scanner_Type) < 0)
3402 return;
3404 m = Py_InitModule("_" SRE_MODULE, _functions);
3405 if (m == NULL)
3406 return;
3407 d = PyModule_GetDict(m);
3409 x = PyLong_FromLong(SRE_MAGIC);
3410 if (x) {
3411 PyDict_SetItemString(d, "MAGIC", x);
3412 Py_DECREF(x);
3415 x = PyLong_FromLong(sizeof(SRE_CODE));
3416 if (x) {
3417 PyDict_SetItemString(d, "CODESIZE", x);
3418 Py_DECREF(x);
3421 x = PyUnicode_FromString(copyright);
3422 if (x) {
3423 PyDict_SetItemString(d, "copyright", x);
3424 Py_DECREF(x);
3428 #endif /* !defined(SRE_RECURSIVE) */
3430 /* vim:ts=4:sw=4:et