Merged release21-maint changes.
[python/dscho.git] / Modules / _sre.c
blobcaf47aab29a3ae6e71b6ae5a5183724242ab7f53
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
12 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
14 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
16 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
23 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
24 * 2000-10-24 fl really fixed assert_not; reset groups in findall
25 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
28 * 2001-01-16 fl fixed memory leak in pattern destructor
29 * 2001-03-20 fl lots of fixes for 2.1b2
30 * 2001-04-15 fl export copyright as Python attribute, not global
31 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
33 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
35 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
37 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license. For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
41 * Portions of this engine have been developed in cooperation with
42 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
43 * other compatibility work.
46 #ifndef SRE_RECURSIVE
48 static char copyright[] =
49 " SRE 2.1.1 Copyright (c) 1997-2001 by Secret Labs AB ";
51 #include "Python.h"
52 #include "structmember.h" /* offsetof */
54 #include "sre.h"
56 #include <ctype.h>
58 /* name of this module, minus the leading underscore */
59 #if !defined(SRE_MODULE)
60 #define SRE_MODULE "sre"
61 #endif
63 /* defining this one enables tracing */
64 #undef VERBOSE
66 #if PY_VERSION_HEX >= 0x01060000
67 /* defining this enables unicode support (default under 1.6a1 and later) */
68 #define HAVE_UNICODE
69 #endif
71 /* -------------------------------------------------------------------- */
72 /* optional features */
74 /* prevent run-away recursion (bad patterns on long strings) */
76 #if !defined(USE_STACKCHECK)
77 #if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
78 /* require smaller recursion limit for a number of 64-bit platforms:
79 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
80 /* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
81 #define USE_RECURSION_LIMIT 7500
82 #else
83 #define USE_RECURSION_LIMIT 10000
84 #endif
85 #endif
87 /* enables fast searching */
88 #define USE_FAST_SEARCH
90 /* enables aggressive inlining (always on for Visual C) */
91 #undef USE_INLINE
93 /* enables copy/deepcopy handling (work in progress) */
94 #undef USE_BUILTIN_COPY
96 #if PY_VERSION_HEX < 0x01060000
97 #define PyObject_DEL(op) PyMem_DEL((op))
98 #endif
100 /* -------------------------------------------------------------------- */
102 #if defined(_MSC_VER)
103 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
104 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
105 /* fastest possible local call under MSVC */
106 #define LOCAL(type) static __inline type __fastcall
107 #elif defined(USE_INLINE)
108 #define LOCAL(type) static inline type
109 #else
110 #define LOCAL(type) static type
111 #endif
113 /* error codes */
114 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
115 #define SRE_ERROR_STATE -2 /* illegal state */
116 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
117 #define SRE_ERROR_MEMORY -9 /* out of memory */
119 #if defined(VERBOSE)
120 #define TRACE(v) printf v
121 #else
122 #define TRACE(v)
123 #endif
125 /* -------------------------------------------------------------------- */
126 /* search engine state */
128 /* default character predicates (run sre_chars.py to regenerate tables) */
130 #define SRE_DIGIT_MASK 1
131 #define SRE_SPACE_MASK 2
132 #define SRE_LINEBREAK_MASK 4
133 #define SRE_ALNUM_MASK 8
134 #define SRE_WORD_MASK 16
136 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
137 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
139 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
140 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
141 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
142 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
144 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
145 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
146 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
147 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
148 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152 120, 121, 122, 123, 124, 125, 126, 127 };
154 #define SRE_IS_DIGIT(ch)\
155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156 #define SRE_IS_SPACE(ch)\
157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158 #define SRE_IS_LINEBREAK(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160 #define SRE_IS_ALNUM(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162 #define SRE_IS_WORD(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
165 static unsigned int sre_lower(unsigned int ch)
167 return ((ch) < 128 ? sre_char_lower[ch] : ch);
170 /* locale-specific character predicates */
172 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
173 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
174 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
175 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
176 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
178 static unsigned int sre_lower_locale(unsigned int ch)
180 return ((ch) < 256 ? tolower((ch)) : ch);
183 /* unicode-specific character predicates */
185 #if defined(HAVE_UNICODE)
187 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
188 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
189 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
190 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
191 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
193 static unsigned int sre_lower_unicode(unsigned int ch)
195 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
198 #endif
200 LOCAL(int)
201 sre_category(SRE_CODE category, unsigned int ch)
203 switch (category) {
205 case SRE_CATEGORY_DIGIT:
206 return SRE_IS_DIGIT(ch);
207 case SRE_CATEGORY_NOT_DIGIT:
208 return !SRE_IS_DIGIT(ch);
209 case SRE_CATEGORY_SPACE:
210 return SRE_IS_SPACE(ch);
211 case SRE_CATEGORY_NOT_SPACE:
212 return !SRE_IS_SPACE(ch);
213 case SRE_CATEGORY_WORD:
214 return SRE_IS_WORD(ch);
215 case SRE_CATEGORY_NOT_WORD:
216 return !SRE_IS_WORD(ch);
217 case SRE_CATEGORY_LINEBREAK:
218 return SRE_IS_LINEBREAK(ch);
219 case SRE_CATEGORY_NOT_LINEBREAK:
220 return !SRE_IS_LINEBREAK(ch);
222 case SRE_CATEGORY_LOC_WORD:
223 return SRE_LOC_IS_WORD(ch);
224 case SRE_CATEGORY_LOC_NOT_WORD:
225 return !SRE_LOC_IS_WORD(ch);
227 #if defined(HAVE_UNICODE)
228 case SRE_CATEGORY_UNI_DIGIT:
229 return SRE_UNI_IS_DIGIT(ch);
230 case SRE_CATEGORY_UNI_NOT_DIGIT:
231 return !SRE_UNI_IS_DIGIT(ch);
232 case SRE_CATEGORY_UNI_SPACE:
233 return SRE_UNI_IS_SPACE(ch);
234 case SRE_CATEGORY_UNI_NOT_SPACE:
235 return !SRE_UNI_IS_SPACE(ch);
236 case SRE_CATEGORY_UNI_WORD:
237 return SRE_UNI_IS_WORD(ch);
238 case SRE_CATEGORY_UNI_NOT_WORD:
239 return !SRE_UNI_IS_WORD(ch);
240 case SRE_CATEGORY_UNI_LINEBREAK:
241 return SRE_UNI_IS_LINEBREAK(ch);
242 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
243 return !SRE_UNI_IS_LINEBREAK(ch);
244 #else
245 case SRE_CATEGORY_UNI_DIGIT:
246 return SRE_IS_DIGIT(ch);
247 case SRE_CATEGORY_UNI_NOT_DIGIT:
248 return !SRE_IS_DIGIT(ch);
249 case SRE_CATEGORY_UNI_SPACE:
250 return SRE_IS_SPACE(ch);
251 case SRE_CATEGORY_UNI_NOT_SPACE:
252 return !SRE_IS_SPACE(ch);
253 case SRE_CATEGORY_UNI_WORD:
254 return SRE_LOC_IS_WORD(ch);
255 case SRE_CATEGORY_UNI_NOT_WORD:
256 return !SRE_LOC_IS_WORD(ch);
257 case SRE_CATEGORY_UNI_LINEBREAK:
258 return SRE_IS_LINEBREAK(ch);
259 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
260 return !SRE_IS_LINEBREAK(ch);
261 #endif
263 return 0;
266 /* helpers */
268 static void
269 mark_fini(SRE_STATE* state)
271 if (state->mark_stack) {
272 free(state->mark_stack);
273 state->mark_stack = NULL;
275 state->mark_stack_size = state->mark_stack_base = 0;
278 static int
279 mark_save(SRE_STATE* state, int lo, int hi)
281 void* stack;
282 int size;
283 int minsize, newsize;
285 if (hi <= lo)
286 return 0;
288 size = (hi - lo) + 1;
290 newsize = state->mark_stack_size;
291 minsize = state->mark_stack_base + size;
293 if (newsize < minsize) {
294 /* create new stack */
295 if (!newsize) {
296 newsize = 512;
297 if (newsize < minsize)
298 newsize = minsize;
299 TRACE(("allocate stack %d\n", newsize));
300 stack = malloc(sizeof(void*) * newsize);
301 } else {
302 /* grow the stack */
303 while (newsize < minsize)
304 newsize += newsize;
305 TRACE(("grow stack to %d\n", newsize));
306 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
308 if (!stack) {
309 mark_fini(state);
310 return SRE_ERROR_MEMORY;
312 state->mark_stack = stack;
313 state->mark_stack_size = newsize;
316 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
318 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
319 size * sizeof(void*));
321 state->mark_stack_base += size;
323 return 0;
326 static int
327 mark_restore(SRE_STATE* state, int lo, int hi)
329 int size;
331 if (hi <= lo)
332 return 0;
334 size = (hi - lo) + 1;
336 state->mark_stack_base -= size;
338 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
340 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
341 size * sizeof(void*));
343 return 0;
346 /* generate 8-bit version */
348 #define SRE_CHAR unsigned char
349 #define SRE_AT sre_at
350 #define SRE_COUNT sre_count
351 #define SRE_CHARSET sre_charset
352 #define SRE_INFO sre_info
353 #define SRE_MATCH sre_match
354 #define SRE_SEARCH sre_search
356 #if defined(HAVE_UNICODE)
358 #define SRE_RECURSIVE
359 #include "_sre.c"
360 #undef SRE_RECURSIVE
362 #undef SRE_SEARCH
363 #undef SRE_MATCH
364 #undef SRE_INFO
365 #undef SRE_CHARSET
366 #undef SRE_COUNT
367 #undef SRE_AT
368 #undef SRE_CHAR
370 /* generate 16-bit unicode version */
372 #define SRE_CHAR Py_UNICODE
373 #define SRE_AT sre_uat
374 #define SRE_COUNT sre_ucount
375 #define SRE_CHARSET sre_ucharset
376 #define SRE_INFO sre_uinfo
377 #define SRE_MATCH sre_umatch
378 #define SRE_SEARCH sre_usearch
379 #endif
381 #endif /* SRE_RECURSIVE */
383 /* -------------------------------------------------------------------- */
384 /* String matching engine */
386 /* the following section is compiled twice, with different character
387 settings */
389 LOCAL(int)
390 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
392 /* check if pointer is at given position */
394 int this, that;
396 switch (at) {
398 case SRE_AT_BEGINNING:
399 case SRE_AT_BEGINNING_STRING:
400 return ((void*) ptr == state->beginning);
402 case SRE_AT_BEGINNING_LINE:
403 return ((void*) ptr == state->beginning ||
404 SRE_IS_LINEBREAK((int) ptr[-1]));
406 case SRE_AT_END:
407 return (((void*) (ptr+1) == state->end &&
408 SRE_IS_LINEBREAK((int) ptr[0])) ||
409 ((void*) ptr == state->end));
411 case SRE_AT_END_LINE:
412 return ((void*) ptr == state->end ||
413 SRE_IS_LINEBREAK((int) ptr[0]));
415 case SRE_AT_END_STRING:
416 return ((void*) ptr == state->end);
418 case SRE_AT_BOUNDARY:
419 if (state->beginning == state->end)
420 return 0;
421 that = ((void*) ptr > state->beginning) ?
422 SRE_IS_WORD((int) ptr[-1]) : 0;
423 this = ((void*) ptr < state->end) ?
424 SRE_IS_WORD((int) ptr[0]) : 0;
425 return this != that;
427 case SRE_AT_NON_BOUNDARY:
428 if (state->beginning == state->end)
429 return 0;
430 that = ((void*) ptr > state->beginning) ?
431 SRE_IS_WORD((int) ptr[-1]) : 0;
432 this = ((void*) ptr < state->end) ?
433 SRE_IS_WORD((int) ptr[0]) : 0;
434 return this == that;
436 case SRE_AT_LOC_BOUNDARY:
437 if (state->beginning == state->end)
438 return 0;
439 that = ((void*) ptr > state->beginning) ?
440 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
441 this = ((void*) ptr < state->end) ?
442 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
443 return this != that;
445 case SRE_AT_LOC_NON_BOUNDARY:
446 if (state->beginning == state->end)
447 return 0;
448 that = ((void*) ptr > state->beginning) ?
449 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
450 this = ((void*) ptr < state->end) ?
451 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
452 return this == that;
454 #if defined(HAVE_UNICODE)
455 case SRE_AT_UNI_BOUNDARY:
456 if (state->beginning == state->end)
457 return 0;
458 that = ((void*) ptr > state->beginning) ?
459 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
460 this = ((void*) ptr < state->end) ?
461 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
462 return this != that;
464 case SRE_AT_UNI_NON_BOUNDARY:
465 if (state->beginning == state->end)
466 return 0;
467 that = ((void*) ptr > state->beginning) ?
468 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
469 this = ((void*) ptr < state->end) ?
470 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
471 return this == that;
472 #endif
476 return 0;
479 LOCAL(int)
480 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
482 /* check if character is a member of the given set */
484 int ok = 1;
486 for (;;) {
487 switch (*set++) {
489 case SRE_OP_LITERAL:
490 /* <LITERAL> <code> */
491 if (ch == set[0])
492 return ok;
493 set++;
494 break;
496 case SRE_OP_RANGE:
497 /* <RANGE> <lower> <upper> */
498 if (set[0] <= ch && ch <= set[1])
499 return ok;
500 set += 2;
501 break;
503 case SRE_OP_CHARSET:
504 /* <CHARSET> <bitmap> (16 bits per code word) */
505 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
506 return ok;
507 set += 16;
508 break;
510 case SRE_OP_BIGCHARSET:
511 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
513 int count, block;
514 count = *(set++);
515 block = ((unsigned char*)set)[ch >> 8];
516 set += 128;
517 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
518 return ok;
519 set += count*16;
520 break;
523 case SRE_OP_CATEGORY:
524 /* <CATEGORY> <code> */
525 if (sre_category(set[0], (int) ch))
526 return ok;
527 set += 1;
528 break;
530 case SRE_OP_NEGATE:
531 ok = !ok;
532 break;
534 case SRE_OP_FAILURE:
535 return !ok;
537 default:
538 /* internal error -- there's not much we can do about it
539 here, so let's just pretend it didn't match... */
540 return 0;
545 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
547 LOCAL(int)
548 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
550 SRE_CODE chr;
551 SRE_CHAR* ptr = state->ptr;
552 SRE_CHAR* end = state->end;
553 int i;
555 /* adjust end */
556 if (maxcount < end - ptr && maxcount != 65535)
557 end = ptr + maxcount;
559 switch (pattern[0]) {
561 case SRE_OP_ANY:
562 /* repeated dot wildcard. */
563 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
564 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
565 ptr++;
566 break;
568 case SRE_OP_ANY_ALL:
569 /* repeated dot wildcare. skip to the end of the target
570 string, and backtrack from there */
571 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
572 ptr = end;
573 break;
575 case SRE_OP_LITERAL:
576 /* repeated literal */
577 chr = pattern[1];
578 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
579 while (ptr < end && (SRE_CODE) *ptr == chr)
580 ptr++;
581 break;
583 case SRE_OP_LITERAL_IGNORE:
584 /* repeated literal */
585 chr = pattern[1];
586 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
587 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
588 ptr++;
589 break;
591 case SRE_OP_NOT_LITERAL:
592 /* repeated non-literal */
593 chr = pattern[1];
594 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
595 while (ptr < end && (SRE_CODE) *ptr != chr)
596 ptr++;
597 break;
599 case SRE_OP_NOT_LITERAL_IGNORE:
600 /* repeated non-literal */
601 chr = pattern[1];
602 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
603 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
604 ptr++;
605 break;
607 case SRE_OP_IN:
608 /* repeated set */
609 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
610 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
611 ptr++;
612 break;
614 default:
615 /* repeated single character pattern */
616 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
617 while ((SRE_CHAR*) state->ptr < end) {
618 i = SRE_MATCH(state, pattern, level);
619 if (i < 0)
620 return i;
621 if (!i)
622 break;
624 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
625 (SRE_CHAR*) state->ptr - ptr));
626 return (SRE_CHAR*) state->ptr - ptr;
629 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
630 return ptr - (SRE_CHAR*) state->ptr;
633 #if 0 /* not used in this release */
634 LOCAL(int)
635 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
637 /* check if an SRE_OP_INFO block matches at the current position.
638 returns the number of SRE_CODE objects to skip if successful, 0
639 if no match */
641 SRE_CHAR* end = state->end;
642 SRE_CHAR* ptr = state->ptr;
643 int i;
645 /* check minimal length */
646 if (pattern[3] && (end - ptr) < pattern[3])
647 return 0;
649 /* check known prefix */
650 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
651 /* <length> <skip> <prefix data> <overlap data> */
652 for (i = 0; i < pattern[5]; i++)
653 if ((SRE_CODE) ptr[i] != pattern[7 + i])
654 return 0;
655 return pattern[0] + 2 * pattern[6];
657 return pattern[0];
659 #endif
661 LOCAL(int)
662 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
664 /* check if string matches the given pattern. returns <0 for
665 error, 0 for failure, and 1 for success */
667 SRE_CHAR* end = state->end;
668 SRE_CHAR* ptr = state->ptr;
669 int i, count;
670 SRE_REPEAT* rp;
671 int lastmark;
672 SRE_CODE chr;
674 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
676 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
678 #if defined(USE_STACKCHECK)
679 if (level % 10 == 0 && PyOS_CheckStack())
680 return SRE_ERROR_RECURSION_LIMIT;
681 #endif
683 #if defined(USE_RECURSION_LIMIT)
684 if (level > USE_RECURSION_LIMIT)
685 return SRE_ERROR_RECURSION_LIMIT;
686 #endif
688 if (pattern[0] == SRE_OP_INFO) {
689 /* optimization info block */
690 /* <INFO> <1=skip> <2=flags> <3=min> ... */
691 if (pattern[3] && (end - ptr) < pattern[3]) {
692 TRACE(("reject (got %d chars, need %d)\n",
693 (end - ptr), pattern[3]));
694 return 0;
696 pattern += pattern[1] + 1;
699 for (;;) {
701 switch (*pattern++) {
703 case SRE_OP_FAILURE:
704 /* immediate failure */
705 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
706 return 0;
708 case SRE_OP_SUCCESS:
709 /* end of pattern */
710 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
711 state->ptr = ptr;
712 return 1;
714 case SRE_OP_AT:
715 /* match at given position */
716 /* <AT> <code> */
717 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
718 if (!SRE_AT(state, ptr, *pattern))
719 return 0;
720 pattern++;
721 break;
723 case SRE_OP_CATEGORY:
724 /* match at given category */
725 /* <CATEGORY> <code> */
726 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
727 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
728 return 0;
729 pattern++;
730 ptr++;
731 break;
733 case SRE_OP_LITERAL:
734 /* match literal string */
735 /* <LITERAL> <code> */
736 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
737 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
738 return 0;
739 pattern++;
740 ptr++;
741 break;
743 case SRE_OP_NOT_LITERAL:
744 /* match anything that is not literal character */
745 /* <NOT_LITERAL> <code> */
746 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
747 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
748 return 0;
749 pattern++;
750 ptr++;
751 break;
753 case SRE_OP_ANY:
754 /* match anything (except a newline) */
755 /* <ANY> */
756 TRACE(("|%p|%p|ANY\n", pattern, ptr));
757 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
758 return 0;
759 ptr++;
760 break;
762 case SRE_OP_ANY_ALL:
763 /* match anything */
764 /* <ANY_ALL> */
765 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
766 if (ptr >= end)
767 return 0;
768 ptr++;
769 break;
771 case SRE_OP_IN:
772 /* match set member (or non_member) */
773 /* <IN> <skip> <set> */
774 TRACE(("|%p|%p|IN\n", pattern, ptr));
775 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
776 return 0;
777 pattern += pattern[0];
778 ptr++;
779 break;
781 case SRE_OP_GROUPREF:
782 /* match backreference */
783 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
784 i = pattern[0];
786 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
787 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
788 if (!p || !e || e < p)
789 return 0;
790 while (p < e) {
791 if (ptr >= end || *ptr != *p)
792 return 0;
793 p++; ptr++;
796 pattern++;
797 break;
799 case SRE_OP_GROUPREF_IGNORE:
800 /* match backreference */
801 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
802 i = pattern[0];
804 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
805 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
806 if (!p || !e || e < p)
807 return 0;
808 while (p < e) {
809 if (ptr >= end ||
810 state->lower(*ptr) != state->lower(*p))
811 return 0;
812 p++; ptr++;
815 pattern++;
816 break;
818 case SRE_OP_LITERAL_IGNORE:
819 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
820 if (ptr >= end ||
821 state->lower(*ptr) != state->lower(*pattern))
822 return 0;
823 pattern++;
824 ptr++;
825 break;
827 case SRE_OP_NOT_LITERAL_IGNORE:
828 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
829 if (ptr >= end ||
830 state->lower(*ptr) == state->lower(*pattern))
831 return 0;
832 pattern++;
833 ptr++;
834 break;
836 case SRE_OP_IN_IGNORE:
837 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
838 if (ptr >= end
839 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
840 return 0;
841 pattern += pattern[0];
842 ptr++;
843 break;
845 case SRE_OP_MARK:
846 /* set mark */
847 /* <MARK> <gid> */
848 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
849 i = pattern[0];
850 if (i & 1)
851 state->lastindex = i/2 + 1;
852 if (i > state->lastmark)
853 state->lastmark = i;
854 state->mark[i] = ptr;
855 pattern++;
856 break;
858 case SRE_OP_JUMP:
859 case SRE_OP_INFO:
860 /* jump forward */
861 /* <JUMP> <offset> */
862 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
863 pattern += pattern[0];
864 break;
866 case SRE_OP_ASSERT:
867 /* assert subpattern */
868 /* <ASSERT> <skip> <back> <pattern> */
869 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
870 state->ptr = ptr - pattern[1];
871 if (state->ptr < state->beginning)
872 return 0;
873 i = SRE_MATCH(state, pattern + 2, level + 1);
874 if (i <= 0)
875 return i;
876 pattern += pattern[0];
877 break;
879 case SRE_OP_ASSERT_NOT:
880 /* assert not subpattern */
881 /* <ASSERT_NOT> <skip> <back> <pattern> */
882 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
883 state->ptr = ptr - pattern[1];
884 if (state->ptr >= state->beginning) {
885 i = SRE_MATCH(state, pattern + 2, level + 1);
886 if (i < 0)
887 return i;
888 if (i)
889 return 0;
891 pattern += pattern[0];
892 break;
894 case SRE_OP_BRANCH:
895 /* alternation */
896 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
897 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
898 lastmark = state->lastmark;
899 for (; pattern[0]; pattern += pattern[0]) {
900 if (pattern[1] == SRE_OP_LITERAL &&
901 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
902 continue;
903 if (pattern[1] == SRE_OP_IN &&
904 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
905 continue;
906 state->ptr = ptr;
907 i = SRE_MATCH(state, pattern + 1, level + 1);
908 if (i)
909 return i;
910 if (state->lastmark > lastmark) {
911 memset(
912 state->mark + lastmark + 1, 0,
913 (state->lastmark - lastmark) * sizeof(void*)
915 state->lastmark = lastmark;
918 return 0;
920 case SRE_OP_REPEAT_ONE:
921 /* match repeated sequence (maximizing regexp) */
923 /* this operator only works if the repeated item is
924 exactly one character wide, and we're not already
925 collecting backtracking points. for other cases,
926 use the MAX_REPEAT operator */
928 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
930 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
931 pattern[1], pattern[2]));
933 if (ptr + pattern[1] > end)
934 return 0; /* cannot match */
936 state->ptr = ptr;
938 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
939 if (count < 0)
940 return count;
942 ptr += count;
944 /* when we arrive here, count contains the number of
945 matches, and ptr points to the tail of the target
946 string. check if the rest of the pattern matches,
947 and backtrack if not. */
949 if (count < (int) pattern[1])
950 return 0;
952 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
953 /* tail is empty. we're finished */
954 state->ptr = ptr;
955 return 1;
957 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
958 /* tail starts with a literal. skip positions where
959 the rest of the pattern cannot possibly match */
960 chr = pattern[pattern[0]+1];
961 for (;;) {
962 while (count >= (int) pattern[1] &&
963 (ptr >= end || *ptr != chr)) {
964 ptr--;
965 count--;
967 if (count < (int) pattern[1])
968 break;
969 state->ptr = ptr;
970 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
971 if (i)
972 return i;
973 ptr--;
974 count--;
977 } else {
978 /* general case */
979 lastmark = state->lastmark;
980 while (count >= (int) pattern[1]) {
981 state->ptr = ptr;
982 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
983 if (i)
984 return i;
985 ptr--;
986 count--;
987 if (state->lastmark > lastmark) {
988 memset(
989 state->mark + lastmark + 1, 0,
990 (state->lastmark - lastmark) * sizeof(void*)
992 state->lastmark = lastmark;
996 return 0;
998 case SRE_OP_REPEAT:
999 /* create repeat context. all the hard work is done
1000 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1001 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1002 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
1003 pattern[1], pattern[2]));
1005 rep.count = -1;
1006 rep.pattern = pattern;
1008 /* install new repeat context */
1009 rep.prev = state->repeat;
1010 state->repeat = &rep;
1012 state->ptr = ptr;
1013 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
1015 state->repeat = rep.prev;
1017 return i;
1019 case SRE_OP_MAX_UNTIL:
1020 /* maximizing repeat */
1021 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1023 /* FIXME: we probably need to deal with zero-width
1024 matches in here... */
1026 rp = state->repeat;
1027 if (!rp)
1028 return SRE_ERROR_STATE;
1030 state->ptr = ptr;
1032 count = rp->count + 1;
1034 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
1036 if (count < rp->pattern[1]) {
1037 /* not enough matches */
1038 rp->count = count;
1039 /* RECURSIVE */
1040 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1041 if (i)
1042 return i;
1043 rp->count = count - 1;
1044 state->ptr = ptr;
1045 return 0;
1048 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
1049 /* we may have enough matches, but if we can
1050 match another item, do so */
1051 rp->count = count;
1052 lastmark = state->lastmark;
1053 i = mark_save(state, 0, lastmark);
1054 if (i < 0)
1055 return i;
1056 /* RECURSIVE */
1057 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1058 if (i)
1059 return i;
1060 i = mark_restore(state, 0, lastmark);
1061 if (i < 0)
1062 return i;
1063 rp->count = count - 1;
1064 state->ptr = ptr;
1067 /* cannot match more repeated items here. make sure the
1068 tail matches */
1069 state->repeat = rp->prev;
1070 i = SRE_MATCH(state, pattern, level + 1);
1071 if (i)
1072 return i;
1073 state->repeat = rp;
1074 state->ptr = ptr;
1075 return 0;
1077 case SRE_OP_MIN_UNTIL:
1078 /* minimizing repeat */
1079 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1081 rp = state->repeat;
1082 if (!rp)
1083 return SRE_ERROR_STATE;
1085 count = rp->count + 1;
1087 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1088 rp->pattern));
1090 state->ptr = ptr;
1092 if (count < rp->pattern[1]) {
1093 /* not enough matches */
1094 rp->count = count;
1095 /* RECURSIVE */
1096 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1097 if (i)
1098 return i;
1099 rp->count = count-1;
1100 state->ptr = ptr;
1101 return 0;
1104 /* see if the tail matches */
1105 state->repeat = rp->prev;
1106 /* FIXME: the following fix doesn't always work (#133283) */
1107 if (rp->pattern[2] == 65535) {
1108 /* unbounded repeat */
1109 for (;;) {
1110 i = SRE_MATCH(state, pattern, level + 1);
1111 if (i || ptr >= end)
1112 break;
1113 state->ptr = ++ptr;
1115 } else
1116 i = SRE_MATCH(state, pattern, level + 1);
1117 if (i) {
1118 /* free(rp); */
1119 return i;
1122 state->ptr = ptr;
1123 state->repeat = rp;
1125 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1126 return 0;
1128 rp->count = count;
1129 /* RECURSIVE */
1130 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1131 if (i)
1132 return i;
1133 rp->count = count - 1;
1134 state->ptr = ptr;
1135 return 0;
1137 default:
1138 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
1139 return SRE_ERROR_ILLEGAL;
1143 /* shouldn't end up here */
1144 return SRE_ERROR_ILLEGAL;
1147 LOCAL(int)
1148 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1150 SRE_CHAR* ptr = state->start;
1151 SRE_CHAR* end = state->end;
1152 int status = 0;
1153 int prefix_len = 0;
1154 int prefix_skip = 0;
1155 SRE_CODE* prefix = NULL;
1156 SRE_CODE* charset = NULL;
1157 SRE_CODE* overlap = NULL;
1158 int flags = 0;
1160 if (pattern[0] == SRE_OP_INFO) {
1161 /* optimization info block */
1162 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1164 flags = pattern[2];
1166 if (pattern[3] > 0) {
1167 /* adjust end point (but make sure we leave at least one
1168 character in there, so literal search will work) */
1169 end -= pattern[3]-1;
1170 if (end <= ptr)
1171 end = ptr+1;
1174 if (flags & SRE_INFO_PREFIX) {
1175 /* pattern starts with a known prefix */
1176 /* <length> <skip> <prefix data> <overlap data> */
1177 prefix_len = pattern[5];
1178 prefix_skip = pattern[6];
1179 prefix = pattern + 7;
1180 overlap = prefix + prefix_len - 1;
1181 } else if (flags & SRE_INFO_CHARSET)
1182 /* pattern starts with a character from a known set */
1183 /* <charset> */
1184 charset = pattern + 5;
1186 pattern += 1 + pattern[1];
1189 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1190 TRACE(("charset = %p\n", charset));
1192 #if defined(USE_FAST_SEARCH)
1193 if (prefix_len > 1) {
1194 /* pattern starts with a known prefix. use the overlap
1195 table to skip forward as fast as we possibly can */
1196 int i = 0;
1197 end = state->end;
1198 while (ptr < end) {
1199 for (;;) {
1200 if ((SRE_CODE) ptr[0] != prefix[i]) {
1201 if (!i)
1202 break;
1203 else
1204 i = overlap[i];
1205 } else {
1206 if (++i == prefix_len) {
1207 /* found a potential match */
1208 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1209 state->start = ptr + 1 - prefix_len;
1210 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1211 if (flags & SRE_INFO_LITERAL)
1212 return 1; /* we got all of it */
1213 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
1214 if (status != 0)
1215 return status;
1216 /* close but no cigar -- try again */
1217 i = overlap[i];
1219 break;
1223 ptr++;
1225 return 0;
1227 #endif
1229 if (pattern[0] == SRE_OP_LITERAL) {
1230 /* pattern starts with a literal character. this is used
1231 for short prefixes, and if fast search is disabled */
1232 SRE_CODE chr = pattern[1];
1233 end = state->end;
1234 for (;;) {
1235 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1236 ptr++;
1237 if (ptr == end)
1238 return 0;
1239 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1240 state->start = ptr;
1241 state->ptr = ++ptr;
1242 status = SRE_MATCH(state, pattern + 2, 1);
1243 if (status != 0)
1244 break;
1246 } else if (charset) {
1247 /* pattern starts with a character from a known set */
1248 end = state->end;
1249 for (;;) {
1250 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1251 ptr++;
1252 if (ptr == end)
1253 return 0;
1254 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1255 state->start = ptr;
1256 state->ptr = ptr;
1257 status = SRE_MATCH(state, pattern, 1);
1258 if (status != 0)
1259 break;
1260 ptr++;
1262 } else
1263 /* general case */
1264 while (ptr <= end) {
1265 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1266 state->start = state->ptr = ptr++;
1267 status = SRE_MATCH(state, pattern, 1);
1268 if (status != 0)
1269 break;
1272 return status;
1276 #if !defined(SRE_RECURSIVE)
1278 /* -------------------------------------------------------------------- */
1279 /* factories and destructors */
1281 /* see sre.h for object declarations */
1283 staticforward PyTypeObject Pattern_Type;
1284 staticforward PyTypeObject Match_Type;
1285 staticforward PyTypeObject Scanner_Type;
1287 static PyObject *
1288 _compile(PyObject* self_, PyObject* args)
1290 /* "compile" pattern descriptor to pattern object */
1292 PatternObject* self;
1293 int i, n;
1295 PyObject* pattern;
1296 int flags = 0;
1297 PyObject* code;
1298 int groups = 0;
1299 PyObject* groupindex = NULL;
1300 PyObject* indexgroup = NULL;
1301 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1302 &PyList_Type, &code, &groups,
1303 &groupindex, &indexgroup))
1304 return NULL;
1306 n = PyList_GET_SIZE(code);
1308 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1309 if (!self)
1310 return NULL;
1312 self->codesize = n;
1314 for (i = 0; i < n; i++) {
1315 PyObject *o = PyList_GET_ITEM(code, i);
1316 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1319 if (PyErr_Occurred()) {
1320 PyObject_DEL(self);
1321 return NULL;
1324 Py_INCREF(pattern);
1325 self->pattern = pattern;
1327 self->flags = flags;
1329 self->groups = groups;
1331 Py_XINCREF(groupindex);
1332 self->groupindex = groupindex;
1334 Py_XINCREF(indexgroup);
1335 self->indexgroup = indexgroup;
1337 return (PyObject*) self;
1340 static PyObject *
1341 sre_codesize(PyObject* self, PyObject* args)
1343 return Py_BuildValue("i", sizeof(SRE_CODE));
1346 static PyObject *
1347 sre_getlower(PyObject* self, PyObject* args)
1349 int character, flags;
1350 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1351 return NULL;
1352 if (flags & SRE_FLAG_LOCALE)
1353 return Py_BuildValue("i", sre_lower_locale(character));
1354 if (flags & SRE_FLAG_UNICODE)
1355 #if defined(HAVE_UNICODE)
1356 return Py_BuildValue("i", sre_lower_unicode(character));
1357 #else
1358 return Py_BuildValue("i", sre_lower_locale(character));
1359 #endif
1360 return Py_BuildValue("i", sre_lower(character));
1363 LOCAL(void)
1364 state_reset(SRE_STATE* state)
1366 int i;
1368 state->lastmark = 0;
1370 /* FIXME: dynamic! */
1371 for (i = 0; i < SRE_MARK_SIZE; i++)
1372 state->mark[i] = NULL;
1374 state->lastindex = -1;
1376 state->repeat = NULL;
1378 mark_fini(state);
1381 LOCAL(PyObject*)
1382 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1383 int start, int end)
1385 /* prepare state object */
1387 PyBufferProcs *buffer;
1388 int size, bytes;
1389 void* ptr;
1391 memset(state, 0, sizeof(SRE_STATE));
1393 state->lastindex = -1;
1395 #if defined(HAVE_UNICODE)
1396 if (PyUnicode_Check(string)) {
1397 /* unicode strings doesn't always support the buffer interface */
1398 ptr = (void*) PyUnicode_AS_DATA(string);
1399 bytes = PyUnicode_GET_DATA_SIZE(string);
1400 size = PyUnicode_GET_SIZE(string);
1401 state->charsize = sizeof(Py_UNICODE);
1403 } else {
1404 #endif
1406 /* get pointer to string buffer */
1407 buffer = string->ob_type->tp_as_buffer;
1408 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1409 buffer->bf_getsegcount(string, NULL) != 1) {
1410 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1411 return NULL;
1414 /* determine buffer size */
1415 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1416 if (bytes < 0) {
1417 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1418 return NULL;
1421 /* determine character size */
1422 #if PY_VERSION_HEX >= 0x01060000
1423 size = PyObject_Size(string);
1424 #else
1425 size = PyObject_Length(string);
1426 #endif
1428 if (PyString_Check(string) || bytes == size)
1429 state->charsize = 1;
1430 #if defined(HAVE_UNICODE)
1431 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1432 state->charsize = sizeof(Py_UNICODE);
1433 #endif
1434 else {
1435 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1436 return NULL;
1439 #if defined(HAVE_UNICODE)
1441 #endif
1443 /* adjust boundaries */
1444 if (start < 0)
1445 start = 0;
1446 else if (start > size)
1447 start = size;
1449 if (end < 0)
1450 end = 0;
1451 else if (end > size)
1452 end = size;
1454 state->beginning = ptr;
1456 state->start = (void*) ((char*) ptr + start * state->charsize);
1457 state->end = (void*) ((char*) ptr + end * state->charsize);
1459 Py_INCREF(string);
1460 state->string = string;
1461 state->pos = start;
1462 state->endpos = end;
1464 if (pattern->flags & SRE_FLAG_LOCALE)
1465 state->lower = sre_lower_locale;
1466 else if (pattern->flags & SRE_FLAG_UNICODE)
1467 #if defined(HAVE_UNICODE)
1468 state->lower = sre_lower_unicode;
1469 #else
1470 state->lower = sre_lower_locale;
1471 #endif
1472 else
1473 state->lower = sre_lower;
1475 return string;
1478 LOCAL(void)
1479 state_fini(SRE_STATE* state)
1481 Py_XDECREF(state->string);
1482 mark_fini(state);
1485 LOCAL(PyObject*)
1486 state_getslice(SRE_STATE* state, int index, PyObject* string)
1488 int i, j;
1490 index = (index - 1) * 2;
1492 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1493 i = j = 0;
1494 } else {
1495 i = ((char*)state->mark[index] - (char*)state->beginning) /
1496 state->charsize;
1497 j = ((char*)state->mark[index+1] - (char*)state->beginning) /
1498 state->charsize;
1501 return PySequence_GetSlice(string, i, j);
1504 static void
1505 pattern_error(int status)
1507 switch (status) {
1508 case SRE_ERROR_RECURSION_LIMIT:
1509 PyErr_SetString(
1510 PyExc_RuntimeError,
1511 "maximum recursion limit exceeded"
1513 break;
1514 case SRE_ERROR_MEMORY:
1515 PyErr_NoMemory();
1516 break;
1517 default:
1518 /* other error codes indicate compiler/engine bugs */
1519 PyErr_SetString(
1520 PyExc_RuntimeError,
1521 "internal error in regular expression engine"
1526 static PyObject*
1527 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
1529 /* create match object (from state object) */
1531 MatchObject* match;
1532 int i, j;
1533 char* base;
1534 int n;
1536 if (status > 0) {
1538 /* create match object (with room for extra group marks) */
1539 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1540 2*(pattern->groups+1));
1541 if (!match)
1542 return NULL;
1544 Py_INCREF(pattern);
1545 match->pattern = pattern;
1547 Py_INCREF(state->string);
1548 match->string = state->string;
1550 match->regs = NULL;
1551 match->groups = pattern->groups+1;
1553 /* fill in group slices */
1555 base = (char*) state->beginning;
1556 n = state->charsize;
1558 match->mark[0] = ((char*) state->start - base) / n;
1559 match->mark[1] = ((char*) state->ptr - base) / n;
1561 for (i = j = 0; i < pattern->groups; i++, j+=2)
1562 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1563 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1564 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1565 } else
1566 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1568 match->pos = state->pos;
1569 match->endpos = state->endpos;
1571 match->lastindex = state->lastindex;
1573 return (PyObject*) match;
1575 } else if (status == 0) {
1577 /* no match */
1578 Py_INCREF(Py_None);
1579 return Py_None;
1583 /* internal error */
1584 pattern_error(status);
1585 return NULL;
1588 static PyObject*
1589 pattern_scanner(PatternObject* pattern, PyObject* args)
1591 /* create search state object */
1593 ScannerObject* self;
1595 PyObject* string;
1596 int start = 0;
1597 int end = INT_MAX;
1598 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1599 return NULL;
1601 /* create scanner object */
1602 self = PyObject_NEW(ScannerObject, &Scanner_Type);
1603 if (!self)
1604 return NULL;
1606 string = state_init(&self->state, pattern, string, start, end);
1607 if (!string) {
1608 PyObject_DEL(self);
1609 return NULL;
1612 Py_INCREF(pattern);
1613 self->pattern = (PyObject*) pattern;
1615 return (PyObject*) self;
1618 static void
1619 pattern_dealloc(PatternObject* self)
1621 Py_XDECREF(self->pattern);
1622 Py_XDECREF(self->groupindex);
1623 Py_XDECREF(self->indexgroup);
1624 PyObject_DEL(self);
1627 static PyObject*
1628 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1630 SRE_STATE state;
1631 int status;
1633 PyObject* string;
1634 int start = 0;
1635 int end = INT_MAX;
1636 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1637 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1638 &string, &start, &end))
1639 return NULL;
1641 string = state_init(&state, self, string, start, end);
1642 if (!string)
1643 return NULL;
1645 state.ptr = state.start;
1647 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1649 if (state.charsize == 1) {
1650 status = sre_match(&state, PatternObject_GetCode(self), 1);
1651 } else {
1652 #if defined(HAVE_UNICODE)
1653 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
1654 #endif
1657 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1659 state_fini(&state);
1661 return pattern_new_match(self, &state, status);
1664 static PyObject*
1665 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1667 SRE_STATE state;
1668 int status;
1670 PyObject* string;
1671 int start = 0;
1672 int end = INT_MAX;
1673 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1674 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1675 &string, &start, &end))
1676 return NULL;
1678 string = state_init(&state, self, string, start, end);
1679 if (!string)
1680 return NULL;
1682 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1684 if (state.charsize == 1) {
1685 status = sre_search(&state, PatternObject_GetCode(self));
1686 } else {
1687 #if defined(HAVE_UNICODE)
1688 status = sre_usearch(&state, PatternObject_GetCode(self));
1689 #endif
1692 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1694 state_fini(&state);
1696 return pattern_new_match(self, &state, status);
1699 static PyObject*
1700 call(char* module, char* function, PyObject* args)
1702 PyObject* name;
1703 PyObject* mod;
1704 PyObject* func;
1705 PyObject* result;
1707 name = PyString_FromString(module);
1708 if (!name)
1709 return NULL;
1710 mod = PyImport_Import(name);
1711 Py_DECREF(name);
1712 if (!mod)
1713 return NULL;
1714 func = PyObject_GetAttrString(mod, function);
1715 Py_DECREF(mod);
1716 if (!func)
1717 return NULL;
1718 result = PyObject_CallObject(func, args);
1719 Py_DECREF(func);
1720 Py_DECREF(args);
1721 return result;
1724 #ifdef USE_BUILTIN_COPY
1725 static int
1726 deepcopy(PyObject** object, PyObject* memo)
1728 PyObject* copy;
1730 copy = call(
1731 "copy", "deepcopy",
1732 Py_BuildValue("OO", *object, memo)
1734 if (!copy)
1735 return 0;
1737 Py_DECREF(*object);
1738 *object = copy;
1740 return 1; /* success */
1742 #endif
1744 static PyObject*
1745 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
1747 PyObject* template;
1748 PyObject* string;
1749 PyObject* count = Py_False; /* zero */
1750 static char* kwlist[] = { "repl", "string", "count", NULL };
1751 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:sub", kwlist,
1752 &template, &string, &count))
1753 return NULL;
1755 /* delegate to Python code */
1756 return call(
1757 SRE_MODULE, "_sub",
1758 Py_BuildValue("OOOO", self, template, string, count)
1762 static PyObject*
1763 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
1765 PyObject* template;
1766 PyObject* string;
1767 PyObject* count = Py_False; /* zero */
1768 static char* kwlist[] = { "repl", "string", "count", NULL };
1769 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|O:subn", kwlist,
1770 &template, &string, &count))
1771 return NULL;
1773 /* delegate to Python code */
1774 return call(
1775 SRE_MODULE, "_subn",
1776 Py_BuildValue("OOOO", self, template, string, count)
1780 static PyObject*
1781 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
1783 PyObject* string;
1784 PyObject* maxsplit = Py_False; /* zero */
1785 static char* kwlist[] = { "source", "maxsplit", NULL };
1786 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|O:split", kwlist,
1787 &string, &maxsplit))
1788 return NULL;
1790 /* delegate to Python code */
1791 return call(
1792 SRE_MODULE, "_split",
1793 Py_BuildValue("OOO", self, string, maxsplit)
1797 static PyObject*
1798 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
1800 SRE_STATE state;
1801 PyObject* list;
1802 int status;
1803 int i;
1805 PyObject* string;
1806 int start = 0;
1807 int end = INT_MAX;
1808 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1809 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1810 &string, &start, &end))
1811 return NULL;
1813 string = state_init(&state, self, string, start, end);
1814 if (!string)
1815 return NULL;
1817 list = PyList_New(0);
1819 while (state.start <= state.end) {
1821 PyObject* item;
1823 state_reset(&state);
1825 state.ptr = state.start;
1827 if (state.charsize == 1) {
1828 status = sre_search(&state, PatternObject_GetCode(self));
1829 } else {
1830 #if defined(HAVE_UNICODE)
1831 status = sre_usearch(&state, PatternObject_GetCode(self));
1832 #endif
1835 if (status > 0) {
1837 /* don't bother to build a match object */
1838 switch (self->groups) {
1839 case 0:
1840 item = PySequence_GetSlice(
1841 string,
1842 ((char*) state.start - (char*) state.beginning) /
1843 state.charsize,
1844 ((char*) state.ptr - (char*) state.beginning) /
1845 state.charsize);
1846 if (!item)
1847 goto error;
1848 break;
1849 case 1:
1850 item = state_getslice(&state, 1, string);
1851 if (!item)
1852 goto error;
1853 break;
1854 default:
1855 item = PyTuple_New(self->groups);
1856 if (!item)
1857 goto error;
1858 for (i = 0; i < self->groups; i++) {
1859 PyObject* o = state_getslice(&state, i+1, string);
1860 if (!o) {
1861 Py_DECREF(item);
1862 goto error;
1864 PyTuple_SET_ITEM(item, i, o);
1866 break;
1869 status = PyList_Append(list, item);
1870 Py_DECREF(item);
1872 if (status < 0)
1873 goto error;
1875 if (state.ptr == state.start)
1876 state.start = (void*) ((char*) state.ptr + state.charsize);
1877 else
1878 state.start = state.ptr;
1880 } else {
1882 if (status == 0)
1883 break;
1885 pattern_error(status);
1886 goto error;
1891 state_fini(&state);
1892 return list;
1894 error:
1895 Py_DECREF(list);
1896 state_fini(&state);
1897 return NULL;
1901 static PyObject*
1902 pattern_copy(PatternObject* self, PyObject* args)
1904 #ifdef USE_BUILTIN_COPY
1905 PatternObject* copy;
1906 int offset;
1908 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
1909 return NULL;
1911 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
1912 if (!copy)
1913 return NULL;
1915 offset = offsetof(PatternObject, groups);
1917 Py_XINCREF(self->groupindex);
1918 Py_XINCREF(self->indexgroup);
1919 Py_XINCREF(self->pattern);
1921 memcpy((char*) copy + offset, (char*) self + offset,
1922 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
1924 return (PyObject*) copy;
1925 #else
1926 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
1927 return NULL;
1928 #endif
1931 static PyObject*
1932 pattern_deepcopy(PatternObject* self, PyObject* args)
1934 #ifdef USE_BUILTIN_COPY
1935 PatternObject* copy;
1937 PyObject* memo;
1938 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
1939 return NULL;
1941 copy = (PatternObject*) pattern_copy(self, Py_None);
1942 if (!copy)
1943 return NULL;
1945 if (!deepcopy(&copy->groupindex, memo) ||
1946 !deepcopy(&copy->indexgroup, memo) ||
1947 !deepcopy(&copy->pattern, memo)) {
1948 Py_DECREF(copy);
1949 return NULL;
1952 #else
1953 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
1954 return NULL;
1955 #endif
1958 static PyObject*
1959 pattern_isliteral(PatternObject* self, PyObject* args)
1961 /* internal: return true if pattern consists of literal text only */
1963 SRE_CODE* code;
1964 PyObject* isliteral;
1966 if (!PyArg_ParseTuple(args, ":_isliteral"))
1967 return NULL;
1969 code = PatternObject_GetCode(self);
1971 if (code[0] == SRE_OP_INFO && code[2] & SRE_INFO_LITERAL)
1972 isliteral = Py_True;
1973 else
1974 isliteral = Py_False;
1976 Py_INCREF(isliteral);
1977 return isliteral;
1980 static PyMethodDef pattern_methods[] = {
1981 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
1982 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
1983 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
1984 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
1985 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
1986 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
1987 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
1988 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
1989 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
1990 {"_isliteral", (PyCFunction) pattern_isliteral, METH_VARARGS},
1991 {NULL, NULL}
1994 static PyObject*
1995 pattern_getattr(PatternObject* self, char* name)
1997 PyObject* res;
1999 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2001 if (res)
2002 return res;
2004 PyErr_Clear();
2006 /* attributes */
2007 if (!strcmp(name, "pattern")) {
2008 Py_INCREF(self->pattern);
2009 return self->pattern;
2012 if (!strcmp(name, "flags"))
2013 return Py_BuildValue("i", self->flags);
2015 if (!strcmp(name, "groups"))
2016 return Py_BuildValue("i", self->groups);
2018 if (!strcmp(name, "groupindex") && self->groupindex) {
2019 Py_INCREF(self->groupindex);
2020 return self->groupindex;
2023 PyErr_SetString(PyExc_AttributeError, name);
2024 return NULL;
2027 statichere PyTypeObject Pattern_Type = {
2028 PyObject_HEAD_INIT(NULL)
2029 0, "SRE_Pattern",
2030 sizeof(PatternObject), sizeof(SRE_CODE),
2031 (destructor)pattern_dealloc, /*tp_dealloc*/
2032 0, /*tp_print*/
2033 (getattrfunc)pattern_getattr /*tp_getattr*/
2036 /* -------------------------------------------------------------------- */
2037 /* match methods */
2039 static void
2040 match_dealloc(MatchObject* self)
2042 Py_XDECREF(self->regs);
2043 Py_XDECREF(self->string);
2044 Py_DECREF(self->pattern);
2045 PyObject_DEL(self);
2048 static PyObject*
2049 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2051 if (index < 0 || index >= self->groups) {
2052 /* raise IndexError if we were given a bad group number */
2053 PyErr_SetString(
2054 PyExc_IndexError,
2055 "no such group"
2057 return NULL;
2060 index *= 2;
2062 if (self->string == Py_None || self->mark[index] < 0) {
2063 /* return default value if the string or group is undefined */
2064 Py_INCREF(def);
2065 return def;
2068 return PySequence_GetSlice(
2069 self->string, self->mark[index], self->mark[index+1]
2073 static int
2074 match_getindex(MatchObject* self, PyObject* index)
2076 int i;
2078 if (PyInt_Check(index))
2079 return (int) PyInt_AS_LONG(index);
2081 i = -1;
2083 if (self->pattern->groupindex) {
2084 index = PyObject_GetItem(self->pattern->groupindex, index);
2085 if (index) {
2086 if (PyInt_Check(index))
2087 i = (int) PyInt_AS_LONG(index);
2088 Py_DECREF(index);
2089 } else
2090 PyErr_Clear();
2093 return i;
2096 static PyObject*
2097 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2099 return match_getslice_by_index(self, match_getindex(self, index), def);
2102 static PyObject*
2103 match_expand(MatchObject* self, PyObject* args)
2105 PyObject* template;
2106 if (!PyArg_ParseTuple(args, "O:expand", &template))
2107 return NULL;
2109 /* delegate to Python code */
2110 return call(
2111 SRE_MODULE, "_expand",
2112 Py_BuildValue("OOO", self->pattern, self, template)
2116 static PyObject*
2117 match_group(MatchObject* self, PyObject* args)
2119 PyObject* result;
2120 int i, size;
2122 size = PyTuple_GET_SIZE(args);
2124 switch (size) {
2125 case 0:
2126 result = match_getslice(self, Py_False, Py_None);
2127 break;
2128 case 1:
2129 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2130 break;
2131 default:
2132 /* fetch multiple items */
2133 result = PyTuple_New(size);
2134 if (!result)
2135 return NULL;
2136 for (i = 0; i < size; i++) {
2137 PyObject* item = match_getslice(
2138 self, PyTuple_GET_ITEM(args, i), Py_None
2140 if (!item) {
2141 Py_DECREF(result);
2142 return NULL;
2144 PyTuple_SET_ITEM(result, i, item);
2146 break;
2148 return result;
2151 static PyObject*
2152 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2154 PyObject* result;
2155 int index;
2157 PyObject* def = Py_None;
2158 static char* kwlist[] = { "default", NULL };
2159 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2160 return NULL;
2162 result = PyTuple_New(self->groups-1);
2163 if (!result)
2164 return NULL;
2166 for (index = 1; index < self->groups; index++) {
2167 PyObject* item;
2168 item = match_getslice_by_index(self, index, def);
2169 if (!item) {
2170 Py_DECREF(result);
2171 return NULL;
2173 PyTuple_SET_ITEM(result, index-1, item);
2176 return result;
2179 static PyObject*
2180 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2182 PyObject* result;
2183 PyObject* keys;
2184 int index;
2186 PyObject* def = Py_None;
2187 static char* kwlist[] = { "default", NULL };
2188 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2189 return NULL;
2191 result = PyDict_New();
2192 if (!result || !self->pattern->groupindex)
2193 return result;
2195 keys = PyMapping_Keys(self->pattern->groupindex);
2196 if (!keys)
2197 goto failed;
2199 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2200 int status;
2201 PyObject* key;
2202 PyObject* value;
2203 key = PyList_GET_ITEM(keys, index);
2204 if (!key)
2205 goto failed;
2206 value = match_getslice(self, key, def);
2207 if (!value) {
2208 Py_DECREF(key);
2209 goto failed;
2211 status = PyDict_SetItem(result, key, value);
2212 Py_DECREF(value);
2213 if (status < 0)
2214 goto failed;
2217 Py_DECREF(keys);
2219 return result;
2221 failed:
2222 Py_DECREF(keys);
2223 Py_DECREF(result);
2224 return NULL;
2227 static PyObject*
2228 match_start(MatchObject* self, PyObject* args)
2230 int index;
2232 PyObject* index_ = Py_False; /* zero */
2233 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2234 return NULL;
2236 index = match_getindex(self, index_);
2238 if (index < 0 || index >= self->groups) {
2239 PyErr_SetString(
2240 PyExc_IndexError,
2241 "no such group"
2243 return NULL;
2246 /* mark is -1 if group is undefined */
2247 return Py_BuildValue("i", self->mark[index*2]);
2250 static PyObject*
2251 match_end(MatchObject* self, PyObject* args)
2253 int index;
2255 PyObject* index_ = Py_False; /* zero */
2256 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2257 return NULL;
2259 index = match_getindex(self, index_);
2261 if (index < 0 || index >= self->groups) {
2262 PyErr_SetString(
2263 PyExc_IndexError,
2264 "no such group"
2266 return NULL;
2269 /* mark is -1 if group is undefined */
2270 return Py_BuildValue("i", self->mark[index*2+1]);
2273 LOCAL(PyObject*)
2274 _pair(int i1, int i2)
2276 PyObject* pair;
2277 PyObject* item;
2279 pair = PyTuple_New(2);
2280 if (!pair)
2281 return NULL;
2283 item = PyInt_FromLong(i1);
2284 if (!item)
2285 goto error;
2286 PyTuple_SET_ITEM(pair, 0, item);
2288 item = PyInt_FromLong(i2);
2289 if (!item)
2290 goto error;
2291 PyTuple_SET_ITEM(pair, 1, item);
2293 return pair;
2295 error:
2296 Py_DECREF(pair);
2297 return NULL;
2300 static PyObject*
2301 match_span(MatchObject* self, PyObject* args)
2303 int index;
2305 PyObject* index_ = Py_False; /* zero */
2306 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2307 return NULL;
2309 index = match_getindex(self, index_);
2311 if (index < 0 || index >= self->groups) {
2312 PyErr_SetString(
2313 PyExc_IndexError,
2314 "no such group"
2316 return NULL;
2319 /* marks are -1 if group is undefined */
2320 return _pair(self->mark[index*2], self->mark[index*2+1]);
2323 static PyObject*
2324 match_regs(MatchObject* self)
2326 PyObject* regs;
2327 PyObject* item;
2328 int index;
2330 regs = PyTuple_New(self->groups);
2331 if (!regs)
2332 return NULL;
2334 for (index = 0; index < self->groups; index++) {
2335 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2336 if (!item) {
2337 Py_DECREF(regs);
2338 return NULL;
2340 PyTuple_SET_ITEM(regs, index, item);
2343 Py_INCREF(regs);
2344 self->regs = regs;
2346 return regs;
2349 static PyObject*
2350 match_copy(MatchObject* self, PyObject* args)
2352 #ifdef USE_BUILTIN_COPY
2353 MatchObject* copy;
2354 int slots, offset;
2356 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2357 return NULL;
2359 slots = 2 * (self->pattern->groups+1);
2361 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2362 if (!copy)
2363 return NULL;
2365 /* this value a constant, but any compiler should be able to
2366 figure that out all by itself */
2367 offset = offsetof(MatchObject, string);
2369 Py_XINCREF(self->pattern);
2370 Py_XINCREF(self->string);
2371 Py_XINCREF(self->regs);
2373 memcpy((char*) copy + offset, (char*) self + offset,
2374 sizeof(MatchObject) + slots * sizeof(int) - offset);
2376 return (PyObject*) copy;
2377 #else
2378 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
2379 return NULL;
2380 #endif
2383 static PyObject*
2384 match_deepcopy(MatchObject* self, PyObject* args)
2386 #ifdef USE_BUILTIN_COPY
2387 MatchObject* copy;
2389 PyObject* memo;
2390 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2391 return NULL;
2393 copy = (MatchObject*) match_copy(self, Py_None);
2394 if (!copy)
2395 return NULL;
2397 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2398 !deepcopy(&copy->string, memo) ||
2399 !deepcopy(&copy->regs, memo)) {
2400 Py_DECREF(copy);
2401 return NULL;
2404 #else
2405 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2406 return NULL;
2407 #endif
2410 static PyMethodDef match_methods[] = {
2411 {"group", (PyCFunction) match_group, METH_VARARGS},
2412 {"start", (PyCFunction) match_start, METH_VARARGS},
2413 {"end", (PyCFunction) match_end, METH_VARARGS},
2414 {"span", (PyCFunction) match_span, METH_VARARGS},
2415 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2416 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2417 {"expand", (PyCFunction) match_expand, METH_VARARGS},
2418 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2419 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
2420 {NULL, NULL}
2423 static PyObject*
2424 match_getattr(MatchObject* self, char* name)
2426 PyObject* res;
2428 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2429 if (res)
2430 return res;
2432 PyErr_Clear();
2434 if (!strcmp(name, "lastindex")) {
2435 if (self->lastindex >= 0)
2436 return Py_BuildValue("i", self->lastindex);
2437 Py_INCREF(Py_None);
2438 return Py_None;
2441 if (!strcmp(name, "lastgroup")) {
2442 if (self->pattern->indexgroup && self->lastindex >= 0) {
2443 PyObject* result = PySequence_GetItem(
2444 self->pattern->indexgroup, self->lastindex
2446 if (result)
2447 return result;
2448 PyErr_Clear();
2450 Py_INCREF(Py_None);
2451 return Py_None;
2454 if (!strcmp(name, "string")) {
2455 if (self->string) {
2456 Py_INCREF(self->string);
2457 return self->string;
2458 } else {
2459 Py_INCREF(Py_None);
2460 return Py_None;
2464 if (!strcmp(name, "regs")) {
2465 if (self->regs) {
2466 Py_INCREF(self->regs);
2467 return self->regs;
2468 } else
2469 return match_regs(self);
2472 if (!strcmp(name, "re")) {
2473 Py_INCREF(self->pattern);
2474 return (PyObject*) self->pattern;
2477 if (!strcmp(name, "pos"))
2478 return Py_BuildValue("i", self->pos);
2480 if (!strcmp(name, "endpos"))
2481 return Py_BuildValue("i", self->endpos);
2483 PyErr_SetString(PyExc_AttributeError, name);
2484 return NULL;
2487 /* FIXME: implement setattr("string", None) as a special case (to
2488 detach the associated string, if any */
2490 statichere PyTypeObject Match_Type = {
2491 PyObject_HEAD_INIT(NULL)
2492 0, "SRE_Match",
2493 sizeof(MatchObject), sizeof(int),
2494 (destructor)match_dealloc, /*tp_dealloc*/
2495 0, /*tp_print*/
2496 (getattrfunc)match_getattr /*tp_getattr*/
2499 /* -------------------------------------------------------------------- */
2500 /* scanner methods (experimental) */
2502 static void
2503 scanner_dealloc(ScannerObject* self)
2505 state_fini(&self->state);
2506 Py_DECREF(self->pattern);
2507 PyObject_DEL(self);
2510 static PyObject*
2511 scanner_match(ScannerObject* self, PyObject* args)
2513 SRE_STATE* state = &self->state;
2514 PyObject* match;
2515 int status;
2517 state_reset(state);
2519 state->ptr = state->start;
2521 if (state->charsize == 1) {
2522 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
2523 } else {
2524 #if defined(HAVE_UNICODE)
2525 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
2526 #endif
2529 match = pattern_new_match((PatternObject*) self->pattern,
2530 state, status);
2532 if (status == 0 || state->ptr == state->start)
2533 state->start = (void*) ((char*) state->ptr + state->charsize);
2534 else
2535 state->start = state->ptr;
2537 return match;
2541 static PyObject*
2542 scanner_search(ScannerObject* self, PyObject* args)
2544 SRE_STATE* state = &self->state;
2545 PyObject* match;
2546 int status;
2548 state_reset(state);
2550 state->ptr = state->start;
2552 if (state->charsize == 1) {
2553 status = sre_search(state, PatternObject_GetCode(self->pattern));
2554 } else {
2555 #if defined(HAVE_UNICODE)
2556 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
2557 #endif
2560 match = pattern_new_match((PatternObject*) self->pattern,
2561 state, status);
2563 if (status == 0 || state->ptr == state->start)
2564 state->start = (void*) ((char*) state->ptr + state->charsize);
2565 else
2566 state->start = state->ptr;
2568 return match;
2571 static PyMethodDef scanner_methods[] = {
2572 {"match", (PyCFunction) scanner_match, 0},
2573 {"search", (PyCFunction) scanner_search, 0},
2574 {NULL, NULL}
2577 static PyObject*
2578 scanner_getattr(ScannerObject* self, char* name)
2580 PyObject* res;
2582 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2583 if (res)
2584 return res;
2586 PyErr_Clear();
2588 /* attributes */
2589 if (!strcmp(name, "pattern")) {
2590 Py_INCREF(self->pattern);
2591 return self->pattern;
2594 PyErr_SetString(PyExc_AttributeError, name);
2595 return NULL;
2598 statichere PyTypeObject Scanner_Type = {
2599 PyObject_HEAD_INIT(NULL)
2600 0, "SRE_Scanner",
2601 sizeof(ScannerObject), 0,
2602 (destructor)scanner_dealloc, /*tp_dealloc*/
2603 0, /*tp_print*/
2604 (getattrfunc)scanner_getattr, /*tp_getattr*/
2607 static PyMethodDef _functions[] = {
2608 {"compile", _compile, 1},
2609 {"getcodesize", sre_codesize, 1},
2610 {"getlower", sre_getlower, 1},
2611 {NULL, NULL}
2614 DL_EXPORT(void)
2615 init_sre(void)
2617 PyObject* m;
2618 PyObject* d;
2620 /* Patch object types */
2621 Pattern_Type.ob_type = Match_Type.ob_type =
2622 Scanner_Type.ob_type = &PyType_Type;
2624 m = Py_InitModule("_" SRE_MODULE, _functions);
2625 d = PyModule_GetDict(m);
2627 PyDict_SetItemString(
2628 d, "MAGIC", (PyObject*) PyInt_FromLong(SRE_MAGIC)
2631 PyDict_SetItemString(
2632 d, "copyright", (PyObject*) PyString_FromString(copyright)
2637 #endif /* !defined(SRE_RECURSIVE) */