This commit was manufactured by cvs2svn to create tag 'r221c2'.
[python/dscho.git] / Modules / _sre.c
blob769965f51a96d23914d43fec9e457b9767dfad48
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
12 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
14 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
16 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
23 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
24 * 2000-10-24 fl really fixed assert_not; reset groups in findall
25 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
28 * 2001-01-16 fl fixed memory leak in pattern destructor
29 * 2001-03-20 fl lots of fixes for 2.1b2
30 * 2001-04-15 fl export copyright as Python attribute, not global
31 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
33 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
34 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
35 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
36 * 2001-10-21 fl added sub/subn primitive
37 * 2001-10-22 fl check for literal sub/subn templates
38 * 2001-10-24 fl added finditer primitive (for 2.2 only)
39 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
41 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
43 * This version of the SRE library can be redistributed under CNRI's
44 * Python 1.6 license. For any other use, please contact Secret Labs
45 * AB (info@pythonware.com).
47 * Portions of this engine have been developed in cooperation with
48 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
49 * other compatibility work.
52 #ifndef SRE_RECURSIVE
54 static char copyright[] =
55 " SRE 2.2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
57 #include "Python.h"
58 #include "structmember.h" /* offsetof */
60 #include "sre.h"
62 #include <ctype.h>
64 /* name of this module, minus the leading underscore */
65 #if !defined(SRE_MODULE)
66 #define SRE_MODULE "sre"
67 #endif
69 /* defining this one enables tracing */
70 #undef VERBOSE
72 #if PY_VERSION_HEX >= 0x01060000
73 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
74 /* defining this enables unicode support (default under 1.6a1 and later) */
75 #define HAVE_UNICODE
76 #endif
77 #endif
79 /* -------------------------------------------------------------------- */
80 /* optional features */
82 /* prevent run-away recursion (bad patterns on long strings) */
84 #if !defined(USE_STACKCHECK)
85 #if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
86 /* require smaller recursion limit for a number of 64-bit platforms:
87 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
88 /* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
89 #define USE_RECURSION_LIMIT 7500
90 #else
91 #define USE_RECURSION_LIMIT 10000
92 #endif
93 #endif
95 /* enables fast searching */
96 #define USE_FAST_SEARCH
98 /* enables aggressive inlining (always on for Visual C) */
99 #undef USE_INLINE
101 /* enables copy/deepcopy handling (work in progress) */
102 #undef USE_BUILTIN_COPY
104 #if PY_VERSION_HEX < 0x01060000
105 #define PyObject_DEL(op) PyMem_DEL((op))
106 #endif
108 /* -------------------------------------------------------------------- */
110 #if defined(_MSC_VER)
111 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
112 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
113 /* fastest possible local call under MSVC */
114 #define LOCAL(type) static __inline type __fastcall
115 #elif defined(USE_INLINE)
116 #define LOCAL(type) static inline type
117 #else
118 #define LOCAL(type) static type
119 #endif
121 /* error codes */
122 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
123 #define SRE_ERROR_STATE -2 /* illegal state */
124 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
125 #define SRE_ERROR_MEMORY -9 /* out of memory */
127 #if defined(VERBOSE)
128 #define TRACE(v) printf v
129 #else
130 #define TRACE(v)
131 #endif
133 /* -------------------------------------------------------------------- */
134 /* search engine state */
136 /* default character predicates (run sre_chars.py to regenerate tables) */
138 #define SRE_DIGIT_MASK 1
139 #define SRE_SPACE_MASK 2
140 #define SRE_LINEBREAK_MASK 4
141 #define SRE_ALNUM_MASK 8
142 #define SRE_WORD_MASK 16
144 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
146 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
147 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
149 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
150 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
151 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
152 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
154 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
155 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
156 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
157 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
158 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
159 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
160 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
161 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
162 120, 121, 122, 123, 124, 125, 126, 127 };
164 #define SRE_IS_DIGIT(ch)\
165 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
166 #define SRE_IS_SPACE(ch)\
167 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
168 #define SRE_IS_LINEBREAK(ch)\
169 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
170 #define SRE_IS_ALNUM(ch)\
171 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
172 #define SRE_IS_WORD(ch)\
173 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
175 static unsigned int sre_lower(unsigned int ch)
177 return ((ch) < 128 ? sre_char_lower[ch] : ch);
180 /* locale-specific character predicates */
182 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
183 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
184 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
185 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
186 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
188 static unsigned int sre_lower_locale(unsigned int ch)
190 return ((ch) < 256 ? tolower((ch)) : ch);
193 /* unicode-specific character predicates */
195 #if defined(HAVE_UNICODE)
197 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
198 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
199 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
200 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
201 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
203 static unsigned int sre_lower_unicode(unsigned int ch)
205 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
208 #endif
210 LOCAL(int)
211 sre_category(SRE_CODE category, unsigned int ch)
213 switch (category) {
215 case SRE_CATEGORY_DIGIT:
216 return SRE_IS_DIGIT(ch);
217 case SRE_CATEGORY_NOT_DIGIT:
218 return !SRE_IS_DIGIT(ch);
219 case SRE_CATEGORY_SPACE:
220 return SRE_IS_SPACE(ch);
221 case SRE_CATEGORY_NOT_SPACE:
222 return !SRE_IS_SPACE(ch);
223 case SRE_CATEGORY_WORD:
224 return SRE_IS_WORD(ch);
225 case SRE_CATEGORY_NOT_WORD:
226 return !SRE_IS_WORD(ch);
227 case SRE_CATEGORY_LINEBREAK:
228 return SRE_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_NOT_LINEBREAK:
230 return !SRE_IS_LINEBREAK(ch);
232 case SRE_CATEGORY_LOC_WORD:
233 return SRE_LOC_IS_WORD(ch);
234 case SRE_CATEGORY_LOC_NOT_WORD:
235 return !SRE_LOC_IS_WORD(ch);
237 #if defined(HAVE_UNICODE)
238 case SRE_CATEGORY_UNI_DIGIT:
239 return SRE_UNI_IS_DIGIT(ch);
240 case SRE_CATEGORY_UNI_NOT_DIGIT:
241 return !SRE_UNI_IS_DIGIT(ch);
242 case SRE_CATEGORY_UNI_SPACE:
243 return SRE_UNI_IS_SPACE(ch);
244 case SRE_CATEGORY_UNI_NOT_SPACE:
245 return !SRE_UNI_IS_SPACE(ch);
246 case SRE_CATEGORY_UNI_WORD:
247 return SRE_UNI_IS_WORD(ch);
248 case SRE_CATEGORY_UNI_NOT_WORD:
249 return !SRE_UNI_IS_WORD(ch);
250 case SRE_CATEGORY_UNI_LINEBREAK:
251 return SRE_UNI_IS_LINEBREAK(ch);
252 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
253 return !SRE_UNI_IS_LINEBREAK(ch);
254 #else
255 case SRE_CATEGORY_UNI_DIGIT:
256 return SRE_IS_DIGIT(ch);
257 case SRE_CATEGORY_UNI_NOT_DIGIT:
258 return !SRE_IS_DIGIT(ch);
259 case SRE_CATEGORY_UNI_SPACE:
260 return SRE_IS_SPACE(ch);
261 case SRE_CATEGORY_UNI_NOT_SPACE:
262 return !SRE_IS_SPACE(ch);
263 case SRE_CATEGORY_UNI_WORD:
264 return SRE_LOC_IS_WORD(ch);
265 case SRE_CATEGORY_UNI_NOT_WORD:
266 return !SRE_LOC_IS_WORD(ch);
267 case SRE_CATEGORY_UNI_LINEBREAK:
268 return SRE_IS_LINEBREAK(ch);
269 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
270 return !SRE_IS_LINEBREAK(ch);
271 #endif
273 return 0;
276 /* helpers */
278 static void
279 mark_fini(SRE_STATE* state)
281 if (state->mark_stack) {
282 free(state->mark_stack);
283 state->mark_stack = NULL;
285 state->mark_stack_size = state->mark_stack_base = 0;
288 static int
289 mark_save(SRE_STATE* state, int lo, int hi)
291 void* stack;
292 int size;
293 int minsize, newsize;
295 if (hi <= lo)
296 return 0;
298 size = (hi - lo) + 1;
300 newsize = state->mark_stack_size;
301 minsize = state->mark_stack_base + size;
303 if (newsize < minsize) {
304 /* create new stack */
305 if (!newsize) {
306 newsize = 512;
307 if (newsize < minsize)
308 newsize = minsize;
309 TRACE(("allocate stack %d\n", newsize));
310 stack = malloc(sizeof(void*) * newsize);
311 } else {
312 /* grow the stack */
313 while (newsize < minsize)
314 newsize += newsize;
315 TRACE(("grow stack to %d\n", newsize));
316 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
318 if (!stack) {
319 mark_fini(state);
320 return SRE_ERROR_MEMORY;
322 state->mark_stack = stack;
323 state->mark_stack_size = newsize;
326 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
328 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
329 size * sizeof(void*));
331 state->mark_stack_base += size;
333 return 0;
336 static int
337 mark_restore(SRE_STATE* state, int lo, int hi)
339 int size;
341 if (hi <= lo)
342 return 0;
344 size = (hi - lo) + 1;
346 state->mark_stack_base -= size;
348 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
350 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
351 size * sizeof(void*));
353 return 0;
356 /* generate 8-bit version */
358 #define SRE_CHAR unsigned char
359 #define SRE_AT sre_at
360 #define SRE_COUNT sre_count
361 #define SRE_CHARSET sre_charset
362 #define SRE_INFO sre_info
363 #define SRE_MATCH sre_match
364 #define SRE_SEARCH sre_search
365 #define SRE_LITERAL_TEMPLATE sre_literal_template
367 #if defined(HAVE_UNICODE)
369 #define SRE_RECURSIVE
370 #include "_sre.c"
371 #undef SRE_RECURSIVE
373 #undef SRE_LITERAL_TEMPLATE
374 #undef SRE_SEARCH
375 #undef SRE_MATCH
376 #undef SRE_INFO
377 #undef SRE_CHARSET
378 #undef SRE_COUNT
379 #undef SRE_AT
380 #undef SRE_CHAR
382 /* generate 16-bit unicode version */
384 #define SRE_CHAR Py_UNICODE
385 #define SRE_AT sre_uat
386 #define SRE_COUNT sre_ucount
387 #define SRE_CHARSET sre_ucharset
388 #define SRE_INFO sre_uinfo
389 #define SRE_MATCH sre_umatch
390 #define SRE_SEARCH sre_usearch
391 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
392 #endif
394 #endif /* SRE_RECURSIVE */
396 /* -------------------------------------------------------------------- */
397 /* String matching engine */
399 /* the following section is compiled twice, with different character
400 settings */
402 LOCAL(int)
403 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
405 /* check if pointer is at given position */
407 int this, that;
409 switch (at) {
411 case SRE_AT_BEGINNING:
412 case SRE_AT_BEGINNING_STRING:
413 return ((void*) ptr == state->beginning);
415 case SRE_AT_BEGINNING_LINE:
416 return ((void*) ptr == state->beginning ||
417 SRE_IS_LINEBREAK((int) ptr[-1]));
419 case SRE_AT_END:
420 return (((void*) (ptr+1) == state->end &&
421 SRE_IS_LINEBREAK((int) ptr[0])) ||
422 ((void*) ptr == state->end));
424 case SRE_AT_END_LINE:
425 return ((void*) ptr == state->end ||
426 SRE_IS_LINEBREAK((int) ptr[0]));
428 case SRE_AT_END_STRING:
429 return ((void*) ptr == state->end);
431 case SRE_AT_BOUNDARY:
432 if (state->beginning == state->end)
433 return 0;
434 that = ((void*) ptr > state->beginning) ?
435 SRE_IS_WORD((int) ptr[-1]) : 0;
436 this = ((void*) ptr < state->end) ?
437 SRE_IS_WORD((int) ptr[0]) : 0;
438 return this != that;
440 case SRE_AT_NON_BOUNDARY:
441 if (state->beginning == state->end)
442 return 0;
443 that = ((void*) ptr > state->beginning) ?
444 SRE_IS_WORD((int) ptr[-1]) : 0;
445 this = ((void*) ptr < state->end) ?
446 SRE_IS_WORD((int) ptr[0]) : 0;
447 return this == that;
449 case SRE_AT_LOC_BOUNDARY:
450 if (state->beginning == state->end)
451 return 0;
452 that = ((void*) ptr > state->beginning) ?
453 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
454 this = ((void*) ptr < state->end) ?
455 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
456 return this != that;
458 case SRE_AT_LOC_NON_BOUNDARY:
459 if (state->beginning == state->end)
460 return 0;
461 that = ((void*) ptr > state->beginning) ?
462 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
463 this = ((void*) ptr < state->end) ?
464 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
465 return this == that;
467 #if defined(HAVE_UNICODE)
468 case SRE_AT_UNI_BOUNDARY:
469 if (state->beginning == state->end)
470 return 0;
471 that = ((void*) ptr > state->beginning) ?
472 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
473 this = ((void*) ptr < state->end) ?
474 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
475 return this != that;
477 case SRE_AT_UNI_NON_BOUNDARY:
478 if (state->beginning == state->end)
479 return 0;
480 that = ((void*) ptr > state->beginning) ?
481 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
482 this = ((void*) ptr < state->end) ?
483 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
484 return this == that;
485 #endif
489 return 0;
492 LOCAL(int)
493 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
495 /* check if character is a member of the given set */
497 int ok = 1;
499 for (;;) {
500 switch (*set++) {
502 case SRE_OP_LITERAL:
503 /* <LITERAL> <code> */
504 if (ch == set[0])
505 return ok;
506 set++;
507 break;
509 case SRE_OP_RANGE:
510 /* <RANGE> <lower> <upper> */
511 if (set[0] <= ch && ch <= set[1])
512 return ok;
513 set += 2;
514 break;
516 case SRE_OP_CHARSET:
517 /* <CHARSET> <bitmap> (16 bits per code word) */
518 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
519 return ok;
520 set += 16;
521 break;
523 case SRE_OP_BIGCHARSET:
524 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
526 int count, block;
527 count = *(set++);
528 block = ((unsigned char*)set)[ch >> 8];
529 set += 128;
530 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
531 return ok;
532 set += count*16;
533 break;
536 case SRE_OP_CATEGORY:
537 /* <CATEGORY> <code> */
538 if (sre_category(set[0], (int) ch))
539 return ok;
540 set += 1;
541 break;
543 case SRE_OP_NEGATE:
544 ok = !ok;
545 break;
547 case SRE_OP_FAILURE:
548 return !ok;
550 default:
551 /* internal error -- there's not much we can do about it
552 here, so let's just pretend it didn't match... */
553 return 0;
558 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
560 LOCAL(int)
561 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
563 SRE_CODE chr;
564 SRE_CHAR* ptr = state->ptr;
565 SRE_CHAR* end = state->end;
566 int i;
568 /* adjust end */
569 if (maxcount < end - ptr && maxcount != 65535)
570 end = ptr + maxcount;
572 switch (pattern[0]) {
574 case SRE_OP_ANY:
575 /* repeated dot wildcard. */
576 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
577 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
578 ptr++;
579 break;
581 case SRE_OP_ANY_ALL:
582 /* repeated dot wildcare. skip to the end of the target
583 string, and backtrack from there */
584 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
585 ptr = end;
586 break;
588 case SRE_OP_LITERAL:
589 /* repeated literal */
590 chr = pattern[1];
591 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
592 while (ptr < end && (SRE_CODE) *ptr == chr)
593 ptr++;
594 break;
596 case SRE_OP_LITERAL_IGNORE:
597 /* repeated literal */
598 chr = pattern[1];
599 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
600 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
601 ptr++;
602 break;
604 case SRE_OP_NOT_LITERAL:
605 /* repeated non-literal */
606 chr = pattern[1];
607 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
608 while (ptr < end && (SRE_CODE) *ptr != chr)
609 ptr++;
610 break;
612 case SRE_OP_NOT_LITERAL_IGNORE:
613 /* repeated non-literal */
614 chr = pattern[1];
615 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
616 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
617 ptr++;
618 break;
620 case SRE_OP_IN:
621 /* repeated set */
622 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
623 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
624 ptr++;
625 break;
627 default:
628 /* repeated single character pattern */
629 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
630 while ((SRE_CHAR*) state->ptr < end) {
631 i = SRE_MATCH(state, pattern, level);
632 if (i < 0)
633 return i;
634 if (!i)
635 break;
637 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
638 (SRE_CHAR*) state->ptr - ptr));
639 return (SRE_CHAR*) state->ptr - ptr;
642 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
643 return ptr - (SRE_CHAR*) state->ptr;
646 #if 0 /* not used in this release */
647 LOCAL(int)
648 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
650 /* check if an SRE_OP_INFO block matches at the current position.
651 returns the number of SRE_CODE objects to skip if successful, 0
652 if no match */
654 SRE_CHAR* end = state->end;
655 SRE_CHAR* ptr = state->ptr;
656 int i;
658 /* check minimal length */
659 if (pattern[3] && (end - ptr) < pattern[3])
660 return 0;
662 /* check known prefix */
663 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
664 /* <length> <skip> <prefix data> <overlap data> */
665 for (i = 0; i < pattern[5]; i++)
666 if ((SRE_CODE) ptr[i] != pattern[7 + i])
667 return 0;
668 return pattern[0] + 2 * pattern[6];
670 return pattern[0];
672 #endif
674 LOCAL(int)
675 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
677 /* check if string matches the given pattern. returns <0 for
678 error, 0 for failure, and 1 for success */
680 SRE_CHAR* end = state->end;
681 SRE_CHAR* ptr = state->ptr;
682 int i, count;
683 SRE_REPEAT* rp;
684 int lastmark;
685 SRE_CODE chr;
687 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
689 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
691 #if defined(USE_STACKCHECK)
692 if (level % 10 == 0 && PyOS_CheckStack())
693 return SRE_ERROR_RECURSION_LIMIT;
694 #endif
696 #if defined(USE_RECURSION_LIMIT)
697 if (level > USE_RECURSION_LIMIT)
698 return SRE_ERROR_RECURSION_LIMIT;
699 #endif
701 if (pattern[0] == SRE_OP_INFO) {
702 /* optimization info block */
703 /* <INFO> <1=skip> <2=flags> <3=min> ... */
704 if (pattern[3] && (end - ptr) < pattern[3]) {
705 TRACE(("reject (got %d chars, need %d)\n",
706 (end - ptr), pattern[3]));
707 return 0;
709 pattern += pattern[1] + 1;
712 for (;;) {
714 switch (*pattern++) {
716 case SRE_OP_FAILURE:
717 /* immediate failure */
718 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
719 return 0;
721 case SRE_OP_SUCCESS:
722 /* end of pattern */
723 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
724 state->ptr = ptr;
725 return 1;
727 case SRE_OP_AT:
728 /* match at given position */
729 /* <AT> <code> */
730 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
731 if (!SRE_AT(state, ptr, *pattern))
732 return 0;
733 pattern++;
734 break;
736 case SRE_OP_CATEGORY:
737 /* match at given category */
738 /* <CATEGORY> <code> */
739 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
740 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
741 return 0;
742 pattern++;
743 ptr++;
744 break;
746 case SRE_OP_LITERAL:
747 /* match literal string */
748 /* <LITERAL> <code> */
749 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
750 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
751 return 0;
752 pattern++;
753 ptr++;
754 break;
756 case SRE_OP_NOT_LITERAL:
757 /* match anything that is not literal character */
758 /* <NOT_LITERAL> <code> */
759 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
760 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
761 return 0;
762 pattern++;
763 ptr++;
764 break;
766 case SRE_OP_ANY:
767 /* match anything (except a newline) */
768 /* <ANY> */
769 TRACE(("|%p|%p|ANY\n", pattern, ptr));
770 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
771 return 0;
772 ptr++;
773 break;
775 case SRE_OP_ANY_ALL:
776 /* match anything */
777 /* <ANY_ALL> */
778 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
779 if (ptr >= end)
780 return 0;
781 ptr++;
782 break;
784 case SRE_OP_IN:
785 /* match set member (or non_member) */
786 /* <IN> <skip> <set> */
787 TRACE(("|%p|%p|IN\n", pattern, ptr));
788 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
789 return 0;
790 pattern += pattern[0];
791 ptr++;
792 break;
794 case SRE_OP_GROUPREF:
795 /* match backreference */
796 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
797 i = pattern[0];
799 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
800 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
801 if (!p || !e || e < p)
802 return 0;
803 while (p < e) {
804 if (ptr >= end || *ptr != *p)
805 return 0;
806 p++; ptr++;
809 pattern++;
810 break;
812 case SRE_OP_GROUPREF_IGNORE:
813 /* match backreference */
814 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
815 i = pattern[0];
817 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
818 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
819 if (!p || !e || e < p)
820 return 0;
821 while (p < e) {
822 if (ptr >= end ||
823 state->lower(*ptr) != state->lower(*p))
824 return 0;
825 p++; ptr++;
828 pattern++;
829 break;
831 case SRE_OP_LITERAL_IGNORE:
832 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
833 if (ptr >= end ||
834 state->lower(*ptr) != state->lower(*pattern))
835 return 0;
836 pattern++;
837 ptr++;
838 break;
840 case SRE_OP_NOT_LITERAL_IGNORE:
841 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
842 if (ptr >= end ||
843 state->lower(*ptr) == state->lower(*pattern))
844 return 0;
845 pattern++;
846 ptr++;
847 break;
849 case SRE_OP_IN_IGNORE:
850 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
851 if (ptr >= end
852 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
853 return 0;
854 pattern += pattern[0];
855 ptr++;
856 break;
858 case SRE_OP_MARK:
859 /* set mark */
860 /* <MARK> <gid> */
861 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
862 i = pattern[0];
863 if (i & 1)
864 state->lastindex = i/2 + 1;
865 if (i > state->lastmark)
866 state->lastmark = i;
867 state->mark[i] = ptr;
868 pattern++;
869 break;
871 case SRE_OP_JUMP:
872 case SRE_OP_INFO:
873 /* jump forward */
874 /* <JUMP> <offset> */
875 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
876 pattern += pattern[0];
877 break;
879 case SRE_OP_ASSERT:
880 /* assert subpattern */
881 /* <ASSERT> <skip> <back> <pattern> */
882 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
883 state->ptr = ptr - pattern[1];
884 if (state->ptr < state->beginning)
885 return 0;
886 i = SRE_MATCH(state, pattern + 2, level + 1);
887 if (i <= 0)
888 return i;
889 pattern += pattern[0];
890 break;
892 case SRE_OP_ASSERT_NOT:
893 /* assert not subpattern */
894 /* <ASSERT_NOT> <skip> <back> <pattern> */
895 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
896 state->ptr = ptr - pattern[1];
897 if (state->ptr >= state->beginning) {
898 i = SRE_MATCH(state, pattern + 2, level + 1);
899 if (i < 0)
900 return i;
901 if (i)
902 return 0;
904 pattern += pattern[0];
905 break;
907 case SRE_OP_BRANCH:
908 /* alternation */
909 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
910 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
911 lastmark = state->lastmark;
912 for (; pattern[0]; pattern += pattern[0]) {
913 if (pattern[1] == SRE_OP_LITERAL &&
914 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
915 continue;
916 if (pattern[1] == SRE_OP_IN &&
917 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
918 continue;
919 state->ptr = ptr;
920 i = SRE_MATCH(state, pattern + 1, level + 1);
921 if (i)
922 return i;
923 if (state->lastmark > lastmark) {
924 memset(
925 state->mark + lastmark + 1, 0,
926 (state->lastmark - lastmark) * sizeof(void*)
928 state->lastmark = lastmark;
931 return 0;
933 case SRE_OP_REPEAT_ONE:
934 /* match repeated sequence (maximizing regexp) */
936 /* this operator only works if the repeated item is
937 exactly one character wide, and we're not already
938 collecting backtracking points. for other cases,
939 use the MAX_REPEAT operator */
941 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
943 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
944 pattern[1], pattern[2]));
946 if (ptr + pattern[1] > end)
947 return 0; /* cannot match */
949 state->ptr = ptr;
951 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
952 if (count < 0)
953 return count;
955 ptr += count;
957 /* when we arrive here, count contains the number of
958 matches, and ptr points to the tail of the target
959 string. check if the rest of the pattern matches,
960 and backtrack if not. */
962 if (count < (int) pattern[1])
963 return 0;
965 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
966 /* tail is empty. we're finished */
967 state->ptr = ptr;
968 return 1;
970 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
971 /* tail starts with a literal. skip positions where
972 the rest of the pattern cannot possibly match */
973 chr = pattern[pattern[0]+1];
974 for (;;) {
975 while (count >= (int) pattern[1] &&
976 (ptr >= end || *ptr != chr)) {
977 ptr--;
978 count--;
980 if (count < (int) pattern[1])
981 break;
982 state->ptr = ptr;
983 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
984 if (i)
985 return i;
986 ptr--;
987 count--;
990 } else {
991 /* general case */
992 lastmark = state->lastmark;
993 while (count >= (int) pattern[1]) {
994 state->ptr = ptr;
995 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
996 if (i)
997 return i;
998 ptr--;
999 count--;
1000 if (state->lastmark > lastmark) {
1001 memset(
1002 state->mark + lastmark + 1, 0,
1003 (state->lastmark - lastmark) * sizeof(void*)
1005 state->lastmark = lastmark;
1009 return 0;
1011 case SRE_OP_REPEAT:
1012 /* create repeat context. all the hard work is done
1013 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1014 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1015 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
1016 pattern[1], pattern[2]));
1018 rep.count = -1;
1019 rep.pattern = pattern;
1021 /* install new repeat context */
1022 rep.prev = state->repeat;
1023 state->repeat = &rep;
1025 state->ptr = ptr;
1026 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
1028 state->repeat = rep.prev;
1030 return i;
1032 case SRE_OP_MAX_UNTIL:
1033 /* maximizing repeat */
1034 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1036 /* FIXME: we probably need to deal with zero-width
1037 matches in here... */
1039 rp = state->repeat;
1040 if (!rp)
1041 return SRE_ERROR_STATE;
1043 state->ptr = ptr;
1045 count = rp->count + 1;
1047 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
1049 if (count < rp->pattern[1]) {
1050 /* not enough matches */
1051 rp->count = count;
1052 /* RECURSIVE */
1053 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1054 if (i)
1055 return i;
1056 rp->count = count - 1;
1057 state->ptr = ptr;
1058 return 0;
1061 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
1062 /* we may have enough matches, but if we can
1063 match another item, do so */
1064 rp->count = count;
1065 lastmark = state->lastmark;
1066 i = mark_save(state, 0, lastmark);
1067 if (i < 0)
1068 return i;
1069 /* RECURSIVE */
1070 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1071 if (i)
1072 return i;
1073 i = mark_restore(state, 0, lastmark);
1074 state->lastmark = lastmark;
1075 if (i < 0)
1076 return i;
1077 rp->count = count - 1;
1078 state->ptr = ptr;
1081 /* cannot match more repeated items here. make sure the
1082 tail matches */
1083 state->repeat = rp->prev;
1084 i = SRE_MATCH(state, pattern, level + 1);
1085 if (i)
1086 return i;
1087 state->repeat = rp;
1088 state->ptr = ptr;
1089 return 0;
1091 case SRE_OP_MIN_UNTIL:
1092 /* minimizing repeat */
1093 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1095 rp = state->repeat;
1096 if (!rp)
1097 return SRE_ERROR_STATE;
1099 count = rp->count + 1;
1101 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1102 rp->pattern));
1104 state->ptr = ptr;
1106 if (count < rp->pattern[1]) {
1107 /* not enough matches */
1108 rp->count = count;
1109 /* RECURSIVE */
1110 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1111 if (i)
1112 return i;
1113 rp->count = count-1;
1114 state->ptr = ptr;
1115 return 0;
1118 /* see if the tail matches */
1119 state->repeat = rp->prev;
1120 i = SRE_MATCH(state, pattern, level + 1);
1121 if (i)
1122 return i;
1124 state->ptr = ptr;
1125 state->repeat = rp;
1127 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1128 return 0;
1130 rp->count = count;
1131 /* RECURSIVE */
1132 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1133 if (i)
1134 return i;
1135 rp->count = count - 1;
1136 state->ptr = ptr;
1137 return 0;
1139 default:
1140 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
1141 return SRE_ERROR_ILLEGAL;
1145 /* can't end up here */
1146 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
1149 LOCAL(int)
1150 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1152 SRE_CHAR* ptr = state->start;
1153 SRE_CHAR* end = state->end;
1154 int status = 0;
1155 int prefix_len = 0;
1156 int prefix_skip = 0;
1157 SRE_CODE* prefix = NULL;
1158 SRE_CODE* charset = NULL;
1159 SRE_CODE* overlap = NULL;
1160 int flags = 0;
1162 if (pattern[0] == SRE_OP_INFO) {
1163 /* optimization info block */
1164 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1166 flags = pattern[2];
1168 if (pattern[3] > 0) {
1169 /* adjust end point (but make sure we leave at least one
1170 character in there, so literal search will work) */
1171 end -= pattern[3]-1;
1172 if (end <= ptr)
1173 end = ptr+1;
1176 if (flags & SRE_INFO_PREFIX) {
1177 /* pattern starts with a known prefix */
1178 /* <length> <skip> <prefix data> <overlap data> */
1179 prefix_len = pattern[5];
1180 prefix_skip = pattern[6];
1181 prefix = pattern + 7;
1182 overlap = prefix + prefix_len - 1;
1183 } else if (flags & SRE_INFO_CHARSET)
1184 /* pattern starts with a character from a known set */
1185 /* <charset> */
1186 charset = pattern + 5;
1188 pattern += 1 + pattern[1];
1191 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1192 TRACE(("charset = %p\n", charset));
1194 #if defined(USE_FAST_SEARCH)
1195 if (prefix_len > 1) {
1196 /* pattern starts with a known prefix. use the overlap
1197 table to skip forward as fast as we possibly can */
1198 int i = 0;
1199 end = state->end;
1200 while (ptr < end) {
1201 for (;;) {
1202 if ((SRE_CODE) ptr[0] != prefix[i]) {
1203 if (!i)
1204 break;
1205 else
1206 i = overlap[i];
1207 } else {
1208 if (++i == prefix_len) {
1209 /* found a potential match */
1210 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1211 state->start = ptr + 1 - prefix_len;
1212 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1213 if (flags & SRE_INFO_LITERAL)
1214 return 1; /* we got all of it */
1215 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
1216 if (status != 0)
1217 return status;
1218 /* close but no cigar -- try again */
1219 i = overlap[i];
1221 break;
1225 ptr++;
1227 return 0;
1229 #endif
1231 if (pattern[0] == SRE_OP_LITERAL) {
1232 /* pattern starts with a literal character. this is used
1233 for short prefixes, and if fast search is disabled */
1234 SRE_CODE chr = pattern[1];
1235 end = state->end;
1236 for (;;) {
1237 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1238 ptr++;
1239 if (ptr == end)
1240 return 0;
1241 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1242 state->start = ptr;
1243 state->ptr = ++ptr;
1244 if (flags & SRE_INFO_LITERAL)
1245 return 1; /* we got all of it */
1246 status = SRE_MATCH(state, pattern + 2, 1);
1247 if (status != 0)
1248 break;
1250 } else if (charset) {
1251 /* pattern starts with a character from a known set */
1252 end = state->end;
1253 for (;;) {
1254 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1255 ptr++;
1256 if (ptr == end)
1257 return 0;
1258 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1259 state->start = ptr;
1260 state->ptr = ptr;
1261 status = SRE_MATCH(state, pattern, 1);
1262 if (status != 0)
1263 break;
1264 ptr++;
1266 } else
1267 /* general case */
1268 while (ptr <= end) {
1269 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1270 state->start = state->ptr = ptr++;
1271 status = SRE_MATCH(state, pattern, 1);
1272 if (status != 0)
1273 break;
1276 return status;
1279 LOCAL(int)
1280 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1282 /* check if given string is a literal template (i.e. no escapes) */
1283 while (len-- > 0)
1284 if (*ptr++ == '\\')
1285 return 0;
1286 return 1;
1289 #if !defined(SRE_RECURSIVE)
1291 /* -------------------------------------------------------------------- */
1292 /* factories and destructors */
1294 /* see sre.h for object declarations */
1296 staticforward PyTypeObject Pattern_Type;
1297 staticforward PyTypeObject Match_Type;
1298 staticforward PyTypeObject Scanner_Type;
1300 static PyObject *
1301 _compile(PyObject* self_, PyObject* args)
1303 /* "compile" pattern descriptor to pattern object */
1305 PatternObject* self;
1306 int i, n;
1308 PyObject* pattern;
1309 int flags = 0;
1310 PyObject* code;
1311 int groups = 0;
1312 PyObject* groupindex = NULL;
1313 PyObject* indexgroup = NULL;
1314 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1315 &PyList_Type, &code, &groups,
1316 &groupindex, &indexgroup))
1317 return NULL;
1319 n = PyList_GET_SIZE(code);
1321 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1322 if (!self)
1323 return NULL;
1325 self->codesize = n;
1327 for (i = 0; i < n; i++) {
1328 PyObject *o = PyList_GET_ITEM(code, i);
1329 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1332 if (PyErr_Occurred()) {
1333 PyObject_DEL(self);
1334 return NULL;
1337 Py_INCREF(pattern);
1338 self->pattern = pattern;
1340 self->flags = flags;
1342 self->groups = groups;
1344 Py_XINCREF(groupindex);
1345 self->groupindex = groupindex;
1347 Py_XINCREF(indexgroup);
1348 self->indexgroup = indexgroup;
1350 return (PyObject*) self;
1353 static PyObject *
1354 sre_codesize(PyObject* self, PyObject* args)
1356 return Py_BuildValue("i", sizeof(SRE_CODE));
1359 static PyObject *
1360 sre_getlower(PyObject* self, PyObject* args)
1362 int character, flags;
1363 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1364 return NULL;
1365 if (flags & SRE_FLAG_LOCALE)
1366 return Py_BuildValue("i", sre_lower_locale(character));
1367 if (flags & SRE_FLAG_UNICODE)
1368 #if defined(HAVE_UNICODE)
1369 return Py_BuildValue("i", sre_lower_unicode(character));
1370 #else
1371 return Py_BuildValue("i", sre_lower_locale(character));
1372 #endif
1373 return Py_BuildValue("i", sre_lower(character));
1376 LOCAL(void)
1377 state_reset(SRE_STATE* state)
1379 int i;
1381 state->lastmark = 0;
1383 /* FIXME: dynamic! */
1384 for (i = 0; i < SRE_MARK_SIZE; i++)
1385 state->mark[i] = NULL;
1387 state->lastindex = -1;
1389 state->repeat = NULL;
1391 mark_fini(state);
1394 static void*
1395 getstring(PyObject* string, int* p_length, int* p_charsize)
1397 /* given a python object, return a data pointer, a length (in
1398 characters), and a character size. return NULL if the object
1399 is not a string (or not compatible) */
1401 PyBufferProcs *buffer;
1402 int size, bytes, charsize;
1403 void* ptr;
1405 #if defined(HAVE_UNICODE)
1406 if (PyUnicode_Check(string)) {
1407 /* unicode strings doesn't always support the buffer interface */
1408 ptr = (void*) PyUnicode_AS_DATA(string);
1409 bytes = PyUnicode_GET_DATA_SIZE(string);
1410 size = PyUnicode_GET_SIZE(string);
1411 charsize = sizeof(Py_UNICODE);
1413 } else {
1414 #endif
1416 /* get pointer to string buffer */
1417 buffer = string->ob_type->tp_as_buffer;
1418 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1419 buffer->bf_getsegcount(string, NULL) != 1) {
1420 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1421 return NULL;
1424 /* determine buffer size */
1425 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1426 if (bytes < 0) {
1427 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1428 return NULL;
1431 /* determine character size */
1432 #if PY_VERSION_HEX >= 0x01060000
1433 size = PyObject_Size(string);
1434 #else
1435 size = PyObject_Length(string);
1436 #endif
1438 if (PyString_Check(string) || bytes == size)
1439 charsize = 1;
1440 #if defined(HAVE_UNICODE)
1441 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1442 charsize = sizeof(Py_UNICODE);
1443 #endif
1444 else {
1445 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1446 return NULL;
1449 #if defined(HAVE_UNICODE)
1451 #endif
1453 *p_length = size;
1454 *p_charsize = charsize;
1456 return ptr;
1459 LOCAL(PyObject*)
1460 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1461 int start, int end)
1463 /* prepare state object */
1465 int length;
1466 int charsize;
1467 void* ptr;
1469 memset(state, 0, sizeof(SRE_STATE));
1471 state->lastindex = -1;
1473 ptr = getstring(string, &length, &charsize);
1474 if (!ptr)
1475 return NULL;
1477 /* adjust boundaries */
1478 if (start < 0)
1479 start = 0;
1480 else if (start > length)
1481 start = length;
1483 if (end < 0)
1484 end = 0;
1485 else if (end > length)
1486 end = length;
1488 state->charsize = charsize;
1490 state->beginning = ptr;
1492 state->start = (void*) ((char*) ptr + start * state->charsize);
1493 state->end = (void*) ((char*) ptr + end * state->charsize);
1495 Py_INCREF(string);
1496 state->string = string;
1497 state->pos = start;
1498 state->endpos = end;
1500 if (pattern->flags & SRE_FLAG_LOCALE)
1501 state->lower = sre_lower_locale;
1502 else if (pattern->flags & SRE_FLAG_UNICODE)
1503 #if defined(HAVE_UNICODE)
1504 state->lower = sre_lower_unicode;
1505 #else
1506 state->lower = sre_lower_locale;
1507 #endif
1508 else
1509 state->lower = sre_lower;
1511 return string;
1514 LOCAL(void)
1515 state_fini(SRE_STATE* state)
1517 Py_XDECREF(state->string);
1518 mark_fini(state);
1521 /* calculate offset from start of string */
1522 #define STATE_OFFSET(state, member)\
1523 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1525 LOCAL(PyObject*)
1526 state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
1528 int i, j;
1530 index = (index - 1) * 2;
1532 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1533 if (empty)
1534 /* want empty string */
1535 i = j = 0;
1536 else {
1537 Py_INCREF(Py_None);
1538 return Py_None;
1540 } else {
1541 i = STATE_OFFSET(state, state->mark[index]);
1542 j = STATE_OFFSET(state, state->mark[index+1]);
1545 return PySequence_GetSlice(string, i, j);
1548 static void
1549 pattern_error(int status)
1551 switch (status) {
1552 case SRE_ERROR_RECURSION_LIMIT:
1553 PyErr_SetString(
1554 PyExc_RuntimeError,
1555 "maximum recursion limit exceeded"
1557 break;
1558 case SRE_ERROR_MEMORY:
1559 PyErr_NoMemory();
1560 break;
1561 default:
1562 /* other error codes indicate compiler/engine bugs */
1563 PyErr_SetString(
1564 PyExc_RuntimeError,
1565 "internal error in regular expression engine"
1570 static PyObject*
1571 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
1573 /* create match object (from state object) */
1575 MatchObject* match;
1576 int i, j;
1577 char* base;
1578 int n;
1580 if (status > 0) {
1582 /* create match object (with room for extra group marks) */
1583 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1584 2*(pattern->groups+1));
1585 if (!match)
1586 return NULL;
1588 Py_INCREF(pattern);
1589 match->pattern = pattern;
1591 Py_INCREF(state->string);
1592 match->string = state->string;
1594 match->regs = NULL;
1595 match->groups = pattern->groups+1;
1597 /* fill in group slices */
1599 base = (char*) state->beginning;
1600 n = state->charsize;
1602 match->mark[0] = ((char*) state->start - base) / n;
1603 match->mark[1] = ((char*) state->ptr - base) / n;
1605 for (i = j = 0; i < pattern->groups; i++, j+=2)
1606 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1607 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1608 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1609 } else
1610 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1612 match->pos = state->pos;
1613 match->endpos = state->endpos;
1615 match->lastindex = state->lastindex;
1617 return (PyObject*) match;
1619 } else if (status == 0) {
1621 /* no match */
1622 Py_INCREF(Py_None);
1623 return Py_None;
1627 /* internal error */
1628 pattern_error(status);
1629 return NULL;
1632 static PyObject*
1633 pattern_scanner(PatternObject* pattern, PyObject* args)
1635 /* create search state object */
1637 ScannerObject* self;
1639 PyObject* string;
1640 int start = 0;
1641 int end = INT_MAX;
1642 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1643 return NULL;
1645 /* create scanner object */
1646 self = PyObject_NEW(ScannerObject, &Scanner_Type);
1647 if (!self)
1648 return NULL;
1650 string = state_init(&self->state, pattern, string, start, end);
1651 if (!string) {
1652 PyObject_DEL(self);
1653 return NULL;
1656 Py_INCREF(pattern);
1657 self->pattern = (PyObject*) pattern;
1659 return (PyObject*) self;
1662 static void
1663 pattern_dealloc(PatternObject* self)
1665 Py_XDECREF(self->pattern);
1666 Py_XDECREF(self->groupindex);
1667 Py_XDECREF(self->indexgroup);
1668 PyObject_DEL(self);
1671 static PyObject*
1672 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1674 SRE_STATE state;
1675 int status;
1677 PyObject* string;
1678 int start = 0;
1679 int end = INT_MAX;
1680 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1681 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1682 &string, &start, &end))
1683 return NULL;
1685 string = state_init(&state, self, string, start, end);
1686 if (!string)
1687 return NULL;
1689 state.ptr = state.start;
1691 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1693 if (state.charsize == 1) {
1694 status = sre_match(&state, PatternObject_GetCode(self), 1);
1695 } else {
1696 #if defined(HAVE_UNICODE)
1697 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
1698 #endif
1701 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1703 state_fini(&state);
1705 return pattern_new_match(self, &state, status);
1708 static PyObject*
1709 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1711 SRE_STATE state;
1712 int status;
1714 PyObject* string;
1715 int start = 0;
1716 int end = INT_MAX;
1717 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1718 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1719 &string, &start, &end))
1720 return NULL;
1722 string = state_init(&state, self, string, start, end);
1723 if (!string)
1724 return NULL;
1726 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1728 if (state.charsize == 1) {
1729 status = sre_search(&state, PatternObject_GetCode(self));
1730 } else {
1731 #if defined(HAVE_UNICODE)
1732 status = sre_usearch(&state, PatternObject_GetCode(self));
1733 #endif
1736 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1738 state_fini(&state);
1740 return pattern_new_match(self, &state, status);
1743 static PyObject*
1744 call(char* module, char* function, PyObject* args)
1746 PyObject* name;
1747 PyObject* mod;
1748 PyObject* func;
1749 PyObject* result;
1751 if (!args)
1752 return NULL;
1753 name = PyString_FromString(module);
1754 if (!name)
1755 return NULL;
1756 mod = PyImport_Import(name);
1757 Py_DECREF(name);
1758 if (!mod)
1759 return NULL;
1760 func = PyObject_GetAttrString(mod, function);
1761 Py_DECREF(mod);
1762 if (!func)
1763 return NULL;
1764 result = PyObject_CallObject(func, args);
1765 Py_DECREF(func);
1766 Py_DECREF(args);
1767 return result;
1770 #ifdef USE_BUILTIN_COPY
1771 static int
1772 deepcopy(PyObject** object, PyObject* memo)
1774 PyObject* copy;
1776 copy = call(
1777 "copy", "deepcopy",
1778 Py_BuildValue("OO", *object, memo)
1780 if (!copy)
1781 return 0;
1783 Py_DECREF(*object);
1784 *object = copy;
1786 return 1; /* success */
1788 #endif
1790 static PyObject*
1791 join(PyObject* list, PyObject* pattern)
1793 /* join list elements */
1795 PyObject* joiner;
1796 #if PY_VERSION_HEX >= 0x01060000
1797 PyObject* function;
1798 PyObject* args;
1799 #endif
1800 PyObject* result;
1802 switch (PyList_GET_SIZE(list)) {
1803 case 0:
1804 Py_DECREF(list);
1805 return PyString_FromString("");
1806 case 1:
1807 result = PyList_GET_ITEM(list, 0);
1808 Py_INCREF(result);
1809 Py_DECREF(list);
1810 return result;
1813 /* two or more elements: slice out a suitable separator from the
1814 first member, and use that to join the entire list */
1816 joiner = PySequence_GetSlice(pattern, 0, 0);
1817 if (!joiner)
1818 return NULL;
1820 #if PY_VERSION_HEX >= 0x01060000
1821 function = PyObject_GetAttrString(joiner, "join");
1822 if (!function) {
1823 Py_DECREF(joiner);
1824 return NULL;
1826 args = PyTuple_New(1);
1827 if (!args) {
1828 Py_DECREF(function);
1829 Py_DECREF(joiner);
1830 return NULL;
1832 PyTuple_SET_ITEM(args, 0, list);
1833 result = PyObject_CallObject(function, args);
1834 Py_DECREF(args); /* also removes list */
1835 Py_DECREF(function);
1836 #else
1837 result = call(
1838 "string", "join",
1839 Py_BuildValue("OO", list, joiner)
1841 #endif
1842 Py_DECREF(joiner);
1844 return result;
1847 static PyObject*
1848 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
1850 SRE_STATE state;
1851 PyObject* list;
1852 int status;
1853 int i, b, e;
1855 PyObject* string;
1856 int start = 0;
1857 int end = INT_MAX;
1858 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1859 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1860 &string, &start, &end))
1861 return NULL;
1863 string = state_init(&state, self, string, start, end);
1864 if (!string)
1865 return NULL;
1867 list = PyList_New(0);
1868 if (!list) {
1869 state_fini(&state);
1870 return NULL;
1873 while (state.start <= state.end) {
1875 PyObject* item;
1877 state_reset(&state);
1879 state.ptr = state.start;
1881 if (state.charsize == 1) {
1882 status = sre_search(&state, PatternObject_GetCode(self));
1883 } else {
1884 #if defined(HAVE_UNICODE)
1885 status = sre_usearch(&state, PatternObject_GetCode(self));
1886 #endif
1889 if (status <= 0) {
1890 if (status == 0)
1891 break;
1892 pattern_error(status);
1893 goto error;
1896 /* don't bother to build a match object */
1897 switch (self->groups) {
1898 case 0:
1899 b = STATE_OFFSET(&state, state.start);
1900 e = STATE_OFFSET(&state, state.ptr);
1901 item = PySequence_GetSlice(string, b, e);
1902 if (!item)
1903 goto error;
1904 break;
1905 case 1:
1906 item = state_getslice(&state, 1, string, 1);
1907 if (!item)
1908 goto error;
1909 break;
1910 default:
1911 item = PyTuple_New(self->groups);
1912 if (!item)
1913 goto error;
1914 for (i = 0; i < self->groups; i++) {
1915 PyObject* o = state_getslice(&state, i+1, string, 1);
1916 if (!o) {
1917 Py_DECREF(item);
1918 goto error;
1920 PyTuple_SET_ITEM(item, i, o);
1922 break;
1925 status = PyList_Append(list, item);
1926 Py_DECREF(item);
1927 if (status < 0)
1928 goto error;
1930 if (state.ptr == state.start)
1931 state.start = (void*) ((char*) state.ptr + state.charsize);
1932 else
1933 state.start = state.ptr;
1937 state_fini(&state);
1938 return list;
1940 error:
1941 Py_DECREF(list);
1942 state_fini(&state);
1943 return NULL;
1947 #if PY_VERSION_HEX >= 0x02020000
1948 static PyObject*
1949 pattern_finditer(PatternObject* pattern, PyObject* args)
1951 PyObject* scanner;
1952 PyObject* search;
1953 PyObject* iterator;
1955 scanner = pattern_scanner(pattern, args);
1956 if (!scanner)
1957 return NULL;
1959 search = PyObject_GetAttrString(scanner, "search");
1960 Py_DECREF(scanner);
1961 if (!search)
1962 return NULL;
1964 iterator = PyCallIter_New(search, Py_None);
1965 Py_DECREF(search);
1967 return iterator;
1969 #endif
1971 static PyObject*
1972 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
1974 SRE_STATE state;
1975 PyObject* list;
1976 PyObject* item;
1977 int status;
1978 int n;
1979 int i;
1980 void* last;
1982 PyObject* string;
1983 int maxsplit = 0;
1984 static char* kwlist[] = { "source", "maxsplit", NULL };
1985 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
1986 &string, &maxsplit))
1987 return NULL;
1989 string = state_init(&state, self, string, 0, INT_MAX);
1990 if (!string)
1991 return NULL;
1993 list = PyList_New(0);
1994 if (!list) {
1995 state_fini(&state);
1996 return NULL;
1999 n = 0;
2000 last = state.start;
2002 while (!maxsplit || n < maxsplit) {
2004 state_reset(&state);
2006 state.ptr = state.start;
2008 if (state.charsize == 1) {
2009 status = sre_search(&state, PatternObject_GetCode(self));
2010 } else {
2011 #if defined(HAVE_UNICODE)
2012 status = sre_usearch(&state, PatternObject_GetCode(self));
2013 #endif
2016 if (status <= 0) {
2017 if (status == 0)
2018 break;
2019 pattern_error(status);
2020 goto error;
2023 if (state.start == state.ptr) {
2024 if (last == state.end)
2025 break;
2026 /* skip one character */
2027 state.start = (void*) ((char*) state.ptr + state.charsize);
2028 continue;
2031 /* get segment before this match */
2032 item = PySequence_GetSlice(
2033 string, STATE_OFFSET(&state, last),
2034 STATE_OFFSET(&state, state.start)
2036 if (!item)
2037 goto error;
2038 status = PyList_Append(list, item);
2039 Py_DECREF(item);
2040 if (status < 0)
2041 goto error;
2043 /* add groups (if any) */
2044 for (i = 0; i < self->groups; i++) {
2045 item = state_getslice(&state, i+1, string, 0);
2046 if (!item)
2047 goto error;
2048 status = PyList_Append(list, item);
2049 Py_DECREF(item);
2050 if (status < 0)
2051 goto error;
2054 n = n + 1;
2056 last = state.start = state.ptr;
2060 /* get segment following last match (even if empty) */
2061 item = PySequence_GetSlice(
2062 string, STATE_OFFSET(&state, last), state.endpos
2064 if (!item)
2065 goto error;
2066 status = PyList_Append(list, item);
2067 Py_DECREF(item);
2068 if (status < 0)
2069 goto error;
2071 state_fini(&state);
2072 return list;
2074 error:
2075 Py_DECREF(list);
2076 state_fini(&state);
2077 return NULL;
2081 static PyObject*
2082 pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2083 int count, int subn)
2085 SRE_STATE state;
2086 PyObject* list;
2087 PyObject* item;
2088 PyObject* filter;
2089 PyObject* args;
2090 PyObject* match;
2091 void* ptr;
2092 int status;
2093 int n;
2094 int i, b, e;
2095 int filter_is_callable;
2097 if (PyCallable_Check(template)) {
2098 /* sub/subn takes either a function or a template */
2099 filter = template;
2100 Py_INCREF(filter);
2101 filter_is_callable = 1;
2102 } else {
2103 /* if not callable, check if it's a literal string */
2104 int literal;
2105 ptr = getstring(template, &n, &b);
2106 if (ptr) {
2107 if (b == 1) {
2108 literal = sre_literal_template(ptr, n);
2109 } else {
2110 #if defined(HAVE_UNICODE)
2111 literal = sre_uliteral_template(ptr, n);
2112 #endif
2114 } else {
2115 PyErr_Clear();
2116 literal = 0;
2118 if (literal) {
2119 filter = template;
2120 Py_INCREF(filter);
2121 filter_is_callable = 0;
2122 } else {
2123 /* not a literal; hand it over to the template compiler */
2124 filter = call(
2125 SRE_MODULE, "_subx",
2126 Py_BuildValue("OO", self, template)
2128 if (!filter)
2129 return NULL;
2130 filter_is_callable = PyCallable_Check(filter);
2134 string = state_init(&state, self, string, 0, INT_MAX);
2135 if (!string) {
2136 Py_DECREF(filter);
2137 return NULL;
2140 list = PyList_New(0);
2141 if (!list) {
2142 Py_DECREF(filter);
2143 state_fini(&state);
2144 return NULL;
2147 n = i = 0;
2149 while (!count || n < count) {
2151 state_reset(&state);
2153 state.ptr = state.start;
2155 if (state.charsize == 1) {
2156 status = sre_search(&state, PatternObject_GetCode(self));
2157 } else {
2158 #if defined(HAVE_UNICODE)
2159 status = sre_usearch(&state, PatternObject_GetCode(self));
2160 #endif
2163 if (status <= 0) {
2164 if (status == 0)
2165 break;
2166 pattern_error(status);
2167 goto error;
2170 b = STATE_OFFSET(&state, state.start);
2171 e = STATE_OFFSET(&state, state.ptr);
2173 if (i < b) {
2174 /* get segment before this match */
2175 item = PySequence_GetSlice(string, i, b);
2176 if (!item)
2177 goto error;
2178 status = PyList_Append(list, item);
2179 Py_DECREF(item);
2180 if (status < 0)
2181 goto error;
2183 } else if (i == b && i == e && n > 0)
2184 /* ignore empty match on latest position */
2185 goto next;
2187 if (filter_is_callable) {
2188 /* pass match object through filter */
2189 match = pattern_new_match(self, &state, 1);
2190 if (!match)
2191 goto error;
2192 args = Py_BuildValue("(O)", match);
2193 if (!args) {
2194 Py_DECREF(match);
2195 goto error;
2197 item = PyObject_CallObject(filter, args);
2198 Py_DECREF(args);
2199 Py_DECREF(match);
2200 if (!item)
2201 goto error;
2202 } else {
2203 /* filter is literal string */
2204 item = filter;
2205 Py_INCREF(item);
2208 /* add to list */
2209 if (item != Py_None) {
2210 status = PyList_Append(list, item);
2211 Py_DECREF(item);
2212 if (status < 0)
2213 goto error;
2216 i = e;
2217 n = n + 1;
2219 next:
2220 /* move on */
2221 if (state.ptr == state.start)
2222 state.start = (void*) ((char*) state.ptr + state.charsize);
2223 else
2224 state.start = state.ptr;
2228 /* get segment following last match */
2229 if (i < state.endpos) {
2230 item = PySequence_GetSlice(string, i, state.endpos);
2231 if (!item)
2232 goto error;
2233 status = PyList_Append(list, item);
2234 Py_DECREF(item);
2235 if (status < 0)
2236 goto error;
2239 state_fini(&state);
2241 Py_DECREF(filter);
2243 /* convert list to single string (also removes list) */
2244 item = join(list, self->pattern);
2246 if (!item)
2247 return NULL;
2249 if (subn)
2250 return Py_BuildValue("Ni", item, n);
2252 return item;
2254 error:
2255 Py_DECREF(list);
2256 state_fini(&state);
2257 Py_DECREF(filter);
2258 return NULL;
2262 static PyObject*
2263 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2265 PyObject* template;
2266 PyObject* string;
2267 int count = 0;
2268 static char* kwlist[] = { "repl", "string", "count", NULL };
2269 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2270 &template, &string, &count))
2271 return NULL;
2273 return pattern_subx(self, template, string, count, 0);
2276 static PyObject*
2277 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2279 PyObject* template;
2280 PyObject* string;
2281 int count = 0;
2282 static char* kwlist[] = { "repl", "string", "count", NULL };
2283 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2284 &template, &string, &count))
2285 return NULL;
2287 return pattern_subx(self, template, string, count, 1);
2290 static PyObject*
2291 pattern_copy(PatternObject* self, PyObject* args)
2293 #ifdef USE_BUILTIN_COPY
2294 PatternObject* copy;
2295 int offset;
2297 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2298 return NULL;
2300 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2301 if (!copy)
2302 return NULL;
2304 offset = offsetof(PatternObject, groups);
2306 Py_XINCREF(self->groupindex);
2307 Py_XINCREF(self->indexgroup);
2308 Py_XINCREF(self->pattern);
2310 memcpy((char*) copy + offset, (char*) self + offset,
2311 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2313 return (PyObject*) copy;
2314 #else
2315 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2316 return NULL;
2317 #endif
2320 static PyObject*
2321 pattern_deepcopy(PatternObject* self, PyObject* args)
2323 #ifdef USE_BUILTIN_COPY
2324 PatternObject* copy;
2326 PyObject* memo;
2327 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2328 return NULL;
2330 copy = (PatternObject*) pattern_copy(self, Py_None);
2331 if (!copy)
2332 return NULL;
2334 if (!deepcopy(&copy->groupindex, memo) ||
2335 !deepcopy(&copy->indexgroup, memo) ||
2336 !deepcopy(&copy->pattern, memo)) {
2337 Py_DECREF(copy);
2338 return NULL;
2341 #else
2342 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2343 return NULL;
2344 #endif
2347 static PyMethodDef pattern_methods[] = {
2348 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2349 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2350 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2351 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2352 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2353 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
2354 #if PY_VERSION_HEX >= 0x02020000
2355 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2356 #endif
2357 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2358 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2359 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
2360 {NULL, NULL}
2363 static PyObject*
2364 pattern_getattr(PatternObject* self, char* name)
2366 PyObject* res;
2368 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2370 if (res)
2371 return res;
2373 PyErr_Clear();
2375 /* attributes */
2376 if (!strcmp(name, "pattern")) {
2377 Py_INCREF(self->pattern);
2378 return self->pattern;
2381 if (!strcmp(name, "flags"))
2382 return Py_BuildValue("i", self->flags);
2384 if (!strcmp(name, "groups"))
2385 return Py_BuildValue("i", self->groups);
2387 if (!strcmp(name, "groupindex") && self->groupindex) {
2388 Py_INCREF(self->groupindex);
2389 return self->groupindex;
2392 PyErr_SetString(PyExc_AttributeError, name);
2393 return NULL;
2396 statichere PyTypeObject Pattern_Type = {
2397 PyObject_HEAD_INIT(NULL)
2398 0, "_" SRE_MODULE ".SRE_Pattern",
2399 sizeof(PatternObject), sizeof(SRE_CODE),
2400 (destructor)pattern_dealloc, /*tp_dealloc*/
2401 0, /*tp_print*/
2402 (getattrfunc)pattern_getattr /*tp_getattr*/
2405 /* -------------------------------------------------------------------- */
2406 /* match methods */
2408 static void
2409 match_dealloc(MatchObject* self)
2411 Py_XDECREF(self->regs);
2412 Py_XDECREF(self->string);
2413 Py_DECREF(self->pattern);
2414 PyObject_DEL(self);
2417 static PyObject*
2418 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2420 if (index < 0 || index >= self->groups) {
2421 /* raise IndexError if we were given a bad group number */
2422 PyErr_SetString(
2423 PyExc_IndexError,
2424 "no such group"
2426 return NULL;
2429 index *= 2;
2431 if (self->string == Py_None || self->mark[index] < 0) {
2432 /* return default value if the string or group is undefined */
2433 Py_INCREF(def);
2434 return def;
2437 return PySequence_GetSlice(
2438 self->string, self->mark[index], self->mark[index+1]
2442 static int
2443 match_getindex(MatchObject* self, PyObject* index)
2445 int i;
2447 if (PyInt_Check(index))
2448 return (int) PyInt_AS_LONG(index);
2450 i = -1;
2452 if (self->pattern->groupindex) {
2453 index = PyObject_GetItem(self->pattern->groupindex, index);
2454 if (index) {
2455 if (PyInt_Check(index))
2456 i = (int) PyInt_AS_LONG(index);
2457 Py_DECREF(index);
2458 } else
2459 PyErr_Clear();
2462 return i;
2465 static PyObject*
2466 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2468 return match_getslice_by_index(self, match_getindex(self, index), def);
2471 static PyObject*
2472 match_expand(MatchObject* self, PyObject* args)
2474 PyObject* template;
2475 if (!PyArg_ParseTuple(args, "O:expand", &template))
2476 return NULL;
2478 /* delegate to Python code */
2479 return call(
2480 SRE_MODULE, "_expand",
2481 Py_BuildValue("OOO", self->pattern, self, template)
2485 static PyObject*
2486 match_group(MatchObject* self, PyObject* args)
2488 PyObject* result;
2489 int i, size;
2491 size = PyTuple_GET_SIZE(args);
2493 switch (size) {
2494 case 0:
2495 result = match_getslice(self, Py_False, Py_None);
2496 break;
2497 case 1:
2498 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2499 break;
2500 default:
2501 /* fetch multiple items */
2502 result = PyTuple_New(size);
2503 if (!result)
2504 return NULL;
2505 for (i = 0; i < size; i++) {
2506 PyObject* item = match_getslice(
2507 self, PyTuple_GET_ITEM(args, i), Py_None
2509 if (!item) {
2510 Py_DECREF(result);
2511 return NULL;
2513 PyTuple_SET_ITEM(result, i, item);
2515 break;
2517 return result;
2520 static PyObject*
2521 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2523 PyObject* result;
2524 int index;
2526 PyObject* def = Py_None;
2527 static char* kwlist[] = { "default", NULL };
2528 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2529 return NULL;
2531 result = PyTuple_New(self->groups-1);
2532 if (!result)
2533 return NULL;
2535 for (index = 1; index < self->groups; index++) {
2536 PyObject* item;
2537 item = match_getslice_by_index(self, index, def);
2538 if (!item) {
2539 Py_DECREF(result);
2540 return NULL;
2542 PyTuple_SET_ITEM(result, index-1, item);
2545 return result;
2548 static PyObject*
2549 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2551 PyObject* result;
2552 PyObject* keys;
2553 int index;
2555 PyObject* def = Py_None;
2556 static char* kwlist[] = { "default", NULL };
2557 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2558 return NULL;
2560 result = PyDict_New();
2561 if (!result || !self->pattern->groupindex)
2562 return result;
2564 keys = PyMapping_Keys(self->pattern->groupindex);
2565 if (!keys)
2566 goto failed;
2568 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2569 int status;
2570 PyObject* key;
2571 PyObject* value;
2572 key = PyList_GET_ITEM(keys, index);
2573 if (!key)
2574 goto failed;
2575 value = match_getslice(self, key, def);
2576 if (!value) {
2577 Py_DECREF(key);
2578 goto failed;
2580 status = PyDict_SetItem(result, key, value);
2581 Py_DECREF(value);
2582 if (status < 0)
2583 goto failed;
2586 Py_DECREF(keys);
2588 return result;
2590 failed:
2591 Py_DECREF(keys);
2592 Py_DECREF(result);
2593 return NULL;
2596 static PyObject*
2597 match_start(MatchObject* self, PyObject* args)
2599 int index;
2601 PyObject* index_ = Py_False; /* zero */
2602 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2603 return NULL;
2605 index = match_getindex(self, index_);
2607 if (index < 0 || index >= self->groups) {
2608 PyErr_SetString(
2609 PyExc_IndexError,
2610 "no such group"
2612 return NULL;
2615 /* mark is -1 if group is undefined */
2616 return Py_BuildValue("i", self->mark[index*2]);
2619 static PyObject*
2620 match_end(MatchObject* self, PyObject* args)
2622 int index;
2624 PyObject* index_ = Py_False; /* zero */
2625 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2626 return NULL;
2628 index = match_getindex(self, index_);
2630 if (index < 0 || index >= self->groups) {
2631 PyErr_SetString(
2632 PyExc_IndexError,
2633 "no such group"
2635 return NULL;
2638 /* mark is -1 if group is undefined */
2639 return Py_BuildValue("i", self->mark[index*2+1]);
2642 LOCAL(PyObject*)
2643 _pair(int i1, int i2)
2645 PyObject* pair;
2646 PyObject* item;
2648 pair = PyTuple_New(2);
2649 if (!pair)
2650 return NULL;
2652 item = PyInt_FromLong(i1);
2653 if (!item)
2654 goto error;
2655 PyTuple_SET_ITEM(pair, 0, item);
2657 item = PyInt_FromLong(i2);
2658 if (!item)
2659 goto error;
2660 PyTuple_SET_ITEM(pair, 1, item);
2662 return pair;
2664 error:
2665 Py_DECREF(pair);
2666 return NULL;
2669 static PyObject*
2670 match_span(MatchObject* self, PyObject* args)
2672 int index;
2674 PyObject* index_ = Py_False; /* zero */
2675 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2676 return NULL;
2678 index = match_getindex(self, index_);
2680 if (index < 0 || index >= self->groups) {
2681 PyErr_SetString(
2682 PyExc_IndexError,
2683 "no such group"
2685 return NULL;
2688 /* marks are -1 if group is undefined */
2689 return _pair(self->mark[index*2], self->mark[index*2+1]);
2692 static PyObject*
2693 match_regs(MatchObject* self)
2695 PyObject* regs;
2696 PyObject* item;
2697 int index;
2699 regs = PyTuple_New(self->groups);
2700 if (!regs)
2701 return NULL;
2703 for (index = 0; index < self->groups; index++) {
2704 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2705 if (!item) {
2706 Py_DECREF(regs);
2707 return NULL;
2709 PyTuple_SET_ITEM(regs, index, item);
2712 Py_INCREF(regs);
2713 self->regs = regs;
2715 return regs;
2718 static PyObject*
2719 match_copy(MatchObject* self, PyObject* args)
2721 #ifdef USE_BUILTIN_COPY
2722 MatchObject* copy;
2723 int slots, offset;
2725 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2726 return NULL;
2728 slots = 2 * (self->pattern->groups+1);
2730 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2731 if (!copy)
2732 return NULL;
2734 /* this value a constant, but any compiler should be able to
2735 figure that out all by itself */
2736 offset = offsetof(MatchObject, string);
2738 Py_XINCREF(self->pattern);
2739 Py_XINCREF(self->string);
2740 Py_XINCREF(self->regs);
2742 memcpy((char*) copy + offset, (char*) self + offset,
2743 sizeof(MatchObject) + slots * sizeof(int) - offset);
2745 return (PyObject*) copy;
2746 #else
2747 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
2748 return NULL;
2749 #endif
2752 static PyObject*
2753 match_deepcopy(MatchObject* self, PyObject* args)
2755 #ifdef USE_BUILTIN_COPY
2756 MatchObject* copy;
2758 PyObject* memo;
2759 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2760 return NULL;
2762 copy = (MatchObject*) match_copy(self, Py_None);
2763 if (!copy)
2764 return NULL;
2766 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2767 !deepcopy(&copy->string, memo) ||
2768 !deepcopy(&copy->regs, memo)) {
2769 Py_DECREF(copy);
2770 return NULL;
2773 #else
2774 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2775 return NULL;
2776 #endif
2779 static PyMethodDef match_methods[] = {
2780 {"group", (PyCFunction) match_group, METH_VARARGS},
2781 {"start", (PyCFunction) match_start, METH_VARARGS},
2782 {"end", (PyCFunction) match_end, METH_VARARGS},
2783 {"span", (PyCFunction) match_span, METH_VARARGS},
2784 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2785 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2786 {"expand", (PyCFunction) match_expand, METH_VARARGS},
2787 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2788 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
2789 {NULL, NULL}
2792 static PyObject*
2793 match_getattr(MatchObject* self, char* name)
2795 PyObject* res;
2797 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2798 if (res)
2799 return res;
2801 PyErr_Clear();
2803 if (!strcmp(name, "lastindex")) {
2804 if (self->lastindex >= 0)
2805 return Py_BuildValue("i", self->lastindex);
2806 Py_INCREF(Py_None);
2807 return Py_None;
2810 if (!strcmp(name, "lastgroup")) {
2811 if (self->pattern->indexgroup && self->lastindex >= 0) {
2812 PyObject* result = PySequence_GetItem(
2813 self->pattern->indexgroup, self->lastindex
2815 if (result)
2816 return result;
2817 PyErr_Clear();
2819 Py_INCREF(Py_None);
2820 return Py_None;
2823 if (!strcmp(name, "string")) {
2824 if (self->string) {
2825 Py_INCREF(self->string);
2826 return self->string;
2827 } else {
2828 Py_INCREF(Py_None);
2829 return Py_None;
2833 if (!strcmp(name, "regs")) {
2834 if (self->regs) {
2835 Py_INCREF(self->regs);
2836 return self->regs;
2837 } else
2838 return match_regs(self);
2841 if (!strcmp(name, "re")) {
2842 Py_INCREF(self->pattern);
2843 return (PyObject*) self->pattern;
2846 if (!strcmp(name, "pos"))
2847 return Py_BuildValue("i", self->pos);
2849 if (!strcmp(name, "endpos"))
2850 return Py_BuildValue("i", self->endpos);
2852 PyErr_SetString(PyExc_AttributeError, name);
2853 return NULL;
2856 /* FIXME: implement setattr("string", None) as a special case (to
2857 detach the associated string, if any */
2859 statichere PyTypeObject Match_Type = {
2860 PyObject_HEAD_INIT(NULL)
2861 0, "_" SRE_MODULE ".SRE_Match",
2862 sizeof(MatchObject), sizeof(int),
2863 (destructor)match_dealloc, /*tp_dealloc*/
2864 0, /*tp_print*/
2865 (getattrfunc)match_getattr /*tp_getattr*/
2868 /* -------------------------------------------------------------------- */
2869 /* scanner methods (experimental) */
2871 static void
2872 scanner_dealloc(ScannerObject* self)
2874 state_fini(&self->state);
2875 Py_DECREF(self->pattern);
2876 PyObject_DEL(self);
2879 static PyObject*
2880 scanner_match(ScannerObject* self, PyObject* args)
2882 SRE_STATE* state = &self->state;
2883 PyObject* match;
2884 int status;
2886 state_reset(state);
2888 state->ptr = state->start;
2890 if (state->charsize == 1) {
2891 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
2892 } else {
2893 #if defined(HAVE_UNICODE)
2894 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
2895 #endif
2898 match = pattern_new_match((PatternObject*) self->pattern,
2899 state, status);
2901 if (status == 0 || state->ptr == state->start)
2902 state->start = (void*) ((char*) state->ptr + state->charsize);
2903 else
2904 state->start = state->ptr;
2906 return match;
2910 static PyObject*
2911 scanner_search(ScannerObject* self, PyObject* args)
2913 SRE_STATE* state = &self->state;
2914 PyObject* match;
2915 int status;
2917 state_reset(state);
2919 state->ptr = state->start;
2921 if (state->charsize == 1) {
2922 status = sre_search(state, PatternObject_GetCode(self->pattern));
2923 } else {
2924 #if defined(HAVE_UNICODE)
2925 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
2926 #endif
2929 match = pattern_new_match((PatternObject*) self->pattern,
2930 state, status);
2932 if (status == 0 || state->ptr == state->start)
2933 state->start = (void*) ((char*) state->ptr + state->charsize);
2934 else
2935 state->start = state->ptr;
2937 return match;
2940 static PyMethodDef scanner_methods[] = {
2941 {"match", (PyCFunction) scanner_match, 0},
2942 {"search", (PyCFunction) scanner_search, 0},
2943 {NULL, NULL}
2946 static PyObject*
2947 scanner_getattr(ScannerObject* self, char* name)
2949 PyObject* res;
2951 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2952 if (res)
2953 return res;
2955 PyErr_Clear();
2957 /* attributes */
2958 if (!strcmp(name, "pattern")) {
2959 Py_INCREF(self->pattern);
2960 return self->pattern;
2963 PyErr_SetString(PyExc_AttributeError, name);
2964 return NULL;
2967 statichere PyTypeObject Scanner_Type = {
2968 PyObject_HEAD_INIT(NULL)
2969 0, "_" SRE_MODULE ".SRE_Scanner",
2970 sizeof(ScannerObject), 0,
2971 (destructor)scanner_dealloc, /*tp_dealloc*/
2972 0, /*tp_print*/
2973 (getattrfunc)scanner_getattr, /*tp_getattr*/
2976 static PyMethodDef _functions[] = {
2977 {"compile", _compile, 1},
2978 {"getcodesize", sre_codesize, 1},
2979 {"getlower", sre_getlower, 1},
2980 {NULL, NULL}
2983 DL_EXPORT(void)
2984 init_sre(void)
2986 PyObject* m;
2987 PyObject* d;
2988 PyObject* x;
2990 /* Patch object types */
2991 Pattern_Type.ob_type = Match_Type.ob_type =
2992 Scanner_Type.ob_type = &PyType_Type;
2994 m = Py_InitModule("_" SRE_MODULE, _functions);
2995 d = PyModule_GetDict(m);
2997 x = PyInt_FromLong(SRE_MAGIC);
2998 if (x) {
2999 PyDict_SetItemString(d, "MAGIC", x);
3000 Py_DECREF(x);
3003 x = PyString_FromString(copyright);
3004 if (x) {
3005 PyDict_SetItemString(d, "copyright", x);
3006 Py_DECREF(x);
3010 #endif /* !defined(SRE_RECURSIVE) */