append(): Fixing the test for convertability after consultation with
[python/dscho.git] / Modules / _sre.c
blobf4dbef042f13734284f3335b56151ac8f1dc2860
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-06-30 fl added fast search optimization
10 * 2000-06-30 fl added assert (lookahead) primitives, etc
11 * 2000-07-02 fl added charset optimizations, etc
12 * 2000-07-03 fl store code in pattern object, lookbehind, etc
13 * 2000-07-08 fl added regs attribute
14 * 2000-07-21 fl reset lastindex in scanner methods
15 * 2000-08-01 fl fixes for 1.6b1
16 * 2000-08-03 fl added recursion limit
17 * 2000-08-07 fl use PyOS_CheckStack() if available
18 * 2000-08-08 fl changed findall to return empty strings instead of None
19 * 2000-08-27 fl properly propagate memory errors
20 * 2000-09-02 fl return -1 instead of None for start/end/span
21 * 2000-09-20 fl added expand method
22 * 2000-09-21 fl don't use the buffer interface for unicode strings
23 * 2000-10-03 fl fixed assert_not primitive; support keyword arguments
24 * 2000-10-24 fl really fixed assert_not; reset groups in findall
25 * 2000-12-21 fl fixed memory leak in groupdict
26 * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27 * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
28 * 2001-01-16 fl fixed memory leak in pattern destructor
29 * 2001-03-20 fl lots of fixes for 2.1b2
30 * 2001-04-15 fl export copyright as Python attribute, not global
31 * 2001-04-28 fl added __copy__ methods (work in progress)
32 * 2001-05-14 fl fixes for 1.5.2
33 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
34 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
35 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
36 * 2001-10-21 fl added sub/subn primitive
37 * 2001-10-22 fl check for literal sub/subn templates
38 * 2001-10-24 fl added finditer primitive (for 2.2 only)
39 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
41 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
43 * This version of the SRE library can be redistributed under CNRI's
44 * Python 1.6 license. For any other use, please contact Secret Labs
45 * AB (info@pythonware.com).
47 * Portions of this engine have been developed in cooperation with
48 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
49 * other compatibility work.
52 #ifndef SRE_RECURSIVE
54 static char copyright[] =
55 " SRE 2.2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
57 #include "Python.h"
58 #include "structmember.h" /* offsetof */
60 #include "sre.h"
62 #include <ctype.h>
64 /* name of this module, minus the leading underscore */
65 #if !defined(SRE_MODULE)
66 #define SRE_MODULE "sre"
67 #endif
69 /* defining this one enables tracing */
70 #undef VERBOSE
72 #if PY_VERSION_HEX >= 0x01060000
73 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
74 /* defining this enables unicode support (default under 1.6a1 and later) */
75 #define HAVE_UNICODE
76 #endif
77 #endif
79 /* -------------------------------------------------------------------- */
80 /* optional features */
82 /* prevent run-away recursion (bad patterns on long strings) */
84 #if !defined(USE_STACKCHECK)
85 #if defined(MS_WIN64) || defined(__LP64__) || defined(_LP64)
86 /* require smaller recursion limit for a number of 64-bit platforms:
87 Win64 (MS_WIN64), Linux64 (__LP64__), Monterey (64-bit AIX) (_LP64) */
88 /* FIXME: maybe the limit should be 40000 / sizeof(void*) ? */
89 #define USE_RECURSION_LIMIT 7500
90 #else
91 #define USE_RECURSION_LIMIT 10000
92 #endif
93 #endif
95 /* enables fast searching */
96 #define USE_FAST_SEARCH
98 /* enables aggressive inlining (always on for Visual C) */
99 #undef USE_INLINE
101 /* enables copy/deepcopy handling (work in progress) */
102 #undef USE_BUILTIN_COPY
104 #if PY_VERSION_HEX < 0x01060000
105 #define PyObject_DEL(op) PyMem_DEL((op))
106 #endif
108 /* -------------------------------------------------------------------- */
110 #if defined(_MSC_VER)
111 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
112 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
113 /* fastest possible local call under MSVC */
114 #define LOCAL(type) static __inline type __fastcall
115 #elif defined(USE_INLINE)
116 #define LOCAL(type) static inline type
117 #else
118 #define LOCAL(type) static type
119 #endif
121 /* error codes */
122 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
123 #define SRE_ERROR_STATE -2 /* illegal state */
124 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
125 #define SRE_ERROR_MEMORY -9 /* out of memory */
127 #if defined(VERBOSE)
128 #define TRACE(v) printf v
129 #else
130 #define TRACE(v)
131 #endif
133 /* -------------------------------------------------------------------- */
134 /* search engine state */
136 /* default character predicates (run sre_chars.py to regenerate tables) */
138 #define SRE_DIGIT_MASK 1
139 #define SRE_SPACE_MASK 2
140 #define SRE_LINEBREAK_MASK 4
141 #define SRE_ALNUM_MASK 8
142 #define SRE_WORD_MASK 16
144 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
146 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
147 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
149 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
150 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
151 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
152 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
154 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
155 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
156 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
157 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
158 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
159 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
160 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
161 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
162 120, 121, 122, 123, 124, 125, 126, 127 };
164 #define SRE_IS_DIGIT(ch)\
165 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
166 #define SRE_IS_SPACE(ch)\
167 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
168 #define SRE_IS_LINEBREAK(ch)\
169 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
170 #define SRE_IS_ALNUM(ch)\
171 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
172 #define SRE_IS_WORD(ch)\
173 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
175 static unsigned int sre_lower(unsigned int ch)
177 return ((ch) < 128 ? sre_char_lower[ch] : ch);
180 /* locale-specific character predicates */
182 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
183 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
184 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
185 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
186 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
188 static unsigned int sre_lower_locale(unsigned int ch)
190 return ((ch) < 256 ? tolower((ch)) : ch);
193 /* unicode-specific character predicates */
195 #if defined(HAVE_UNICODE)
197 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
198 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
199 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
200 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
201 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
203 static unsigned int sre_lower_unicode(unsigned int ch)
205 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
208 #endif
210 LOCAL(int)
211 sre_category(SRE_CODE category, unsigned int ch)
213 switch (category) {
215 case SRE_CATEGORY_DIGIT:
216 return SRE_IS_DIGIT(ch);
217 case SRE_CATEGORY_NOT_DIGIT:
218 return !SRE_IS_DIGIT(ch);
219 case SRE_CATEGORY_SPACE:
220 return SRE_IS_SPACE(ch);
221 case SRE_CATEGORY_NOT_SPACE:
222 return !SRE_IS_SPACE(ch);
223 case SRE_CATEGORY_WORD:
224 return SRE_IS_WORD(ch);
225 case SRE_CATEGORY_NOT_WORD:
226 return !SRE_IS_WORD(ch);
227 case SRE_CATEGORY_LINEBREAK:
228 return SRE_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_NOT_LINEBREAK:
230 return !SRE_IS_LINEBREAK(ch);
232 case SRE_CATEGORY_LOC_WORD:
233 return SRE_LOC_IS_WORD(ch);
234 case SRE_CATEGORY_LOC_NOT_WORD:
235 return !SRE_LOC_IS_WORD(ch);
237 #if defined(HAVE_UNICODE)
238 case SRE_CATEGORY_UNI_DIGIT:
239 return SRE_UNI_IS_DIGIT(ch);
240 case SRE_CATEGORY_UNI_NOT_DIGIT:
241 return !SRE_UNI_IS_DIGIT(ch);
242 case SRE_CATEGORY_UNI_SPACE:
243 return SRE_UNI_IS_SPACE(ch);
244 case SRE_CATEGORY_UNI_NOT_SPACE:
245 return !SRE_UNI_IS_SPACE(ch);
246 case SRE_CATEGORY_UNI_WORD:
247 return SRE_UNI_IS_WORD(ch);
248 case SRE_CATEGORY_UNI_NOT_WORD:
249 return !SRE_UNI_IS_WORD(ch);
250 case SRE_CATEGORY_UNI_LINEBREAK:
251 return SRE_UNI_IS_LINEBREAK(ch);
252 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
253 return !SRE_UNI_IS_LINEBREAK(ch);
254 #else
255 case SRE_CATEGORY_UNI_DIGIT:
256 return SRE_IS_DIGIT(ch);
257 case SRE_CATEGORY_UNI_NOT_DIGIT:
258 return !SRE_IS_DIGIT(ch);
259 case SRE_CATEGORY_UNI_SPACE:
260 return SRE_IS_SPACE(ch);
261 case SRE_CATEGORY_UNI_NOT_SPACE:
262 return !SRE_IS_SPACE(ch);
263 case SRE_CATEGORY_UNI_WORD:
264 return SRE_LOC_IS_WORD(ch);
265 case SRE_CATEGORY_UNI_NOT_WORD:
266 return !SRE_LOC_IS_WORD(ch);
267 case SRE_CATEGORY_UNI_LINEBREAK:
268 return SRE_IS_LINEBREAK(ch);
269 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
270 return !SRE_IS_LINEBREAK(ch);
271 #endif
273 return 0;
276 /* helpers */
278 static void
279 mark_fini(SRE_STATE* state)
281 if (state->mark_stack) {
282 free(state->mark_stack);
283 state->mark_stack = NULL;
285 state->mark_stack_size = state->mark_stack_base = 0;
288 static int
289 mark_save(SRE_STATE* state, int lo, int hi)
291 void* stack;
292 int size;
293 int minsize, newsize;
295 if (hi <= lo)
296 return 0;
298 size = (hi - lo) + 1;
300 newsize = state->mark_stack_size;
301 minsize = state->mark_stack_base + size;
303 if (newsize < minsize) {
304 /* create new stack */
305 if (!newsize) {
306 newsize = 512;
307 if (newsize < minsize)
308 newsize = minsize;
309 TRACE(("allocate stack %d\n", newsize));
310 stack = malloc(sizeof(void*) * newsize);
311 } else {
312 /* grow the stack */
313 while (newsize < minsize)
314 newsize += newsize;
315 TRACE(("grow stack to %d\n", newsize));
316 stack = realloc(state->mark_stack, sizeof(void*) * newsize);
318 if (!stack) {
319 mark_fini(state);
320 return SRE_ERROR_MEMORY;
322 state->mark_stack = stack;
323 state->mark_stack_size = newsize;
326 TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
328 memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
329 size * sizeof(void*));
331 state->mark_stack_base += size;
333 return 0;
336 static int
337 mark_restore(SRE_STATE* state, int lo, int hi)
339 int size;
341 if (hi <= lo)
342 return 0;
344 size = (hi - lo) + 1;
346 state->mark_stack_base -= size;
348 TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
350 memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
351 size * sizeof(void*));
353 return 0;
356 /* generate 8-bit version */
358 #define SRE_CHAR unsigned char
359 #define SRE_AT sre_at
360 #define SRE_COUNT sre_count
361 #define SRE_CHARSET sre_charset
362 #define SRE_INFO sre_info
363 #define SRE_MATCH sre_match
364 #define SRE_SEARCH sre_search
365 #define SRE_LITERAL_TEMPLATE sre_literal_template
367 #if defined(HAVE_UNICODE)
369 #define SRE_RECURSIVE
370 #include "_sre.c"
371 #undef SRE_RECURSIVE
373 #undef SRE_LITERAL_TEMPLATE
374 #undef SRE_SEARCH
375 #undef SRE_MATCH
376 #undef SRE_INFO
377 #undef SRE_CHARSET
378 #undef SRE_COUNT
379 #undef SRE_AT
380 #undef SRE_CHAR
382 /* generate 16-bit unicode version */
384 #define SRE_CHAR Py_UNICODE
385 #define SRE_AT sre_uat
386 #define SRE_COUNT sre_ucount
387 #define SRE_CHARSET sre_ucharset
388 #define SRE_INFO sre_uinfo
389 #define SRE_MATCH sre_umatch
390 #define SRE_SEARCH sre_usearch
391 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
392 #endif
394 #endif /* SRE_RECURSIVE */
396 /* -------------------------------------------------------------------- */
397 /* String matching engine */
399 /* the following section is compiled twice, with different character
400 settings */
402 LOCAL(int)
403 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
405 /* check if pointer is at given position */
407 int this, that;
409 switch (at) {
411 case SRE_AT_BEGINNING:
412 case SRE_AT_BEGINNING_STRING:
413 return ((void*) ptr == state->beginning);
415 case SRE_AT_BEGINNING_LINE:
416 return ((void*) ptr == state->beginning ||
417 SRE_IS_LINEBREAK((int) ptr[-1]));
419 case SRE_AT_END:
420 return (((void*) (ptr+1) == state->end &&
421 SRE_IS_LINEBREAK((int) ptr[0])) ||
422 ((void*) ptr == state->end));
424 case SRE_AT_END_LINE:
425 return ((void*) ptr == state->end ||
426 SRE_IS_LINEBREAK((int) ptr[0]));
428 case SRE_AT_END_STRING:
429 return ((void*) ptr == state->end);
431 case SRE_AT_BOUNDARY:
432 if (state->beginning == state->end)
433 return 0;
434 that = ((void*) ptr > state->beginning) ?
435 SRE_IS_WORD((int) ptr[-1]) : 0;
436 this = ((void*) ptr < state->end) ?
437 SRE_IS_WORD((int) ptr[0]) : 0;
438 return this != that;
440 case SRE_AT_NON_BOUNDARY:
441 if (state->beginning == state->end)
442 return 0;
443 that = ((void*) ptr > state->beginning) ?
444 SRE_IS_WORD((int) ptr[-1]) : 0;
445 this = ((void*) ptr < state->end) ?
446 SRE_IS_WORD((int) ptr[0]) : 0;
447 return this == that;
449 case SRE_AT_LOC_BOUNDARY:
450 if (state->beginning == state->end)
451 return 0;
452 that = ((void*) ptr > state->beginning) ?
453 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
454 this = ((void*) ptr < state->end) ?
455 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
456 return this != that;
458 case SRE_AT_LOC_NON_BOUNDARY:
459 if (state->beginning == state->end)
460 return 0;
461 that = ((void*) ptr > state->beginning) ?
462 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
463 this = ((void*) ptr < state->end) ?
464 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
465 return this == that;
467 #if defined(HAVE_UNICODE)
468 case SRE_AT_UNI_BOUNDARY:
469 if (state->beginning == state->end)
470 return 0;
471 that = ((void*) ptr > state->beginning) ?
472 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
473 this = ((void*) ptr < state->end) ?
474 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
475 return this != that;
477 case SRE_AT_UNI_NON_BOUNDARY:
478 if (state->beginning == state->end)
479 return 0;
480 that = ((void*) ptr > state->beginning) ?
481 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
482 this = ((void*) ptr < state->end) ?
483 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
484 return this == that;
485 #endif
489 return 0;
492 LOCAL(int)
493 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
495 /* check if character is a member of the given set */
497 int ok = 1;
499 for (;;) {
500 switch (*set++) {
502 case SRE_OP_LITERAL:
503 /* <LITERAL> <code> */
504 if (ch == set[0])
505 return ok;
506 set++;
507 break;
509 case SRE_OP_RANGE:
510 /* <RANGE> <lower> <upper> */
511 if (set[0] <= ch && ch <= set[1])
512 return ok;
513 set += 2;
514 break;
516 case SRE_OP_CHARSET:
517 /* <CHARSET> <bitmap> (16 bits per code word) */
518 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
519 return ok;
520 set += 16;
521 break;
523 case SRE_OP_BIGCHARSET:
524 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
526 int count, block;
527 count = *(set++);
528 block = ((unsigned char*)set)[ch >> 8];
529 set += 128;
530 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
531 return ok;
532 set += count*16;
533 break;
536 case SRE_OP_CATEGORY:
537 /* <CATEGORY> <code> */
538 if (sre_category(set[0], (int) ch))
539 return ok;
540 set += 1;
541 break;
543 case SRE_OP_NEGATE:
544 ok = !ok;
545 break;
547 case SRE_OP_FAILURE:
548 return !ok;
550 default:
551 /* internal error -- there's not much we can do about it
552 here, so let's just pretend it didn't match... */
553 return 0;
558 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level);
560 LOCAL(int)
561 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
563 SRE_CODE chr;
564 SRE_CHAR* ptr = state->ptr;
565 SRE_CHAR* end = state->end;
566 int i;
568 /* adjust end */
569 if (maxcount < end - ptr && maxcount != 65535)
570 end = ptr + maxcount;
572 switch (pattern[0]) {
574 case SRE_OP_ANY:
575 /* repeated dot wildcard. */
576 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
577 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
578 ptr++;
579 break;
581 case SRE_OP_ANY_ALL:
582 /* repeated dot wildcare. skip to the end of the target
583 string, and backtrack from there */
584 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
585 ptr = end;
586 break;
588 case SRE_OP_LITERAL:
589 /* repeated literal */
590 chr = pattern[1];
591 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
592 while (ptr < end && (SRE_CODE) *ptr == chr)
593 ptr++;
594 break;
596 case SRE_OP_LITERAL_IGNORE:
597 /* repeated literal */
598 chr = pattern[1];
599 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
600 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
601 ptr++;
602 break;
604 case SRE_OP_NOT_LITERAL:
605 /* repeated non-literal */
606 chr = pattern[1];
607 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
608 while (ptr < end && (SRE_CODE) *ptr != chr)
609 ptr++;
610 break;
612 case SRE_OP_NOT_LITERAL_IGNORE:
613 /* repeated non-literal */
614 chr = pattern[1];
615 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
616 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
617 ptr++;
618 break;
620 case SRE_OP_IN:
621 /* repeated set */
622 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
623 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
624 ptr++;
625 break;
627 default:
628 /* repeated single character pattern */
629 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
630 while ((SRE_CHAR*) state->ptr < end) {
631 i = SRE_MATCH(state, pattern, level);
632 if (i < 0)
633 return i;
634 if (!i)
635 break;
637 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
638 (SRE_CHAR*) state->ptr - ptr));
639 return (SRE_CHAR*) state->ptr - ptr;
642 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
643 return ptr - (SRE_CHAR*) state->ptr;
646 #if 0 /* not used in this release */
647 LOCAL(int)
648 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
650 /* check if an SRE_OP_INFO block matches at the current position.
651 returns the number of SRE_CODE objects to skip if successful, 0
652 if no match */
654 SRE_CHAR* end = state->end;
655 SRE_CHAR* ptr = state->ptr;
656 int i;
658 /* check minimal length */
659 if (pattern[3] && (end - ptr) < pattern[3])
660 return 0;
662 /* check known prefix */
663 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
664 /* <length> <skip> <prefix data> <overlap data> */
665 for (i = 0; i < pattern[5]; i++)
666 if ((SRE_CODE) ptr[i] != pattern[7 + i])
667 return 0;
668 return pattern[0] + 2 * pattern[6];
670 return pattern[0];
672 #endif
674 LOCAL(int)
675 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
677 /* check if string matches the given pattern. returns <0 for
678 error, 0 for failure, and 1 for success */
680 SRE_CHAR* end = state->end;
681 SRE_CHAR* ptr = state->ptr;
682 int i, count;
683 SRE_REPEAT* rp;
684 int lastmark;
685 SRE_CODE chr;
687 SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
689 TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level));
691 #if defined(USE_STACKCHECK)
692 if (level % 10 == 0 && PyOS_CheckStack())
693 return SRE_ERROR_RECURSION_LIMIT;
694 #endif
696 #if defined(USE_RECURSION_LIMIT)
697 if (level > USE_RECURSION_LIMIT)
698 return SRE_ERROR_RECURSION_LIMIT;
699 #endif
701 if (pattern[0] == SRE_OP_INFO) {
702 /* optimization info block */
703 /* <INFO> <1=skip> <2=flags> <3=min> ... */
704 if (pattern[3] && (end - ptr) < pattern[3]) {
705 TRACE(("reject (got %d chars, need %d)\n",
706 (end - ptr), pattern[3]));
707 return 0;
709 pattern += pattern[1] + 1;
712 for (;;) {
714 switch (*pattern++) {
716 case SRE_OP_FAILURE:
717 /* immediate failure */
718 TRACE(("|%p|%p|FAILURE\n", pattern, ptr));
719 return 0;
721 case SRE_OP_SUCCESS:
722 /* end of pattern */
723 TRACE(("|%p|%p|SUCCESS\n", pattern, ptr));
724 state->ptr = ptr;
725 return 1;
727 case SRE_OP_AT:
728 /* match at given position */
729 /* <AT> <code> */
730 TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern));
731 if (!SRE_AT(state, ptr, *pattern))
732 return 0;
733 pattern++;
734 break;
736 case SRE_OP_CATEGORY:
737 /* match at given category */
738 /* <CATEGORY> <code> */
739 TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern));
740 if (ptr >= end || !sre_category(pattern[0], ptr[0]))
741 return 0;
742 pattern++;
743 ptr++;
744 break;
746 case SRE_OP_LITERAL:
747 /* match literal string */
748 /* <LITERAL> <code> */
749 TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
750 if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
751 return 0;
752 pattern++;
753 ptr++;
754 break;
756 case SRE_OP_NOT_LITERAL:
757 /* match anything that is not literal character */
758 /* <NOT_LITERAL> <code> */
759 TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
760 if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
761 return 0;
762 pattern++;
763 ptr++;
764 break;
766 case SRE_OP_ANY:
767 /* match anything (except a newline) */
768 /* <ANY> */
769 TRACE(("|%p|%p|ANY\n", pattern, ptr));
770 if (ptr >= end || SRE_IS_LINEBREAK(ptr[0]))
771 return 0;
772 ptr++;
773 break;
775 case SRE_OP_ANY_ALL:
776 /* match anything */
777 /* <ANY_ALL> */
778 TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr));
779 if (ptr >= end)
780 return 0;
781 ptr++;
782 break;
784 case SRE_OP_IN:
785 /* match set member (or non_member) */
786 /* <IN> <skip> <set> */
787 TRACE(("|%p|%p|IN\n", pattern, ptr));
788 if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr))
789 return 0;
790 pattern += pattern[0];
791 ptr++;
792 break;
794 case SRE_OP_GROUPREF:
795 /* match backreference */
796 TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
797 i = pattern[0];
799 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
800 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
801 if (!p || !e || e < p)
802 return 0;
803 while (p < e) {
804 if (ptr >= end || *ptr != *p)
805 return 0;
806 p++; ptr++;
809 pattern++;
810 break;
812 case SRE_OP_GROUPREF_IGNORE:
813 /* match backreference */
814 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
815 i = pattern[0];
817 SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
818 SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
819 if (!p || !e || e < p)
820 return 0;
821 while (p < e) {
822 if (ptr >= end ||
823 state->lower(*ptr) != state->lower(*p))
824 return 0;
825 p++; ptr++;
828 pattern++;
829 break;
831 case SRE_OP_LITERAL_IGNORE:
832 TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0]));
833 if (ptr >= end ||
834 state->lower(*ptr) != state->lower(*pattern))
835 return 0;
836 pattern++;
837 ptr++;
838 break;
840 case SRE_OP_NOT_LITERAL_IGNORE:
841 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern));
842 if (ptr >= end ||
843 state->lower(*ptr) == state->lower(*pattern))
844 return 0;
845 pattern++;
846 ptr++;
847 break;
849 case SRE_OP_IN_IGNORE:
850 TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr));
851 if (ptr >= end
852 || !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr)))
853 return 0;
854 pattern += pattern[0];
855 ptr++;
856 break;
858 case SRE_OP_MARK:
859 /* set mark */
860 /* <MARK> <gid> */
861 TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
862 i = pattern[0];
863 if (i & 1)
864 state->lastindex = i/2 + 1;
865 if (i > state->lastmark)
866 state->lastmark = i;
867 state->mark[i] = ptr;
868 pattern++;
869 break;
871 case SRE_OP_JUMP:
872 case SRE_OP_INFO:
873 /* jump forward */
874 /* <JUMP> <offset> */
875 TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0]));
876 pattern += pattern[0];
877 break;
879 case SRE_OP_ASSERT:
880 /* assert subpattern */
881 /* <ASSERT> <skip> <back> <pattern> */
882 TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
883 state->ptr = ptr - pattern[1];
884 if (state->ptr < state->beginning)
885 return 0;
886 i = SRE_MATCH(state, pattern + 2, level + 1);
887 if (i <= 0)
888 return i;
889 pattern += pattern[0];
890 break;
892 case SRE_OP_ASSERT_NOT:
893 /* assert not subpattern */
894 /* <ASSERT_NOT> <skip> <back> <pattern> */
895 TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
896 state->ptr = ptr - pattern[1];
897 if (state->ptr >= state->beginning) {
898 i = SRE_MATCH(state, pattern + 2, level + 1);
899 if (i < 0)
900 return i;
901 if (i)
902 return 0;
904 pattern += pattern[0];
905 break;
907 case SRE_OP_BRANCH:
908 /* alternation */
909 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
910 TRACE(("|%p|%p|BRANCH\n", pattern, ptr));
911 lastmark = state->lastmark;
912 for (; pattern[0]; pattern += pattern[0]) {
913 if (pattern[1] == SRE_OP_LITERAL &&
914 (ptr >= end || (SRE_CODE) *ptr != pattern[2]))
915 continue;
916 if (pattern[1] == SRE_OP_IN &&
917 (ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr)))
918 continue;
919 state->ptr = ptr;
920 i = SRE_MATCH(state, pattern + 1, level + 1);
921 if (i)
922 return i;
923 if (state->lastmark > lastmark) {
924 memset(
925 state->mark + lastmark + 1, 0,
926 (state->lastmark - lastmark) * sizeof(void*)
928 state->lastmark = lastmark;
931 return 0;
933 case SRE_OP_REPEAT_ONE:
934 /* match repeated sequence (maximizing regexp) */
936 /* this operator only works if the repeated item is
937 exactly one character wide, and we're not already
938 collecting backtracking points. for other cases,
939 use the MAX_REPEAT operator */
941 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
943 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr,
944 pattern[1], pattern[2]));
946 if (ptr + pattern[1] > end)
947 return 0; /* cannot match */
949 state->ptr = ptr;
951 count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1);
952 if (count < 0)
953 return count;
955 ptr += count;
957 /* when we arrive here, count contains the number of
958 matches, and ptr points to the tail of the target
959 string. check if the rest of the pattern matches,
960 and backtrack if not. */
962 if (count < (int) pattern[1])
963 return 0;
965 if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
966 /* tail is empty. we're finished */
967 state->ptr = ptr;
968 return 1;
970 } else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
971 /* tail starts with a literal. skip positions where
972 the rest of the pattern cannot possibly match */
973 chr = pattern[pattern[0]+1];
974 for (;;) {
975 while (count >= (int) pattern[1] &&
976 (ptr >= end || *ptr != chr)) {
977 ptr--;
978 count--;
980 if (count < (int) pattern[1])
981 break;
982 state->ptr = ptr;
983 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
984 if (i)
985 return i;
986 ptr--;
987 count--;
990 } else {
991 /* general case */
992 lastmark = state->lastmark;
993 while (count >= (int) pattern[1]) {
994 state->ptr = ptr;
995 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
996 if (i)
997 return i;
998 ptr--;
999 count--;
1000 if (state->lastmark > lastmark) {
1001 memset(
1002 state->mark + lastmark + 1, 0,
1003 (state->lastmark - lastmark) * sizeof(void*)
1005 state->lastmark = lastmark;
1009 return 0;
1011 case SRE_OP_REPEAT:
1012 /* create repeat context. all the hard work is done
1013 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1014 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1015 TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr,
1016 pattern[1], pattern[2]));
1018 rep.count = -1;
1019 rep.pattern = pattern;
1021 /* install new repeat context */
1022 rep.prev = state->repeat;
1023 state->repeat = &rep;
1025 state->ptr = ptr;
1026 i = SRE_MATCH(state, pattern + pattern[0], level + 1);
1028 state->repeat = rep.prev;
1030 return i;
1032 case SRE_OP_MAX_UNTIL:
1033 /* maximizing repeat */
1034 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1036 /* FIXME: we probably need to deal with zero-width
1037 matches in here... */
1039 rp = state->repeat;
1040 if (!rp)
1041 return SRE_ERROR_STATE;
1043 state->ptr = ptr;
1045 count = rp->count + 1;
1047 TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count));
1049 if (count < rp->pattern[1]) {
1050 /* not enough matches */
1051 rp->count = count;
1052 /* RECURSIVE */
1053 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1054 if (i)
1055 return i;
1056 rp->count = count - 1;
1057 state->ptr = ptr;
1058 return 0;
1061 if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
1062 /* we may have enough matches, but if we can
1063 match another item, do so */
1064 rp->count = count;
1065 lastmark = state->lastmark;
1066 i = mark_save(state, 0, lastmark);
1067 if (i < 0)
1068 return i;
1069 /* RECURSIVE */
1070 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1071 if (i)
1072 return i;
1073 i = mark_restore(state, 0, lastmark);
1074 state->lastmark = lastmark;
1075 if (i < 0)
1076 return i;
1077 rp->count = count - 1;
1078 state->ptr = ptr;
1081 /* cannot match more repeated items here. make sure the
1082 tail matches */
1083 state->repeat = rp->prev;
1084 i = SRE_MATCH(state, pattern, level + 1);
1085 if (i)
1086 return i;
1087 state->repeat = rp;
1088 state->ptr = ptr;
1089 return 0;
1091 case SRE_OP_MIN_UNTIL:
1092 /* minimizing repeat */
1093 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1095 rp = state->repeat;
1096 if (!rp)
1097 return SRE_ERROR_STATE;
1099 count = rp->count + 1;
1101 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count,
1102 rp->pattern));
1104 state->ptr = ptr;
1106 if (count < rp->pattern[1]) {
1107 /* not enough matches */
1108 rp->count = count;
1109 /* RECURSIVE */
1110 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1111 if (i)
1112 return i;
1113 rp->count = count-1;
1114 state->ptr = ptr;
1115 return 0;
1118 /* see if the tail matches */
1119 state->repeat = rp->prev;
1120 i = SRE_MATCH(state, pattern, level + 1);
1121 if (i)
1122 return i;
1124 state->ptr = ptr;
1125 state->repeat = rp;
1127 if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
1128 return 0;
1130 rp->count = count;
1131 /* RECURSIVE */
1132 i = SRE_MATCH(state, rp->pattern + 3, level + 1);
1133 if (i)
1134 return i;
1135 rp->count = count - 1;
1136 state->ptr = ptr;
1137 return 0;
1139 default:
1140 TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1]));
1141 return SRE_ERROR_ILLEGAL;
1145 /* can't end up here */
1146 /* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
1149 LOCAL(int)
1150 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1152 SRE_CHAR* ptr = state->start;
1153 SRE_CHAR* end = state->end;
1154 int status = 0;
1155 int prefix_len = 0;
1156 int prefix_skip = 0;
1157 SRE_CODE* prefix = NULL;
1158 SRE_CODE* charset = NULL;
1159 SRE_CODE* overlap = NULL;
1160 int flags = 0;
1162 if (pattern[0] == SRE_OP_INFO) {
1163 /* optimization info block */
1164 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1166 flags = pattern[2];
1168 if (pattern[3] > 0) {
1169 /* adjust end point (but make sure we leave at least one
1170 character in there, so literal search will work) */
1171 end -= pattern[3]-1;
1172 if (end <= ptr)
1173 end = ptr+1;
1176 if (flags & SRE_INFO_PREFIX) {
1177 /* pattern starts with a known prefix */
1178 /* <length> <skip> <prefix data> <overlap data> */
1179 prefix_len = pattern[5];
1180 prefix_skip = pattern[6];
1181 prefix = pattern + 7;
1182 overlap = prefix + prefix_len - 1;
1183 } else if (flags & SRE_INFO_CHARSET)
1184 /* pattern starts with a character from a known set */
1185 /* <charset> */
1186 charset = pattern + 5;
1188 pattern += 1 + pattern[1];
1191 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1192 TRACE(("charset = %p\n", charset));
1194 #if defined(USE_FAST_SEARCH)
1195 if (prefix_len > 1) {
1196 /* pattern starts with a known prefix. use the overlap
1197 table to skip forward as fast as we possibly can */
1198 int i = 0;
1199 end = state->end;
1200 while (ptr < end) {
1201 for (;;) {
1202 if ((SRE_CODE) ptr[0] != prefix[i]) {
1203 if (!i)
1204 break;
1205 else
1206 i = overlap[i];
1207 } else {
1208 if (++i == prefix_len) {
1209 /* found a potential match */
1210 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1211 state->start = ptr + 1 - prefix_len;
1212 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1213 if (flags & SRE_INFO_LITERAL)
1214 return 1; /* we got all of it */
1215 status = SRE_MATCH(state, pattern + 2*prefix_skip, 1);
1216 if (status != 0)
1217 return status;
1218 /* close but no cigar -- try again */
1219 i = overlap[i];
1221 break;
1225 ptr++;
1227 return 0;
1229 #endif
1231 if (pattern[0] == SRE_OP_LITERAL) {
1232 /* pattern starts with a literal character. this is used
1233 for short prefixes, and if fast search is disabled */
1234 SRE_CODE chr = pattern[1];
1235 end = state->end;
1236 for (;;) {
1237 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1238 ptr++;
1239 if (ptr == end)
1240 return 0;
1241 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1242 state->start = ptr;
1243 state->ptr = ++ptr;
1244 if (flags & SRE_INFO_LITERAL)
1245 return 1; /* we got all of it */
1246 status = SRE_MATCH(state, pattern + 2, 1);
1247 if (status != 0)
1248 break;
1250 } else if (charset) {
1251 /* pattern starts with a character from a known set */
1252 end = state->end;
1253 for (;;) {
1254 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1255 ptr++;
1256 if (ptr == end)
1257 return 0;
1258 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1259 state->start = ptr;
1260 state->ptr = ptr;
1261 status = SRE_MATCH(state, pattern, 1);
1262 if (status != 0)
1263 break;
1264 ptr++;
1266 } else
1267 /* general case */
1268 while (ptr <= end) {
1269 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1270 state->start = state->ptr = ptr++;
1271 status = SRE_MATCH(state, pattern, 1);
1272 if (status != 0)
1273 break;
1276 return status;
1279 LOCAL(int)
1280 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1282 /* check if given string is a literal template (i.e. no escapes) */
1283 while (len-- > 0)
1284 if (*ptr++ == '\\')
1285 return 0;
1286 return 1;
1289 #if !defined(SRE_RECURSIVE)
1291 /* -------------------------------------------------------------------- */
1292 /* factories and destructors */
1294 /* see sre.h for object declarations */
1296 static PyTypeObject Pattern_Type;
1297 static PyTypeObject Match_Type;
1298 static PyTypeObject Scanner_Type;
1300 static PyObject *
1301 _compile(PyObject* self_, PyObject* args)
1303 /* "compile" pattern descriptor to pattern object */
1305 PatternObject* self;
1306 int i, n;
1308 PyObject* pattern;
1309 int flags = 0;
1310 PyObject* code;
1311 int groups = 0;
1312 PyObject* groupindex = NULL;
1313 PyObject* indexgroup = NULL;
1314 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1315 &PyList_Type, &code, &groups,
1316 &groupindex, &indexgroup))
1317 return NULL;
1319 n = PyList_GET_SIZE(code);
1321 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1322 if (!self)
1323 return NULL;
1325 self->codesize = n;
1327 for (i = 0; i < n; i++) {
1328 PyObject *o = PyList_GET_ITEM(code, i);
1329 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1332 if (PyErr_Occurred()) {
1333 PyObject_DEL(self);
1334 return NULL;
1337 Py_INCREF(pattern);
1338 self->pattern = pattern;
1340 self->flags = flags;
1342 self->groups = groups;
1344 Py_XINCREF(groupindex);
1345 self->groupindex = groupindex;
1347 Py_XINCREF(indexgroup);
1348 self->indexgroup = indexgroup;
1350 return (PyObject*) self;
1353 static PyObject *
1354 sre_codesize(PyObject* self, PyObject* args)
1356 return Py_BuildValue("i", sizeof(SRE_CODE));
1359 static PyObject *
1360 sre_getlower(PyObject* self, PyObject* args)
1362 int character, flags;
1363 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1364 return NULL;
1365 if (flags & SRE_FLAG_LOCALE)
1366 return Py_BuildValue("i", sre_lower_locale(character));
1367 if (flags & SRE_FLAG_UNICODE)
1368 #if defined(HAVE_UNICODE)
1369 return Py_BuildValue("i", sre_lower_unicode(character));
1370 #else
1371 return Py_BuildValue("i", sre_lower_locale(character));
1372 #endif
1373 return Py_BuildValue("i", sre_lower(character));
1376 LOCAL(void)
1377 state_reset(SRE_STATE* state)
1379 state->lastmark = 0;
1381 /* FIXME: dynamic! */
1382 memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);
1384 state->lastindex = -1;
1386 state->repeat = NULL;
1388 mark_fini(state);
1391 static void*
1392 getstring(PyObject* string, int* p_length, int* p_charsize)
1394 /* given a python object, return a data pointer, a length (in
1395 characters), and a character size. return NULL if the object
1396 is not a string (or not compatible) */
1398 PyBufferProcs *buffer;
1399 int size, bytes, charsize;
1400 void* ptr;
1402 #if defined(HAVE_UNICODE)
1403 if (PyUnicode_Check(string)) {
1404 /* unicode strings doesn't always support the buffer interface */
1405 ptr = (void*) PyUnicode_AS_DATA(string);
1406 bytes = PyUnicode_GET_DATA_SIZE(string);
1407 size = PyUnicode_GET_SIZE(string);
1408 charsize = sizeof(Py_UNICODE);
1410 } else {
1411 #endif
1413 /* get pointer to string buffer */
1414 buffer = string->ob_type->tp_as_buffer;
1415 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1416 buffer->bf_getsegcount(string, NULL) != 1) {
1417 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1418 return NULL;
1421 /* determine buffer size */
1422 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1423 if (bytes < 0) {
1424 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1425 return NULL;
1428 /* determine character size */
1429 #if PY_VERSION_HEX >= 0x01060000
1430 size = PyObject_Size(string);
1431 #else
1432 size = PyObject_Length(string);
1433 #endif
1435 if (PyString_Check(string) || bytes == size)
1436 charsize = 1;
1437 #if defined(HAVE_UNICODE)
1438 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1439 charsize = sizeof(Py_UNICODE);
1440 #endif
1441 else {
1442 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1443 return NULL;
1446 #if defined(HAVE_UNICODE)
1448 #endif
1450 *p_length = size;
1451 *p_charsize = charsize;
1453 return ptr;
1456 LOCAL(PyObject*)
1457 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1458 int start, int end)
1460 /* prepare state object */
1462 int length;
1463 int charsize;
1464 void* ptr;
1466 memset(state, 0, sizeof(SRE_STATE));
1468 state->lastindex = -1;
1470 ptr = getstring(string, &length, &charsize);
1471 if (!ptr)
1472 return NULL;
1474 /* adjust boundaries */
1475 if (start < 0)
1476 start = 0;
1477 else if (start > length)
1478 start = length;
1480 if (end < 0)
1481 end = 0;
1482 else if (end > length)
1483 end = length;
1485 state->charsize = charsize;
1487 state->beginning = ptr;
1489 state->start = (void*) ((char*) ptr + start * state->charsize);
1490 state->end = (void*) ((char*) ptr + end * state->charsize);
1492 Py_INCREF(string);
1493 state->string = string;
1494 state->pos = start;
1495 state->endpos = end;
1497 if (pattern->flags & SRE_FLAG_LOCALE)
1498 state->lower = sre_lower_locale;
1499 else if (pattern->flags & SRE_FLAG_UNICODE)
1500 #if defined(HAVE_UNICODE)
1501 state->lower = sre_lower_unicode;
1502 #else
1503 state->lower = sre_lower_locale;
1504 #endif
1505 else
1506 state->lower = sre_lower;
1508 return string;
1511 LOCAL(void)
1512 state_fini(SRE_STATE* state)
1514 Py_XDECREF(state->string);
1515 mark_fini(state);
1518 /* calculate offset from start of string */
1519 #define STATE_OFFSET(state, member)\
1520 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1522 LOCAL(PyObject*)
1523 state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
1525 int i, j;
1527 index = (index - 1) * 2;
1529 if (string == Py_None || !state->mark[index] || !state->mark[index+1]) {
1530 if (empty)
1531 /* want empty string */
1532 i = j = 0;
1533 else {
1534 Py_INCREF(Py_None);
1535 return Py_None;
1537 } else {
1538 i = STATE_OFFSET(state, state->mark[index]);
1539 j = STATE_OFFSET(state, state->mark[index+1]);
1542 return PySequence_GetSlice(string, i, j);
1545 static void
1546 pattern_error(int status)
1548 switch (status) {
1549 case SRE_ERROR_RECURSION_LIMIT:
1550 PyErr_SetString(
1551 PyExc_RuntimeError,
1552 "maximum recursion limit exceeded"
1554 break;
1555 case SRE_ERROR_MEMORY:
1556 PyErr_NoMemory();
1557 break;
1558 default:
1559 /* other error codes indicate compiler/engine bugs */
1560 PyErr_SetString(
1561 PyExc_RuntimeError,
1562 "internal error in regular expression engine"
1567 static PyObject*
1568 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
1570 /* create match object (from state object) */
1572 MatchObject* match;
1573 int i, j;
1574 char* base;
1575 int n;
1577 if (status > 0) {
1579 /* create match object (with room for extra group marks) */
1580 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1581 2*(pattern->groups+1));
1582 if (!match)
1583 return NULL;
1585 Py_INCREF(pattern);
1586 match->pattern = pattern;
1588 Py_INCREF(state->string);
1589 match->string = state->string;
1591 match->regs = NULL;
1592 match->groups = pattern->groups+1;
1594 /* fill in group slices */
1596 base = (char*) state->beginning;
1597 n = state->charsize;
1599 match->mark[0] = ((char*) state->start - base) / n;
1600 match->mark[1] = ((char*) state->ptr - base) / n;
1602 for (i = j = 0; i < pattern->groups; i++, j+=2)
1603 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1604 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1605 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1606 } else
1607 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1609 match->pos = state->pos;
1610 match->endpos = state->endpos;
1612 match->lastindex = state->lastindex;
1614 return (PyObject*) match;
1616 } else if (status == 0) {
1618 /* no match */
1619 Py_INCREF(Py_None);
1620 return Py_None;
1624 /* internal error */
1625 pattern_error(status);
1626 return NULL;
1629 static PyObject*
1630 pattern_scanner(PatternObject* pattern, PyObject* args)
1632 /* create search state object */
1634 ScannerObject* self;
1636 PyObject* string;
1637 int start = 0;
1638 int end = INT_MAX;
1639 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1640 return NULL;
1642 /* create scanner object */
1643 self = PyObject_NEW(ScannerObject, &Scanner_Type);
1644 if (!self)
1645 return NULL;
1647 string = state_init(&self->state, pattern, string, start, end);
1648 if (!string) {
1649 PyObject_DEL(self);
1650 return NULL;
1653 Py_INCREF(pattern);
1654 self->pattern = (PyObject*) pattern;
1656 return (PyObject*) self;
1659 static void
1660 pattern_dealloc(PatternObject* self)
1662 Py_XDECREF(self->pattern);
1663 Py_XDECREF(self->groupindex);
1664 Py_XDECREF(self->indexgroup);
1665 PyObject_DEL(self);
1668 static PyObject*
1669 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1671 SRE_STATE state;
1672 int status;
1674 PyObject* string;
1675 int start = 0;
1676 int end = INT_MAX;
1677 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1678 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
1679 &string, &start, &end))
1680 return NULL;
1682 string = state_init(&state, self, string, start, end);
1683 if (!string)
1684 return NULL;
1686 state.ptr = state.start;
1688 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1690 if (state.charsize == 1) {
1691 status = sre_match(&state, PatternObject_GetCode(self), 1);
1692 } else {
1693 #if defined(HAVE_UNICODE)
1694 status = sre_umatch(&state, PatternObject_GetCode(self), 1);
1695 #endif
1698 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1700 state_fini(&state);
1702 return pattern_new_match(self, &state, status);
1705 static PyObject*
1706 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1708 SRE_STATE state;
1709 int status;
1711 PyObject* string;
1712 int start = 0;
1713 int end = INT_MAX;
1714 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1715 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
1716 &string, &start, &end))
1717 return NULL;
1719 string = state_init(&state, self, string, start, end);
1720 if (!string)
1721 return NULL;
1723 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1725 if (state.charsize == 1) {
1726 status = sre_search(&state, PatternObject_GetCode(self));
1727 } else {
1728 #if defined(HAVE_UNICODE)
1729 status = sre_usearch(&state, PatternObject_GetCode(self));
1730 #endif
1733 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1735 state_fini(&state);
1737 return pattern_new_match(self, &state, status);
1740 static PyObject*
1741 call(char* module, char* function, PyObject* args)
1743 PyObject* name;
1744 PyObject* mod;
1745 PyObject* func;
1746 PyObject* result;
1748 if (!args)
1749 return NULL;
1750 name = PyString_FromString(module);
1751 if (!name)
1752 return NULL;
1753 mod = PyImport_Import(name);
1754 Py_DECREF(name);
1755 if (!mod)
1756 return NULL;
1757 func = PyObject_GetAttrString(mod, function);
1758 Py_DECREF(mod);
1759 if (!func)
1760 return NULL;
1761 result = PyObject_CallObject(func, args);
1762 Py_DECREF(func);
1763 Py_DECREF(args);
1764 return result;
1767 #ifdef USE_BUILTIN_COPY
1768 static int
1769 deepcopy(PyObject** object, PyObject* memo)
1771 PyObject* copy;
1773 copy = call(
1774 "copy", "deepcopy",
1775 Py_BuildValue("OO", *object, memo)
1777 if (!copy)
1778 return 0;
1780 Py_DECREF(*object);
1781 *object = copy;
1783 return 1; /* success */
1785 #endif
1787 static PyObject*
1788 join_list(PyObject* list, PyObject* pattern)
1790 /* join list elements */
1792 PyObject* joiner;
1793 #if PY_VERSION_HEX >= 0x01060000
1794 PyObject* function;
1795 PyObject* args;
1796 #endif
1797 PyObject* result;
1799 switch (PyList_GET_SIZE(list)) {
1800 case 0:
1801 Py_DECREF(list);
1802 return PyString_FromString("");
1803 case 1:
1804 result = PyList_GET_ITEM(list, 0);
1805 Py_INCREF(result);
1806 Py_DECREF(list);
1807 return result;
1810 /* two or more elements: slice out a suitable separator from the
1811 first member, and use that to join the entire list */
1813 joiner = PySequence_GetSlice(pattern, 0, 0);
1814 if (!joiner)
1815 return NULL;
1817 #if PY_VERSION_HEX >= 0x01060000
1818 function = PyObject_GetAttrString(joiner, "join");
1819 if (!function) {
1820 Py_DECREF(joiner);
1821 return NULL;
1823 args = PyTuple_New(1);
1824 if (!args) {
1825 Py_DECREF(function);
1826 Py_DECREF(joiner);
1827 return NULL;
1829 PyTuple_SET_ITEM(args, 0, list);
1830 result = PyObject_CallObject(function, args);
1831 Py_DECREF(args); /* also removes list */
1832 Py_DECREF(function);
1833 #else
1834 result = call(
1835 "string", "join",
1836 Py_BuildValue("OO", list, joiner)
1838 #endif
1839 Py_DECREF(joiner);
1841 return result;
1844 static PyObject*
1845 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
1847 SRE_STATE state;
1848 PyObject* list;
1849 int status;
1850 int i, b, e;
1852 PyObject* string;
1853 int start = 0;
1854 int end = INT_MAX;
1855 static char* kwlist[] = { "source", "pos", "endpos", NULL };
1856 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
1857 &string, &start, &end))
1858 return NULL;
1860 string = state_init(&state, self, string, start, end);
1861 if (!string)
1862 return NULL;
1864 list = PyList_New(0);
1865 if (!list) {
1866 state_fini(&state);
1867 return NULL;
1870 while (state.start <= state.end) {
1872 PyObject* item;
1874 state_reset(&state);
1876 state.ptr = state.start;
1878 if (state.charsize == 1) {
1879 status = sre_search(&state, PatternObject_GetCode(self));
1880 } else {
1881 #if defined(HAVE_UNICODE)
1882 status = sre_usearch(&state, PatternObject_GetCode(self));
1883 #endif
1886 if (status <= 0) {
1887 if (status == 0)
1888 break;
1889 pattern_error(status);
1890 goto error;
1893 /* don't bother to build a match object */
1894 switch (self->groups) {
1895 case 0:
1896 b = STATE_OFFSET(&state, state.start);
1897 e = STATE_OFFSET(&state, state.ptr);
1898 item = PySequence_GetSlice(string, b, e);
1899 if (!item)
1900 goto error;
1901 break;
1902 case 1:
1903 item = state_getslice(&state, 1, string, 1);
1904 if (!item)
1905 goto error;
1906 break;
1907 default:
1908 item = PyTuple_New(self->groups);
1909 if (!item)
1910 goto error;
1911 for (i = 0; i < self->groups; i++) {
1912 PyObject* o = state_getslice(&state, i+1, string, 1);
1913 if (!o) {
1914 Py_DECREF(item);
1915 goto error;
1917 PyTuple_SET_ITEM(item, i, o);
1919 break;
1922 status = PyList_Append(list, item);
1923 Py_DECREF(item);
1924 if (status < 0)
1925 goto error;
1927 if (state.ptr == state.start)
1928 state.start = (void*) ((char*) state.ptr + state.charsize);
1929 else
1930 state.start = state.ptr;
1934 state_fini(&state);
1935 return list;
1937 error:
1938 Py_DECREF(list);
1939 state_fini(&state);
1940 return NULL;
1944 #if PY_VERSION_HEX >= 0x02020000
1945 static PyObject*
1946 pattern_finditer(PatternObject* pattern, PyObject* args)
1948 PyObject* scanner;
1949 PyObject* search;
1950 PyObject* iterator;
1952 scanner = pattern_scanner(pattern, args);
1953 if (!scanner)
1954 return NULL;
1956 search = PyObject_GetAttrString(scanner, "search");
1957 Py_DECREF(scanner);
1958 if (!search)
1959 return NULL;
1961 iterator = PyCallIter_New(search, Py_None);
1962 Py_DECREF(search);
1964 return iterator;
1966 #endif
1968 static PyObject*
1969 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
1971 SRE_STATE state;
1972 PyObject* list;
1973 PyObject* item;
1974 int status;
1975 int n;
1976 int i;
1977 void* last;
1979 PyObject* string;
1980 int maxsplit = 0;
1981 static char* kwlist[] = { "source", "maxsplit", NULL };
1982 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
1983 &string, &maxsplit))
1984 return NULL;
1986 string = state_init(&state, self, string, 0, INT_MAX);
1987 if (!string)
1988 return NULL;
1990 list = PyList_New(0);
1991 if (!list) {
1992 state_fini(&state);
1993 return NULL;
1996 n = 0;
1997 last = state.start;
1999 while (!maxsplit || n < maxsplit) {
2001 state_reset(&state);
2003 state.ptr = state.start;
2005 if (state.charsize == 1) {
2006 status = sre_search(&state, PatternObject_GetCode(self));
2007 } else {
2008 #if defined(HAVE_UNICODE)
2009 status = sre_usearch(&state, PatternObject_GetCode(self));
2010 #endif
2013 if (status <= 0) {
2014 if (status == 0)
2015 break;
2016 pattern_error(status);
2017 goto error;
2020 if (state.start == state.ptr) {
2021 if (last == state.end)
2022 break;
2023 /* skip one character */
2024 state.start = (void*) ((char*) state.ptr + state.charsize);
2025 continue;
2028 /* get segment before this match */
2029 item = PySequence_GetSlice(
2030 string, STATE_OFFSET(&state, last),
2031 STATE_OFFSET(&state, state.start)
2033 if (!item)
2034 goto error;
2035 status = PyList_Append(list, item);
2036 Py_DECREF(item);
2037 if (status < 0)
2038 goto error;
2040 /* add groups (if any) */
2041 for (i = 0; i < self->groups; i++) {
2042 item = state_getslice(&state, i+1, string, 0);
2043 if (!item)
2044 goto error;
2045 status = PyList_Append(list, item);
2046 Py_DECREF(item);
2047 if (status < 0)
2048 goto error;
2051 n = n + 1;
2053 last = state.start = state.ptr;
2057 /* get segment following last match (even if empty) */
2058 item = PySequence_GetSlice(
2059 string, STATE_OFFSET(&state, last), state.endpos
2061 if (!item)
2062 goto error;
2063 status = PyList_Append(list, item);
2064 Py_DECREF(item);
2065 if (status < 0)
2066 goto error;
2068 state_fini(&state);
2069 return list;
2071 error:
2072 Py_DECREF(list);
2073 state_fini(&state);
2074 return NULL;
2078 static PyObject*
2079 pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2080 int count, int subn)
2082 SRE_STATE state;
2083 PyObject* list;
2084 PyObject* item;
2085 PyObject* filter;
2086 PyObject* args;
2087 PyObject* match;
2088 void* ptr;
2089 int status;
2090 int n;
2091 int i, b, e;
2092 int filter_is_callable;
2094 if (PyCallable_Check(template)) {
2095 /* sub/subn takes either a function or a template */
2096 filter = template;
2097 Py_INCREF(filter);
2098 filter_is_callable = 1;
2099 } else {
2100 /* if not callable, check if it's a literal string */
2101 int literal;
2102 ptr = getstring(template, &n, &b);
2103 if (ptr) {
2104 if (b == 1) {
2105 literal = sre_literal_template(ptr, n);
2106 } else {
2107 #if defined(HAVE_UNICODE)
2108 literal = sre_uliteral_template(ptr, n);
2109 #endif
2111 } else {
2112 PyErr_Clear();
2113 literal = 0;
2115 if (literal) {
2116 filter = template;
2117 Py_INCREF(filter);
2118 filter_is_callable = 0;
2119 } else {
2120 /* not a literal; hand it over to the template compiler */
2121 filter = call(
2122 SRE_MODULE, "_subx",
2123 Py_BuildValue("OO", self, template)
2125 if (!filter)
2126 return NULL;
2127 filter_is_callable = PyCallable_Check(filter);
2131 string = state_init(&state, self, string, 0, INT_MAX);
2132 if (!string) {
2133 Py_DECREF(filter);
2134 return NULL;
2137 list = PyList_New(0);
2138 if (!list) {
2139 Py_DECREF(filter);
2140 state_fini(&state);
2141 return NULL;
2144 n = i = 0;
2146 while (!count || n < count) {
2148 state_reset(&state);
2150 state.ptr = state.start;
2152 if (state.charsize == 1) {
2153 status = sre_search(&state, PatternObject_GetCode(self));
2154 } else {
2155 #if defined(HAVE_UNICODE)
2156 status = sre_usearch(&state, PatternObject_GetCode(self));
2157 #endif
2160 if (status <= 0) {
2161 if (status == 0)
2162 break;
2163 pattern_error(status);
2164 goto error;
2167 b = STATE_OFFSET(&state, state.start);
2168 e = STATE_OFFSET(&state, state.ptr);
2170 if (i < b) {
2171 /* get segment before this match */
2172 item = PySequence_GetSlice(string, i, b);
2173 if (!item)
2174 goto error;
2175 status = PyList_Append(list, item);
2176 Py_DECREF(item);
2177 if (status < 0)
2178 goto error;
2180 } else if (i == b && i == e && n > 0)
2181 /* ignore empty match on latest position */
2182 goto next;
2184 if (filter_is_callable) {
2185 /* pass match object through filter */
2186 match = pattern_new_match(self, &state, 1);
2187 if (!match)
2188 goto error;
2189 args = Py_BuildValue("(O)", match);
2190 if (!args) {
2191 Py_DECREF(match);
2192 goto error;
2194 item = PyObject_CallObject(filter, args);
2195 Py_DECREF(args);
2196 Py_DECREF(match);
2197 if (!item)
2198 goto error;
2199 } else {
2200 /* filter is literal string */
2201 item = filter;
2202 Py_INCREF(item);
2205 /* add to list */
2206 if (item != Py_None) {
2207 status = PyList_Append(list, item);
2208 Py_DECREF(item);
2209 if (status < 0)
2210 goto error;
2213 i = e;
2214 n = n + 1;
2216 next:
2217 /* move on */
2218 if (state.ptr == state.start)
2219 state.start = (void*) ((char*) state.ptr + state.charsize);
2220 else
2221 state.start = state.ptr;
2225 /* get segment following last match */
2226 if (i < state.endpos) {
2227 item = PySequence_GetSlice(string, i, state.endpos);
2228 if (!item)
2229 goto error;
2230 status = PyList_Append(list, item);
2231 Py_DECREF(item);
2232 if (status < 0)
2233 goto error;
2236 state_fini(&state);
2238 Py_DECREF(filter);
2240 /* convert list to single string (also removes list) */
2241 item = join_list(list, self->pattern);
2243 if (!item)
2244 return NULL;
2246 if (subn)
2247 return Py_BuildValue("Ni", item, n);
2249 return item;
2251 error:
2252 Py_DECREF(list);
2253 state_fini(&state);
2254 Py_DECREF(filter);
2255 return NULL;
2259 static PyObject*
2260 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2262 PyObject* template;
2263 PyObject* string;
2264 int count = 0;
2265 static char* kwlist[] = { "repl", "string", "count", NULL };
2266 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2267 &template, &string, &count))
2268 return NULL;
2270 return pattern_subx(self, template, string, count, 0);
2273 static PyObject*
2274 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2276 PyObject* template;
2277 PyObject* string;
2278 int count = 0;
2279 static char* kwlist[] = { "repl", "string", "count", NULL };
2280 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2281 &template, &string, &count))
2282 return NULL;
2284 return pattern_subx(self, template, string, count, 1);
2287 static PyObject*
2288 pattern_copy(PatternObject* self, PyObject* args)
2290 #ifdef USE_BUILTIN_COPY
2291 PatternObject* copy;
2292 int offset;
2294 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2295 return NULL;
2297 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2298 if (!copy)
2299 return NULL;
2301 offset = offsetof(PatternObject, groups);
2303 Py_XINCREF(self->groupindex);
2304 Py_XINCREF(self->indexgroup);
2305 Py_XINCREF(self->pattern);
2307 memcpy((char*) copy + offset, (char*) self + offset,
2308 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2310 return (PyObject*) copy;
2311 #else
2312 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2313 return NULL;
2314 #endif
2317 static PyObject*
2318 pattern_deepcopy(PatternObject* self, PyObject* args)
2320 #ifdef USE_BUILTIN_COPY
2321 PatternObject* copy;
2323 PyObject* memo;
2324 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2325 return NULL;
2327 copy = (PatternObject*) pattern_copy(self, Py_None);
2328 if (!copy)
2329 return NULL;
2331 if (!deepcopy(&copy->groupindex, memo) ||
2332 !deepcopy(&copy->indexgroup, memo) ||
2333 !deepcopy(&copy->pattern, memo)) {
2334 Py_DECREF(copy);
2335 return NULL;
2338 #else
2339 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2340 return NULL;
2341 #endif
2344 static PyMethodDef pattern_methods[] = {
2345 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2346 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2347 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2348 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2349 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2350 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
2351 #if PY_VERSION_HEX >= 0x02020000
2352 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2353 #endif
2354 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2355 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2356 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
2357 {NULL, NULL}
2360 static PyObject*
2361 pattern_getattr(PatternObject* self, char* name)
2363 PyObject* res;
2365 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2367 if (res)
2368 return res;
2370 PyErr_Clear();
2372 /* attributes */
2373 if (!strcmp(name, "pattern")) {
2374 Py_INCREF(self->pattern);
2375 return self->pattern;
2378 if (!strcmp(name, "flags"))
2379 return Py_BuildValue("i", self->flags);
2381 if (!strcmp(name, "groups"))
2382 return Py_BuildValue("i", self->groups);
2384 if (!strcmp(name, "groupindex") && self->groupindex) {
2385 Py_INCREF(self->groupindex);
2386 return self->groupindex;
2389 PyErr_SetString(PyExc_AttributeError, name);
2390 return NULL;
2393 statichere PyTypeObject Pattern_Type = {
2394 PyObject_HEAD_INIT(NULL)
2395 0, "_" SRE_MODULE ".SRE_Pattern",
2396 sizeof(PatternObject), sizeof(SRE_CODE),
2397 (destructor)pattern_dealloc, /*tp_dealloc*/
2398 0, /*tp_print*/
2399 (getattrfunc)pattern_getattr /*tp_getattr*/
2402 /* -------------------------------------------------------------------- */
2403 /* match methods */
2405 static void
2406 match_dealloc(MatchObject* self)
2408 Py_XDECREF(self->regs);
2409 Py_XDECREF(self->string);
2410 Py_DECREF(self->pattern);
2411 PyObject_DEL(self);
2414 static PyObject*
2415 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2417 if (index < 0 || index >= self->groups) {
2418 /* raise IndexError if we were given a bad group number */
2419 PyErr_SetString(
2420 PyExc_IndexError,
2421 "no such group"
2423 return NULL;
2426 index *= 2;
2428 if (self->string == Py_None || self->mark[index] < 0) {
2429 /* return default value if the string or group is undefined */
2430 Py_INCREF(def);
2431 return def;
2434 return PySequence_GetSlice(
2435 self->string, self->mark[index], self->mark[index+1]
2439 static int
2440 match_getindex(MatchObject* self, PyObject* index)
2442 int i;
2444 if (PyInt_Check(index))
2445 return (int) PyInt_AS_LONG(index);
2447 i = -1;
2449 if (self->pattern->groupindex) {
2450 index = PyObject_GetItem(self->pattern->groupindex, index);
2451 if (index) {
2452 if (PyInt_Check(index))
2453 i = (int) PyInt_AS_LONG(index);
2454 Py_DECREF(index);
2455 } else
2456 PyErr_Clear();
2459 return i;
2462 static PyObject*
2463 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2465 return match_getslice_by_index(self, match_getindex(self, index), def);
2468 static PyObject*
2469 match_expand(MatchObject* self, PyObject* args)
2471 PyObject* template;
2472 if (!PyArg_ParseTuple(args, "O:expand", &template))
2473 return NULL;
2475 /* delegate to Python code */
2476 return call(
2477 SRE_MODULE, "_expand",
2478 Py_BuildValue("OOO", self->pattern, self, template)
2482 static PyObject*
2483 match_group(MatchObject* self, PyObject* args)
2485 PyObject* result;
2486 int i, size;
2488 size = PyTuple_GET_SIZE(args);
2490 switch (size) {
2491 case 0:
2492 result = match_getslice(self, Py_False, Py_None);
2493 break;
2494 case 1:
2495 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2496 break;
2497 default:
2498 /* fetch multiple items */
2499 result = PyTuple_New(size);
2500 if (!result)
2501 return NULL;
2502 for (i = 0; i < size; i++) {
2503 PyObject* item = match_getslice(
2504 self, PyTuple_GET_ITEM(args, i), Py_None
2506 if (!item) {
2507 Py_DECREF(result);
2508 return NULL;
2510 PyTuple_SET_ITEM(result, i, item);
2512 break;
2514 return result;
2517 static PyObject*
2518 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2520 PyObject* result;
2521 int index;
2523 PyObject* def = Py_None;
2524 static char* kwlist[] = { "default", NULL };
2525 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2526 return NULL;
2528 result = PyTuple_New(self->groups-1);
2529 if (!result)
2530 return NULL;
2532 for (index = 1; index < self->groups; index++) {
2533 PyObject* item;
2534 item = match_getslice_by_index(self, index, def);
2535 if (!item) {
2536 Py_DECREF(result);
2537 return NULL;
2539 PyTuple_SET_ITEM(result, index-1, item);
2542 return result;
2545 static PyObject*
2546 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2548 PyObject* result;
2549 PyObject* keys;
2550 int index;
2552 PyObject* def = Py_None;
2553 static char* kwlist[] = { "default", NULL };
2554 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2555 return NULL;
2557 result = PyDict_New();
2558 if (!result || !self->pattern->groupindex)
2559 return result;
2561 keys = PyMapping_Keys(self->pattern->groupindex);
2562 if (!keys)
2563 goto failed;
2565 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2566 int status;
2567 PyObject* key;
2568 PyObject* value;
2569 key = PyList_GET_ITEM(keys, index);
2570 if (!key)
2571 goto failed;
2572 value = match_getslice(self, key, def);
2573 if (!value) {
2574 Py_DECREF(key);
2575 goto failed;
2577 status = PyDict_SetItem(result, key, value);
2578 Py_DECREF(value);
2579 if (status < 0)
2580 goto failed;
2583 Py_DECREF(keys);
2585 return result;
2587 failed:
2588 Py_DECREF(keys);
2589 Py_DECREF(result);
2590 return NULL;
2593 static PyObject*
2594 match_start(MatchObject* self, PyObject* args)
2596 int index;
2598 PyObject* index_ = Py_False; /* zero */
2599 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2600 return NULL;
2602 index = match_getindex(self, index_);
2604 if (index < 0 || index >= self->groups) {
2605 PyErr_SetString(
2606 PyExc_IndexError,
2607 "no such group"
2609 return NULL;
2612 /* mark is -1 if group is undefined */
2613 return Py_BuildValue("i", self->mark[index*2]);
2616 static PyObject*
2617 match_end(MatchObject* self, PyObject* args)
2619 int index;
2621 PyObject* index_ = Py_False; /* zero */
2622 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2623 return NULL;
2625 index = match_getindex(self, index_);
2627 if (index < 0 || index >= self->groups) {
2628 PyErr_SetString(
2629 PyExc_IndexError,
2630 "no such group"
2632 return NULL;
2635 /* mark is -1 if group is undefined */
2636 return Py_BuildValue("i", self->mark[index*2+1]);
2639 LOCAL(PyObject*)
2640 _pair(int i1, int i2)
2642 PyObject* pair;
2643 PyObject* item;
2645 pair = PyTuple_New(2);
2646 if (!pair)
2647 return NULL;
2649 item = PyInt_FromLong(i1);
2650 if (!item)
2651 goto error;
2652 PyTuple_SET_ITEM(pair, 0, item);
2654 item = PyInt_FromLong(i2);
2655 if (!item)
2656 goto error;
2657 PyTuple_SET_ITEM(pair, 1, item);
2659 return pair;
2661 error:
2662 Py_DECREF(pair);
2663 return NULL;
2666 static PyObject*
2667 match_span(MatchObject* self, PyObject* args)
2669 int index;
2671 PyObject* index_ = Py_False; /* zero */
2672 if (!PyArg_ParseTuple(args, "|O:span", &index_))
2673 return NULL;
2675 index = match_getindex(self, index_);
2677 if (index < 0 || index >= self->groups) {
2678 PyErr_SetString(
2679 PyExc_IndexError,
2680 "no such group"
2682 return NULL;
2685 /* marks are -1 if group is undefined */
2686 return _pair(self->mark[index*2], self->mark[index*2+1]);
2689 static PyObject*
2690 match_regs(MatchObject* self)
2692 PyObject* regs;
2693 PyObject* item;
2694 int index;
2696 regs = PyTuple_New(self->groups);
2697 if (!regs)
2698 return NULL;
2700 for (index = 0; index < self->groups; index++) {
2701 item = _pair(self->mark[index*2], self->mark[index*2+1]);
2702 if (!item) {
2703 Py_DECREF(regs);
2704 return NULL;
2706 PyTuple_SET_ITEM(regs, index, item);
2709 Py_INCREF(regs);
2710 self->regs = regs;
2712 return regs;
2715 static PyObject*
2716 match_copy(MatchObject* self, PyObject* args)
2718 #ifdef USE_BUILTIN_COPY
2719 MatchObject* copy;
2720 int slots, offset;
2722 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2723 return NULL;
2725 slots = 2 * (self->pattern->groups+1);
2727 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
2728 if (!copy)
2729 return NULL;
2731 /* this value a constant, but any compiler should be able to
2732 figure that out all by itself */
2733 offset = offsetof(MatchObject, string);
2735 Py_XINCREF(self->pattern);
2736 Py_XINCREF(self->string);
2737 Py_XINCREF(self->regs);
2739 memcpy((char*) copy + offset, (char*) self + offset,
2740 sizeof(MatchObject) + slots * sizeof(int) - offset);
2742 return (PyObject*) copy;
2743 #else
2744 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
2745 return NULL;
2746 #endif
2749 static PyObject*
2750 match_deepcopy(MatchObject* self, PyObject* args)
2752 #ifdef USE_BUILTIN_COPY
2753 MatchObject* copy;
2755 PyObject* memo;
2756 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2757 return NULL;
2759 copy = (MatchObject*) match_copy(self, Py_None);
2760 if (!copy)
2761 return NULL;
2763 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
2764 !deepcopy(&copy->string, memo) ||
2765 !deepcopy(&copy->regs, memo)) {
2766 Py_DECREF(copy);
2767 return NULL;
2770 #else
2771 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
2772 return NULL;
2773 #endif
2776 static PyMethodDef match_methods[] = {
2777 {"group", (PyCFunction) match_group, METH_VARARGS},
2778 {"start", (PyCFunction) match_start, METH_VARARGS},
2779 {"end", (PyCFunction) match_end, METH_VARARGS},
2780 {"span", (PyCFunction) match_span, METH_VARARGS},
2781 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
2782 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
2783 {"expand", (PyCFunction) match_expand, METH_VARARGS},
2784 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
2785 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
2786 {NULL, NULL}
2789 static PyObject*
2790 match_getattr(MatchObject* self, char* name)
2792 PyObject* res;
2794 res = Py_FindMethod(match_methods, (PyObject*) self, name);
2795 if (res)
2796 return res;
2798 PyErr_Clear();
2800 if (!strcmp(name, "lastindex")) {
2801 if (self->lastindex >= 0)
2802 return Py_BuildValue("i", self->lastindex);
2803 Py_INCREF(Py_None);
2804 return Py_None;
2807 if (!strcmp(name, "lastgroup")) {
2808 if (self->pattern->indexgroup && self->lastindex >= 0) {
2809 PyObject* result = PySequence_GetItem(
2810 self->pattern->indexgroup, self->lastindex
2812 if (result)
2813 return result;
2814 PyErr_Clear();
2816 Py_INCREF(Py_None);
2817 return Py_None;
2820 if (!strcmp(name, "string")) {
2821 if (self->string) {
2822 Py_INCREF(self->string);
2823 return self->string;
2824 } else {
2825 Py_INCREF(Py_None);
2826 return Py_None;
2830 if (!strcmp(name, "regs")) {
2831 if (self->regs) {
2832 Py_INCREF(self->regs);
2833 return self->regs;
2834 } else
2835 return match_regs(self);
2838 if (!strcmp(name, "re")) {
2839 Py_INCREF(self->pattern);
2840 return (PyObject*) self->pattern;
2843 if (!strcmp(name, "pos"))
2844 return Py_BuildValue("i", self->pos);
2846 if (!strcmp(name, "endpos"))
2847 return Py_BuildValue("i", self->endpos);
2849 PyErr_SetString(PyExc_AttributeError, name);
2850 return NULL;
2853 /* FIXME: implement setattr("string", None) as a special case (to
2854 detach the associated string, if any */
2856 statichere PyTypeObject Match_Type = {
2857 PyObject_HEAD_INIT(NULL)
2858 0, "_" SRE_MODULE ".SRE_Match",
2859 sizeof(MatchObject), sizeof(int),
2860 (destructor)match_dealloc, /*tp_dealloc*/
2861 0, /*tp_print*/
2862 (getattrfunc)match_getattr /*tp_getattr*/
2865 /* -------------------------------------------------------------------- */
2866 /* scanner methods (experimental) */
2868 static void
2869 scanner_dealloc(ScannerObject* self)
2871 state_fini(&self->state);
2872 Py_DECREF(self->pattern);
2873 PyObject_DEL(self);
2876 static PyObject*
2877 scanner_match(ScannerObject* self, PyObject* args)
2879 SRE_STATE* state = &self->state;
2880 PyObject* match;
2881 int status;
2883 state_reset(state);
2885 state->ptr = state->start;
2887 if (state->charsize == 1) {
2888 status = sre_match(state, PatternObject_GetCode(self->pattern), 1);
2889 } else {
2890 #if defined(HAVE_UNICODE)
2891 status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1);
2892 #endif
2895 match = pattern_new_match((PatternObject*) self->pattern,
2896 state, status);
2898 if (status == 0 || state->ptr == state->start)
2899 state->start = (void*) ((char*) state->ptr + state->charsize);
2900 else
2901 state->start = state->ptr;
2903 return match;
2907 static PyObject*
2908 scanner_search(ScannerObject* self, PyObject* args)
2910 SRE_STATE* state = &self->state;
2911 PyObject* match;
2912 int status;
2914 state_reset(state);
2916 state->ptr = state->start;
2918 if (state->charsize == 1) {
2919 status = sre_search(state, PatternObject_GetCode(self->pattern));
2920 } else {
2921 #if defined(HAVE_UNICODE)
2922 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
2923 #endif
2926 match = pattern_new_match((PatternObject*) self->pattern,
2927 state, status);
2929 if (status == 0 || state->ptr == state->start)
2930 state->start = (void*) ((char*) state->ptr + state->charsize);
2931 else
2932 state->start = state->ptr;
2934 return match;
2937 static PyMethodDef scanner_methods[] = {
2938 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
2939 /* METH_OLDARGS is not in Python 1.5.2 */
2940 {"match", (PyCFunction) scanner_match, 0},
2941 {"search", (PyCFunction) scanner_search, 0},
2942 {NULL, NULL}
2945 static PyObject*
2946 scanner_getattr(ScannerObject* self, char* name)
2948 PyObject* res;
2950 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
2951 if (res)
2952 return res;
2954 PyErr_Clear();
2956 /* attributes */
2957 if (!strcmp(name, "pattern")) {
2958 Py_INCREF(self->pattern);
2959 return self->pattern;
2962 PyErr_SetString(PyExc_AttributeError, name);
2963 return NULL;
2966 statichere PyTypeObject Scanner_Type = {
2967 PyObject_HEAD_INIT(NULL)
2968 0, "_" SRE_MODULE ".SRE_Scanner",
2969 sizeof(ScannerObject), 0,
2970 (destructor)scanner_dealloc, /*tp_dealloc*/
2971 0, /*tp_print*/
2972 (getattrfunc)scanner_getattr, /*tp_getattr*/
2975 static PyMethodDef _functions[] = {
2976 {"compile", _compile, METH_VARARGS},
2977 {"getcodesize", sre_codesize, METH_VARARGS},
2978 {"getlower", sre_getlower, METH_VARARGS},
2979 {NULL, NULL}
2982 PyMODINIT_FUNC init_sre(void)
2984 PyObject* m;
2985 PyObject* d;
2986 PyObject* x;
2988 /* Patch object types */
2989 Pattern_Type.ob_type = Match_Type.ob_type =
2990 Scanner_Type.ob_type = &PyType_Type;
2992 m = Py_InitModule("_" SRE_MODULE, _functions);
2993 d = PyModule_GetDict(m);
2995 x = PyInt_FromLong(SRE_MAGIC);
2996 if (x) {
2997 PyDict_SetItemString(d, "MAGIC", x);
2998 Py_DECREF(x);
3001 x = PyString_FromString(copyright);
3002 if (x) {
3003 PyDict_SetItemString(d, "copyright", x);
3004 Py_DECREF(x);
3008 #endif /* !defined(SRE_RECURSIVE) */