Whitespace normalization.
[python/dscho.git] / Modules / _sre.c
blob6ee0bb82d3b07338c335a4c083aa655f34a3c10f
1 /*
2 * Secret Labs' Regular Expression Engine
4 * regular expression matching engine
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
37 #ifdef __SUNPRO_C
38 #pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
39 #endif
41 #ifndef SRE_RECURSIVE
43 static char copyright[] =
44 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
46 #include "Python.h"
47 #include "structmember.h" /* offsetof */
49 #include "sre.h"
51 #include <ctype.h>
53 /* name of this module, minus the leading underscore */
54 #if !defined(SRE_MODULE)
55 #define SRE_MODULE "sre"
56 #endif
58 /* defining this one enables tracing */
59 #undef VERBOSE
61 #if PY_VERSION_HEX >= 0x01060000
62 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63 /* defining this enables unicode support (default under 1.6a1 and later) */
64 #define HAVE_UNICODE
65 #endif
66 #endif
68 /* -------------------------------------------------------------------- */
69 /* optional features */
71 /* enables fast searching */
72 #define USE_FAST_SEARCH
74 /* enables aggressive inlining (always on for Visual C) */
75 #undef USE_INLINE
77 /* enables copy/deepcopy handling (work in progress) */
78 #undef USE_BUILTIN_COPY
80 #if PY_VERSION_HEX < 0x01060000
81 #define PyObject_DEL(op) PyMem_DEL((op))
82 #endif
84 /* -------------------------------------------------------------------- */
86 #if defined(_MSC_VER)
87 #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89 /* fastest possible local call under MSVC */
90 #define LOCAL(type) static __inline type __fastcall
91 #elif defined(USE_INLINE)
92 #define LOCAL(type) static inline type
93 #else
94 #define LOCAL(type) static type
95 #endif
97 /* error codes */
98 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99 #define SRE_ERROR_STATE -2 /* illegal state */
100 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101 #define SRE_ERROR_MEMORY -9 /* out of memory */
103 #if defined(VERBOSE)
104 #define TRACE(v) printf v
105 #else
106 #define TRACE(v)
107 #endif
109 /* -------------------------------------------------------------------- */
110 /* search engine state */
112 /* default character predicates (run sre_chars.py to regenerate tables) */
114 #define SRE_DIGIT_MASK 1
115 #define SRE_SPACE_MASK 2
116 #define SRE_LINEBREAK_MASK 4
117 #define SRE_ALNUM_MASK 8
118 #define SRE_WORD_MASK 16
120 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
122 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
123 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
126 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
127 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
128 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
131 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
132 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
133 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
134 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
135 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
136 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
137 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
138 120, 121, 122, 123, 124, 125, 126, 127 };
140 #define SRE_IS_DIGIT(ch)\
141 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
142 #define SRE_IS_SPACE(ch)\
143 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
144 #define SRE_IS_LINEBREAK(ch)\
145 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
146 #define SRE_IS_ALNUM(ch)\
147 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
148 #define SRE_IS_WORD(ch)\
149 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151 static unsigned int sre_lower(unsigned int ch)
153 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
156 /* locale-specific character predicates */
157 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
158 * warnings when c's type supports only numbers < N+1 */
159 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
160 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
161 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
162 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
163 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165 static unsigned int sre_lower_locale(unsigned int ch)
167 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
170 /* unicode-specific character predicates */
172 #if defined(HAVE_UNICODE)
174 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
175 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
176 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
177 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
178 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180 static unsigned int sre_lower_unicode(unsigned int ch)
182 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
185 #endif
187 LOCAL(int)
188 sre_category(SRE_CODE category, unsigned int ch)
190 switch (category) {
192 case SRE_CATEGORY_DIGIT:
193 return SRE_IS_DIGIT(ch);
194 case SRE_CATEGORY_NOT_DIGIT:
195 return !SRE_IS_DIGIT(ch);
196 case SRE_CATEGORY_SPACE:
197 return SRE_IS_SPACE(ch);
198 case SRE_CATEGORY_NOT_SPACE:
199 return !SRE_IS_SPACE(ch);
200 case SRE_CATEGORY_WORD:
201 return SRE_IS_WORD(ch);
202 case SRE_CATEGORY_NOT_WORD:
203 return !SRE_IS_WORD(ch);
204 case SRE_CATEGORY_LINEBREAK:
205 return SRE_IS_LINEBREAK(ch);
206 case SRE_CATEGORY_NOT_LINEBREAK:
207 return !SRE_IS_LINEBREAK(ch);
209 case SRE_CATEGORY_LOC_WORD:
210 return SRE_LOC_IS_WORD(ch);
211 case SRE_CATEGORY_LOC_NOT_WORD:
212 return !SRE_LOC_IS_WORD(ch);
214 #if defined(HAVE_UNICODE)
215 case SRE_CATEGORY_UNI_DIGIT:
216 return SRE_UNI_IS_DIGIT(ch);
217 case SRE_CATEGORY_UNI_NOT_DIGIT:
218 return !SRE_UNI_IS_DIGIT(ch);
219 case SRE_CATEGORY_UNI_SPACE:
220 return SRE_UNI_IS_SPACE(ch);
221 case SRE_CATEGORY_UNI_NOT_SPACE:
222 return !SRE_UNI_IS_SPACE(ch);
223 case SRE_CATEGORY_UNI_WORD:
224 return SRE_UNI_IS_WORD(ch);
225 case SRE_CATEGORY_UNI_NOT_WORD:
226 return !SRE_UNI_IS_WORD(ch);
227 case SRE_CATEGORY_UNI_LINEBREAK:
228 return SRE_UNI_IS_LINEBREAK(ch);
229 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
230 return !SRE_UNI_IS_LINEBREAK(ch);
231 #else
232 case SRE_CATEGORY_UNI_DIGIT:
233 return SRE_IS_DIGIT(ch);
234 case SRE_CATEGORY_UNI_NOT_DIGIT:
235 return !SRE_IS_DIGIT(ch);
236 case SRE_CATEGORY_UNI_SPACE:
237 return SRE_IS_SPACE(ch);
238 case SRE_CATEGORY_UNI_NOT_SPACE:
239 return !SRE_IS_SPACE(ch);
240 case SRE_CATEGORY_UNI_WORD:
241 return SRE_LOC_IS_WORD(ch);
242 case SRE_CATEGORY_UNI_NOT_WORD:
243 return !SRE_LOC_IS_WORD(ch);
244 case SRE_CATEGORY_UNI_LINEBREAK:
245 return SRE_IS_LINEBREAK(ch);
246 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
247 return !SRE_IS_LINEBREAK(ch);
248 #endif
250 return 0;
253 /* helpers */
255 static void
256 data_stack_dealloc(SRE_STATE* state)
258 if (state->data_stack) {
259 free(state->data_stack);
260 state->data_stack = NULL;
262 state->data_stack_size = state->data_stack_base = 0;
265 static int
266 data_stack_grow(SRE_STATE* state, int size)
268 int minsize, cursize;
269 minsize = state->data_stack_base+size;
270 cursize = state->data_stack_size;
271 if (cursize < minsize) {
272 void* stack;
273 cursize = minsize+minsize/4+1024;
274 TRACE(("allocate/grow stack %d\n", cursize));
275 stack = realloc(state->data_stack, cursize);
276 if (!stack) {
277 data_stack_dealloc(state);
278 return SRE_ERROR_MEMORY;
280 state->data_stack = stack;
281 state->data_stack_size = cursize;
283 return 0;
286 /* generate 8-bit version */
288 #define SRE_CHAR unsigned char
289 #define SRE_AT sre_at
290 #define SRE_COUNT sre_count
291 #define SRE_CHARSET sre_charset
292 #define SRE_INFO sre_info
293 #define SRE_MATCH sre_match
294 #define SRE_MATCH_CONTEXT sre_match_context
295 #define SRE_SEARCH sre_search
296 #define SRE_LITERAL_TEMPLATE sre_literal_template
298 #if defined(HAVE_UNICODE)
300 #define SRE_RECURSIVE
301 #include "_sre.c"
302 #undef SRE_RECURSIVE
304 #undef SRE_LITERAL_TEMPLATE
305 #undef SRE_SEARCH
306 #undef SRE_MATCH
307 #undef SRE_MATCH_CONTEXT
308 #undef SRE_INFO
309 #undef SRE_CHARSET
310 #undef SRE_COUNT
311 #undef SRE_AT
312 #undef SRE_CHAR
314 /* generate 16-bit unicode version */
316 #define SRE_CHAR Py_UNICODE
317 #define SRE_AT sre_uat
318 #define SRE_COUNT sre_ucount
319 #define SRE_CHARSET sre_ucharset
320 #define SRE_INFO sre_uinfo
321 #define SRE_MATCH sre_umatch
322 #define SRE_MATCH_CONTEXT sre_umatch_context
323 #define SRE_SEARCH sre_usearch
324 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
325 #endif
327 #endif /* SRE_RECURSIVE */
329 /* -------------------------------------------------------------------- */
330 /* String matching engine */
332 /* the following section is compiled twice, with different character
333 settings */
335 LOCAL(int)
336 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338 /* check if pointer is at given position */
340 int this, that;
342 switch (at) {
344 case SRE_AT_BEGINNING:
345 case SRE_AT_BEGINNING_STRING:
346 return ((void*) ptr == state->beginning);
348 case SRE_AT_BEGINNING_LINE:
349 return ((void*) ptr == state->beginning ||
350 SRE_IS_LINEBREAK((int) ptr[-1]));
352 case SRE_AT_END:
353 return (((void*) (ptr+1) == state->end &&
354 SRE_IS_LINEBREAK((int) ptr[0])) ||
355 ((void*) ptr == state->end));
357 case SRE_AT_END_LINE:
358 return ((void*) ptr == state->end ||
359 SRE_IS_LINEBREAK((int) ptr[0]));
361 case SRE_AT_END_STRING:
362 return ((void*) ptr == state->end);
364 case SRE_AT_BOUNDARY:
365 if (state->beginning == state->end)
366 return 0;
367 that = ((void*) ptr > state->beginning) ?
368 SRE_IS_WORD((int) ptr[-1]) : 0;
369 this = ((void*) ptr < state->end) ?
370 SRE_IS_WORD((int) ptr[0]) : 0;
371 return this != that;
373 case SRE_AT_NON_BOUNDARY:
374 if (state->beginning == state->end)
375 return 0;
376 that = ((void*) ptr > state->beginning) ?
377 SRE_IS_WORD((int) ptr[-1]) : 0;
378 this = ((void*) ptr < state->end) ?
379 SRE_IS_WORD((int) ptr[0]) : 0;
380 return this == that;
382 case SRE_AT_LOC_BOUNDARY:
383 if (state->beginning == state->end)
384 return 0;
385 that = ((void*) ptr > state->beginning) ?
386 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
387 this = ((void*) ptr < state->end) ?
388 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
389 return this != that;
391 case SRE_AT_LOC_NON_BOUNDARY:
392 if (state->beginning == state->end)
393 return 0;
394 that = ((void*) ptr > state->beginning) ?
395 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
396 this = ((void*) ptr < state->end) ?
397 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
398 return this == that;
400 #if defined(HAVE_UNICODE)
401 case SRE_AT_UNI_BOUNDARY:
402 if (state->beginning == state->end)
403 return 0;
404 that = ((void*) ptr > state->beginning) ?
405 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
406 this = ((void*) ptr < state->end) ?
407 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
408 return this != that;
410 case SRE_AT_UNI_NON_BOUNDARY:
411 if (state->beginning == state->end)
412 return 0;
413 that = ((void*) ptr > state->beginning) ?
414 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
415 this = ((void*) ptr < state->end) ?
416 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
417 return this == that;
418 #endif
422 return 0;
425 LOCAL(int)
426 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428 /* check if character is a member of the given set */
430 int ok = 1;
432 for (;;) {
433 switch (*set++) {
435 case SRE_OP_FAILURE:
436 return !ok;
438 case SRE_OP_LITERAL:
439 /* <LITERAL> <code> */
440 if (ch == set[0])
441 return ok;
442 set++;
443 break;
445 case SRE_OP_CATEGORY:
446 /* <CATEGORY> <code> */
447 if (sre_category(set[0], (int) ch))
448 return ok;
449 set += 1;
450 break;
452 case SRE_OP_CHARSET:
453 if (sizeof(SRE_CODE) == 2) {
454 /* <CHARSET> <bitmap> (16 bits per code word) */
455 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
456 return ok;
457 set += 16;
459 else {
460 /* <CHARSET> <bitmap> (32 bits per code word) */
461 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
462 return ok;
463 set += 8;
465 break;
467 case SRE_OP_RANGE:
468 /* <RANGE> <lower> <upper> */
469 if (set[0] <= ch && ch <= set[1])
470 return ok;
471 set += 2;
472 break;
474 case SRE_OP_NEGATE:
475 ok = !ok;
476 break;
478 case SRE_OP_BIGCHARSET:
479 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 int count, block;
482 count = *(set++);
484 if (sizeof(SRE_CODE) == 2) {
485 block = ((unsigned char*)set)[ch >> 8];
486 set += 128;
487 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
488 return ok;
489 set += count*16;
491 else {
492 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
493 * warnings when c's type supports only numbers < N+1 */
494 if (!(ch & ~65535))
495 block = ((unsigned char*)set)[ch >> 8];
496 else
497 block = -1;
498 set += 64;
499 if (block >=0 &&
500 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
501 return ok;
502 set += count*8;
504 break;
507 default:
508 /* internal error -- there's not much we can do about it
509 here, so let's just pretend it didn't match... */
510 return 0;
515 LOCAL(int) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517 LOCAL(int)
518 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount)
520 SRE_CODE chr;
521 SRE_CHAR* ptr = state->ptr;
522 SRE_CHAR* end = state->end;
523 int i;
525 /* adjust end */
526 if (maxcount < end - ptr && maxcount != 65535)
527 end = ptr + maxcount;
529 switch (pattern[0]) {
531 case SRE_OP_IN:
532 /* repeated set */
533 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
534 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
535 ptr++;
536 break;
538 case SRE_OP_ANY:
539 /* repeated dot wildcard. */
540 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
541 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
542 ptr++;
543 break;
545 case SRE_OP_ANY_ALL:
546 /* repeated dot wildcare. skip to the end of the target
547 string, and backtrack from there */
548 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
549 ptr = end;
550 break;
552 case SRE_OP_LITERAL:
553 /* repeated literal */
554 chr = pattern[1];
555 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
556 while (ptr < end && (SRE_CODE) *ptr == chr)
557 ptr++;
558 break;
560 case SRE_OP_LITERAL_IGNORE:
561 /* repeated literal */
562 chr = pattern[1];
563 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
564 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
565 ptr++;
566 break;
568 case SRE_OP_NOT_LITERAL:
569 /* repeated non-literal */
570 chr = pattern[1];
571 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
572 while (ptr < end && (SRE_CODE) *ptr != chr)
573 ptr++;
574 break;
576 case SRE_OP_NOT_LITERAL_IGNORE:
577 /* repeated non-literal */
578 chr = pattern[1];
579 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
580 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
581 ptr++;
582 break;
584 default:
585 /* repeated single character pattern */
586 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
587 while ((SRE_CHAR*) state->ptr < end) {
588 i = SRE_MATCH(state, pattern);
589 if (i < 0)
590 return i;
591 if (!i)
592 break;
594 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
595 (SRE_CHAR*) state->ptr - ptr));
596 return (SRE_CHAR*) state->ptr - ptr;
599 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
600 return ptr - (SRE_CHAR*) state->ptr;
603 #if 0 /* not used in this release */
604 LOCAL(int)
605 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
607 /* check if an SRE_OP_INFO block matches at the current position.
608 returns the number of SRE_CODE objects to skip if successful, 0
609 if no match */
611 SRE_CHAR* end = state->end;
612 SRE_CHAR* ptr = state->ptr;
613 int i;
615 /* check minimal length */
616 if (pattern[3] && (end - ptr) < pattern[3])
617 return 0;
619 /* check known prefix */
620 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
621 /* <length> <skip> <prefix data> <overlap data> */
622 for (i = 0; i < pattern[5]; i++)
623 if ((SRE_CODE) ptr[i] != pattern[7 + i])
624 return 0;
625 return pattern[0] + 2 * pattern[6];
627 return pattern[0];
629 #endif
631 /* The macros below should be used to protect recursive SRE_MATCH()
632 * calls that *failed* and do *not* return immediately (IOW, those
633 * that will backtrack). Explaining:
635 * - Recursive SRE_MATCH() returned true: that's usually a success
636 * (besides atypical cases like ASSERT_NOT), therefore there's no
637 * reason to restore lastmark;
639 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
640 * is returning to the caller: If the current SRE_MATCH() is the
641 * top function of the recursion, returning false will be a matching
642 * failure, and it doesn't matter where lastmark is pointing to.
643 * If it's *not* the top function, it will be a recursive SRE_MATCH()
644 * failure by itself, and the calling SRE_MATCH() will have to deal
645 * with the failure by the same rules explained here (it will restore
646 * lastmark by itself if necessary);
648 * - Recursive SRE_MATCH() returned false, and will continue the
649 * outside 'for' loop: must be protected when breaking, since the next
650 * OP could potentially depend on lastmark;
652 * - Recursive SRE_MATCH() returned false, and will be called again
653 * inside a local for/while loop: must be protected between each
654 * loop iteration, since the recursive SRE_MATCH() could do anything,
655 * and could potentially depend on lastmark.
657 * For more information, check the discussion at SF patch #712900.
659 #define LASTMARK_SAVE() \
660 do { \
661 ctx->lastmark = state->lastmark; \
662 ctx->lastindex = state->lastindex; \
663 } while (0)
664 #define LASTMARK_RESTORE() \
665 do { \
666 state->lastmark = ctx->lastmark; \
667 state->lastindex = ctx->lastindex; \
668 } while (0)
670 #define RETURN_ERROR(i) do { return i; } while(0)
671 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
672 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
674 #define RETURN_ON_ERROR(i) \
675 do { if (i < 0) RETURN_ERROR(i); } while (0)
676 #define RETURN_ON_SUCCESS(i) \
677 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
678 #define RETURN_ON_FAILURE(i) \
679 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
681 #define SFY(x) #x
683 #define DATA_STACK_ALLOC(state, type, ptr) \
684 do { \
685 alloc_pos = state->data_stack_base; \
686 TRACE(("allocating %s in %d (%d)\n", \
687 SFY(type), alloc_pos, sizeof(type))); \
688 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
689 int j = data_stack_grow(state, sizeof(type)); \
690 if (j < 0) return j; \
691 if (ctx_pos != -1) \
692 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
694 ptr = (type*)(state->data_stack+alloc_pos); \
695 state->data_stack_base += sizeof(type); \
696 } while (0)
698 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
699 do { \
700 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
701 ptr = (type*)(state->data_stack+pos); \
702 } while (0)
704 #define DATA_STACK_PUSH(state, data, size) \
705 do { \
706 TRACE(("copy data in %p to %d (%d)\n", \
707 data, state->data_stack_base, size)); \
708 if (state->data_stack_size < state->data_stack_base+size) { \
709 int j = data_stack_grow(state, size); \
710 if (j < 0) return j; \
711 if (ctx_pos != -1) \
712 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
714 memcpy(state->data_stack+state->data_stack_base, data, size); \
715 state->data_stack_base += size; \
716 } while (0)
718 #define DATA_STACK_POP(state, data, size, discard) \
719 do { \
720 TRACE(("copy data to %p from %d (%d)\n", \
721 data, state->data_stack_base-size, size)); \
722 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
723 if (discard) \
724 state->data_stack_base -= size; \
725 } while (0)
727 #define DATA_STACK_POP_DISCARD(state, size) \
728 do { \
729 TRACE(("discard data from %d (%d)\n", \
730 state->data_stack_base-size, size)); \
731 state->data_stack_base -= size; \
732 } while(0)
734 #define DATA_PUSH(x) \
735 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
736 #define DATA_POP(x) \
737 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
738 #define DATA_POP_DISCARD(x) \
739 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
740 #define DATA_ALLOC(t,p) \
741 DATA_STACK_ALLOC(state, t, p)
742 #define DATA_LOOKUP_AT(t,p,pos) \
743 DATA_STACK_LOOKUP_AT(state,t,p,pos)
745 #define MARK_PUSH(lastmark) \
746 do if (lastmark > 0) { \
747 i = lastmark; /* ctx->lastmark may change if reallocated */ \
748 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
749 } while (0)
750 #define MARK_POP(lastmark) \
751 do if (lastmark > 0) { \
752 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
753 } while (0)
754 #define MARK_POP_KEEP(lastmark) \
755 do if (lastmark > 0) { \
756 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
757 } while (0)
758 #define MARK_POP_DISCARD(lastmark) \
759 do if (lastmark > 0) { \
760 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
761 } while (0)
763 #define JUMP_NONE 0
764 #define JUMP_MAX_UNTIL_1 1
765 #define JUMP_MAX_UNTIL_2 2
766 #define JUMP_MAX_UNTIL_3 3
767 #define JUMP_MIN_UNTIL_1 4
768 #define JUMP_MIN_UNTIL_2 5
769 #define JUMP_MIN_UNTIL_3 6
770 #define JUMP_REPEAT 7
771 #define JUMP_REPEAT_ONE_1 8
772 #define JUMP_REPEAT_ONE_2 9
773 #define JUMP_MIN_REPEAT_ONE 10
774 #define JUMP_BRANCH 11
775 #define JUMP_ASSERT 12
776 #define JUMP_ASSERT_NOT 13
778 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
779 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
780 nextctx->last_ctx_pos = ctx_pos; \
781 nextctx->jump = jumpvalue; \
782 nextctx->pattern = nextpattern; \
783 ctx_pos = alloc_pos; \
784 ctx = nextctx; \
785 goto entrance; \
786 jumplabel: \
787 while (0) /* gcc doesn't like labels at end of scopes */ \
789 typedef struct {
790 int last_ctx_pos;
791 int jump;
792 SRE_CHAR* ptr;
793 SRE_CODE* pattern;
794 int count;
795 int lastmark;
796 int lastindex;
797 union {
798 SRE_CODE chr;
799 SRE_REPEAT* rep;
800 } u;
801 } SRE_MATCH_CONTEXT;
803 /* check if string matches the given pattern. returns <0 for
804 error, 0 for failure, and 1 for success */
805 LOCAL(int)
806 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
808 SRE_CHAR* end = state->end;
809 int alloc_pos, ctx_pos = -1;
810 int i, ret = 0;
811 int jump;
813 SRE_MATCH_CONTEXT* ctx;
814 SRE_MATCH_CONTEXT* nextctx;
816 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
818 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
819 ctx->last_ctx_pos = -1;
820 ctx->jump = JUMP_NONE;
821 ctx->pattern = pattern;
822 ctx_pos = alloc_pos;
824 entrance:
826 ctx->ptr = state->ptr;
828 if (ctx->pattern[0] == SRE_OP_INFO) {
829 /* optimization info block */
830 /* <INFO> <1=skip> <2=flags> <3=min> ... */
831 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
832 TRACE(("reject (got %d chars, need %d)\n",
833 (end - ctx->ptr), ctx->pattern[3]));
834 RETURN_FAILURE;
836 ctx->pattern += ctx->pattern[1] + 1;
839 for (;;) {
841 switch (*ctx->pattern++) {
843 case SRE_OP_MARK:
844 /* set mark */
845 /* <MARK> <gid> */
846 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
847 ctx->ptr, ctx->pattern[0]));
848 i = ctx->pattern[0];
849 if (i & 1)
850 state->lastindex = i/2 + 1;
851 if (i > state->lastmark) {
852 /* state->lastmark is the highest valid index in the
853 state->mark array. If it is increased by more than 1,
854 the intervening marks must be set to NULL to signal
855 that these marks have not been encountered. */
856 int j = state->lastmark + 1;
857 while (j < i)
858 state->mark[j++] = NULL;
859 state->lastmark = i;
861 state->mark[i] = ctx->ptr;
862 ctx->pattern++;
863 break;
865 case SRE_OP_LITERAL:
866 /* match literal string */
867 /* <LITERAL> <code> */
868 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
869 ctx->ptr, *ctx->pattern));
870 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
871 RETURN_FAILURE;
872 ctx->pattern++;
873 ctx->ptr++;
874 break;
876 case SRE_OP_NOT_LITERAL:
877 /* match anything that is not literal character */
878 /* <NOT_LITERAL> <code> */
879 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
887 case SRE_OP_SUCCESS:
888 /* end of pattern */
889 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
890 state->ptr = ctx->ptr;
891 RETURN_SUCCESS;
893 case SRE_OP_AT:
894 /* match at given position */
895 /* <AT> <code> */
896 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
897 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
898 RETURN_FAILURE;
899 ctx->pattern++;
900 break;
902 case SRE_OP_CATEGORY:
903 /* match at given category */
904 /* <CATEGORY> <code> */
905 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
906 ctx->ptr, *ctx->pattern));
907 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
908 RETURN_FAILURE;
909 ctx->pattern++;
910 ctx->ptr++;
911 break;
913 case SRE_OP_ANY:
914 /* match anything (except a newline) */
915 /* <ANY> */
916 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
917 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
918 RETURN_FAILURE;
919 ctx->ptr++;
920 break;
922 case SRE_OP_ANY_ALL:
923 /* match anything */
924 /* <ANY_ALL> */
925 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
926 if (ctx->ptr >= end)
927 RETURN_FAILURE;
928 ctx->ptr++;
929 break;
931 case SRE_OP_IN:
932 /* match set member (or non_member) */
933 /* <IN> <skip> <set> */
934 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
936 RETURN_FAILURE;
937 ctx->pattern += ctx->pattern[0];
938 ctx->ptr++;
939 break;
941 case SRE_OP_LITERAL_IGNORE:
942 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
943 ctx->pattern, ctx->ptr, ctx->pattern[0]));
944 if (ctx->ptr >= end ||
945 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
946 RETURN_FAILURE;
947 ctx->pattern++;
948 ctx->ptr++;
949 break;
951 case SRE_OP_NOT_LITERAL_IGNORE:
952 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
953 ctx->pattern, ctx->ptr, *ctx->pattern));
954 if (ctx->ptr >= end ||
955 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
956 RETURN_FAILURE;
957 ctx->pattern++;
958 ctx->ptr++;
959 break;
961 case SRE_OP_IN_IGNORE:
962 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
963 if (ctx->ptr >= end
964 || !SRE_CHARSET(ctx->pattern+1,
965 (SRE_CODE)state->lower(*ctx->ptr)))
966 RETURN_FAILURE;
967 ctx->pattern += ctx->pattern[0];
968 ctx->ptr++;
969 break;
971 case SRE_OP_JUMP:
972 case SRE_OP_INFO:
973 /* jump forward */
974 /* <JUMP> <offset> */
975 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
976 ctx->ptr, ctx->pattern[0]));
977 ctx->pattern += ctx->pattern[0];
978 break;
980 case SRE_OP_BRANCH:
981 /* alternation */
982 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
983 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
984 LASTMARK_SAVE();
985 ctx->u.rep = state->repeat;
986 if (ctx->u.rep)
987 MARK_PUSH(ctx->lastmark);
988 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
989 if (ctx->pattern[1] == SRE_OP_LITERAL &&
990 (ctx->ptr >= end ||
991 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
992 continue;
993 if (ctx->pattern[1] == SRE_OP_IN &&
994 (ctx->ptr >= end ||
995 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
996 continue;
997 state->ptr = ctx->ptr;
998 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
999 if (ret) {
1000 if (ctx->u.rep)
1001 MARK_POP_DISCARD(ctx->lastmark);
1002 RETURN_ON_ERROR(ret);
1003 RETURN_SUCCESS;
1005 if (ctx->u.rep)
1006 MARK_POP_KEEP(ctx->lastmark);
1007 LASTMARK_RESTORE();
1009 if (ctx->u.rep)
1010 MARK_POP_DISCARD(ctx->lastmark);
1011 RETURN_FAILURE;
1013 case SRE_OP_REPEAT_ONE:
1014 /* match repeated sequence (maximizing regexp) */
1016 /* this operator only works if the repeated item is
1017 exactly one character wide, and we're not already
1018 collecting backtracking points. for other cases,
1019 use the MAX_REPEAT operator */
1021 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1023 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1024 ctx->pattern[1], ctx->pattern[2]));
1026 if (ctx->ptr + ctx->pattern[1] > end)
1027 RETURN_FAILURE; /* cannot match */
1029 state->ptr = ctx->ptr;
1031 ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1032 RETURN_ON_ERROR(ctx->count);
1034 ctx->ptr += ctx->count;
1036 /* when we arrive here, count contains the number of
1037 matches, and ctx->ptr points to the tail of the target
1038 string. check if the rest of the pattern matches,
1039 and backtrack if not. */
1041 if (ctx->count < (int) ctx->pattern[1])
1042 RETURN_FAILURE;
1044 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1045 /* tail is empty. we're finished */
1046 state->ptr = ctx->ptr;
1047 RETURN_SUCCESS;
1050 LASTMARK_SAVE();
1052 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1053 /* tail starts with a literal. skip positions where
1054 the rest of the pattern cannot possibly match */
1055 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1056 for (;;) {
1057 while (ctx->count >= (int) ctx->pattern[1] &&
1058 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1059 ctx->ptr--;
1060 ctx->count--;
1062 if (ctx->count < (int) ctx->pattern[1])
1063 break;
1064 state->ptr = ctx->ptr;
1065 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1066 ctx->pattern+ctx->pattern[0]);
1067 if (ret) {
1068 RETURN_ON_ERROR(ret);
1069 RETURN_SUCCESS;
1072 LASTMARK_RESTORE();
1074 ctx->ptr--;
1075 ctx->count--;
1078 } else {
1079 /* general case */
1080 while (ctx->count >= (int) ctx->pattern[1]) {
1081 state->ptr = ctx->ptr;
1082 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1083 ctx->pattern+ctx->pattern[0]);
1084 if (ret) {
1085 RETURN_ON_ERROR(ret);
1086 RETURN_SUCCESS;
1088 ctx->ptr--;
1089 ctx->count--;
1090 LASTMARK_RESTORE();
1093 RETURN_FAILURE;
1095 case SRE_OP_MIN_REPEAT_ONE:
1096 /* match repeated sequence (minimizing regexp) */
1098 /* this operator only works if the repeated item is
1099 exactly one character wide, and we're not already
1100 collecting backtracking points. for other cases,
1101 use the MIN_REPEAT operator */
1103 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1105 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1106 ctx->pattern[1], ctx->pattern[2]));
1108 if (ctx->ptr + ctx->pattern[1] > end)
1109 RETURN_FAILURE; /* cannot match */
1111 state->ptr = ctx->ptr;
1113 if (ctx->pattern[1] == 0)
1114 ctx->count = 0;
1115 else {
1116 /* count using pattern min as the maximum */
1117 ctx->count = SRE_COUNT(state, ctx->pattern+3,
1118 ctx->pattern[1]);
1119 RETURN_ON_ERROR(ctx->count);
1120 if (ctx->count < (int) ctx->pattern[1])
1121 /* didn't match minimum number of times */
1122 RETURN_FAILURE;
1123 /* advance past minimum matches of repeat */
1124 ctx->ptr += ctx->count;
1127 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1128 /* tail is empty. we're finished */
1129 state->ptr = ctx->ptr;
1130 RETURN_SUCCESS;
1132 } else {
1133 /* general case */
1134 LASTMARK_SAVE();
1135 while ((int)ctx->pattern[2] == 65535
1136 || ctx->count <= (int)ctx->pattern[2]) {
1137 state->ptr = ctx->ptr;
1138 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1139 ctx->pattern+ctx->pattern[0]);
1140 if (ret) {
1141 RETURN_ON_ERROR(ret);
1142 RETURN_SUCCESS;
1144 state->ptr = ctx->ptr;
1145 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1146 RETURN_ON_ERROR(ret);
1147 if (ret == 0)
1148 break;
1149 assert(ret == 1);
1150 ctx->ptr++;
1151 ctx->count++;
1152 LASTMARK_RESTORE();
1155 RETURN_FAILURE;
1157 case SRE_OP_REPEAT:
1158 /* create repeat context. all the hard work is done
1159 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1160 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1161 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1162 ctx->pattern[1], ctx->pattern[2]));
1164 /* install new repeat context */
1165 ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
1166 ctx->u.rep->count = -1;
1167 ctx->u.rep->pattern = ctx->pattern;
1168 ctx->u.rep->prev = state->repeat;
1169 ctx->u.rep->last_ptr = NULL;
1170 state->repeat = ctx->u.rep;
1172 state->ptr = ctx->ptr;
1173 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1174 state->repeat = ctx->u.rep->prev;
1175 free(ctx->u.rep);
1177 if (ret) {
1178 RETURN_ON_ERROR(ret);
1179 RETURN_SUCCESS;
1181 RETURN_FAILURE;
1183 case SRE_OP_MAX_UNTIL:
1184 /* maximizing repeat */
1185 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1187 /* FIXME: we probably need to deal with zero-width
1188 matches in here... */
1190 ctx->u.rep = state->repeat;
1191 if (!ctx->u.rep)
1192 RETURN_ERROR(SRE_ERROR_STATE);
1194 state->ptr = ctx->ptr;
1196 ctx->count = ctx->u.rep->count+1;
1198 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1199 ctx->ptr, ctx->count));
1201 if (ctx->count < ctx->u.rep->pattern[1]) {
1202 /* not enough matches */
1203 ctx->u.rep->count = ctx->count;
1204 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1205 ctx->u.rep->pattern+3);
1206 if (ret) {
1207 RETURN_ON_ERROR(ret);
1208 RETURN_SUCCESS;
1210 ctx->u.rep->count = ctx->count-1;
1211 state->ptr = ctx->ptr;
1212 RETURN_FAILURE;
1215 if ((ctx->count < ctx->u.rep->pattern[2] ||
1216 ctx->u.rep->pattern[2] == 65535) &&
1217 state->ptr != ctx->u.rep->last_ptr) {
1218 /* we may have enough matches, but if we can
1219 match another item, do so */
1220 ctx->u.rep->count = ctx->count;
1221 LASTMARK_SAVE();
1222 MARK_PUSH(ctx->lastmark);
1223 /* zero-width match protection */
1224 DATA_PUSH(&ctx->u.rep->last_ptr);
1225 ctx->u.rep->last_ptr = state->ptr;
1226 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1227 ctx->u.rep->pattern+3);
1228 DATA_POP(&ctx->u.rep->last_ptr);
1229 if (ret) {
1230 MARK_POP_DISCARD(ctx->lastmark);
1231 RETURN_ON_ERROR(ret);
1232 RETURN_SUCCESS;
1234 MARK_POP(ctx->lastmark);
1235 LASTMARK_RESTORE();
1236 ctx->u.rep->count = ctx->count-1;
1237 state->ptr = ctx->ptr;
1240 /* cannot match more repeated items here. make sure the
1241 tail matches */
1242 state->repeat = ctx->u.rep->prev;
1243 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1244 RETURN_ON_SUCCESS(ret);
1245 state->repeat = ctx->u.rep;
1246 state->ptr = ctx->ptr;
1247 RETURN_FAILURE;
1249 case SRE_OP_MIN_UNTIL:
1250 /* minimizing repeat */
1251 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1253 ctx->u.rep = state->repeat;
1254 if (!ctx->u.rep)
1255 RETURN_ERROR(SRE_ERROR_STATE);
1257 state->ptr = ctx->ptr;
1259 ctx->count = ctx->u.rep->count+1;
1261 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1262 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1264 if (ctx->count < ctx->u.rep->pattern[1]) {
1265 /* not enough matches */
1266 ctx->u.rep->count = ctx->count;
1267 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1268 ctx->u.rep->pattern+3);
1269 if (ret) {
1270 RETURN_ON_ERROR(ret);
1271 RETURN_SUCCESS;
1273 ctx->u.rep->count = ctx->count-1;
1274 state->ptr = ctx->ptr;
1275 RETURN_FAILURE;
1278 LASTMARK_SAVE();
1280 /* see if the tail matches */
1281 state->repeat = ctx->u.rep->prev;
1282 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1283 if (ret) {
1284 RETURN_ON_ERROR(ret);
1285 RETURN_SUCCESS;
1288 state->repeat = ctx->u.rep;
1289 state->ptr = ctx->ptr;
1291 LASTMARK_RESTORE();
1293 if (ctx->count >= ctx->u.rep->pattern[2]
1294 && ctx->u.rep->pattern[2] != 65535)
1295 RETURN_FAILURE;
1297 ctx->u.rep->count = ctx->count;
1298 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1299 ctx->u.rep->pattern+3);
1300 if (ret) {
1301 RETURN_ON_ERROR(ret);
1302 RETURN_SUCCESS;
1304 ctx->u.rep->count = ctx->count-1;
1305 state->ptr = ctx->ptr;
1306 RETURN_FAILURE;
1308 case SRE_OP_GROUPREF:
1309 /* match backreference */
1310 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1311 ctx->ptr, ctx->pattern[0]));
1312 i = ctx->pattern[0];
1314 int groupref = i+i;
1315 if (groupref >= state->lastmark) {
1316 RETURN_FAILURE;
1317 } else {
1318 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1319 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1320 if (!p || !e || e < p)
1321 RETURN_FAILURE;
1322 while (p < e) {
1323 if (ctx->ptr >= end || *ctx->ptr != *p)
1324 RETURN_FAILURE;
1325 p++; ctx->ptr++;
1329 ctx->pattern++;
1330 break;
1332 case SRE_OP_GROUPREF_IGNORE:
1333 /* match backreference */
1334 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1335 ctx->ptr, ctx->pattern[0]));
1336 i = ctx->pattern[0];
1338 int groupref = i+i;
1339 if (groupref >= state->lastmark) {
1340 RETURN_FAILURE;
1341 } else {
1342 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1343 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1344 if (!p || !e || e < p)
1345 RETURN_FAILURE;
1346 while (p < e) {
1347 if (ctx->ptr >= end ||
1348 state->lower(*ctx->ptr) != state->lower(*p))
1349 RETURN_FAILURE;
1350 p++; ctx->ptr++;
1354 ctx->pattern++;
1355 break;
1357 case SRE_OP_GROUPREF_EXISTS:
1358 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1359 ctx->ptr, ctx->pattern[0]));
1360 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1361 i = ctx->pattern[0];
1363 int groupref = i+i;
1364 if (groupref >= state->lastmark) {
1365 ctx->pattern += ctx->pattern[1];
1366 break;
1367 } else {
1368 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1369 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1370 if (!p || !e || e < p) {
1371 ctx->pattern += ctx->pattern[1];
1372 break;
1376 ctx->pattern += 2;
1377 break;
1379 case SRE_OP_ASSERT:
1380 /* assert subpattern */
1381 /* <ASSERT> <skip> <back> <pattern> */
1382 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1383 ctx->ptr, ctx->pattern[1]));
1384 state->ptr = ctx->ptr - ctx->pattern[1];
1385 if (state->ptr < state->beginning)
1386 RETURN_FAILURE;
1387 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1388 RETURN_ON_FAILURE(ret);
1389 ctx->pattern += ctx->pattern[0];
1390 break;
1392 case SRE_OP_ASSERT_NOT:
1393 /* assert not subpattern */
1394 /* <ASSERT_NOT> <skip> <back> <pattern> */
1395 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1396 ctx->ptr, ctx->pattern[1]));
1397 state->ptr = ctx->ptr - ctx->pattern[1];
1398 if (state->ptr >= state->beginning) {
1399 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1400 if (ret) {
1401 RETURN_ON_ERROR(ret);
1402 RETURN_FAILURE;
1405 ctx->pattern += ctx->pattern[0];
1406 break;
1408 case SRE_OP_FAILURE:
1409 /* immediate failure */
1410 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1411 RETURN_FAILURE;
1413 default:
1414 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1415 ctx->pattern[-1]));
1416 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1420 exit:
1421 ctx_pos = ctx->last_ctx_pos;
1422 jump = ctx->jump;
1423 DATA_POP_DISCARD(ctx);
1424 if (ctx_pos == -1)
1425 return ret;
1426 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1428 switch (jump) {
1429 case JUMP_MAX_UNTIL_2:
1430 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1431 goto jump_max_until_2;
1432 case JUMP_MAX_UNTIL_3:
1433 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1434 goto jump_max_until_3;
1435 case JUMP_MIN_UNTIL_2:
1436 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1437 goto jump_min_until_2;
1438 case JUMP_MIN_UNTIL_3:
1439 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1440 goto jump_min_until_3;
1441 case JUMP_BRANCH:
1442 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1443 goto jump_branch;
1444 case JUMP_MAX_UNTIL_1:
1445 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1446 goto jump_max_until_1;
1447 case JUMP_MIN_UNTIL_1:
1448 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1449 goto jump_min_until_1;
1450 case JUMP_REPEAT:
1451 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1452 goto jump_repeat;
1453 case JUMP_REPEAT_ONE_1:
1454 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1455 goto jump_repeat_one_1;
1456 case JUMP_REPEAT_ONE_2:
1457 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1458 goto jump_repeat_one_2;
1459 case JUMP_MIN_REPEAT_ONE:
1460 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1461 goto jump_min_repeat_one;
1462 case JUMP_ASSERT:
1463 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1464 goto jump_assert;
1465 case JUMP_ASSERT_NOT:
1466 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1467 goto jump_assert_not;
1468 case JUMP_NONE:
1469 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1470 break;
1473 return ret; /* should never get here */
1476 LOCAL(int)
1477 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1479 SRE_CHAR* ptr = state->start;
1480 SRE_CHAR* end = state->end;
1481 int status = 0;
1482 int prefix_len = 0;
1483 int prefix_skip = 0;
1484 SRE_CODE* prefix = NULL;
1485 SRE_CODE* charset = NULL;
1486 SRE_CODE* overlap = NULL;
1487 int flags = 0;
1489 if (pattern[0] == SRE_OP_INFO) {
1490 /* optimization info block */
1491 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1493 flags = pattern[2];
1495 if (pattern[3] > 1) {
1496 /* adjust end point (but make sure we leave at least one
1497 character in there, so literal search will work) */
1498 end -= pattern[3]-1;
1499 if (end <= ptr)
1500 end = ptr+1;
1503 if (flags & SRE_INFO_PREFIX) {
1504 /* pattern starts with a known prefix */
1505 /* <length> <skip> <prefix data> <overlap data> */
1506 prefix_len = pattern[5];
1507 prefix_skip = pattern[6];
1508 prefix = pattern + 7;
1509 overlap = prefix + prefix_len - 1;
1510 } else if (flags & SRE_INFO_CHARSET)
1511 /* pattern starts with a character from a known set */
1512 /* <charset> */
1513 charset = pattern + 5;
1515 pattern += 1 + pattern[1];
1518 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1519 TRACE(("charset = %p\n", charset));
1521 #if defined(USE_FAST_SEARCH)
1522 if (prefix_len > 1) {
1523 /* pattern starts with a known prefix. use the overlap
1524 table to skip forward as fast as we possibly can */
1525 int i = 0;
1526 end = state->end;
1527 while (ptr < end) {
1528 for (;;) {
1529 if ((SRE_CODE) ptr[0] != prefix[i]) {
1530 if (!i)
1531 break;
1532 else
1533 i = overlap[i];
1534 } else {
1535 if (++i == prefix_len) {
1536 /* found a potential match */
1537 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1538 state->start = ptr + 1 - prefix_len;
1539 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1540 if (flags & SRE_INFO_LITERAL)
1541 return 1; /* we got all of it */
1542 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1543 if (status != 0)
1544 return status;
1545 /* close but no cigar -- try again */
1546 i = overlap[i];
1548 break;
1552 ptr++;
1554 return 0;
1556 #endif
1558 if (pattern[0] == SRE_OP_LITERAL) {
1559 /* pattern starts with a literal character. this is used
1560 for short prefixes, and if fast search is disabled */
1561 SRE_CODE chr = pattern[1];
1562 end = state->end;
1563 for (;;) {
1564 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1565 ptr++;
1566 if (ptr >= end)
1567 return 0;
1568 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1569 state->start = ptr;
1570 state->ptr = ++ptr;
1571 if (flags & SRE_INFO_LITERAL)
1572 return 1; /* we got all of it */
1573 status = SRE_MATCH(state, pattern + 2);
1574 if (status != 0)
1575 break;
1577 } else if (charset) {
1578 /* pattern starts with a character from a known set */
1579 end = state->end;
1580 for (;;) {
1581 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1582 ptr++;
1583 if (ptr >= end)
1584 return 0;
1585 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1586 state->start = ptr;
1587 state->ptr = ptr;
1588 status = SRE_MATCH(state, pattern);
1589 if (status != 0)
1590 break;
1591 ptr++;
1593 } else
1594 /* general case */
1595 while (ptr <= end) {
1596 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1597 state->start = state->ptr = ptr++;
1598 status = SRE_MATCH(state, pattern);
1599 if (status != 0)
1600 break;
1603 return status;
1606 LOCAL(int)
1607 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, int len)
1609 /* check if given string is a literal template (i.e. no escapes) */
1610 while (len-- > 0)
1611 if (*ptr++ == '\\')
1612 return 0;
1613 return 1;
1616 #if !defined(SRE_RECURSIVE)
1618 /* -------------------------------------------------------------------- */
1619 /* factories and destructors */
1621 /* see sre.h for object declarations */
1623 static PyTypeObject Pattern_Type;
1624 static PyTypeObject Match_Type;
1625 static PyTypeObject Scanner_Type;
1627 static PyObject *
1628 _compile(PyObject* self_, PyObject* args)
1630 /* "compile" pattern descriptor to pattern object */
1632 PatternObject* self;
1633 int i, n;
1635 PyObject* pattern;
1636 int flags = 0;
1637 PyObject* code;
1638 int groups = 0;
1639 PyObject* groupindex = NULL;
1640 PyObject* indexgroup = NULL;
1641 if (!PyArg_ParseTuple(args, "OiO!|iOO", &pattern, &flags,
1642 &PyList_Type, &code, &groups,
1643 &groupindex, &indexgroup))
1644 return NULL;
1646 n = PyList_GET_SIZE(code);
1648 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
1649 if (!self)
1650 return NULL;
1652 self->codesize = n;
1654 for (i = 0; i < n; i++) {
1655 PyObject *o = PyList_GET_ITEM(code, i);
1656 if (PyInt_Check(o))
1657 self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1658 else
1659 self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
1662 if (PyErr_Occurred()) {
1663 PyObject_DEL(self);
1664 return NULL;
1667 Py_INCREF(pattern);
1668 self->pattern = pattern;
1670 self->flags = flags;
1672 self->groups = groups;
1674 Py_XINCREF(groupindex);
1675 self->groupindex = groupindex;
1677 Py_XINCREF(indexgroup);
1678 self->indexgroup = indexgroup;
1680 self->weakreflist = NULL;
1682 return (PyObject*) self;
1685 static PyObject *
1686 sre_codesize(PyObject* self, PyObject* args)
1688 return Py_BuildValue("i", sizeof(SRE_CODE));
1691 static PyObject *
1692 sre_getlower(PyObject* self, PyObject* args)
1694 int character, flags;
1695 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1696 return NULL;
1697 if (flags & SRE_FLAG_LOCALE)
1698 return Py_BuildValue("i", sre_lower_locale(character));
1699 if (flags & SRE_FLAG_UNICODE)
1700 #if defined(HAVE_UNICODE)
1701 return Py_BuildValue("i", sre_lower_unicode(character));
1702 #else
1703 return Py_BuildValue("i", sre_lower_locale(character));
1704 #endif
1705 return Py_BuildValue("i", sre_lower(character));
1708 LOCAL(void)
1709 state_reset(SRE_STATE* state)
1711 /* FIXME: dynamic! */
1712 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1714 state->lastmark = -1;
1715 state->lastindex = -1;
1717 state->repeat = NULL;
1719 data_stack_dealloc(state);
1722 static void*
1723 getstring(PyObject* string, int* p_length, int* p_charsize)
1725 /* given a python object, return a data pointer, a length (in
1726 characters), and a character size. return NULL if the object
1727 is not a string (or not compatible) */
1729 PyBufferProcs *buffer;
1730 int size, bytes, charsize;
1731 void* ptr;
1733 #if defined(HAVE_UNICODE)
1734 if (PyUnicode_Check(string)) {
1735 /* unicode strings doesn't always support the buffer interface */
1736 ptr = (void*) PyUnicode_AS_DATA(string);
1737 bytes = PyUnicode_GET_DATA_SIZE(string);
1738 size = PyUnicode_GET_SIZE(string);
1739 charsize = sizeof(Py_UNICODE);
1741 } else {
1742 #endif
1744 /* get pointer to string buffer */
1745 buffer = string->ob_type->tp_as_buffer;
1746 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1747 buffer->bf_getsegcount(string, NULL) != 1) {
1748 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1749 return NULL;
1752 /* determine buffer size */
1753 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1754 if (bytes < 0) {
1755 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1756 return NULL;
1759 /* determine character size */
1760 #if PY_VERSION_HEX >= 0x01060000
1761 size = PyObject_Size(string);
1762 #else
1763 size = PyObject_Length(string);
1764 #endif
1766 if (PyString_Check(string) || bytes == size)
1767 charsize = 1;
1768 #if defined(HAVE_UNICODE)
1769 else if (bytes == (int) (size * sizeof(Py_UNICODE)))
1770 charsize = sizeof(Py_UNICODE);
1771 #endif
1772 else {
1773 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1774 return NULL;
1777 #if defined(HAVE_UNICODE)
1779 #endif
1781 *p_length = size;
1782 *p_charsize = charsize;
1784 return ptr;
1787 LOCAL(PyObject*)
1788 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1789 int start, int end)
1791 /* prepare state object */
1793 int length;
1794 int charsize;
1795 void* ptr;
1797 memset(state, 0, sizeof(SRE_STATE));
1799 state->lastmark = -1;
1800 state->lastindex = -1;
1802 ptr = getstring(string, &length, &charsize);
1803 if (!ptr)
1804 return NULL;
1806 /* adjust boundaries */
1807 if (start < 0)
1808 start = 0;
1809 else if (start > length)
1810 start = length;
1812 if (end < 0)
1813 end = 0;
1814 else if (end > length)
1815 end = length;
1817 state->charsize = charsize;
1819 state->beginning = ptr;
1821 state->start = (void*) ((char*) ptr + start * state->charsize);
1822 state->end = (void*) ((char*) ptr + end * state->charsize);
1824 Py_INCREF(string);
1825 state->string = string;
1826 state->pos = start;
1827 state->endpos = end;
1829 if (pattern->flags & SRE_FLAG_LOCALE)
1830 state->lower = sre_lower_locale;
1831 else if (pattern->flags & SRE_FLAG_UNICODE)
1832 #if defined(HAVE_UNICODE)
1833 state->lower = sre_lower_unicode;
1834 #else
1835 state->lower = sre_lower_locale;
1836 #endif
1837 else
1838 state->lower = sre_lower;
1840 return string;
1843 LOCAL(void)
1844 state_fini(SRE_STATE* state)
1846 Py_XDECREF(state->string);
1847 data_stack_dealloc(state);
1850 /* calculate offset from start of string */
1851 #define STATE_OFFSET(state, member)\
1852 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1854 LOCAL(PyObject*)
1855 state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
1857 int i, j;
1859 index = (index - 1) * 2;
1861 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1862 if (empty)
1863 /* want empty string */
1864 i = j = 0;
1865 else {
1866 Py_INCREF(Py_None);
1867 return Py_None;
1869 } else {
1870 i = STATE_OFFSET(state, state->mark[index]);
1871 j = STATE_OFFSET(state, state->mark[index+1]);
1874 return PySequence_GetSlice(string, i, j);
1877 static void
1878 pattern_error(int status)
1880 switch (status) {
1881 case SRE_ERROR_RECURSION_LIMIT:
1882 PyErr_SetString(
1883 PyExc_RuntimeError,
1884 "maximum recursion limit exceeded"
1886 break;
1887 case SRE_ERROR_MEMORY:
1888 PyErr_NoMemory();
1889 break;
1890 default:
1891 /* other error codes indicate compiler/engine bugs */
1892 PyErr_SetString(
1893 PyExc_RuntimeError,
1894 "internal error in regular expression engine"
1899 static PyObject*
1900 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
1902 /* create match object (from state object) */
1904 MatchObject* match;
1905 int i, j;
1906 char* base;
1907 int n;
1909 if (status > 0) {
1911 /* create match object (with room for extra group marks) */
1912 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
1913 2*(pattern->groups+1));
1914 if (!match)
1915 return NULL;
1917 Py_INCREF(pattern);
1918 match->pattern = pattern;
1920 Py_INCREF(state->string);
1921 match->string = state->string;
1923 match->regs = NULL;
1924 match->groups = pattern->groups+1;
1926 /* fill in group slices */
1928 base = (char*) state->beginning;
1929 n = state->charsize;
1931 match->mark[0] = ((char*) state->start - base) / n;
1932 match->mark[1] = ((char*) state->ptr - base) / n;
1934 for (i = j = 0; i < pattern->groups; i++, j+=2)
1935 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
1936 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
1937 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
1938 } else
1939 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
1941 match->pos = state->pos;
1942 match->endpos = state->endpos;
1944 match->lastindex = state->lastindex;
1946 return (PyObject*) match;
1948 } else if (status == 0) {
1950 /* no match */
1951 Py_INCREF(Py_None);
1952 return Py_None;
1956 /* internal error */
1957 pattern_error(status);
1958 return NULL;
1961 static PyObject*
1962 pattern_scanner(PatternObject* pattern, PyObject* args)
1964 /* create search state object */
1966 ScannerObject* self;
1968 PyObject* string;
1969 int start = 0;
1970 int end = INT_MAX;
1971 if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
1972 return NULL;
1974 /* create scanner object */
1975 self = PyObject_NEW(ScannerObject, &Scanner_Type);
1976 if (!self)
1977 return NULL;
1979 string = state_init(&self->state, pattern, string, start, end);
1980 if (!string) {
1981 PyObject_DEL(self);
1982 return NULL;
1985 Py_INCREF(pattern);
1986 self->pattern = (PyObject*) pattern;
1988 return (PyObject*) self;
1991 static void
1992 pattern_dealloc(PatternObject* self)
1994 if (self->weakreflist != NULL)
1995 PyObject_ClearWeakRefs((PyObject *) self);
1996 Py_XDECREF(self->pattern);
1997 Py_XDECREF(self->groupindex);
1998 Py_XDECREF(self->indexgroup);
1999 PyObject_DEL(self);
2002 static PyObject*
2003 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
2005 SRE_STATE state;
2006 int status;
2008 PyObject* string;
2009 int start = 0;
2010 int end = INT_MAX;
2011 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2012 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:match", kwlist,
2013 &string, &start, &end))
2014 return NULL;
2016 string = state_init(&state, self, string, start, end);
2017 if (!string)
2018 return NULL;
2020 state.ptr = state.start;
2022 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
2024 if (state.charsize == 1) {
2025 status = sre_match(&state, PatternObject_GetCode(self));
2026 } else {
2027 #if defined(HAVE_UNICODE)
2028 status = sre_umatch(&state, PatternObject_GetCode(self));
2029 #endif
2032 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2034 state_fini(&state);
2036 return pattern_new_match(self, &state, status);
2039 static PyObject*
2040 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
2042 SRE_STATE state;
2043 int status;
2045 PyObject* string;
2046 int start = 0;
2047 int end = INT_MAX;
2048 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
2049 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:search", kwlist,
2050 &string, &start, &end))
2051 return NULL;
2053 string = state_init(&state, self, string, start, end);
2054 if (!string)
2055 return NULL;
2057 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
2059 if (state.charsize == 1) {
2060 status = sre_search(&state, PatternObject_GetCode(self));
2061 } else {
2062 #if defined(HAVE_UNICODE)
2063 status = sre_usearch(&state, PatternObject_GetCode(self));
2064 #endif
2067 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
2069 state_fini(&state);
2071 return pattern_new_match(self, &state, status);
2074 static PyObject*
2075 call(char* module, char* function, PyObject* args)
2077 PyObject* name;
2078 PyObject* mod;
2079 PyObject* func;
2080 PyObject* result;
2082 if (!args)
2083 return NULL;
2084 name = PyString_FromString(module);
2085 if (!name)
2086 return NULL;
2087 mod = PyImport_Import(name);
2088 Py_DECREF(name);
2089 if (!mod)
2090 return NULL;
2091 func = PyObject_GetAttrString(mod, function);
2092 Py_DECREF(mod);
2093 if (!func)
2094 return NULL;
2095 result = PyObject_CallObject(func, args);
2096 Py_DECREF(func);
2097 Py_DECREF(args);
2098 return result;
2101 #ifdef USE_BUILTIN_COPY
2102 static int
2103 deepcopy(PyObject** object, PyObject* memo)
2105 PyObject* copy;
2107 copy = call(
2108 "copy", "deepcopy",
2109 PyTuple_Pack(2, *object, memo)
2111 if (!copy)
2112 return 0;
2114 Py_DECREF(*object);
2115 *object = copy;
2117 return 1; /* success */
2119 #endif
2121 static PyObject*
2122 join_list(PyObject* list, PyObject* pattern)
2124 /* join list elements */
2126 PyObject* joiner;
2127 #if PY_VERSION_HEX >= 0x01060000
2128 PyObject* function;
2129 PyObject* args;
2130 #endif
2131 PyObject* result;
2133 switch (PyList_GET_SIZE(list)) {
2134 case 0:
2135 Py_DECREF(list);
2136 return PySequence_GetSlice(pattern, 0, 0);
2137 case 1:
2138 result = PyList_GET_ITEM(list, 0);
2139 Py_INCREF(result);
2140 Py_DECREF(list);
2141 return result;
2144 /* two or more elements: slice out a suitable separator from the
2145 first member, and use that to join the entire list */
2147 joiner = PySequence_GetSlice(pattern, 0, 0);
2148 if (!joiner)
2149 return NULL;
2151 #if PY_VERSION_HEX >= 0x01060000
2152 function = PyObject_GetAttrString(joiner, "join");
2153 if (!function) {
2154 Py_DECREF(joiner);
2155 return NULL;
2157 args = PyTuple_New(1);
2158 if (!args) {
2159 Py_DECREF(function);
2160 Py_DECREF(joiner);
2161 return NULL;
2163 PyTuple_SET_ITEM(args, 0, list);
2164 result = PyObject_CallObject(function, args);
2165 Py_DECREF(args); /* also removes list */
2166 Py_DECREF(function);
2167 #else
2168 result = call(
2169 "string", "join",
2170 PyTuple_Pack(2, list, joiner)
2172 #endif
2173 Py_DECREF(joiner);
2175 return result;
2178 static PyObject*
2179 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2181 SRE_STATE state;
2182 PyObject* list;
2183 int status;
2184 int i, b, e;
2186 PyObject* string;
2187 int start = 0;
2188 int end = INT_MAX;
2189 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2190 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:findall", kwlist,
2191 &string, &start, &end))
2192 return NULL;
2194 string = state_init(&state, self, string, start, end);
2195 if (!string)
2196 return NULL;
2198 list = PyList_New(0);
2199 if (!list) {
2200 state_fini(&state);
2201 return NULL;
2204 while (state.start <= state.end) {
2206 PyObject* item;
2208 state_reset(&state);
2210 state.ptr = state.start;
2212 if (state.charsize == 1) {
2213 status = sre_search(&state, PatternObject_GetCode(self));
2214 } else {
2215 #if defined(HAVE_UNICODE)
2216 status = sre_usearch(&state, PatternObject_GetCode(self));
2217 #endif
2220 if (status <= 0) {
2221 if (status == 0)
2222 break;
2223 pattern_error(status);
2224 goto error;
2227 /* don't bother to build a match object */
2228 switch (self->groups) {
2229 case 0:
2230 b = STATE_OFFSET(&state, state.start);
2231 e = STATE_OFFSET(&state, state.ptr);
2232 item = PySequence_GetSlice(string, b, e);
2233 if (!item)
2234 goto error;
2235 break;
2236 case 1:
2237 item = state_getslice(&state, 1, string, 1);
2238 if (!item)
2239 goto error;
2240 break;
2241 default:
2242 item = PyTuple_New(self->groups);
2243 if (!item)
2244 goto error;
2245 for (i = 0; i < self->groups; i++) {
2246 PyObject* o = state_getslice(&state, i+1, string, 1);
2247 if (!o) {
2248 Py_DECREF(item);
2249 goto error;
2251 PyTuple_SET_ITEM(item, i, o);
2253 break;
2256 status = PyList_Append(list, item);
2257 Py_DECREF(item);
2258 if (status < 0)
2259 goto error;
2261 if (state.ptr == state.start)
2262 state.start = (void*) ((char*) state.ptr + state.charsize);
2263 else
2264 state.start = state.ptr;
2268 state_fini(&state);
2269 return list;
2271 error:
2272 Py_DECREF(list);
2273 state_fini(&state);
2274 return NULL;
2278 #if PY_VERSION_HEX >= 0x02020000
2279 static PyObject*
2280 pattern_finditer(PatternObject* pattern, PyObject* args)
2282 PyObject* scanner;
2283 PyObject* search;
2284 PyObject* iterator;
2286 scanner = pattern_scanner(pattern, args);
2287 if (!scanner)
2288 return NULL;
2290 search = PyObject_GetAttrString(scanner, "search");
2291 Py_DECREF(scanner);
2292 if (!search)
2293 return NULL;
2295 iterator = PyCallIter_New(search, Py_None);
2296 Py_DECREF(search);
2298 return iterator;
2300 #endif
2302 static PyObject*
2303 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2305 SRE_STATE state;
2306 PyObject* list;
2307 PyObject* item;
2308 int status;
2309 int n;
2310 int i;
2311 void* last;
2313 PyObject* string;
2314 int maxsplit = 0;
2315 static char* kwlist[] = { "source", "maxsplit", NULL };
2316 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
2317 &string, &maxsplit))
2318 return NULL;
2320 string = state_init(&state, self, string, 0, INT_MAX);
2321 if (!string)
2322 return NULL;
2324 list = PyList_New(0);
2325 if (!list) {
2326 state_fini(&state);
2327 return NULL;
2330 n = 0;
2331 last = state.start;
2333 while (!maxsplit || n < maxsplit) {
2335 state_reset(&state);
2337 state.ptr = state.start;
2339 if (state.charsize == 1) {
2340 status = sre_search(&state, PatternObject_GetCode(self));
2341 } else {
2342 #if defined(HAVE_UNICODE)
2343 status = sre_usearch(&state, PatternObject_GetCode(self));
2344 #endif
2347 if (status <= 0) {
2348 if (status == 0)
2349 break;
2350 pattern_error(status);
2351 goto error;
2354 if (state.start == state.ptr) {
2355 if (last == state.end)
2356 break;
2357 /* skip one character */
2358 state.start = (void*) ((char*) state.ptr + state.charsize);
2359 continue;
2362 /* get segment before this match */
2363 item = PySequence_GetSlice(
2364 string, STATE_OFFSET(&state, last),
2365 STATE_OFFSET(&state, state.start)
2367 if (!item)
2368 goto error;
2369 status = PyList_Append(list, item);
2370 Py_DECREF(item);
2371 if (status < 0)
2372 goto error;
2374 /* add groups (if any) */
2375 for (i = 0; i < self->groups; i++) {
2376 item = state_getslice(&state, i+1, string, 0);
2377 if (!item)
2378 goto error;
2379 status = PyList_Append(list, item);
2380 Py_DECREF(item);
2381 if (status < 0)
2382 goto error;
2385 n = n + 1;
2387 last = state.start = state.ptr;
2391 /* get segment following last match (even if empty) */
2392 item = PySequence_GetSlice(
2393 string, STATE_OFFSET(&state, last), state.endpos
2395 if (!item)
2396 goto error;
2397 status = PyList_Append(list, item);
2398 Py_DECREF(item);
2399 if (status < 0)
2400 goto error;
2402 state_fini(&state);
2403 return list;
2405 error:
2406 Py_DECREF(list);
2407 state_fini(&state);
2408 return NULL;
2412 static PyObject*
2413 pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
2414 int count, int subn)
2416 SRE_STATE state;
2417 PyObject* list;
2418 PyObject* item;
2419 PyObject* filter;
2420 PyObject* args;
2421 PyObject* match;
2422 void* ptr;
2423 int status;
2424 int n;
2425 int i, b, e;
2426 int filter_is_callable;
2428 if (PyCallable_Check(template)) {
2429 /* sub/subn takes either a function or a template */
2430 filter = template;
2431 Py_INCREF(filter);
2432 filter_is_callable = 1;
2433 } else {
2434 /* if not callable, check if it's a literal string */
2435 int literal;
2436 ptr = getstring(template, &n, &b);
2437 if (ptr) {
2438 if (b == 1) {
2439 literal = sre_literal_template(ptr, n);
2440 } else {
2441 #if defined(HAVE_UNICODE)
2442 literal = sre_uliteral_template(ptr, n);
2443 #endif
2445 } else {
2446 PyErr_Clear();
2447 literal = 0;
2449 if (literal) {
2450 filter = template;
2451 Py_INCREF(filter);
2452 filter_is_callable = 0;
2453 } else {
2454 /* not a literal; hand it over to the template compiler */
2455 filter = call(
2456 SRE_MODULE, "_subx",
2457 PyTuple_Pack(2, self, template)
2459 if (!filter)
2460 return NULL;
2461 filter_is_callable = PyCallable_Check(filter);
2465 string = state_init(&state, self, string, 0, INT_MAX);
2466 if (!string) {
2467 Py_DECREF(filter);
2468 return NULL;
2471 list = PyList_New(0);
2472 if (!list) {
2473 Py_DECREF(filter);
2474 state_fini(&state);
2475 return NULL;
2478 n = i = 0;
2480 while (!count || n < count) {
2482 state_reset(&state);
2484 state.ptr = state.start;
2486 if (state.charsize == 1) {
2487 status = sre_search(&state, PatternObject_GetCode(self));
2488 } else {
2489 #if defined(HAVE_UNICODE)
2490 status = sre_usearch(&state, PatternObject_GetCode(self));
2491 #endif
2494 if (status <= 0) {
2495 if (status == 0)
2496 break;
2497 pattern_error(status);
2498 goto error;
2501 b = STATE_OFFSET(&state, state.start);
2502 e = STATE_OFFSET(&state, state.ptr);
2504 if (i < b) {
2505 /* get segment before this match */
2506 item = PySequence_GetSlice(string, i, b);
2507 if (!item)
2508 goto error;
2509 status = PyList_Append(list, item);
2510 Py_DECREF(item);
2511 if (status < 0)
2512 goto error;
2514 } else if (i == b && i == e && n > 0)
2515 /* ignore empty match on latest position */
2516 goto next;
2518 if (filter_is_callable) {
2519 /* pass match object through filter */
2520 match = pattern_new_match(self, &state, 1);
2521 if (!match)
2522 goto error;
2523 args = PyTuple_Pack(1, match);
2524 if (!args) {
2525 Py_DECREF(match);
2526 goto error;
2528 item = PyObject_CallObject(filter, args);
2529 Py_DECREF(args);
2530 Py_DECREF(match);
2531 if (!item)
2532 goto error;
2533 } else {
2534 /* filter is literal string */
2535 item = filter;
2536 Py_INCREF(item);
2539 /* add to list */
2540 if (item != Py_None) {
2541 status = PyList_Append(list, item);
2542 Py_DECREF(item);
2543 if (status < 0)
2544 goto error;
2547 i = e;
2548 n = n + 1;
2550 next:
2551 /* move on */
2552 if (state.ptr == state.start)
2553 state.start = (void*) ((char*) state.ptr + state.charsize);
2554 else
2555 state.start = state.ptr;
2559 /* get segment following last match */
2560 if (i < state.endpos) {
2561 item = PySequence_GetSlice(string, i, state.endpos);
2562 if (!item)
2563 goto error;
2564 status = PyList_Append(list, item);
2565 Py_DECREF(item);
2566 if (status < 0)
2567 goto error;
2570 state_fini(&state);
2572 Py_DECREF(filter);
2574 /* convert list to single string (also removes list) */
2575 item = join_list(list, self->pattern);
2577 if (!item)
2578 return NULL;
2580 if (subn)
2581 return Py_BuildValue("Ni", item, n);
2583 return item;
2585 error:
2586 Py_DECREF(list);
2587 state_fini(&state);
2588 Py_DECREF(filter);
2589 return NULL;
2593 static PyObject*
2594 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2596 PyObject* template;
2597 PyObject* string;
2598 int count = 0;
2599 static char* kwlist[] = { "repl", "string", "count", NULL };
2600 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
2601 &template, &string, &count))
2602 return NULL;
2604 return pattern_subx(self, template, string, count, 0);
2607 static PyObject*
2608 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2610 PyObject* template;
2611 PyObject* string;
2612 int count = 0;
2613 static char* kwlist[] = { "repl", "string", "count", NULL };
2614 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
2615 &template, &string, &count))
2616 return NULL;
2618 return pattern_subx(self, template, string, count, 1);
2621 static PyObject*
2622 pattern_copy(PatternObject* self, PyObject* args)
2624 #ifdef USE_BUILTIN_COPY
2625 PatternObject* copy;
2626 int offset;
2628 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
2629 return NULL;
2631 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2632 if (!copy)
2633 return NULL;
2635 offset = offsetof(PatternObject, groups);
2637 Py_XINCREF(self->groupindex);
2638 Py_XINCREF(self->indexgroup);
2639 Py_XINCREF(self->pattern);
2641 memcpy((char*) copy + offset, (char*) self + offset,
2642 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2643 copy->weakreflist = NULL;
2645 return (PyObject*) copy;
2646 #else
2647 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2648 return NULL;
2649 #endif
2652 static PyObject*
2653 pattern_deepcopy(PatternObject* self, PyObject* args)
2655 #ifdef USE_BUILTIN_COPY
2656 PatternObject* copy;
2658 PyObject* memo;
2659 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
2660 return NULL;
2662 copy = (PatternObject*) pattern_copy(self, Py_None);
2663 if (!copy)
2664 return NULL;
2666 if (!deepcopy(&copy->groupindex, memo) ||
2667 !deepcopy(&copy->indexgroup, memo) ||
2668 !deepcopy(&copy->pattern, memo)) {
2669 Py_DECREF(copy);
2670 return NULL;
2673 #else
2674 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2675 return NULL;
2676 #endif
2679 static PyMethodDef pattern_methods[] = {
2680 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
2681 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS},
2682 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS},
2683 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS},
2684 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS},
2685 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS},
2686 #if PY_VERSION_HEX >= 0x02020000
2687 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS},
2688 #endif
2689 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2690 {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
2691 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
2692 {NULL, NULL}
2695 static PyObject*
2696 pattern_getattr(PatternObject* self, char* name)
2698 PyObject* res;
2700 res = Py_FindMethod(pattern_methods, (PyObject*) self, name);
2702 if (res)
2703 return res;
2705 PyErr_Clear();
2707 /* attributes */
2708 if (!strcmp(name, "pattern")) {
2709 Py_INCREF(self->pattern);
2710 return self->pattern;
2713 if (!strcmp(name, "flags"))
2714 return Py_BuildValue("i", self->flags);
2716 if (!strcmp(name, "groups"))
2717 return Py_BuildValue("i", self->groups);
2719 if (!strcmp(name, "groupindex") && self->groupindex) {
2720 Py_INCREF(self->groupindex);
2721 return self->groupindex;
2724 PyErr_SetString(PyExc_AttributeError, name);
2725 return NULL;
2728 statichere PyTypeObject Pattern_Type = {
2729 PyObject_HEAD_INIT(NULL)
2730 0, "_" SRE_MODULE ".SRE_Pattern",
2731 sizeof(PatternObject), sizeof(SRE_CODE),
2732 (destructor)pattern_dealloc, /*tp_dealloc*/
2733 0, /*tp_print*/
2734 (getattrfunc)pattern_getattr, /*tp_getattr*/
2735 0, /* tp_setattr */
2736 0, /* tp_compare */
2737 0, /* tp_repr */
2738 0, /* tp_as_number */
2739 0, /* tp_as_sequence */
2740 0, /* tp_as_mapping */
2741 0, /* tp_hash */
2742 0, /* tp_call */
2743 0, /* tp_str */
2744 0, /* tp_getattro */
2745 0, /* tp_setattro */
2746 0, /* tp_as_buffer */
2747 Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
2748 0, /* tp_doc */
2749 0, /* tp_traverse */
2750 0, /* tp_clear */
2751 0, /* tp_richcompare */
2752 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2755 /* -------------------------------------------------------------------- */
2756 /* match methods */
2758 static void
2759 match_dealloc(MatchObject* self)
2761 Py_XDECREF(self->regs);
2762 Py_XDECREF(self->string);
2763 Py_DECREF(self->pattern);
2764 PyObject_DEL(self);
2767 static PyObject*
2768 match_getslice_by_index(MatchObject* self, int index, PyObject* def)
2770 if (index < 0 || index >= self->groups) {
2771 /* raise IndexError if we were given a bad group number */
2772 PyErr_SetString(
2773 PyExc_IndexError,
2774 "no such group"
2776 return NULL;
2779 index *= 2;
2781 if (self->string == Py_None || self->mark[index] < 0) {
2782 /* return default value if the string or group is undefined */
2783 Py_INCREF(def);
2784 return def;
2787 return PySequence_GetSlice(
2788 self->string, self->mark[index], self->mark[index+1]
2792 static int
2793 match_getindex(MatchObject* self, PyObject* index)
2795 int i;
2797 if (PyInt_Check(index))
2798 return (int) PyInt_AS_LONG(index);
2800 i = -1;
2802 if (self->pattern->groupindex) {
2803 index = PyObject_GetItem(self->pattern->groupindex, index);
2804 if (index) {
2805 if (PyInt_Check(index))
2806 i = (int) PyInt_AS_LONG(index);
2807 Py_DECREF(index);
2808 } else
2809 PyErr_Clear();
2812 return i;
2815 static PyObject*
2816 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
2818 return match_getslice_by_index(self, match_getindex(self, index), def);
2821 static PyObject*
2822 match_expand(MatchObject* self, PyObject* args)
2824 PyObject* template;
2825 if (!PyArg_ParseTuple(args, "O:expand", &template))
2826 return NULL;
2828 /* delegate to Python code */
2829 return call(
2830 SRE_MODULE, "_expand",
2831 PyTuple_Pack(3, self->pattern, self, template)
2835 static PyObject*
2836 match_group(MatchObject* self, PyObject* args)
2838 PyObject* result;
2839 int i, size;
2841 size = PyTuple_GET_SIZE(args);
2843 switch (size) {
2844 case 0:
2845 result = match_getslice(self, Py_False, Py_None);
2846 break;
2847 case 1:
2848 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
2849 break;
2850 default:
2851 /* fetch multiple items */
2852 result = PyTuple_New(size);
2853 if (!result)
2854 return NULL;
2855 for (i = 0; i < size; i++) {
2856 PyObject* item = match_getslice(
2857 self, PyTuple_GET_ITEM(args, i), Py_None
2859 if (!item) {
2860 Py_DECREF(result);
2861 return NULL;
2863 PyTuple_SET_ITEM(result, i, item);
2865 break;
2867 return result;
2870 static PyObject*
2871 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
2873 PyObject* result;
2874 int index;
2876 PyObject* def = Py_None;
2877 static char* kwlist[] = { "default", NULL };
2878 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
2879 return NULL;
2881 result = PyTuple_New(self->groups-1);
2882 if (!result)
2883 return NULL;
2885 for (index = 1; index < self->groups; index++) {
2886 PyObject* item;
2887 item = match_getslice_by_index(self, index, def);
2888 if (!item) {
2889 Py_DECREF(result);
2890 return NULL;
2892 PyTuple_SET_ITEM(result, index-1, item);
2895 return result;
2898 static PyObject*
2899 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
2901 PyObject* result;
2902 PyObject* keys;
2903 int index;
2905 PyObject* def = Py_None;
2906 static char* kwlist[] = { "default", NULL };
2907 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
2908 return NULL;
2910 result = PyDict_New();
2911 if (!result || !self->pattern->groupindex)
2912 return result;
2914 keys = PyMapping_Keys(self->pattern->groupindex);
2915 if (!keys)
2916 goto failed;
2918 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
2919 int status;
2920 PyObject* key;
2921 PyObject* value;
2922 key = PyList_GET_ITEM(keys, index);
2923 if (!key)
2924 goto failed;
2925 value = match_getslice(self, key, def);
2926 if (!value) {
2927 Py_DECREF(key);
2928 goto failed;
2930 status = PyDict_SetItem(result, key, value);
2931 Py_DECREF(value);
2932 if (status < 0)
2933 goto failed;
2936 Py_DECREF(keys);
2938 return result;
2940 failed:
2941 Py_DECREF(keys);
2942 Py_DECREF(result);
2943 return NULL;
2946 static PyObject*
2947 match_start(MatchObject* self, PyObject* args)
2949 int index;
2951 PyObject* index_ = Py_False; /* zero */
2952 if (!PyArg_ParseTuple(args, "|O:start", &index_))
2953 return NULL;
2955 index = match_getindex(self, index_);
2957 if (index < 0 || index >= self->groups) {
2958 PyErr_SetString(
2959 PyExc_IndexError,
2960 "no such group"
2962 return NULL;
2965 /* mark is -1 if group is undefined */
2966 return Py_BuildValue("i", self->mark[index*2]);
2969 static PyObject*
2970 match_end(MatchObject* self, PyObject* args)
2972 int index;
2974 PyObject* index_ = Py_False; /* zero */
2975 if (!PyArg_ParseTuple(args, "|O:end", &index_))
2976 return NULL;
2978 index = match_getindex(self, index_);
2980 if (index < 0 || index >= self->groups) {
2981 PyErr_SetString(
2982 PyExc_IndexError,
2983 "no such group"
2985 return NULL;
2988 /* mark is -1 if group is undefined */
2989 return Py_BuildValue("i", self->mark[index*2+1]);
2992 LOCAL(PyObject*)
2993 _pair(int i1, int i2)
2995 PyObject* pair;
2996 PyObject* item;
2998 pair = PyTuple_New(2);
2999 if (!pair)
3000 return NULL;
3002 item = PyInt_FromLong(i1);
3003 if (!item)
3004 goto error;
3005 PyTuple_SET_ITEM(pair, 0, item);
3007 item = PyInt_FromLong(i2);
3008 if (!item)
3009 goto error;
3010 PyTuple_SET_ITEM(pair, 1, item);
3012 return pair;
3014 error:
3015 Py_DECREF(pair);
3016 return NULL;
3019 static PyObject*
3020 match_span(MatchObject* self, PyObject* args)
3022 int index;
3024 PyObject* index_ = Py_False; /* zero */
3025 if (!PyArg_ParseTuple(args, "|O:span", &index_))
3026 return NULL;
3028 index = match_getindex(self, index_);
3030 if (index < 0 || index >= self->groups) {
3031 PyErr_SetString(
3032 PyExc_IndexError,
3033 "no such group"
3035 return NULL;
3038 /* marks are -1 if group is undefined */
3039 return _pair(self->mark[index*2], self->mark[index*2+1]);
3042 static PyObject*
3043 match_regs(MatchObject* self)
3045 PyObject* regs;
3046 PyObject* item;
3047 int index;
3049 regs = PyTuple_New(self->groups);
3050 if (!regs)
3051 return NULL;
3053 for (index = 0; index < self->groups; index++) {
3054 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3055 if (!item) {
3056 Py_DECREF(regs);
3057 return NULL;
3059 PyTuple_SET_ITEM(regs, index, item);
3062 Py_INCREF(regs);
3063 self->regs = regs;
3065 return regs;
3068 static PyObject*
3069 match_copy(MatchObject* self, PyObject* args)
3071 #ifdef USE_BUILTIN_COPY
3072 MatchObject* copy;
3073 int slots, offset;
3075 if (args != Py_None && !PyArg_ParseTuple(args, ":__copy__"))
3076 return NULL;
3078 slots = 2 * (self->pattern->groups+1);
3080 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3081 if (!copy)
3082 return NULL;
3084 /* this value a constant, but any compiler should be able to
3085 figure that out all by itself */
3086 offset = offsetof(MatchObject, string);
3088 Py_XINCREF(self->pattern);
3089 Py_XINCREF(self->string);
3090 Py_XINCREF(self->regs);
3092 memcpy((char*) copy + offset, (char*) self + offset,
3093 sizeof(MatchObject) + slots * sizeof(int) - offset);
3095 return (PyObject*) copy;
3096 #else
3097 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3098 return NULL;
3099 #endif
3102 static PyObject*
3103 match_deepcopy(MatchObject* self, PyObject* args)
3105 #ifdef USE_BUILTIN_COPY
3106 MatchObject* copy;
3108 PyObject* memo;
3109 if (!PyArg_ParseTuple(args, "O:__deepcopy__", &memo))
3110 return NULL;
3112 copy = (MatchObject*) match_copy(self, Py_None);
3113 if (!copy)
3114 return NULL;
3116 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3117 !deepcopy(&copy->string, memo) ||
3118 !deepcopy(&copy->regs, memo)) {
3119 Py_DECREF(copy);
3120 return NULL;
3123 #else
3124 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3125 return NULL;
3126 #endif
3129 static PyMethodDef match_methods[] = {
3130 {"group", (PyCFunction) match_group, METH_VARARGS},
3131 {"start", (PyCFunction) match_start, METH_VARARGS},
3132 {"end", (PyCFunction) match_end, METH_VARARGS},
3133 {"span", (PyCFunction) match_span, METH_VARARGS},
3134 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3135 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3136 {"expand", (PyCFunction) match_expand, METH_VARARGS},
3137 {"__copy__", (PyCFunction) match_copy, METH_VARARGS},
3138 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_VARARGS},
3139 {NULL, NULL}
3142 static PyObject*
3143 match_getattr(MatchObject* self, char* name)
3145 PyObject* res;
3147 res = Py_FindMethod(match_methods, (PyObject*) self, name);
3148 if (res)
3149 return res;
3151 PyErr_Clear();
3153 if (!strcmp(name, "lastindex")) {
3154 if (self->lastindex >= 0)
3155 return Py_BuildValue("i", self->lastindex);
3156 Py_INCREF(Py_None);
3157 return Py_None;
3160 if (!strcmp(name, "lastgroup")) {
3161 if (self->pattern->indexgroup && self->lastindex >= 0) {
3162 PyObject* result = PySequence_GetItem(
3163 self->pattern->indexgroup, self->lastindex
3165 if (result)
3166 return result;
3167 PyErr_Clear();
3169 Py_INCREF(Py_None);
3170 return Py_None;
3173 if (!strcmp(name, "string")) {
3174 if (self->string) {
3175 Py_INCREF(self->string);
3176 return self->string;
3177 } else {
3178 Py_INCREF(Py_None);
3179 return Py_None;
3183 if (!strcmp(name, "regs")) {
3184 if (self->regs) {
3185 Py_INCREF(self->regs);
3186 return self->regs;
3187 } else
3188 return match_regs(self);
3191 if (!strcmp(name, "re")) {
3192 Py_INCREF(self->pattern);
3193 return (PyObject*) self->pattern;
3196 if (!strcmp(name, "pos"))
3197 return Py_BuildValue("i", self->pos);
3199 if (!strcmp(name, "endpos"))
3200 return Py_BuildValue("i", self->endpos);
3202 PyErr_SetString(PyExc_AttributeError, name);
3203 return NULL;
3206 /* FIXME: implement setattr("string", None) as a special case (to
3207 detach the associated string, if any */
3209 statichere PyTypeObject Match_Type = {
3210 PyObject_HEAD_INIT(NULL)
3211 0, "_" SRE_MODULE ".SRE_Match",
3212 sizeof(MatchObject), sizeof(int),
3213 (destructor)match_dealloc, /*tp_dealloc*/
3214 0, /*tp_print*/
3215 (getattrfunc)match_getattr /*tp_getattr*/
3218 /* -------------------------------------------------------------------- */
3219 /* scanner methods (experimental) */
3221 static void
3222 scanner_dealloc(ScannerObject* self)
3224 state_fini(&self->state);
3225 Py_DECREF(self->pattern);
3226 PyObject_DEL(self);
3229 static PyObject*
3230 scanner_match(ScannerObject* self, PyObject* args)
3232 SRE_STATE* state = &self->state;
3233 PyObject* match;
3234 int status;
3236 state_reset(state);
3238 state->ptr = state->start;
3240 if (state->charsize == 1) {
3241 status = sre_match(state, PatternObject_GetCode(self->pattern));
3242 } else {
3243 #if defined(HAVE_UNICODE)
3244 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3245 #endif
3248 match = pattern_new_match((PatternObject*) self->pattern,
3249 state, status);
3251 if ((status == 0 || state->ptr == state->start) &&
3252 state->ptr < state->end)
3253 state->start = (void*) ((char*) state->ptr + state->charsize);
3254 else
3255 state->start = state->ptr;
3257 return match;
3261 static PyObject*
3262 scanner_search(ScannerObject* self, PyObject* args)
3264 SRE_STATE* state = &self->state;
3265 PyObject* match;
3266 int status;
3268 state_reset(state);
3270 state->ptr = state->start;
3272 if (state->charsize == 1) {
3273 status = sre_search(state, PatternObject_GetCode(self->pattern));
3274 } else {
3275 #if defined(HAVE_UNICODE)
3276 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3277 #endif
3280 match = pattern_new_match((PatternObject*) self->pattern,
3281 state, status);
3283 if ((status == 0 || state->ptr == state->start) &&
3284 state->ptr < state->end)
3285 state->start = (void*) ((char*) state->ptr + state->charsize);
3286 else
3287 state->start = state->ptr;
3289 return match;
3292 static PyMethodDef scanner_methods[] = {
3293 /* FIXME: use METH_OLDARGS instead of 0 or fix to use METH_VARARGS */
3294 /* METH_OLDARGS is not in Python 1.5.2 */
3295 {"match", (PyCFunction) scanner_match, 0},
3296 {"search", (PyCFunction) scanner_search, 0},
3297 {NULL, NULL}
3300 static PyObject*
3301 scanner_getattr(ScannerObject* self, char* name)
3303 PyObject* res;
3305 res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
3306 if (res)
3307 return res;
3309 PyErr_Clear();
3311 /* attributes */
3312 if (!strcmp(name, "pattern")) {
3313 Py_INCREF(self->pattern);
3314 return self->pattern;
3317 PyErr_SetString(PyExc_AttributeError, name);
3318 return NULL;
3321 statichere PyTypeObject Scanner_Type = {
3322 PyObject_HEAD_INIT(NULL)
3323 0, "_" SRE_MODULE ".SRE_Scanner",
3324 sizeof(ScannerObject), 0,
3325 (destructor)scanner_dealloc, /*tp_dealloc*/
3326 0, /*tp_print*/
3327 (getattrfunc)scanner_getattr, /*tp_getattr*/
3330 static PyMethodDef _functions[] = {
3331 {"compile", _compile, METH_VARARGS},
3332 {"getcodesize", sre_codesize, METH_VARARGS},
3333 {"getlower", sre_getlower, METH_VARARGS},
3334 {NULL, NULL}
3337 #if PY_VERSION_HEX < 0x02030000
3338 DL_EXPORT(void) init_sre(void)
3339 #else
3340 PyMODINIT_FUNC init_sre(void)
3341 #endif
3343 PyObject* m;
3344 PyObject* d;
3345 PyObject* x;
3347 /* Patch object types */
3348 Pattern_Type.ob_type = Match_Type.ob_type =
3349 Scanner_Type.ob_type = &PyType_Type;
3351 m = Py_InitModule("_" SRE_MODULE, _functions);
3352 d = PyModule_GetDict(m);
3354 x = PyInt_FromLong(SRE_MAGIC);
3355 if (x) {
3356 PyDict_SetItemString(d, "MAGIC", x);
3357 Py_DECREF(x);
3360 x = PyInt_FromLong(sizeof(SRE_CODE));
3361 if (x) {
3362 PyDict_SetItemString(d, "CODESIZE", x);
3363 Py_DECREF(x);
3366 x = PyString_FromString(copyright);
3367 if (x) {
3368 PyDict_SetItemString(d, "copyright", x);
3369 Py_DECREF(x);
3373 #endif /* !defined(SRE_RECURSIVE) */
3375 /* vim:ts=4:sw=4:et