This commit was manufactured by cvs2svn to create tag 'r212c1'.
[python/dscho.git] / Modules / pcremodule.c
blobaf5b25456449828ce57c971da9e95e1563b4ff8d
1 /* Pcre objects */
3 #include "Python.h"
5 #include <assert.h>
6 #ifndef Py_eval_input
7 /* For Python 1.4, graminit.h has to be explicitly included */
8 #include "graminit.h"
9 #define Py_eval_input eval_input
10 #endif
12 #ifndef FOR_PYTHON
13 #define FOR_PYTHON
14 #endif
16 #include "pcre.h"
17 #include "pcre-int.h"
19 static PyObject *ErrorObject;
21 typedef struct {
22 PyObject_HEAD
23 pcre *regex;
24 pcre_extra *regex_extra;
25 int num_groups;
26 } PcreObject;
28 staticforward PyTypeObject Pcre_Type;
30 #define PcreObject_Check(v) ((v)->ob_type == &Pcre_Type)
31 #define NORMAL 0
32 #define CHARCLASS 1
33 #define REPLACEMENT 2
35 #define CHAR 0
36 #define MEMORY_REFERENCE 1
37 #define SYNTAX 2
38 #define NOT_SYNTAX 3
39 #define SET 4
40 #define WORD_BOUNDARY 5
41 #define NOT_WORD_BOUNDARY 6
42 #define BEGINNING_OF_BUFFER 7
43 #define END_OF_BUFFER 8
44 #define STRING 9
46 static PcreObject *
47 newPcreObject(PyObject *args)
49 PcreObject *self;
50 self = PyObject_New(PcreObject, &Pcre_Type);
51 if (self == NULL)
52 return NULL;
53 self->regex = NULL;
54 self->regex_extra = NULL;
55 return self;
58 /* Pcre methods */
60 static void
61 PyPcre_dealloc(PcreObject *self)
63 if (self->regex) (pcre_free)(self->regex);
64 if (self->regex_extra) (pcre_free)(self->regex_extra);
65 PyObject_Del(self);
69 static PyObject *
70 PyPcre_exec(PcreObject *self, PyObject *args)
72 char *string;
73 int stringlen, pos = 0, options=0, endpos = -1, i, count;
74 int offsets[100*2];
75 PyObject *list;
77 if (!PyArg_ParseTuple(args, "t#|iii:match", &string, &stringlen,
78 &pos, &endpos, &options))
79 return NULL;
80 if (endpos == -1) {endpos = stringlen;}
81 count = pcre_exec(self->regex, self->regex_extra,
82 string, endpos, pos, options,
83 offsets, sizeof(offsets)/sizeof(int) );
84 /* If an error occurred during the match, and an exception was raised,
85 just return NULL and leave the exception alone. The most likely
86 problem to cause this would be running out of memory for
87 the failure stack. */
88 if (PyErr_Occurred())
90 return NULL;
92 if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
93 if (count<0)
95 PyObject *errval = Py_BuildValue("si", "Regex execution error", count);
96 PyErr_SetObject(ErrorObject, errval);
97 Py_XDECREF(errval);
98 return NULL;
101 list=PyList_New(self->num_groups+1);
102 if (list==NULL) return NULL;
103 for(i=0; i<=self->num_groups; i++)
105 PyObject *v;
106 int start=offsets[i*2], end=offsets[i*2+1];
107 /* If the group wasn't affected by the match, return -1, -1 */
108 if (start<0 || count<=i)
109 {start=end=-1;}
110 v=Py_BuildValue("ii", start, end);
111 if (v==NULL) {Py_DECREF(list); return NULL;}
112 PyList_SetItem(list, i, v);
114 return list;
117 static PyMethodDef Pcre_methods[] = {
118 {"match", (PyCFunction)PyPcre_exec, 1},
119 {NULL, NULL} /* sentinel */
122 static PyObject *
123 PyPcre_getattr(PcreObject *self, char *name)
125 return Py_FindMethod(Pcre_methods, (PyObject *)self, name);
129 staticforward PyTypeObject Pcre_Type = {
130 PyObject_HEAD_INIT(NULL)
131 0, /*ob_size*/
132 "Pcre", /*tp_name*/
133 sizeof(PcreObject), /*tp_basicsize*/
134 0, /*tp_itemsize*/
135 /* methods */
136 (destructor)PyPcre_dealloc, /*tp_dealloc*/
137 0, /*tp_print*/
138 (getattrfunc)PyPcre_getattr, /*tp_getattr*/
139 0, /*tp_setattr*/
140 0, /*tp_compare*/
141 0, /*tp_repr*/
142 0, /*tp_as_number*/
143 0, /*tp_as_sequence*/
144 0, /*tp_as_mapping*/
145 0, /*tp_hash*/
147 /* --------------------------------------------------------------------- */
149 static PyObject *
150 PyPcre_compile(PyObject *self, PyObject *args)
152 PcreObject *rv;
153 PyObject *dictionary;
154 char *pattern;
155 const char *error;
157 int options, erroroffset;
158 if (!PyArg_ParseTuple(args, "siO!:pcre_compile", &pattern, &options,
159 &PyDict_Type, &dictionary))
160 return NULL;
161 rv = newPcreObject(args);
162 if ( rv == NULL )
163 return NULL;
165 rv->regex = pcre_compile((char*)pattern, options,
166 &error, &erroroffset, dictionary);
167 if (rv->regex==NULL)
169 Py_DECREF(rv);
170 if (!PyErr_Occurred())
172 PyObject *errval = Py_BuildValue("si", error, erroroffset);
173 PyErr_SetObject(ErrorObject, errval);
174 Py_XDECREF(errval);
176 return NULL;
178 rv->regex_extra=pcre_study(rv->regex, 0, &error);
179 if (rv->regex_extra==NULL && error!=NULL)
181 PyObject *errval = Py_BuildValue("si", error, 0);
182 Py_DECREF(rv);
183 PyErr_SetObject(ErrorObject, errval);
184 Py_XDECREF(errval);
185 return NULL;
187 rv->num_groups = pcre_info(rv->regex, NULL, NULL);
188 if (rv->num_groups<0)
190 PyObject *errval = Py_BuildValue("si", error, rv->num_groups);
191 PyErr_SetObject(ErrorObject, errval);
192 Py_XDECREF(errval);
193 Py_DECREF(rv);
194 return NULL;
196 return (PyObject *)rv;
199 static PyObject *
200 PyPcre_expand_escape(unsigned char *pattern, int pattern_len,
201 int *indexptr, int *typeptr)
203 unsigned char c;
204 int index = *indexptr;
206 if (pattern_len<=index)
208 PyErr_SetString(ErrorObject, "escape ends too soon");
209 return NULL;
211 c=pattern[index]; index++;
212 *typeptr=CHAR;
214 switch (c)
216 case('t'):
217 *indexptr=index;
218 return Py_BuildValue("c", (char)9);
219 case('n'):
220 *indexptr = index;
221 return Py_BuildValue("c", (char)10);
222 case('v'):
223 *indexptr = index;
224 return Py_BuildValue("c", (char)11);
225 case('r'):
226 *indexptr = index;
227 return Py_BuildValue("c", (char)13);
228 case('f'):
229 *indexptr = index;
230 return Py_BuildValue("c", (char)12);
231 case('a'):
232 *indexptr = index;
233 return Py_BuildValue("c", (char)7);
234 case('b'):
235 *indexptr=index;
236 return Py_BuildValue("c", (char)8);
237 case('\\'):
238 *indexptr=index;
239 return Py_BuildValue("c", '\\');
241 case('x'):
243 int x, ch, end;
245 x = 0; end = index;
246 while ( (end<pattern_len && pcre_ctypes[ pattern[end] ] & ctype_xdigit) != 0)
248 ch = pattern[end];
249 x = x * 16 + pcre_lcc[ch] -
250 (((pcre_ctypes[ch] & ctype_digit) != 0)? '0' : 'W');
251 x &= 255;
252 end++;
254 if (end==index)
256 PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
257 return NULL;
259 *indexptr = end;
260 return Py_BuildValue("c", (char)x);
262 break;
264 case('E'): case('G'): case('L'): case('Q'):
265 case('U'): case('l'): case('u'):
267 char message[50];
268 sprintf(message, "\\%c is not allowed", c);
269 PyErr_SetString(ErrorObject, message);
270 return NULL;
273 case('g'):
275 int end, i;
276 int group_num = 0, is_number=0;
278 if (pattern_len<=index)
280 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
281 return NULL;
283 if (pattern[index]!='<')
285 PyErr_SetString(ErrorObject, "missing < in symbolic reference");
286 return NULL;
288 index++;
289 end=index;
290 while (end<pattern_len && pattern[end]!='>')
291 end++;
292 if (end==pattern_len)
294 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
295 return NULL;
298 if (index==end) /* Zero-length name */
300 /* XXX should include the text of the reference */
301 PyErr_SetString(ErrorObject, "zero-length symbolic reference");
302 return NULL;
304 if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
306 is_number = 1;
307 group_num = pattern[index] - '0';
310 for(i=index+1; i<end; i++)
312 if (is_number &&
313 !(pcre_ctypes[pattern[i]] & ctype_digit) )
315 /* XXX should include the text of the reference */
316 PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
317 return NULL;
319 else {group_num = group_num * 10 + pattern[i] - '0';}
320 if (!(pcre_ctypes[pattern[i]] & ctype_word) )
322 /* XXX should include the text of the reference */
323 PyErr_SetString(ErrorObject, "illegal symbolic reference");
324 return NULL;
328 *typeptr = MEMORY_REFERENCE;
329 *indexptr = end+1;
330 /* If it's a number, return the integer value of the group */
331 if (is_number) return Py_BuildValue("i", group_num);
332 /* Otherwise, return a string containing the group name */
333 return Py_BuildValue("s#", pattern+index, end-index);
335 break;
337 case('0'):
339 /* \0 always indicates an octal escape, so we consume up to 3
340 characters, as long as they're all octal digits */
341 int octval=0, i;
342 index--;
343 for(i=index;
344 i<=index+2 && i<pattern_len
345 && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
346 i++)
348 octval = octval * 8 + pattern[i] - '0';
350 if (octval>255)
352 PyErr_SetString(ErrorObject, "octal value out of range");
353 return NULL;
355 *indexptr = i;
356 return Py_BuildValue("c", (unsigned char)octval);
358 break;
359 case('1'): case('2'): case('3'): case('4'):
360 case('5'): case('6'): case('7'): case('8'):
361 case('9'):
363 /* Handle \?, where ? is from 1 through 9 */
364 int value=0;
365 index--;
366 /* If it's at least a two-digit reference, like \34, it might
367 either be a 3-digit octal escape (\123) or a 2-digit
368 decimal memory reference (\34) */
370 if ( (index+1) <pattern_len &&
371 (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
373 if ( (index+2) <pattern_len &&
374 (pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
375 (pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
376 (pcre_ctypes[ pattern[index ] ] & ctype_odigit)
379 /* 3 octal digits */
380 value= 8*8*(pattern[index ]-'0') +
381 8*(pattern[index+1]-'0') +
382 (pattern[index+2]-'0');
383 if (value>255)
385 PyErr_SetString(ErrorObject, "octal value out of range");
386 return NULL;
388 *indexptr = index+3;
389 return Py_BuildValue("c", (unsigned char)value);
391 else
393 /* 2-digit form, so it's a memory reference */
394 value= 10*(pattern[index ]-'0') +
395 (pattern[index+1]-'0');
396 if (value<1 || EXTRACT_MAX<=value)
398 PyErr_SetString(ErrorObject, "memory reference out of range");
399 return NULL;
401 *typeptr = MEMORY_REFERENCE;
402 *indexptr = index+2;
403 return Py_BuildValue("i", value);
406 else
408 /* Single-digit form, like \2, so it's a memory reference */
409 *typeptr = MEMORY_REFERENCE;
410 *indexptr = index+1;
411 return Py_BuildValue("i", pattern[index]-'0');
414 break;
416 default:
417 /* It's some unknown escape like \s, so return a string containing
418 \s */
419 *typeptr = STRING;
420 *indexptr = index;
421 return Py_BuildValue("s#", pattern+index-2, 2);
425 static PyObject *
426 PyPcre_expand(PyObject *self, PyObject *args)
428 PyObject *results, *match_obj;
429 PyObject *repl_obj, *newstring;
430 unsigned char *repl;
431 int size, total_len, i, start, pos;
433 if (!PyArg_ParseTuple(args, "OS:pcre_expand", &match_obj, &repl_obj))
434 return NULL;
436 repl=(unsigned char *)PyString_AsString(repl_obj);
437 size=PyString_Size(repl_obj);
438 results=PyList_New(0);
439 if (results==NULL) return NULL;
440 for(start=total_len=i=0; i<size; i++)
442 if (repl[i]=='\\')
444 PyObject *value;
445 int escape_type;
447 if (start!=i)
449 int status;
450 PyObject *s = PyString_FromStringAndSize(
451 (char *)repl+start, i-start);
452 if (s == NULL) {
453 Py_DECREF(results);
454 return NULL;
456 status = PyList_Append(results, s);
457 Py_DECREF(s);
458 if (status < 0) {
459 Py_DECREF(results);
460 return NULL;
462 total_len += i-start;
464 i++;
465 value=PyPcre_expand_escape(repl, size, &i, &escape_type);
466 if (value==NULL)
468 /* PyPcre_expand_escape triggered an exception of some sort,
469 so just return */
470 Py_DECREF(results);
471 return NULL;
473 switch (escape_type)
475 case (CHAR):
476 PyList_Append(results, value);
477 total_len += PyString_Size(value);
478 break;
479 case(MEMORY_REFERENCE):
481 PyObject *r, *tuple, *result;
482 r=PyObject_GetAttrString(match_obj, "group");
483 if (r == NULL) {
484 Py_DECREF(results);
485 return NULL;
487 tuple=PyTuple_New(1);
488 Py_INCREF(value);
489 PyTuple_SetItem(tuple, 0, value);
490 result=PyEval_CallObject(r, tuple);
491 Py_DECREF(r); Py_DECREF(tuple);
492 if (result==NULL)
494 /* The group() method triggered an exception of some sort */
495 Py_DECREF(results);
496 Py_DECREF(value);
497 return NULL;
499 if (result==Py_None)
501 char message[50];
502 sprintf(message,
503 "group did not contribute to the match");
504 PyErr_SetString(ErrorObject,
505 message);
506 Py_DECREF(result);
507 Py_DECREF(value);
508 Py_DECREF(results);
509 return NULL;
511 /* typecheck that it's a string! */
512 if (!PyString_Check(result))
514 Py_DECREF(results);
515 Py_DECREF(result);
516 PyErr_SetString(ErrorObject,
517 "group() must return a string value for replacement");
518 return NULL;
520 PyList_Append(results, result);
521 total_len += PyString_Size(result);
522 Py_DECREF(result);
524 break;
525 case(STRING):
527 PyList_Append(results, value);
528 total_len += PyString_Size(value);
529 break;
531 default:
532 Py_DECREF(results);
533 PyErr_SetString(ErrorObject,
534 "bad escape in replacement");
535 return NULL;
537 Py_DECREF(value);
538 start=i;
539 i--; /* Decrement now, because the 'for' loop will increment it */
541 } /* endif repl[i]!='\\' */
543 if (start!=i)
545 int status;
546 PyObject *s = PyString_FromStringAndSize((char *)repl+start,
547 i-start);
548 if (s == NULL) {
549 Py_DECREF(results);
550 return NULL;
552 status = PyList_Append(results, s);
553 Py_DECREF(s);
554 if (status < 0) {
555 Py_DECREF(results);
556 return NULL;
558 total_len += i-start;
561 /* Whew! Now we've constructed a list containing various pieces of
562 strings that will make up our final result. So, iterate over
563 the list concatenating them. A new string measuring total_len
564 bytes is allocated and filled in. */
566 newstring=PyString_FromStringAndSize(NULL, total_len);
567 if (newstring==NULL)
569 Py_DECREF(results);
570 return NULL;
573 repl=(unsigned char *)PyString_AsString(newstring);
574 for (pos=i=0; i<PyList_Size(results); i++)
576 PyObject *item=PyList_GetItem(results, i);
577 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
578 pos += PyString_Size(item);
580 Py_DECREF(results);
581 return newstring;
585 /* List of functions defined in the module */
587 static PyMethodDef pcre_methods[] = {
588 {"pcre_compile", PyPcre_compile, 1},
589 {"pcre_expand", PyPcre_expand, 1},
590 {NULL, NULL} /* sentinel */
595 * Convenience routine to export an integer value.
596 * For simplicity, errors (which are unlikely anyway) are ignored.
599 static void
600 insint(PyObject *d, char *name, int value)
602 PyObject *v = PyInt_FromLong((long) value);
603 if (v == NULL) {
604 /* Don't bother reporting this error */
605 PyErr_Clear();
607 else {
608 PyDict_SetItemString(d, name, v);
609 Py_DECREF(v);
614 /* Initialization function for the module (*must* be called initpcre) */
616 DL_EXPORT(void)
617 initpcre(void)
619 PyObject *m, *d;
621 Pcre_Type.ob_type = &PyType_Type;
623 /* Create the module and add the functions */
624 m = Py_InitModule("pcre", pcre_methods);
626 /* Add some symbolic constants to the module */
627 d = PyModule_GetDict(m);
628 ErrorObject = PyErr_NewException("pcre.error", NULL, NULL);
629 PyDict_SetItemString(d, "error", ErrorObject);
631 /* Insert the flags */
632 insint(d, "IGNORECASE", PCRE_CASELESS);
633 insint(d, "ANCHORED", PCRE_ANCHORED);
634 insint(d, "MULTILINE", PCRE_MULTILINE);
635 insint(d, "DOTALL", PCRE_DOTALL);
636 insint(d, "VERBOSE", PCRE_EXTENDED);
637 insint(d, "LOCALE", PCRE_LOCALE);