This commit was manufactured by cvs2svn to create tag 'r23a1-fork'.
[python/dscho.git] / Modules / pcremodule.c
blob656539a59172ff3d64856a34a79b2444b4627552
1 /* Pcre objects */
3 #include "Python.h"
5 #ifndef Py_eval_input
6 /* For Python 1.4, graminit.h has to be explicitly included */
7 #include "graminit.h"
8 #define Py_eval_input eval_input
9 #endif
11 #ifndef FOR_PYTHON
12 #define FOR_PYTHON
13 #endif
15 #include "pcre.h"
16 #include "pcre-int.h"
18 static PyObject *ErrorObject;
20 typedef struct {
21 PyObject_HEAD
22 pcre *regex;
23 pcre_extra *regex_extra;
24 int num_groups;
25 } PcreObject;
27 static PyTypeObject Pcre_Type;
29 #define PcreObject_Check(v) ((v)->ob_type == &Pcre_Type)
30 #define NORMAL 0
31 #define CHARCLASS 1
32 #define REPLACEMENT 2
34 #define CHAR 0
35 #define MEMORY_REFERENCE 1
36 #define SYNTAX 2
37 #define NOT_SYNTAX 3
38 #define SET 4
39 #define WORD_BOUNDARY 5
40 #define NOT_WORD_BOUNDARY 6
41 #define BEGINNING_OF_BUFFER 7
42 #define END_OF_BUFFER 8
43 #define STRING 9
45 static PcreObject *
46 newPcreObject(PyObject *args)
48 PcreObject *self;
49 self = PyObject_New(PcreObject, &Pcre_Type);
50 if (self == NULL)
51 return NULL;
52 self->regex = NULL;
53 self->regex_extra = NULL;
54 return self;
57 /* Pcre methods */
59 static void
60 PyPcre_dealloc(PcreObject *self)
62 if (self->regex) (pcre_free)(self->regex);
63 if (self->regex_extra) (pcre_free)(self->regex_extra);
64 PyObject_Del(self);
68 static PyObject *
69 PyPcre_exec(PcreObject *self, PyObject *args)
71 char *string;
72 int stringlen, pos = 0, options=0, endpos = -1, i, count;
73 int offsets[100*2];
74 PyObject *list;
76 if (!PyArg_ParseTuple(args, "t#|iii:match", &string, &stringlen,
77 &pos, &endpos, &options))
78 return NULL;
79 if (endpos == -1) {endpos = stringlen;}
80 count = pcre_exec(self->regex, self->regex_extra,
81 string, endpos, pos, options,
82 offsets, sizeof(offsets)/sizeof(int) );
83 /* If an error occurred during the match, and an exception was raised,
84 just return NULL and leave the exception alone. The most likely
85 problem to cause this would be running out of memory for
86 the failure stack. */
87 if (PyErr_Occurred())
89 return NULL;
91 if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
92 if (count<0)
94 PyObject *errval = Py_BuildValue("si", "Regex execution error", count);
95 PyErr_SetObject(ErrorObject, errval);
96 Py_XDECREF(errval);
97 return NULL;
100 list=PyList_New(self->num_groups+1);
101 if (list==NULL) return NULL;
102 for(i=0; i<=self->num_groups; i++)
104 PyObject *v;
105 int start=offsets[i*2], end=offsets[i*2+1];
106 /* If the group wasn't affected by the match, return -1, -1 */
107 if (start<0 || count<=i)
108 {start=end=-1;}
109 v=Py_BuildValue("ii", start, end);
110 if (v==NULL) {Py_DECREF(list); return NULL;}
111 PyList_SetItem(list, i, v);
113 return list;
116 static PyMethodDef Pcre_methods[] = {
117 {"match", (PyCFunction)PyPcre_exec, METH_VARARGS},
118 {NULL, NULL} /* sentinel */
121 static PyObject *
122 PyPcre_getattr(PcreObject *self, char *name)
124 return Py_FindMethod(Pcre_methods, (PyObject *)self, name);
128 static PyTypeObject Pcre_Type = {
129 PyObject_HEAD_INIT(NULL)
130 0, /*ob_size*/
131 "pcre.Pcre", /*tp_name*/
132 sizeof(PcreObject), /*tp_basicsize*/
133 0, /*tp_itemsize*/
134 /* methods */
135 (destructor)PyPcre_dealloc, /*tp_dealloc*/
136 0, /*tp_print*/
137 (getattrfunc)PyPcre_getattr, /*tp_getattr*/
138 0, /*tp_setattr*/
139 0, /*tp_compare*/
140 0, /*tp_repr*/
141 0, /*tp_as_number*/
142 0, /*tp_as_sequence*/
143 0, /*tp_as_mapping*/
144 0, /*tp_hash*/
146 /* --------------------------------------------------------------------- */
148 static PyObject *
149 PyPcre_compile(PyObject *self, PyObject *args)
151 PcreObject *rv;
152 PyObject *dictionary;
153 char *pattern;
154 const char *error;
156 int options, erroroffset;
157 if (!PyArg_ParseTuple(args, "siO!:pcre_compile", &pattern, &options,
158 &PyDict_Type, &dictionary))
159 return NULL;
160 rv = newPcreObject(args);
161 if ( rv == NULL )
162 return NULL;
164 rv->regex = pcre_compile((char*)pattern, options,
165 &error, &erroroffset, dictionary);
166 if (rv->regex==NULL)
168 Py_DECREF(rv);
169 if (!PyErr_Occurred())
171 PyObject *errval = Py_BuildValue("si", error, erroroffset);
172 PyErr_SetObject(ErrorObject, errval);
173 Py_XDECREF(errval);
175 return NULL;
177 rv->regex_extra=pcre_study(rv->regex, 0, &error);
178 if (rv->regex_extra==NULL && error!=NULL)
180 PyObject *errval = Py_BuildValue("si", error, 0);
181 Py_DECREF(rv);
182 PyErr_SetObject(ErrorObject, errval);
183 Py_XDECREF(errval);
184 return NULL;
186 rv->num_groups = pcre_info(rv->regex, NULL, NULL);
187 if (rv->num_groups<0)
189 PyObject *errval = Py_BuildValue("si", error, rv->num_groups);
190 PyErr_SetObject(ErrorObject, errval);
191 Py_XDECREF(errval);
192 Py_DECREF(rv);
193 return NULL;
195 return (PyObject *)rv;
198 static PyObject *
199 PyPcre_expand_escape(unsigned char *pattern, int pattern_len,
200 int *indexptr, int *typeptr)
202 unsigned char c;
203 int index = *indexptr;
205 if (pattern_len<=index)
207 PyErr_SetString(ErrorObject, "escape ends too soon");
208 return NULL;
210 c=pattern[index]; index++;
211 *typeptr=CHAR;
213 switch (c)
215 case('t'):
216 *indexptr=index;
217 return Py_BuildValue("c", (char)9);
218 case('n'):
219 *indexptr = index;
220 return Py_BuildValue("c", (char)10);
221 case('v'):
222 *indexptr = index;
223 return Py_BuildValue("c", (char)11);
224 case('r'):
225 *indexptr = index;
226 return Py_BuildValue("c", (char)13);
227 case('f'):
228 *indexptr = index;
229 return Py_BuildValue("c", (char)12);
230 case('a'):
231 *indexptr = index;
232 return Py_BuildValue("c", (char)7);
233 case('b'):
234 *indexptr=index;
235 return Py_BuildValue("c", (char)8);
236 case('\\'):
237 *indexptr=index;
238 return Py_BuildValue("c", '\\');
240 case('x'):
242 int x, ch, end;
244 x = 0; end = index;
245 while ( (end<pattern_len && pcre_ctypes[ pattern[end] ] & ctype_xdigit) != 0)
247 ch = pattern[end];
248 x = x * 16 + pcre_lcc[ch] -
249 (((pcre_ctypes[ch] & ctype_digit) != 0)? '0' : 'W');
250 x &= 255;
251 end++;
253 if (end==index)
255 PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
256 return NULL;
258 *indexptr = end;
259 return Py_BuildValue("c", (char)x);
262 case('E'): case('G'): case('L'): case('Q'):
263 case('U'): case('l'): case('u'):
265 char message[50];
266 PyOS_snprintf(message, sizeof(message),
267 "\\%c is not allowed", c);
268 PyErr_SetString(ErrorObject, message);
269 return NULL;
272 case('g'):
274 int end, i;
275 int group_num = 0, is_number=0;
277 if (pattern_len<=index)
279 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
280 return NULL;
282 if (pattern[index]!='<')
284 PyErr_SetString(ErrorObject, "missing < in symbolic reference");
285 return NULL;
287 index++;
288 end=index;
289 while (end<pattern_len && pattern[end]!='>')
290 end++;
291 if (end==pattern_len)
293 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
294 return NULL;
297 if (index==end) /* Zero-length name */
299 /* XXX should include the text of the reference */
300 PyErr_SetString(ErrorObject, "zero-length symbolic reference");
301 return NULL;
303 if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
305 is_number = 1;
306 group_num = pattern[index] - '0';
309 for(i=index+1; i<end; i++)
311 if (is_number &&
312 !(pcre_ctypes[pattern[i]] & ctype_digit) )
314 /* XXX should include the text of the reference */
315 PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
316 return NULL;
318 else {group_num = group_num * 10 + pattern[i] - '0';}
319 if (!(pcre_ctypes[pattern[i]] & ctype_word) )
321 /* XXX should include the text of the reference */
322 PyErr_SetString(ErrorObject, "illegal symbolic reference");
323 return NULL;
327 *typeptr = MEMORY_REFERENCE;
328 *indexptr = end+1;
329 /* If it's a number, return the integer value of the group */
330 if (is_number) return Py_BuildValue("i", group_num);
331 /* Otherwise, return a string containing the group name */
332 return Py_BuildValue("s#", pattern+index, end-index);
335 case('0'):
337 /* \0 always indicates an octal escape, so we consume up to 3
338 characters, as long as they're all octal digits */
339 int octval=0, i;
340 index--;
341 for(i=index;
342 i<=index+2 && i<pattern_len
343 && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
344 i++)
346 octval = octval * 8 + pattern[i] - '0';
348 if (octval>255)
350 PyErr_SetString(ErrorObject, "octal value out of range");
351 return NULL;
353 *indexptr = i;
354 return Py_BuildValue("c", (unsigned char)octval);
357 case('1'): case('2'): case('3'): case('4'):
358 case('5'): case('6'): case('7'): case('8'):
359 case('9'):
361 /* Handle \?, where ? is from 1 through 9 */
362 int value=0;
363 index--;
364 /* If it's at least a two-digit reference, like \34, it might
365 either be a 3-digit octal escape (\123) or a 2-digit
366 decimal memory reference (\34) */
368 if ( (index+1) <pattern_len &&
369 (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
371 if ( (index+2) <pattern_len &&
372 (pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
373 (pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
374 (pcre_ctypes[ pattern[index ] ] & ctype_odigit)
377 /* 3 octal digits */
378 value= 8*8*(pattern[index ]-'0') +
379 8*(pattern[index+1]-'0') +
380 (pattern[index+2]-'0');
381 if (value>255)
383 PyErr_SetString(ErrorObject, "octal value out of range");
384 return NULL;
386 *indexptr = index+3;
387 return Py_BuildValue("c", (unsigned char)value);
389 else
391 /* 2-digit form, so it's a memory reference */
392 value= 10*(pattern[index ]-'0') +
393 (pattern[index+1]-'0');
394 if (value<1 || EXTRACT_MAX<=value)
396 PyErr_SetString(ErrorObject, "memory reference out of range");
397 return NULL;
399 *typeptr = MEMORY_REFERENCE;
400 *indexptr = index+2;
401 return Py_BuildValue("i", value);
404 else
406 /* Single-digit form, like \2, so it's a memory reference */
407 *typeptr = MEMORY_REFERENCE;
408 *indexptr = index+1;
409 return Py_BuildValue("i", pattern[index]-'0');
413 default:
414 /* It's some unknown escape like \s, so return a string containing
415 \s */
416 *typeptr = STRING;
417 *indexptr = index;
418 return Py_BuildValue("s#", pattern+index-2, 2);
422 static PyObject *
423 PyPcre_expand(PyObject *self, PyObject *args)
425 PyObject *results, *match_obj;
426 PyObject *repl_obj, *newstring;
427 unsigned char *repl;
428 int size, total_len, i, start, pos;
430 if (!PyArg_ParseTuple(args, "OS:pcre_expand", &match_obj, &repl_obj))
431 return NULL;
433 repl=(unsigned char *)PyString_AsString(repl_obj);
434 size=PyString_Size(repl_obj);
435 results=PyList_New(0);
436 if (results==NULL) return NULL;
437 for(start=total_len=i=0; i<size; i++)
439 if (repl[i]=='\\')
441 PyObject *value;
442 int escape_type;
444 if (start!=i)
446 int status;
447 PyObject *s = PyString_FromStringAndSize(
448 (char *)repl+start, i-start);
449 if (s == NULL) {
450 Py_DECREF(results);
451 return NULL;
453 status = PyList_Append(results, s);
454 Py_DECREF(s);
455 if (status < 0) {
456 Py_DECREF(results);
457 return NULL;
459 total_len += i-start;
461 i++;
462 value=PyPcre_expand_escape(repl, size, &i, &escape_type);
463 if (value==NULL)
465 /* PyPcre_expand_escape triggered an exception of some sort,
466 so just return */
467 Py_DECREF(results);
468 return NULL;
470 switch (escape_type)
472 case (CHAR):
473 PyList_Append(results, value);
474 total_len += PyString_Size(value);
475 break;
476 case(MEMORY_REFERENCE):
478 PyObject *r, *tuple, *result;
479 r=PyObject_GetAttrString(match_obj, "group");
480 if (r == NULL) {
481 Py_DECREF(results);
482 return NULL;
484 tuple=PyTuple_New(1);
485 Py_INCREF(value);
486 PyTuple_SetItem(tuple, 0, value);
487 result=PyEval_CallObject(r, tuple);
488 Py_DECREF(r); Py_DECREF(tuple);
489 if (result==NULL)
491 /* The group() method triggered an exception of some sort */
492 Py_DECREF(results);
493 Py_DECREF(value);
494 return NULL;
496 if (result==Py_None)
498 char message[50];
499 PyOS_snprintf(message, sizeof(message),
500 "group did not contribute to the match");
501 PyErr_SetString(ErrorObject,
502 message);
503 Py_DECREF(result);
504 Py_DECREF(value);
505 Py_DECREF(results);
506 return NULL;
508 /* typecheck that it's a string! */
509 if (!PyString_Check(result))
511 Py_DECREF(results);
512 Py_DECREF(result);
513 PyErr_SetString(ErrorObject,
514 "group() must return a string value for replacement");
515 return NULL;
517 PyList_Append(results, result);
518 total_len += PyString_Size(result);
519 Py_DECREF(result);
521 break;
522 case(STRING):
524 PyList_Append(results, value);
525 total_len += PyString_Size(value);
526 break;
528 default:
529 Py_DECREF(results);
530 PyErr_SetString(ErrorObject,
531 "bad escape in replacement");
532 return NULL;
534 Py_DECREF(value);
535 start=i;
536 i--; /* Decrement now, because the 'for' loop will increment it */
538 } /* endif repl[i]!='\\' */
540 if (start!=i)
542 int status;
543 PyObject *s = PyString_FromStringAndSize((char *)repl+start,
544 i-start);
545 if (s == NULL) {
546 Py_DECREF(results);
547 return NULL;
549 status = PyList_Append(results, s);
550 Py_DECREF(s);
551 if (status < 0) {
552 Py_DECREF(results);
553 return NULL;
555 total_len += i-start;
558 /* Whew! Now we've constructed a list containing various pieces of
559 strings that will make up our final result. So, iterate over
560 the list concatenating them. A new string measuring total_len
561 bytes is allocated and filled in. */
563 newstring=PyString_FromStringAndSize(NULL, total_len);
564 if (newstring==NULL)
566 Py_DECREF(results);
567 return NULL;
570 repl=(unsigned char *)PyString_AsString(newstring);
571 for (pos=i=0; i<PyList_Size(results); i++)
573 PyObject *item=PyList_GetItem(results, i);
574 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
575 pos += PyString_Size(item);
577 Py_DECREF(results);
578 return newstring;
582 /* List of functions defined in the module */
584 static PyMethodDef pcre_methods[] = {
585 {"pcre_compile", PyPcre_compile, METH_VARARGS},
586 {"pcre_expand", PyPcre_expand, METH_VARARGS},
587 {NULL, NULL} /* sentinel */
592 * Convenience routine to export an integer value.
593 * For simplicity, errors (which are unlikely anyway) are ignored.
596 static void
597 insint(PyObject *d, char *name, int value)
599 PyObject *v = PyInt_FromLong((long) value);
600 if (v == NULL) {
601 /* Don't bother reporting this error */
602 PyErr_Clear();
604 else {
605 PyDict_SetItemString(d, name, v);
606 Py_DECREF(v);
611 /* Initialization function for the module (*must* be called initpcre) */
613 PyMODINIT_FUNC
614 initpcre(void)
616 PyObject *m, *d;
618 Pcre_Type.ob_type = &PyType_Type;
620 /* Create the module and add the functions */
621 m = Py_InitModule("pcre", pcre_methods);
623 /* Add some symbolic constants to the module */
624 d = PyModule_GetDict(m);
625 ErrorObject = PyErr_NewException("pcre.error", NULL, NULL);
626 PyDict_SetItemString(d, "error", ErrorObject);
628 /* Insert the flags */
629 insint(d, "IGNORECASE", PCRE_CASELESS);
630 insint(d, "ANCHORED", PCRE_ANCHORED);
631 insint(d, "MULTILINE", PCRE_MULTILINE);
632 insint(d, "DOTALL", PCRE_DOTALL);
633 insint(d, "VERBOSE", PCRE_EXTENDED);
634 insint(d, "LOCALE", PCRE_LOCALE);