Ditched '_find_SET()', since it was a no-value-added wrapper around
[python/dscho.git] / Modules / pcremodule.c
blob9ae9226ba3d16c3b383003fdba028c112cbd5e27
1 /***********************************************************
2 Copyright 1997 by Stichting Mathematisch Centrum, Amsterdam,
3 The Netherlands.
5 All Rights Reserved
7 Permission to use, copy, modify, and distribute this software and its
8 documentation for any purpose and without fee is hereby granted,
9 provided that the above copyright notice appear in all copies and that
10 both that copyright notice and this permission notice appear in
11 supporting documentation, and that the names of Stichting Mathematisch
12 Centrum or CWI or Corporation for National Research Initiatives or
13 CNRI not be used in advertising or publicity pertaining to
14 distribution of the software without specific, written prior
15 permission.
17 While CWI is the initial source for this software, a modified version
18 is made available by the Corporation for National Research Initiatives
19 (CNRI) at the Internet address ftp://ftp.python.org.
21 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28 PERFORMANCE OF THIS SOFTWARE.
30 ******************************************************************/
32 /* Pcre objects */
34 #include "Python.h"
36 #include <assert.h>
37 #ifndef Py_eval_input
38 /* For Python 1.4, graminit.h has to be explicitly included */
39 #include "graminit.h"
40 #define Py_eval_input eval_input
41 #endif
43 #ifndef FOR_PYTHON
44 #define FOR_PYTHON
45 #endif
47 #include "pcre.h"
48 #include "pcre-int.h"
50 static PyObject *ErrorObject;
52 typedef struct {
53 PyObject_HEAD
54 pcre *regex;
55 pcre_extra *regex_extra;
56 int num_groups;
57 } PcreObject;
59 staticforward PyTypeObject Pcre_Type;
61 #define PcreObject_Check(v) ((v)->ob_type == &Pcre_Type)
62 #define NORMAL 0
63 #define CHARCLASS 1
64 #define REPLACEMENT 2
66 #define CHAR 0
67 #define MEMORY_REFERENCE 1
68 #define SYNTAX 2
69 #define NOT_SYNTAX 3
70 #define SET 4
71 #define WORD_BOUNDARY 5
72 #define NOT_WORD_BOUNDARY 6
73 #define BEGINNING_OF_BUFFER 7
74 #define END_OF_BUFFER 8
75 #define STRING 9
77 static PcreObject *
78 newPcreObject(arg)
79 PyObject *arg;
81 PcreObject *self;
82 self = PyObject_NEW(PcreObject, &Pcre_Type);
83 if (self == NULL)
84 return NULL;
85 self->regex = NULL;
86 self->regex_extra = NULL;
87 return self;
90 /* Pcre methods */
92 static void
93 PyPcre_dealloc(self)
94 PcreObject *self;
96 if (self->regex) free(self->regex);
97 if (self->regex_extra) free(self->regex_extra);
98 self->regex=NULL;
99 self->regex_extra=NULL;
100 PyMem_DEL(self);
104 static PyObject *
105 PyPcre_exec(self, args)
106 PcreObject *self;
107 PyObject *args;
109 char *string;
110 int stringlen, pos = 0, options=0, endpos = -1, i, count;
111 int offsets[100*2];
112 PyObject *list;
114 if (!PyArg_ParseTuple(args, "t#|iiii", &string, &stringlen, &pos, &endpos, &options))
115 return NULL;
116 if (endpos == -1) {endpos = stringlen;}
117 count = pcre_exec(self->regex, self->regex_extra,
118 string, endpos, pos, options,
119 offsets, sizeof(offsets)/sizeof(int) );
120 /* If an error occurred during the match, and an exception was raised,
121 just return NULL and leave the exception alone. The most likely
122 problem to cause this would be running out of memory for
123 the failure stack. */
124 if (PyErr_Occurred())
126 return NULL;
128 if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
129 if (count<0)
131 PyObject *errval = Py_BuildValue("si", "Regex execution error", count);
132 PyErr_SetObject(ErrorObject, errval);
133 Py_XDECREF(errval);
134 return NULL;
137 list=PyList_New(self->num_groups+1);
138 if (list==NULL) return NULL;
139 for(i=0; i<=self->num_groups; i++)
141 PyObject *v;
142 int start=offsets[i*2], end=offsets[i*2+1];
143 /* If the group wasn't affected by the match, return -1, -1 */
144 if (start<0 || count<=i)
145 {start=end=-1;}
146 v=Py_BuildValue("ii", start, end);
147 if (v==NULL) {Py_DECREF(list); return NULL;}
148 PyList_SetItem(list, i, v);
150 return list;
153 static PyMethodDef Pcre_methods[] = {
154 {"match", (PyCFunction)PyPcre_exec, 1},
155 {NULL, NULL} /* sentinel */
158 static PyObject *
159 PyPcre_getattr(self, name)
160 PcreObject *self;
161 char *name;
163 return Py_FindMethod(Pcre_methods, (PyObject *)self, name);
167 staticforward PyTypeObject Pcre_Type = {
168 PyObject_HEAD_INIT(NULL)
169 0, /*ob_size*/
170 "Pcre", /*tp_name*/
171 sizeof(PcreObject), /*tp_basicsize*/
172 0, /*tp_itemsize*/
173 /* methods */
174 (destructor)PyPcre_dealloc, /*tp_dealloc*/
175 0, /*tp_print*/
176 (getattrfunc)PyPcre_getattr, /*tp_getattr*/
177 0, /*tp_setattr*/
178 0, /*tp_compare*/
179 0, /*tp_repr*/
180 0, /*tp_as_number*/
181 0, /*tp_as_sequence*/
182 0, /*tp_as_mapping*/
183 0, /*tp_hash*/
185 /* --------------------------------------------------------------------- */
187 static PyObject *
188 PyPcre_compile(self, args)
189 PyObject *self; /* Not used */
190 PyObject *args;
192 PcreObject *rv;
193 PyObject *dictionary;
194 char *pattern;
195 const char *error;
197 int options, erroroffset;
198 if (!PyArg_ParseTuple(args, "siO!", &pattern, &options,
199 &PyDict_Type, &dictionary))
200 return NULL;
201 rv = newPcreObject(args);
202 if ( rv == NULL )
203 return NULL;
205 rv->regex = pcre_compile((char*)pattern, options,
206 &error, &erroroffset, dictionary);
207 if (rv->regex==NULL)
209 PyMem_DEL(rv);
210 if (!PyErr_Occurred())
212 PyObject *errval = Py_BuildValue("si", error, erroroffset);
213 PyErr_SetObject(ErrorObject, errval);
214 Py_XDECREF(errval);
216 return NULL;
218 rv->regex_extra=pcre_study(rv->regex, 0, &error);
219 if (rv->regex_extra==NULL && error!=NULL)
221 PyObject *errval = Py_BuildValue("si", error, 0);
222 PyMem_DEL(rv);
223 PyErr_SetObject(ErrorObject, errval);
224 Py_XDECREF(errval);
225 return NULL;
227 rv->num_groups = pcre_info(rv->regex, NULL, NULL);
228 if (rv->num_groups<0)
230 PyObject *errval = Py_BuildValue("si", error, rv->num_groups);
231 PyErr_SetObject(ErrorObject, errval);
232 Py_XDECREF(errval);
233 PyMem_DEL(rv);
234 return NULL;
236 return (PyObject *)rv;
239 static PyObject *
240 PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
241 unsigned char *pattern;
242 int pattern_len, *indexptr, *typeptr;
244 unsigned char c;
245 int index = *indexptr;
247 if (pattern_len<=index)
249 PyErr_SetString(ErrorObject, "escape ends too soon");
250 return NULL;
252 c=pattern[index]; index++;
253 *typeptr=CHAR;
255 switch (c)
257 case('t'):
258 *indexptr=index;
259 return Py_BuildValue("c", (char)9);
260 case('n'):
261 *indexptr = index;
262 return Py_BuildValue("c", (char)10);
263 case('v'):
264 *indexptr = index;
265 return Py_BuildValue("c", (char)11);
266 case('r'):
267 *indexptr = index;
268 return Py_BuildValue("c", (char)13);
269 case('f'):
270 *indexptr = index;
271 return Py_BuildValue("c", (char)12);
272 case('a'):
273 *indexptr = index;
274 return Py_BuildValue("c", (char)7);
275 case('b'):
276 *indexptr=index;
277 return Py_BuildValue("c", (char)8);
278 case('\\'):
279 *indexptr=index;
280 return Py_BuildValue("c", '\\');
282 case('x'):
284 int x, ch, end;
286 x = 0; end = index;
287 while ( (end<pattern_len && pcre_ctypes[ pattern[end] ] & ctype_xdigit) != 0)
289 ch = pattern[end];
290 x = x * 16 + pcre_lcc[ch] -
291 (((pcre_ctypes[ch] & ctype_digit) != 0)? '0' : 'W');
292 x &= 255;
293 end++;
295 if (end==index)
297 PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
298 return NULL;
300 *indexptr = end;
301 return Py_BuildValue("c", (char)x);
303 break;
305 case('E'): case('G'): case('L'): case('Q'):
306 case('U'): case('l'): case('u'):
308 char message[50];
309 sprintf(message, "\\%c is not allowed", c);
310 PyErr_SetString(ErrorObject, message);
311 return NULL;
314 case('g'):
316 int end, i;
317 int group_num = 0, is_number=0;
319 if (pattern_len<=index)
321 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
322 return NULL;
324 if (pattern[index]!='<')
326 PyErr_SetString(ErrorObject, "missing < in symbolic reference");
327 return NULL;
329 index++;
330 end=index;
331 while (end<pattern_len && pattern[end]!='>')
332 end++;
333 if (end==pattern_len)
335 PyErr_SetString(ErrorObject, "unfinished symbolic reference");
336 return NULL;
339 if (index==end) /* Zero-length name */
341 /* XXX should include the text of the reference */
342 PyErr_SetString(ErrorObject, "zero-length symbolic reference");
343 return NULL;
345 if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
347 is_number = 1;
348 group_num = pattern[index] - '0';
351 for(i=index+1; i<end; i++)
353 if (is_number &&
354 !(pcre_ctypes[pattern[i]] & ctype_digit) )
356 /* XXX should include the text of the reference */
357 PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
358 return NULL;
360 else {group_num = group_num * 10 + pattern[i] - '0';}
361 if (!(pcre_ctypes[pattern[i]] & ctype_word) )
363 /* XXX should include the text of the reference */
364 PyErr_SetString(ErrorObject, "illegal symbolic reference");
365 return NULL;
369 *typeptr = MEMORY_REFERENCE;
370 *indexptr = end+1;
371 /* If it's a number, return the integer value of the group */
372 if (is_number) return Py_BuildValue("i", group_num);
373 /* Otherwise, return a string containing the group name */
374 return Py_BuildValue("s#", pattern+index, end-index);
376 break;
378 case('0'):
380 /* \0 always indicates an octal escape, so we consume up to 3
381 characters, as long as they're all octal digits */
382 int octval=0, i;
383 index--;
384 for(i=index;
385 i<=index+2 && i<pattern_len
386 && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
387 i++)
389 octval = octval * 8 + pattern[i] - '0';
391 if (octval>255)
393 PyErr_SetString(ErrorObject, "octal value out of range");
394 return NULL;
396 *indexptr = i;
397 return Py_BuildValue("c", (unsigned char)octval);
399 break;
400 case('1'): case('2'): case('3'): case('4'):
401 case('5'): case('6'): case('7'): case('8'):
402 case('9'):
404 /* Handle \?, where ? is from 1 through 9 */
405 int value=0;
406 index--;
407 /* If it's at least a two-digit reference, like \34, it might
408 either be a 3-digit octal escape (\123) or a 2-digit
409 decimal memory reference (\34) */
411 if ( (index+1) <pattern_len &&
412 (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
414 if ( (index+2) <pattern_len &&
415 (pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
416 (pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
417 (pcre_ctypes[ pattern[index ] ] & ctype_odigit)
420 /* 3 octal digits */
421 value= 8*8*(pattern[index ]-'0') +
422 8*(pattern[index+1]-'0') +
423 (pattern[index+2]-'0');
424 if (value>255)
426 PyErr_SetString(ErrorObject, "octal value out of range");
427 return NULL;
429 *indexptr = index+3;
430 return Py_BuildValue("c", (unsigned char)value);
432 else
434 /* 2-digit form, so it's a memory reference */
435 value= 10*(pattern[index ]-'0') +
436 (pattern[index+1]-'0');
437 if (value<1 || EXTRACT_MAX<=value)
439 PyErr_SetString(ErrorObject, "memory reference out of range");
440 return NULL;
442 *typeptr = MEMORY_REFERENCE;
443 *indexptr = index+2;
444 return Py_BuildValue("i", value);
447 else
449 /* Single-digit form, like \2, so it's a memory reference */
450 *typeptr = MEMORY_REFERENCE;
451 *indexptr = index+1;
452 return Py_BuildValue("i", pattern[index]-'0');
455 break;
457 default:
458 /* It's some unknown escape like \s, so return a string containing
459 \s */
460 *typeptr = STRING;
461 *indexptr = index;
462 return Py_BuildValue("s#", pattern+index-2, 2);
466 static PyObject *
467 PyPcre_expand(self, args)
468 PyObject *self;
469 PyObject *args;
471 PyObject *results, *match_obj;
472 PyObject *repl_obj, *newstring;
473 unsigned char *repl;
474 int size, total_len, i, start, pos;
476 if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
477 return NULL;
479 repl=(unsigned char *)PyString_AsString(repl_obj);
480 size=PyString_Size(repl_obj);
481 results=PyList_New(0);
482 if (results==NULL) return NULL;
483 for(start=total_len=i=0; i<size; i++)
485 if (repl[i]=='\\')
487 PyObject *value;
488 int escape_type;
490 if (start!=i)
492 int status;
493 PyObject *s = PyString_FromStringAndSize(
494 (char *)repl+start, i-start);
495 if (s == NULL) {
496 Py_DECREF(results);
497 return NULL;
499 status = PyList_Append(results, s);
500 Py_DECREF(s);
501 if (status < 0) {
502 Py_DECREF(results);
503 return NULL;
505 total_len += i-start;
507 i++;
508 value=PyPcre_expand_escape(repl, size, &i, &escape_type);
509 if (value==NULL)
511 /* PyPcre_expand_escape triggered an exception of some sort,
512 so just return */
513 Py_DECREF(results);
514 return NULL;
516 switch (escape_type)
518 case (CHAR):
519 PyList_Append(results, value);
520 total_len += PyString_Size(value);
521 break;
522 case(MEMORY_REFERENCE):
524 PyObject *r, *tuple, *result;
525 r=PyObject_GetAttrString(match_obj, "group");
526 if (r == NULL) {
527 Py_DECREF(results);
528 return NULL;
530 tuple=PyTuple_New(1);
531 Py_INCREF(value);
532 PyTuple_SetItem(tuple, 0, value);
533 result=PyEval_CallObject(r, tuple);
534 Py_DECREF(r); Py_DECREF(tuple);
535 if (result==NULL)
537 /* The group() method triggered an exception of some sort */
538 Py_DECREF(results);
539 Py_DECREF(value);
540 return NULL;
542 if (result==Py_None)
544 char message[50];
545 sprintf(message,
546 "group did not contribute to the match");
547 PyErr_SetString(ErrorObject,
548 message);
549 Py_DECREF(result);
550 Py_DECREF(value);
551 Py_DECREF(results);
552 return NULL;
554 /* typecheck that it's a string! */
555 if (!PyString_Check(result))
557 Py_DECREF(results);
558 Py_DECREF(result);
559 PyErr_SetString(ErrorObject,
560 "group() must return a string value for replacement");
561 return NULL;
563 PyList_Append(results, result);
564 total_len += PyString_Size(result);
565 Py_DECREF(result);
567 break;
568 case(STRING):
570 PyList_Append(results, value);
571 total_len += PyString_Size(value);
572 break;
574 default:
575 Py_DECREF(results);
576 PyErr_SetString(ErrorObject,
577 "bad escape in replacement");
578 return NULL;
580 Py_DECREF(value);
581 start=i;
582 i--; /* Decrement now, because the 'for' loop will increment it */
584 } /* endif repl[i]!='\\' */
586 if (start!=i)
588 int status;
589 PyObject *s = PyString_FromStringAndSize((char *)repl+start,
590 i-start);
591 if (s == NULL) {
592 Py_DECREF(results);
593 return NULL;
595 status = PyList_Append(results, s);
596 Py_DECREF(s);
597 if (status < 0) {
598 Py_DECREF(results);
599 return NULL;
601 total_len += i-start;
604 /* Whew! Now we've constructed a list containing various pieces of
605 strings that will make up our final result. So, iterate over
606 the list concatenating them. A new string measuring total_len
607 bytes is allocated and filled in. */
609 newstring=PyString_FromStringAndSize(NULL, total_len);
610 if (newstring==NULL)
612 Py_DECREF(results);
613 return NULL;
616 repl=(unsigned char *)PyString_AsString(newstring);
617 for (pos=i=0; i<PyList_Size(results); i++)
619 PyObject *item=PyList_GetItem(results, i);
620 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
621 pos += PyString_Size(item);
623 Py_DECREF(results);
624 return newstring;
628 /* List of functions defined in the module */
630 static PyMethodDef pcre_methods[] = {
631 {"pcre_compile", PyPcre_compile, 1},
632 {"pcre_expand", PyPcre_expand, 1},
633 {NULL, NULL} /* sentinel */
638 * Convenience routine to export an integer value.
639 * For simplicity, errors (which are unlikely anyway) are ignored.
642 static void
643 insint(d, name, value)
644 PyObject * d;
645 char * name;
646 int value;
648 PyObject *v = PyInt_FromLong((long) value);
649 if (v == NULL) {
650 /* Don't bother reporting this error */
651 PyErr_Clear();
653 else {
654 PyDict_SetItemString(d, name, v);
655 Py_DECREF(v);
660 /* Initialization function for the module (*must* be called initpcre) */
662 DL_EXPORT(void)
663 initpcre()
665 PyObject *m, *d;
667 Pcre_Type.ob_type = &PyType_Type;
669 /* Create the module and add the functions */
670 m = Py_InitModule("pcre", pcre_methods);
672 /* Add some symbolic constants to the module */
673 d = PyModule_GetDict(m);
674 ErrorObject = PyString_FromString("pcre.error");
675 PyDict_SetItemString(d, "error", ErrorObject);
677 /* Insert the flags */
678 insint(d, "IGNORECASE", PCRE_CASELESS);
679 insint(d, "ANCHORED", PCRE_ANCHORED);
680 insint(d, "MULTILINE", PCRE_MULTILINE);
681 insint(d, "DOTALL", PCRE_DOTALL);
682 insint(d, "VERBOSE", PCRE_EXTENDED);
683 insint(d, "LOCALE", PCRE_LOCALE);
685 /* Check for errors */
686 if (PyErr_Occurred())
687 Py_FatalError("can't initialize module pcre");