Last set of CW Pro 5 projects (probably)
[python/dscho.git] / Modules / regexmodule.c
blobff3b8950dd47916d7beced1b27088b2ee9a3d432
1 /*
2 XXX support range parameter on search
3 XXX support mstop parameter on search
4 */
6 /***********************************************************
7 Copyright (c) 2000, BeOpen.com.
8 Copyright (c) 1995-2000, Corporation for National Research Initiatives.
9 Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
10 All rights reserved.
12 See the file "Misc/COPYRIGHT" for information on usage and
13 redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
14 ******************************************************************/
16 /* Regular expression objects */
17 /* This uses Tatu Ylonen's copyleft-free reimplementation of
18 GNU regular expressions */
20 #include "Python.h"
22 #include <ctype.h>
24 #include "regexpr.h"
26 static PyObject *RegexError; /* Exception */
28 typedef struct {
29 PyObject_HEAD
30 struct re_pattern_buffer re_patbuf; /* The compiled expression */
31 struct re_registers re_regs; /* The registers from the last match */
32 char re_fastmap[256]; /* Storage for fastmap */
33 PyObject *re_translate; /* String object for translate table */
34 PyObject *re_lastok; /* String object last matched/searched */
35 PyObject *re_groupindex; /* Group name to index dictionary */
36 PyObject *re_givenpat; /* Pattern with symbolic groups */
37 PyObject *re_realpat; /* Pattern without symbolic groups */
38 } regexobject;
40 /* Regex object methods */
42 static void
43 reg_dealloc(regexobject *re)
45 if (re->re_patbuf.buffer)
46 free(re->re_patbuf.buffer);
47 Py_XDECREF(re->re_translate);
48 Py_XDECREF(re->re_lastok);
49 Py_XDECREF(re->re_groupindex);
50 Py_XDECREF(re->re_givenpat);
51 Py_XDECREF(re->re_realpat);
52 PyObject_Del(re);
55 static PyObject *
56 makeresult(struct re_registers *regs)
58 PyObject *v;
59 int i;
60 static PyObject *filler = NULL;
62 if (filler == NULL) {
63 filler = Py_BuildValue("(ii)", -1, -1);
64 if (filler == NULL)
65 return NULL;
67 v = PyTuple_New(RE_NREGS);
68 if (v == NULL)
69 return NULL;
71 for (i = 0; i < RE_NREGS; i++) {
72 int lo = regs->start[i];
73 int hi = regs->end[i];
74 PyObject *w;
75 if (lo == -1 && hi == -1) {
76 w = filler;
77 Py_INCREF(w);
79 else
80 w = Py_BuildValue("(ii)", lo, hi);
81 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
82 Py_DECREF(v);
83 return NULL;
86 return v;
89 static PyObject *
90 regobj_match(regexobject *re, PyObject *args)
92 PyObject *argstring;
93 char *buffer;
94 int size;
95 int offset = 0;
96 int result;
98 if (!PyArg_ParseTuple(args, "O|i:match", &argstring, &offset))
99 return NULL;
100 if (!PyArg_Parse(argstring, "t#", &buffer, &size))
101 return NULL;
103 if (offset < 0 || offset > size) {
104 PyErr_SetString(RegexError, "match offset out of range");
105 return NULL;
107 Py_XDECREF(re->re_lastok);
108 re->re_lastok = NULL;
109 result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset,
110 &re->re_regs);
111 if (result < -1) {
112 /* Serious failure of some sort; if re_match didn't
113 set an exception, raise a generic error */
114 if (!PyErr_Occurred())
115 PyErr_SetString(RegexError, "match failure");
116 return NULL;
118 if (result >= 0) {
119 Py_INCREF(argstring);
120 re->re_lastok = argstring;
122 return PyInt_FromLong((long)result); /* Length of the match or -1 */
125 static PyObject *
126 regobj_search(regexobject *re, PyObject *args)
128 PyObject *argstring;
129 char *buffer;
130 int size;
131 int offset = 0;
132 int range;
133 int result;
135 if (!PyArg_ParseTuple(args, "O|i:search", &argstring, &offset))
136 return NULL;
137 if (!PyArg_Parse(argstring, "t#:search", &buffer, &size))
138 return NULL;
140 if (offset < 0 || offset > size) {
141 PyErr_SetString(RegexError, "search offset out of range");
142 return NULL;
144 /* NB: In Emacs 18.57, the documentation for re_search[_2] and
145 the implementation don't match: the documentation states that
146 |range| positions are tried, while the code tries |range|+1
147 positions. It seems more productive to believe the code! */
148 range = size - offset;
149 Py_XDECREF(re->re_lastok);
150 re->re_lastok = NULL;
151 result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range,
152 &re->re_regs);
153 if (result < -1) {
154 /* Serious failure of some sort; if re_match didn't
155 set an exception, raise a generic error */
156 if (!PyErr_Occurred())
157 PyErr_SetString(RegexError, "match failure");
158 return NULL;
160 if (result >= 0) {
161 Py_INCREF(argstring);
162 re->re_lastok = argstring;
164 return PyInt_FromLong((long)result); /* Position of the match or -1 */
167 /* get the group from the regex where index can be a string (group name) or
168 an integer index [0 .. 99]
170 static PyObject*
171 group_from_index(regexobject *re, PyObject *index)
173 int i, a, b;
174 char *v;
176 if (PyString_Check(index))
177 if (re->re_groupindex == NULL ||
178 !(index = PyDict_GetItem(re->re_groupindex, index)))
180 PyErr_SetString(RegexError,
181 "group() group name doesn't exist");
182 return NULL;
185 i = PyInt_AsLong(index);
186 if (i == -1 && PyErr_Occurred())
187 return NULL;
189 if (i < 0 || i >= RE_NREGS) {
190 PyErr_SetString(RegexError, "group() index out of range");
191 return NULL;
193 if (re->re_lastok == NULL) {
194 PyErr_SetString(RegexError,
195 "group() only valid after successful match/search");
196 return NULL;
198 a = re->re_regs.start[i];
199 b = re->re_regs.end[i];
200 if (a < 0 || b < 0) {
201 Py_INCREF(Py_None);
202 return Py_None;
205 if (!(v = PyString_AsString(re->re_lastok)))
206 return NULL;
208 return PyString_FromStringAndSize(v+a, b-a);
212 static PyObject *
213 regobj_group(regexobject *re, PyObject *args)
215 int n = PyTuple_Size(args);
216 int i;
217 PyObject *res = NULL;
219 if (n < 0)
220 return NULL;
221 if (n == 0) {
222 PyErr_SetString(PyExc_TypeError, "not enough arguments");
223 return NULL;
225 if (n == 1) {
226 /* return value is a single string */
227 PyObject *index = PyTuple_GetItem(args, 0);
228 if (!index)
229 return NULL;
231 return group_from_index(re, index);
234 /* return value is a tuple */
235 if (!(res = PyTuple_New(n)))
236 return NULL;
238 for (i = 0; i < n; i++) {
239 PyObject *index = PyTuple_GetItem(args, i);
240 PyObject *group = NULL;
242 if (!index)
243 goto finally;
244 if (!(group = group_from_index(re, index)))
245 goto finally;
246 if (PyTuple_SetItem(res, i, group) < 0)
247 goto finally;
249 return res;
251 finally:
252 Py_DECREF(res);
253 return NULL;
257 static struct PyMethodDef reg_methods[] = {
258 {"match", (PyCFunction)regobj_match, 1},
259 {"search", (PyCFunction)regobj_search, 1},
260 {"group", (PyCFunction)regobj_group, 1},
261 {NULL, NULL} /* sentinel */
266 static char* members[] = {
267 "last", "regs", "translate",
268 "groupindex", "realpat", "givenpat",
269 NULL
273 static PyObject *
274 regobj_getattr(regexobject *re, char *name)
276 if (strcmp(name, "regs") == 0) {
277 if (re->re_lastok == NULL) {
278 Py_INCREF(Py_None);
279 return Py_None;
281 return makeresult(&re->re_regs);
283 if (strcmp(name, "last") == 0) {
284 if (re->re_lastok == NULL) {
285 Py_INCREF(Py_None);
286 return Py_None;
288 Py_INCREF(re->re_lastok);
289 return re->re_lastok;
291 if (strcmp(name, "translate") == 0) {
292 if (re->re_translate == NULL) {
293 Py_INCREF(Py_None);
294 return Py_None;
296 Py_INCREF(re->re_translate);
297 return re->re_translate;
299 if (strcmp(name, "groupindex") == 0) {
300 if (re->re_groupindex == NULL) {
301 Py_INCREF(Py_None);
302 return Py_None;
304 Py_INCREF(re->re_groupindex);
305 return re->re_groupindex;
307 if (strcmp(name, "realpat") == 0) {
308 if (re->re_realpat == NULL) {
309 Py_INCREF(Py_None);
310 return Py_None;
312 Py_INCREF(re->re_realpat);
313 return re->re_realpat;
315 if (strcmp(name, "givenpat") == 0) {
316 if (re->re_givenpat == NULL) {
317 Py_INCREF(Py_None);
318 return Py_None;
320 Py_INCREF(re->re_givenpat);
321 return re->re_givenpat;
323 if (strcmp(name, "__members__") == 0) {
324 int i = 0;
325 PyObject *list = NULL;
327 /* okay, so it's unlikely this list will change that often.
328 still, it's easier to change it in just one place.
330 while (members[i])
331 i++;
332 if (!(list = PyList_New(i)))
333 return NULL;
335 i = 0;
336 while (members[i]) {
337 PyObject* v = PyString_FromString(members[i]);
338 if (!v || PyList_SetItem(list, i, v) < 0) {
339 Py_DECREF(list);
340 return NULL;
342 i++;
344 return list;
346 return Py_FindMethod(reg_methods, (PyObject *)re, name);
349 static PyTypeObject Regextype = {
350 PyObject_HEAD_INIT(&PyType_Type)
351 0, /*ob_size*/
352 "regex", /*tp_name*/
353 sizeof(regexobject), /*tp_size*/
354 0, /*tp_itemsize*/
355 /* methods */
356 (destructor)reg_dealloc, /*tp_dealloc*/
357 0, /*tp_print*/
358 (getattrfunc)regobj_getattr, /*tp_getattr*/
359 0, /*tp_setattr*/
360 0, /*tp_compare*/
361 0, /*tp_repr*/
364 /* reference counting invariants:
365 pattern: borrowed
366 translate: borrowed
367 givenpat: borrowed
368 groupindex: transferred
370 static PyObject *
371 newregexobject(PyObject *pattern, PyObject *translate, PyObject *givenpat, PyObject *groupindex)
373 regexobject *re;
374 char *pat;
375 int size;
377 if (!PyArg_Parse(pattern, "t#", &pat, &size))
378 return NULL;
380 if (translate != NULL && PyString_Size(translate) != 256) {
381 PyErr_SetString(RegexError,
382 "translation table must be 256 bytes");
383 return NULL;
385 re = PyObject_New(regexobject, &Regextype);
386 if (re != NULL) {
387 char *error;
388 re->re_patbuf.buffer = NULL;
389 re->re_patbuf.allocated = 0;
390 re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
391 if (translate) {
392 re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate);
393 if (!re->re_patbuf.translate)
394 goto finally;
395 Py_INCREF(translate);
397 else
398 re->re_patbuf.translate = NULL;
399 re->re_translate = translate;
400 re->re_lastok = NULL;
401 re->re_groupindex = groupindex;
402 Py_INCREF(pattern);
403 re->re_realpat = pattern;
404 Py_INCREF(givenpat);
405 re->re_givenpat = givenpat;
406 error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
407 if (error != NULL) {
408 PyErr_SetString(RegexError, error);
409 goto finally;
412 return (PyObject *)re;
413 finally:
414 Py_DECREF(re);
415 return NULL;
418 static PyObject *
419 regex_compile(PyObject *self, PyObject *args)
421 PyObject *pat = NULL;
422 PyObject *tran = NULL;
424 if (!PyArg_ParseTuple(args, "S|S:compile", &pat, &tran))
425 return NULL;
426 return newregexobject(pat, tran, pat, NULL);
429 static PyObject *
430 symcomp(PyObject *pattern, PyObject *gdict)
432 char *opat, *oend, *o, *n, *g, *v;
433 int group_count = 0;
434 int sz;
435 int escaped = 0;
436 char name_buf[128];
437 PyObject *npattern;
438 int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
440 if (!(opat = PyString_AsString(pattern)))
441 return NULL;
443 if ((sz = PyString_Size(pattern)) < 0)
444 return NULL;
446 oend = opat + sz;
447 o = opat;
449 if (oend == opat) {
450 Py_INCREF(pattern);
451 return pattern;
454 if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) ||
455 !(n = PyString_AsString(npattern)))
456 return NULL;
458 while (o < oend) {
459 if (*o == '(' && escaped == require_escape) {
460 char *backtrack;
461 escaped = 0;
462 ++group_count;
463 *n++ = *o;
464 if (++o >= oend || *o != '<')
465 continue;
466 /* *o == '<' */
467 if (o+1 < oend && *(o+1) == '>')
468 continue;
469 backtrack = o;
470 g = name_buf;
471 for (++o; o < oend;) {
472 if (*o == '>') {
473 PyObject *group_name = NULL;
474 PyObject *group_index = NULL;
475 *g++ = '\0';
476 group_name = PyString_FromString(name_buf);
477 group_index = PyInt_FromLong(group_count);
478 if (group_name == NULL ||
479 group_index == NULL ||
480 PyDict_SetItem(gdict, group_name,
481 group_index) != 0)
483 Py_XDECREF(group_name);
484 Py_XDECREF(group_index);
485 Py_XDECREF(npattern);
486 return NULL;
488 Py_DECREF(group_name);
489 Py_DECREF(group_index);
490 ++o; /* eat the '>' */
491 break;
493 if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
494 o = backtrack;
495 break;
497 *g++ = *o++;
500 else if (*o == '[' && !escaped) {
501 *n++ = *o;
502 ++o; /* eat the char following '[' */
503 *n++ = *o;
504 while (o < oend && *o != ']') {
505 ++o;
506 *n++ = *o;
508 if (o < oend)
509 ++o;
511 else if (*o == '\\') {
512 escaped = 1;
513 *n++ = *o;
514 ++o;
516 else {
517 escaped = 0;
518 *n++ = *o;
519 ++o;
523 if (!(v = PyString_AsString(npattern))) {
524 Py_DECREF(npattern);
525 return NULL;
527 /* _PyString_Resize() decrements npattern on failure */
528 if (_PyString_Resize(&npattern, n - v) == 0)
529 return npattern;
530 else {
531 return NULL;
536 static PyObject *
537 regex_symcomp(PyObject *self, PyObject *args)
539 PyObject *pattern;
540 PyObject *tran = NULL;
541 PyObject *gdict = NULL;
542 PyObject *npattern;
543 PyObject *retval = NULL;
545 if (!PyArg_ParseTuple(args, "S|S:symcomp", &pattern, &tran))
546 return NULL;
548 gdict = PyDict_New();
549 if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) {
550 Py_DECREF(gdict);
551 Py_DECREF(pattern);
552 return NULL;
554 retval = newregexobject(npattern, tran, pattern, gdict);
555 Py_DECREF(npattern);
556 return retval;
560 static PyObject *cache_pat;
561 static PyObject *cache_prog;
563 static int
564 update_cache(PyObject *pat)
566 PyObject *tuple = Py_BuildValue("(O)", pat);
567 int status = 0;
569 if (!tuple)
570 return -1;
572 if (pat != cache_pat) {
573 Py_XDECREF(cache_pat);
574 cache_pat = NULL;
575 Py_XDECREF(cache_prog);
576 cache_prog = regex_compile((PyObject *)NULL, tuple);
577 if (cache_prog == NULL) {
578 status = -1;
579 goto finally;
581 cache_pat = pat;
582 Py_INCREF(cache_pat);
584 finally:
585 Py_DECREF(tuple);
586 return status;
589 static PyObject *
590 regex_match(PyObject *self, PyObject *args)
592 PyObject *pat, *string;
593 PyObject *tuple, *v;
595 if (!PyArg_Parse(args, "(SS)", &pat, &string))
596 return NULL;
597 if (update_cache(pat) < 0)
598 return NULL;
600 if (!(tuple = Py_BuildValue("(S)", string)))
601 return NULL;
602 v = regobj_match((regexobject *)cache_prog, tuple);
603 Py_DECREF(tuple);
604 return v;
607 static PyObject *
608 regex_search(PyObject *self, PyObject *args)
610 PyObject *pat, *string;
611 PyObject *tuple, *v;
613 if (!PyArg_Parse(args, "(SS)", &pat, &string))
614 return NULL;
615 if (update_cache(pat) < 0)
616 return NULL;
618 if (!(tuple = Py_BuildValue("(S)", string)))
619 return NULL;
620 v = regobj_search((regexobject *)cache_prog, tuple);
621 Py_DECREF(tuple);
622 return v;
625 static PyObject *
626 regex_set_syntax(PyObject *self, PyObject *args)
628 int syntax;
629 if (!PyArg_Parse(args, "i", &syntax))
630 return NULL;
631 syntax = re_set_syntax(syntax);
632 /* wipe the global pattern cache */
633 Py_XDECREF(cache_pat);
634 cache_pat = NULL;
635 Py_XDECREF(cache_prog);
636 cache_prog = NULL;
637 return PyInt_FromLong((long)syntax);
640 static PyObject *
641 regex_get_syntax(PyObject *self, PyObject *args)
643 if (!PyArg_Parse(args, ""))
644 return NULL;
645 return PyInt_FromLong((long)re_syntax);
649 static struct PyMethodDef regex_global_methods[] = {
650 {"compile", regex_compile, 1},
651 {"symcomp", regex_symcomp, 1},
652 {"match", regex_match, 0},
653 {"search", regex_search, 0},
654 {"set_syntax", regex_set_syntax, 0},
655 {"get_syntax", regex_get_syntax, 0},
656 {NULL, NULL} /* sentinel */
659 DL_EXPORT(void)
660 initregex(void)
662 PyObject *m, *d, *v;
663 int i;
664 char *s;
666 m = Py_InitModule("regex", regex_global_methods);
667 d = PyModule_GetDict(m);
669 /* Initialize regex.error exception */
670 v = RegexError = PyErr_NewException("regex.error", NULL, NULL);
671 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
672 goto finally;
674 /* Initialize regex.casefold constant */
675 if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
676 goto finally;
678 if (!(s = PyString_AsString(v)))
679 goto finally;
681 for (i = 0; i < 256; i++) {
682 if (isupper(i))
683 s[i] = tolower(i);
684 else
685 s[i] = i;
687 if (PyDict_SetItemString(d, "casefold", v) < 0)
688 goto finally;
689 Py_DECREF(v);
691 if (!PyErr_Occurred())
692 return;
693 finally:
694 /* Nothing */ ;