Fix an amazing number of typos & malformed sentences reported by Detlef
[python/dscho.git] / Modules / regexmodule.c
blob1f64f61467da2e57c42070244b5613ac1754066c
1 /*
2 XXX support range parameter on search
3 XXX support mstop parameter on search
4 */
6 /***********************************************************
7 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
8 The Netherlands.
10 All Rights Reserved
12 Permission to use, copy, modify, and distribute this software and its
13 documentation for any purpose and without fee is hereby granted,
14 provided that the above copyright notice appear in all copies and that
15 both that copyright notice and this permission notice appear in
16 supporting documentation, and that the names of Stichting Mathematisch
17 Centrum or CWI or Corporation for National Research Initiatives or
18 CNRI not be used in advertising or publicity pertaining to
19 distribution of the software without specific, written prior
20 permission.
22 While CWI is the initial source for this software, a modified version
23 is made available by the Corporation for National Research Initiatives
24 (CNRI) at the Internet address ftp://ftp.python.org.
26 STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
27 REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
28 MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
29 CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
30 DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
31 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
32 TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
33 PERFORMANCE OF THIS SOFTWARE.
35 ******************************************************************/
37 /* Regular expression objects */
38 /* This uses Tatu Ylonen's copyleft-free reimplementation of
39 GNU regular expressions */
41 #include "Python.h"
43 #include <ctype.h>
45 #include "regexpr.h"
47 static PyObject *RegexError; /* Exception */
49 typedef struct {
50 PyObject_HEAD
51 struct re_pattern_buffer re_patbuf; /* The compiled expression */
52 struct re_registers re_regs; /* The registers from the last match */
53 char re_fastmap[256]; /* Storage for fastmap */
54 PyObject *re_translate; /* String object for translate table */
55 PyObject *re_lastok; /* String object last matched/searched */
56 PyObject *re_groupindex; /* Group name to index dictionary */
57 PyObject *re_givenpat; /* Pattern with symbolic groups */
58 PyObject *re_realpat; /* Pattern without symbolic groups */
59 } regexobject;
61 /* Regex object methods */
63 static void
64 reg_dealloc(re)
65 regexobject *re;
67 PyMem_XDEL(re->re_patbuf.buffer);
68 Py_XDECREF(re->re_translate);
69 Py_XDECREF(re->re_lastok);
70 Py_XDECREF(re->re_groupindex);
71 Py_XDECREF(re->re_givenpat);
72 Py_XDECREF(re->re_realpat);
73 PyMem_DEL(re);
76 static PyObject *
77 makeresult(regs)
78 struct re_registers *regs;
80 PyObject *v;
81 int i;
82 static PyObject *filler = NULL;
84 if (filler == NULL) {
85 filler = Py_BuildValue("(ii)", -1, -1);
86 if (filler == NULL)
87 return NULL;
89 v = PyTuple_New(RE_NREGS);
90 if (v == NULL)
91 return NULL;
93 for (i = 0; i < RE_NREGS; i++) {
94 int lo = regs->start[i];
95 int hi = regs->end[i];
96 PyObject *w;
97 if (lo == -1 && hi == -1) {
98 w = filler;
99 Py_INCREF(w);
101 else
102 w = Py_BuildValue("(ii)", lo, hi);
103 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
104 Py_DECREF(v);
105 return NULL;
108 return v;
111 static PyObject *
112 regobj_match(re, args)
113 regexobject *re;
114 PyObject *args;
116 PyObject *argstring;
117 char *buffer;
118 int size;
119 int offset = 0;
120 int result;
122 if (!PyArg_ParseTuple(args, "O|i", &argstring, &offset))
123 return NULL;
124 if (!PyArg_Parse(argstring, "t#", &buffer, &size))
125 return NULL;
127 if (offset < 0 || offset > size) {
128 PyErr_SetString(RegexError, "match offset out of range");
129 return NULL;
131 Py_XDECREF(re->re_lastok);
132 re->re_lastok = NULL;
133 result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset,
134 &re->re_regs);
135 if (result < -1) {
136 /* Serious failure of some sort; if re_match didn't
137 set an exception, raise a generic error */
138 if (!PyErr_Occurred())
139 PyErr_SetString(RegexError, "match failure");
140 return NULL;
142 if (result >= 0) {
143 Py_INCREF(argstring);
144 re->re_lastok = argstring;
146 return PyInt_FromLong((long)result); /* Length of the match or -1 */
149 static PyObject *
150 regobj_search(re, args)
151 regexobject *re;
152 PyObject *args;
154 PyObject *argstring;
155 char *buffer;
156 int size;
157 int offset = 0;
158 int range;
159 int result;
161 if (!PyArg_ParseTuple(args, "O|i", &argstring, &offset))
162 return NULL;
163 if (!PyArg_Parse(argstring, "t#", &buffer, &size))
164 return NULL;
166 if (offset < 0 || offset > size) {
167 PyErr_SetString(RegexError, "search offset out of range");
168 return NULL;
170 /* NB: In Emacs 18.57, the documentation for re_search[_2] and
171 the implementation don't match: the documentation states that
172 |range| positions are tried, while the code tries |range|+1
173 positions. It seems more productive to believe the code! */
174 range = size - offset;
175 Py_XDECREF(re->re_lastok);
176 re->re_lastok = NULL;
177 result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range,
178 &re->re_regs);
179 if (result < -1) {
180 /* Serious failure of some sort; if re_match didn't
181 set an exception, raise a generic error */
182 if (!PyErr_Occurred())
183 PyErr_SetString(RegexError, "match failure");
184 return NULL;
186 if (result >= 0) {
187 Py_INCREF(argstring);
188 re->re_lastok = argstring;
190 return PyInt_FromLong((long)result); /* Position of the match or -1 */
193 /* get the group from the regex where index can be a string (group name) or
194 an integer index [0 .. 99]
196 static PyObject*
197 group_from_index(re, index)
198 regexobject *re;
199 PyObject *index;
201 int i, a, b;
202 char *v;
204 if (PyString_Check(index))
205 if (re->re_groupindex == NULL ||
206 !(index = PyDict_GetItem(re->re_groupindex, index)))
208 PyErr_SetString(RegexError,
209 "group() group name doesn't exist");
210 return NULL;
213 i = PyInt_AsLong(index);
214 if (i == -1 && PyErr_Occurred())
215 return NULL;
217 if (i < 0 || i >= RE_NREGS) {
218 PyErr_SetString(RegexError, "group() index out of range");
219 return NULL;
221 if (re->re_lastok == NULL) {
222 PyErr_SetString(RegexError,
223 "group() only valid after successful match/search");
224 return NULL;
226 a = re->re_regs.start[i];
227 b = re->re_regs.end[i];
228 if (a < 0 || b < 0) {
229 Py_INCREF(Py_None);
230 return Py_None;
233 if (!(v = PyString_AsString(re->re_lastok)))
234 return NULL;
236 return PyString_FromStringAndSize(v+a, b-a);
240 static PyObject *
241 regobj_group(re, args)
242 regexobject *re;
243 PyObject *args;
245 int n = PyTuple_Size(args);
246 int i;
247 PyObject *res = NULL;
249 if (n < 0)
250 return NULL;
251 if (n == 0) {
252 PyErr_SetString(PyExc_TypeError, "not enough arguments");
253 return NULL;
255 if (n == 1) {
256 /* return value is a single string */
257 PyObject *index = PyTuple_GetItem(args, 0);
258 if (!index)
259 return NULL;
261 return group_from_index(re, index);
264 /* return value is a tuple */
265 if (!(res = PyTuple_New(n)))
266 return NULL;
268 for (i = 0; i < n; i++) {
269 PyObject *index = PyTuple_GetItem(args, i);
270 PyObject *group = NULL;
272 if (!index)
273 goto finally;
274 if (!(group = group_from_index(re, index)))
275 goto finally;
276 if (PyTuple_SetItem(res, i, group) < 0)
277 goto finally;
279 return res;
281 finally:
282 Py_DECREF(res);
283 return NULL;
287 static struct PyMethodDef reg_methods[] = {
288 {"match", (PyCFunction)regobj_match, 1},
289 {"search", (PyCFunction)regobj_search, 1},
290 {"group", (PyCFunction)regobj_group, 1},
291 {NULL, NULL} /* sentinel */
296 static char* members[] = {
297 "last", "regs", "translate",
298 "groupindex", "realpat", "givenpat",
299 NULL
303 static PyObject *
304 regobj_getattr(re, name)
305 regexobject *re;
306 char *name;
308 if (strcmp(name, "regs") == 0) {
309 if (re->re_lastok == NULL) {
310 Py_INCREF(Py_None);
311 return Py_None;
313 return makeresult(&re->re_regs);
315 if (strcmp(name, "last") == 0) {
316 if (re->re_lastok == NULL) {
317 Py_INCREF(Py_None);
318 return Py_None;
320 Py_INCREF(re->re_lastok);
321 return re->re_lastok;
323 if (strcmp(name, "translate") == 0) {
324 if (re->re_translate == NULL) {
325 Py_INCREF(Py_None);
326 return Py_None;
328 Py_INCREF(re->re_translate);
329 return re->re_translate;
331 if (strcmp(name, "groupindex") == 0) {
332 if (re->re_groupindex == NULL) {
333 Py_INCREF(Py_None);
334 return Py_None;
336 Py_INCREF(re->re_groupindex);
337 return re->re_groupindex;
339 if (strcmp(name, "realpat") == 0) {
340 if (re->re_realpat == NULL) {
341 Py_INCREF(Py_None);
342 return Py_None;
344 Py_INCREF(re->re_realpat);
345 return re->re_realpat;
347 if (strcmp(name, "givenpat") == 0) {
348 if (re->re_givenpat == NULL) {
349 Py_INCREF(Py_None);
350 return Py_None;
352 Py_INCREF(re->re_givenpat);
353 return re->re_givenpat;
355 if (strcmp(name, "__members__") == 0) {
356 int i = 0;
357 PyObject *list = NULL;
359 /* okay, so it's unlikely this list will change that often.
360 still, it's easier to change it in just one place.
362 while (members[i])
363 i++;
364 if (!(list = PyList_New(i)))
365 return NULL;
367 i = 0;
368 while (members[i]) {
369 PyObject* v = PyString_FromString(members[i]);
370 if (!v || PyList_SetItem(list, i, v) < 0) {
371 Py_DECREF(list);
372 return NULL;
374 i++;
376 return list;
378 return Py_FindMethod(reg_methods, (PyObject *)re, name);
381 static PyTypeObject Regextype = {
382 PyObject_HEAD_INIT(&PyType_Type)
383 0, /*ob_size*/
384 "regex", /*tp_name*/
385 sizeof(regexobject), /*tp_size*/
386 0, /*tp_itemsize*/
387 /* methods */
388 (destructor)reg_dealloc, /*tp_dealloc*/
389 0, /*tp_print*/
390 (getattrfunc)regobj_getattr, /*tp_getattr*/
391 0, /*tp_setattr*/
392 0, /*tp_compare*/
393 0, /*tp_repr*/
396 /* reference counting invariants:
397 pattern: borrowed
398 translate: borrowed
399 givenpat: borrowed
400 groupindex: transferred
402 static PyObject *
403 newregexobject(pattern, translate, givenpat, groupindex)
404 PyObject *pattern;
405 PyObject *translate;
406 PyObject *givenpat;
407 PyObject *groupindex;
409 regexobject *re;
410 char *pat;
411 int size;
413 if (!PyArg_Parse(pattern, "t#", &pat, &size))
414 return NULL;
416 if (translate != NULL && PyString_Size(translate) != 256) {
417 PyErr_SetString(RegexError,
418 "translation table must be 256 bytes");
419 return NULL;
421 re = PyObject_NEW(regexobject, &Regextype);
422 if (re != NULL) {
423 char *error;
424 re->re_patbuf.buffer = NULL;
425 re->re_patbuf.allocated = 0;
426 re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
427 if (translate) {
428 re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate);
429 if (!re->re_patbuf.translate)
430 goto finally;
431 Py_INCREF(translate);
433 else
434 re->re_patbuf.translate = NULL;
435 re->re_translate = translate;
436 re->re_lastok = NULL;
437 re->re_groupindex = groupindex;
438 Py_INCREF(pattern);
439 re->re_realpat = pattern;
440 Py_INCREF(givenpat);
441 re->re_givenpat = givenpat;
442 error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
443 if (error != NULL) {
444 PyErr_SetString(RegexError, error);
445 goto finally;
448 return (PyObject *)re;
449 finally:
450 Py_DECREF(re);
451 return NULL;
454 static PyObject *
455 regex_compile(self, args)
456 PyObject *self;
457 PyObject *args;
459 PyObject *pat = NULL;
460 PyObject *tran = NULL;
462 if (!PyArg_ParseTuple(args, "S|S", &pat, &tran))
463 return NULL;
464 return newregexobject(pat, tran, pat, NULL);
467 static PyObject *
468 symcomp(pattern, gdict)
469 PyObject *pattern;
470 PyObject *gdict;
472 char *opat, *oend, *o, *n, *g, *v;
473 int group_count = 0;
474 int sz;
475 int escaped = 0;
476 char name_buf[128];
477 PyObject *npattern;
478 int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
480 if (!(opat = PyString_AsString(pattern)))
481 return NULL;
483 if ((sz = PyString_Size(pattern)) < 0)
484 return NULL;
486 oend = opat + sz;
487 o = opat;
489 if (oend == opat) {
490 Py_INCREF(pattern);
491 return pattern;
494 if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) ||
495 !(n = PyString_AsString(npattern)))
496 return NULL;
498 while (o < oend) {
499 if (*o == '(' && escaped == require_escape) {
500 char *backtrack;
501 escaped = 0;
502 ++group_count;
503 *n++ = *o;
504 if (++o >= oend || *o != '<')
505 continue;
506 /* *o == '<' */
507 if (o+1 < oend && *(o+1) == '>')
508 continue;
509 backtrack = o;
510 g = name_buf;
511 for (++o; o < oend;) {
512 if (*o == '>') {
513 PyObject *group_name = NULL;
514 PyObject *group_index = NULL;
515 *g++ = '\0';
516 group_name = PyString_FromString(name_buf);
517 group_index = PyInt_FromLong(group_count);
518 if (group_name == NULL ||
519 group_index == NULL ||
520 PyDict_SetItem(gdict, group_name,
521 group_index) != 0)
523 Py_XDECREF(group_name);
524 Py_XDECREF(group_index);
525 Py_XDECREF(npattern);
526 return NULL;
528 Py_DECREF(group_name);
529 Py_DECREF(group_index);
530 ++o; /* eat the '>' */
531 break;
533 if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
534 o = backtrack;
535 break;
537 *g++ = *o++;
540 else if (*o == '[' && !escaped) {
541 *n++ = *o;
542 ++o; /* eat the char following '[' */
543 *n++ = *o;
544 while (o < oend && *o != ']') {
545 ++o;
546 *n++ = *o;
548 if (o < oend)
549 ++o;
551 else if (*o == '\\') {
552 escaped = 1;
553 *n++ = *o;
554 ++o;
556 else {
557 escaped = 0;
558 *n++ = *o;
559 ++o;
563 if (!(v = PyString_AsString(npattern))) {
564 Py_DECREF(npattern);
565 return NULL;
567 /* _PyString_Resize() decrements npattern on failure */
568 if (_PyString_Resize(&npattern, n - v) == 0)
569 return npattern;
570 else {
571 return NULL;
576 static PyObject *
577 regex_symcomp(self, args)
578 PyObject *self;
579 PyObject *args;
581 PyObject *pattern;
582 PyObject *tran = NULL;
583 PyObject *gdict = NULL;
584 PyObject *npattern;
585 PyObject *retval = NULL;
587 if (!PyArg_ParseTuple(args, "S|S", &pattern, &tran))
588 return NULL;
590 gdict = PyDict_New();
591 if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) {
592 Py_DECREF(gdict);
593 Py_DECREF(pattern);
594 return NULL;
596 retval = newregexobject(npattern, tran, pattern, gdict);
597 Py_DECREF(npattern);
598 return retval;
602 static PyObject *cache_pat;
603 static PyObject *cache_prog;
605 static int
606 update_cache(pat)
607 PyObject *pat;
609 PyObject *tuple = Py_BuildValue("(O)", pat);
610 int status = 0;
612 if (!tuple)
613 return -1;
615 if (pat != cache_pat) {
616 Py_XDECREF(cache_pat);
617 cache_pat = NULL;
618 Py_XDECREF(cache_prog);
619 cache_prog = regex_compile((PyObject *)NULL, tuple);
620 if (cache_prog == NULL) {
621 status = -1;
622 goto finally;
624 cache_pat = pat;
625 Py_INCREF(cache_pat);
627 finally:
628 Py_DECREF(tuple);
629 return status;
632 static PyObject *
633 regex_match(self, args)
634 PyObject *self;
635 PyObject *args;
637 PyObject *pat, *string;
638 PyObject *tuple, *v;
640 if (!PyArg_Parse(args, "(SS)", &pat, &string))
641 return NULL;
642 if (update_cache(pat) < 0)
643 return NULL;
645 if (!(tuple = Py_BuildValue("(S)", string)))
646 return NULL;
647 v = regobj_match((regexobject *)cache_prog, tuple);
648 Py_DECREF(tuple);
649 return v;
652 static PyObject *
653 regex_search(self, args)
654 PyObject *self;
655 PyObject *args;
657 PyObject *pat, *string;
658 PyObject *tuple, *v;
660 if (!PyArg_Parse(args, "(SS)", &pat, &string))
661 return NULL;
662 if (update_cache(pat) < 0)
663 return NULL;
665 if (!(tuple = Py_BuildValue("(S)", string)))
666 return NULL;
667 v = regobj_search((regexobject *)cache_prog, tuple);
668 Py_DECREF(tuple);
669 return v;
672 static PyObject *
673 regex_set_syntax(self, args)
674 PyObject *self;
675 PyObject *args;
677 int syntax;
678 if (!PyArg_Parse(args, "i", &syntax))
679 return NULL;
680 syntax = re_set_syntax(syntax);
681 /* wipe the global pattern cache */
682 Py_XDECREF(cache_pat);
683 cache_pat = NULL;
684 Py_XDECREF(cache_prog);
685 cache_prog = NULL;
686 return PyInt_FromLong((long)syntax);
689 static PyObject *
690 regex_get_syntax(self, args)
691 PyObject *self;
692 PyObject *args;
694 if (!PyArg_Parse(args, ""))
695 return NULL;
696 return PyInt_FromLong((long)re_syntax);
700 static struct PyMethodDef regex_global_methods[] = {
701 {"compile", regex_compile, 1},
702 {"symcomp", regex_symcomp, 1},
703 {"match", regex_match, 0},
704 {"search", regex_search, 0},
705 {"set_syntax", regex_set_syntax, 0},
706 {"get_syntax", regex_get_syntax, 0},
707 {NULL, NULL} /* sentinel */
710 DL_EXPORT(void)
711 initregex()
713 PyObject *m, *d, *v;
714 int i;
715 char *s;
717 m = Py_InitModule("regex", regex_global_methods);
718 d = PyModule_GetDict(m);
720 /* Initialize regex.error exception */
721 v = RegexError = PyErr_NewException("regex.error", NULL, NULL);
722 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
723 goto finally;
725 /* Initialize regex.casefold constant */
726 if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
727 goto finally;
729 if (!(s = PyString_AsString(v)))
730 goto finally;
732 for (i = 0; i < 256; i++) {
733 if (isupper(i))
734 s[i] = tolower(i);
735 else
736 s[i] = i;
738 if (PyDict_SetItemString(d, "casefold", v) < 0)
739 goto finally;
740 Py_DECREF(v);
742 if (!PyErr_Occurred())
743 return;
744 finally:
745 /* Nothing */ ;