improve treatment of multi-line replies, ignore empty lines
[python/dscho.git] / Modules / regexmodule.c
blob41ae9caf88aaab1a9592ee17d58af8bc785ad75a
1 /*
2 XXX support range parameter on search
3 XXX support mstop parameter on search
4 */
6 /***********************************************************
7 Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
8 The Netherlands.
10 All Rights Reserved
12 Permission to use, copy, modify, and distribute this software and its
13 documentation for any purpose and without fee is hereby granted,
14 provided that the above copyright notice appear in all copies and that
15 both that copyright notice and this permission notice appear in
16 supporting documentation, and that the names of Stichting Mathematisch
17 Centrum or CWI not be used in advertising or publicity pertaining to
18 distribution of the software without specific, written prior permission.
20 STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
21 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
22 FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
23 FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
24 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
25 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
26 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
28 ******************************************************************/
30 /* Regular expression objects */
31 /* This uses Tatu Ylonen's copyleft-free reimplementation of
32 GNU regular expressions */
34 #include "allobjects.h"
35 #include "modsupport.h"
37 #include "regexpr.h"
38 #include "ctype.h"
40 static object *RegexError; /* Exception */
42 typedef struct {
43 OB_HEAD
44 struct re_pattern_buffer re_patbuf; /* The compiled expression */
45 struct re_registers re_regs; /* The registers from the last match */
46 char re_fastmap[256]; /* Storage for fastmap */
47 object *re_translate; /* String object for translate table */
48 object *re_lastok; /* String object last matched/searched */
49 object *re_groupindex; /* Group name to index dictionary */
50 object *re_givenpat; /* Pattern with symbolic groups */
51 object *re_realpat; /* Pattern without symbolic groups */
52 } regexobject;
54 /* Regex object methods */
56 static void
57 reg_dealloc(re)
58 regexobject *re;
60 XDECREF(re->re_translate);
61 XDECREF(re->re_lastok);
62 XDECREF(re->re_groupindex);
63 XDECREF(re->re_givenpat);
64 XDECREF(re->re_realpat);
65 DEL(re);
68 static object *
69 makeresult(regs)
70 struct re_registers *regs;
72 object *v = newtupleobject(RE_NREGS);
73 if (v != NULL) {
74 int i;
75 for (i = 0; i < RE_NREGS; i++) {
76 object *w;
77 w = mkvalue("(ii)", regs->start[i], regs->end[i]);
78 if (w == NULL) {
79 XDECREF(v);
80 v = NULL;
81 break;
83 settupleitem(v, i, w);
86 return v;
89 static object *
90 reg_match(re, args)
91 regexobject *re;
92 object *args;
94 object *argstring;
95 char *buffer;
96 int size;
97 int offset;
98 int result;
99 if (getargs(args, "S", &argstring)) {
100 offset = 0;
102 else {
103 err_clear();
104 if (!getargs(args, "(Si)", &argstring, &offset))
105 return NULL;
107 buffer = getstringvalue(argstring);
108 size = getstringsize(argstring);
109 if (offset < 0 || offset > size) {
110 err_setstr(RegexError, "match offset out of range");
111 return NULL;
113 XDECREF(re->re_lastok);
114 re->re_lastok = NULL;
115 result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs);
116 if (result < -1) {
117 /* Failure like stack overflow */
118 err_setstr(RegexError, "match failure");
119 return NULL;
121 if (result >= 0) {
122 INCREF(argstring);
123 re->re_lastok = argstring;
125 return newintobject((long)result); /* Length of the match or -1 */
128 static object *
129 reg_search(re, args)
130 regexobject *re;
131 object *args;
133 object *argstring;
134 char *buffer;
135 int size;
136 int offset;
137 int range;
138 int result;
140 if (getargs(args, "S", &argstring)) {
141 offset = 0;
143 else {
144 err_clear();
145 if (!getargs(args, "(Si)", &argstring, &offset))
146 return NULL;
148 buffer = getstringvalue(argstring);
149 size = getstringsize(argstring);
150 if (offset < 0 || offset > size) {
151 err_setstr(RegexError, "search offset out of range");
152 return NULL;
154 /* NB: In Emacs 18.57, the documentation for re_search[_2] and
155 the implementation don't match: the documentation states that
156 |range| positions are tried, while the code tries |range|+1
157 positions. It seems more productive to believe the code! */
158 range = size - offset;
159 XDECREF(re->re_lastok);
160 re->re_lastok = NULL;
161 result = re_search(&re->re_patbuf, buffer, size, offset, range,
162 &re->re_regs);
163 if (result < -1) {
164 /* Failure like stack overflow */
165 err_setstr(RegexError, "match failure");
166 return NULL;
168 if (result >= 0) {
169 INCREF(argstring);
170 re->re_lastok = argstring;
172 return newintobject((long)result); /* Position of the match or -1 */
175 static object *
176 reg_group(re, args)
177 regexobject *re;
178 object *args;
180 int i, a, b;
181 if (args != NULL && is_tupleobject(args)) {
182 int n = gettuplesize(args);
183 object *res = newtupleobject(n);
184 if (res == NULL)
185 return NULL;
186 for (i = 0; i < n; i++) {
187 object *v = reg_group(re, gettupleitem(args, i));
188 if (v == NULL) {
189 DECREF(res);
190 return NULL;
192 settupleitem(res, i, v);
194 return res;
196 if (!getargs(args, "i", &i)) {
197 object *n;
198 err_clear();
199 if (!getargs(args, "S", &n))
200 return NULL;
201 else {
202 object *index;
203 if (re->re_groupindex == NULL)
204 index = NULL;
205 else
206 index = mappinglookup(re->re_groupindex, n);
207 if (index == NULL) {
208 err_setstr(RegexError, "group() group name doesn't exist");
209 return NULL;
211 i = getintvalue(index);
214 if (i < 0 || i >= RE_NREGS) {
215 err_setstr(RegexError, "group() index out of range");
216 return NULL;
218 if (re->re_lastok == NULL) {
219 err_setstr(RegexError,
220 "group() only valid after successful match/search");
221 return NULL;
223 a = re->re_regs.start[i];
224 b = re->re_regs.end[i];
225 if (a < 0 || b < 0) {
226 INCREF(None);
227 return None;
229 return newsizedstringobject(getstringvalue(re->re_lastok)+a, b-a);
232 static struct methodlist reg_methods[] = {
233 {"match", (method)reg_match},
234 {"search", (method)reg_search},
235 {"group", (method)reg_group},
236 {NULL, NULL} /* sentinel */
239 static object *
240 reg_getattr(re, name)
241 regexobject *re;
242 char *name;
244 if (strcmp(name, "regs") == 0) {
245 if (re->re_lastok == NULL) {
246 INCREF(None);
247 return None;
249 return makeresult(&re->re_regs);
251 if (strcmp(name, "last") == 0) {
252 if (re->re_lastok == NULL) {
253 INCREF(None);
254 return None;
256 INCREF(re->re_lastok);
257 return re->re_lastok;
259 if (strcmp(name, "translate") == 0) {
260 if (re->re_translate == NULL) {
261 INCREF(None);
262 return None;
264 INCREF(re->re_translate);
265 return re->re_translate;
267 if (strcmp(name, "groupindex") == 0) {
268 if (re->re_groupindex == NULL) {
269 INCREF(None);
270 return None;
272 INCREF(re->re_groupindex);
273 return re->re_groupindex;
275 if (strcmp(name, "realpat") == 0) {
276 if (re->re_realpat == NULL) {
277 INCREF(None);
278 return None;
280 INCREF(re->re_realpat);
281 return re->re_realpat;
283 if (strcmp(name, "givenpat") == 0) {
284 if (re->re_givenpat == NULL) {
285 INCREF(None);
286 return None;
288 INCREF(re->re_givenpat);
289 return re->re_givenpat;
291 if (strcmp(name, "__members__") == 0) {
292 object *list = newlistobject(6);
293 if (list) {
294 setlistitem(list, 0, newstringobject("last"));
295 setlistitem(list, 1, newstringobject("regs"));
296 setlistitem(list, 2, newstringobject("translate"));
297 setlistitem(list, 3, newstringobject("groupindex"));
298 setlistitem(list, 4, newstringobject("realpat"));
299 setlistitem(list, 5, newstringobject("givenpat"));
300 if (err_occurred()) {
301 DECREF(list);
302 list = NULL;
305 return list;
307 return findmethod(reg_methods, (object *)re, name);
310 static typeobject Regextype = {
311 OB_HEAD_INIT(&Typetype)
312 0, /*ob_size*/
313 "regex", /*tp_name*/
314 sizeof(regexobject), /*tp_size*/
315 0, /*tp_itemsize*/
316 /* methods */
317 (destructor)reg_dealloc, /*tp_dealloc*/
318 0, /*tp_print*/
319 (getattrfunc)reg_getattr, /*tp_getattr*/
320 0, /*tp_setattr*/
321 0, /*tp_compare*/
322 0, /*tp_repr*/
325 static object *
326 newregexobject(pattern, translate, givenpat, groupindex)
327 object *pattern;
328 object *translate;
329 object *givenpat;
330 object *groupindex;
332 regexobject *re;
333 char *pat = getstringvalue(pattern);
334 int size = getstringsize(pattern);
336 if (translate != NULL && getstringsize(translate) != 256) {
337 err_setstr(RegexError,
338 "translation table must be 256 bytes");
339 return NULL;
341 re = NEWOBJ(regexobject, &Regextype);
342 if (re != NULL) {
343 char *error;
344 re->re_patbuf.buffer = NULL;
345 re->re_patbuf.allocated = 0;
346 re->re_patbuf.fastmap = re->re_fastmap;
347 if (translate)
348 re->re_patbuf.translate = getstringvalue(translate);
349 else
350 re->re_patbuf.translate = NULL;
351 XINCREF(translate);
352 re->re_translate = translate;
353 re->re_lastok = NULL;
354 re->re_groupindex = groupindex;
355 INCREF(pattern);
356 re->re_realpat = pattern;
357 INCREF(givenpat);
358 re->re_givenpat = givenpat;
359 error = re_compile_pattern(pat, size, &re->re_patbuf);
360 if (error != NULL) {
361 err_setstr(RegexError, error);
362 DECREF(re);
363 re = NULL;
366 return (object *)re;
369 static object *
370 regex_compile(self, args)
371 object *self;
372 object *args;
374 object *pat = NULL;
375 object *tran = NULL;
376 if (!getargs(args, "S", &pat)) {
377 err_clear();
378 if (!getargs(args, "(SS)", &pat, &tran))
379 return NULL;
381 return newregexobject(pat, tran, pat, NULL);
384 static object *
385 symcomp(pattern, gdict)
386 object *pattern;
387 object *gdict;
389 char *opat = getstringvalue(pattern);
390 char *oend = opat + getstringsize(pattern);
391 int group_count = 0;
392 int escaped = 0;
393 char *o = opat;
394 char *n;
395 char name_buf[128];
396 char *g;
397 object *npattern;
398 int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
400 npattern = newsizedstringobject((char*)NULL, getstringsize(pattern));
401 if (npattern == NULL)
402 return NULL;
403 n = getstringvalue(npattern);
405 while (o < oend) {
406 if (*o == '(' && escaped == require_escape) {
407 char *backtrack;
408 escaped = 0;
409 ++group_count;
410 *n++ = *o;
411 if (++o >= oend || *o != '<')
412 continue;
413 /* *o == '<' */
414 if (o+1 < oend && *(o+1) == '>')
415 continue;
416 backtrack = o;
417 g = name_buf;
418 for (++o; o < oend;) {
419 if (*o == '>') {
420 object *group_name = NULL;
421 object *group_index = NULL;
422 *g++ = '\0';
423 group_name = newstringobject(name_buf);
424 group_index = newintobject(group_count);
425 if (group_name == NULL || group_index == NULL
426 || mappinginsert(gdict, group_name, group_index) != 0) {
427 XDECREF(group_name);
428 XDECREF(group_index);
429 XDECREF(npattern);
430 return NULL;
432 ++o; /* eat the '>' */
433 break;
435 if (!isalnum(*o) && *o != '_') {
436 o = backtrack;
437 break;
439 *g++ = *o++;
442 if (*o == '[' && !escaped) {
443 *n++ = *o;
444 ++o; /* eat the char following '[' */
445 *n++ = *o;
446 while (o < oend && *o != ']') {
447 ++o;
448 *n++ = *o;
450 if (o < oend)
451 ++o;
453 else if (*o == '\\') {
454 escaped = 1;
455 *n++ = *o;
456 ++o;
458 else {
459 escaped = 0;
460 *n++ = *o;
461 ++o;
465 if (resizestring(&npattern, n - getstringvalue(npattern)) == 0)
466 return npattern;
467 else {
468 DECREF(npattern);
469 return NULL;
474 static object *
475 regex_symcomp(self, args)
476 object *self;
477 object *args;
479 object *pattern;
480 object *tran = NULL;
481 object *gdict = NULL;
482 object *npattern;
483 if (!getargs(args, "S", &pattern)) {
484 err_clear();
485 if (!getargs(args, "(SS)", &pattern, &tran))
486 return NULL;
488 gdict = newmappingobject();
489 if (gdict == NULL
490 || (npattern = symcomp(pattern, gdict)) == NULL) {
491 DECREF(gdict);
492 DECREF(pattern);
493 return NULL;
495 return newregexobject(npattern, tran, pattern, gdict);
499 static object *cache_pat;
500 static object *cache_prog;
502 static int
503 update_cache(pat)
504 object *pat;
506 if (pat != cache_pat) {
507 XDECREF(cache_pat);
508 cache_pat = NULL;
509 XDECREF(cache_prog);
510 cache_prog = regex_compile((object *)NULL, pat);
511 if (cache_prog == NULL)
512 return -1;
513 cache_pat = pat;
514 INCREF(cache_pat);
516 return 0;
519 static object *
520 regex_match(self, args)
521 object *self;
522 object *args;
524 object *pat, *string;
525 if (!getargs(args, "(SS)", &pat, &string))
526 return NULL;
527 if (update_cache(pat) < 0)
528 return NULL;
529 return reg_match((regexobject *)cache_prog, string);
532 static object *
533 regex_search(self, args)
534 object *self;
535 object *args;
537 object *pat, *string;
538 if (!getargs(args, "(SS)", &pat, &string))
539 return NULL;
540 if (update_cache(pat) < 0)
541 return NULL;
542 return reg_search((regexobject *)cache_prog, string);
545 static object *
546 regex_set_syntax(self, args)
547 object *self, *args;
549 int syntax;
550 if (!getintarg(args, &syntax))
551 return NULL;
552 syntax = re_set_syntax(syntax);
553 return newintobject((long)syntax);
556 static struct methodlist regex_global_methods[] = {
557 {"compile", regex_compile},
558 {"symcomp", regex_symcomp},
559 {"match", regex_match},
560 {"search", regex_search},
561 {"set_syntax", regex_set_syntax},
562 {NULL, NULL} /* sentinel */
565 initregex()
567 object *m, *d, *v;
569 m = initmodule("regex", regex_global_methods);
570 d = getmoduledict(m);
572 /* Initialize regex.error exception */
573 RegexError = newstringobject("regex.error");
574 if (RegexError == NULL || dictinsert(d, "error", RegexError) != 0)
575 fatal("can't define regex.error");
577 /* Initialize regex.casefold constant */
578 v = newsizedstringobject((char *)NULL, 256);
579 if (v != NULL) {
580 int i;
581 char *s = getstringvalue(v);
582 for (i = 0; i < 256; i++) {
583 if (isupper(i))
584 s[i] = tolower(i);
585 else
586 s[i] = i;
588 dictinsert(d, "casefold", v);
589 DECREF(v);