added re9_subst() and sample for it
[libre9.git] / src / libre9 / re9.c
blob7817c66d3f2ce335ff20c17db24db2b2fef2f2aa
1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
17 #include "re9.h"
19 #include <ctype.h>
20 #include <limits.h>
21 #include <setjmp.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
29 /******************************************************************************/
30 #if defined(__GNUC__) && __GNUC__ >= 3
31 # define PACKED __attribute__((packed))
32 # define NORETURN __attribute__((noreturn))
33 #else
34 # define PACKED
35 # define NORETURN
36 #endif
38 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
40 #ifdef RE9_DEBUG
41 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
42 #else
43 # define dlogf(...)
44 #endif
46 //#define RE9_DEBUG_PRG
47 //#define RE9_CALC_MAXT
50 /******************************************************************************/
51 typedef uint32_t re9_rune;
54 enum {
55 PRG_STARTS_WITH_ANY = 0x00,
56 PRG_STARTS_WITH_BOL = 0x01,
57 PRG_STARTS_WITH_EOL = 0x02,
58 PRG_STARTS_WITH_RUNE = 0x04,
59 PRG_STARTS_WITH_CLASS = 0x08
63 struct re9_prog_s {
64 uint8_t *code; /* vm code */
65 uint32_t code_used;
66 uint32_t code_alloted;
67 uint32_t startinst; /* start pc */
68 #ifndef RE9_DISABLE_LEARNING
69 uint8_t startflags; /* PRG_START_WITH_xxx */
70 union {
71 uint32_t startrange; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
72 re9_rune startrune;
74 #endif
75 int flags; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
76 int nsub;
80 /* note that CLASS opcode is variable-length */
81 typedef struct PACKED {
82 uint32_t opcode:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
83 uint32_t next:24; /* offset of next opcode */
84 } vmopcode_t;
87 /* RUNE */
88 typedef struct PACKED {
89 vmopcode_t opc;
90 re9_rune r;
91 } vmop_rune_t;
94 /* SPLIT */
95 typedef struct PACKED {
96 vmopcode_t opc;
97 uint32_t alt; /* offset of 'alternate' opcode */
98 } vmop_split_t;
101 /* MARK */
102 typedef struct PACKED {
103 vmopcode_t opc;
104 uint8_t subid; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
105 } vmop_mark_t;
108 /* CLASS */
109 typedef struct PACKED {
110 vmopcode_t opc;
111 uint16_t spi_count; /* number of items in spi */
112 re9_rune spi[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
113 } vmop_class_t;
116 /******************************************************************************/
118 * Actions and Tokens (re9_inst_t types)
120 * 02xx are operators, value == precedence
121 * 03xx are tokens, i.e. operands for operators
123 #define RUNE 0177 /* literal rune */
125 #define OPERATOR 0200 /* bitmask of all operators */
126 #define START 0200 /* start, used for marker on stack */
127 #define RBRA 0201 /* right bracket */
128 #define LBRA 0202 /* left bracket */
129 #define ALT 0203 /* alternation */
130 #define CAT 0204 /* concatentation, implicit operator */
131 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
132 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
133 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
135 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
136 #define ANY 0301 /* (and token); any character except newline */
137 #define ANYNL 0302 /* (and token); any character including newline */
138 #define BOL 0303 /* (and token); beginning of line */
139 #define EOL 0304 /* (and token); end of line */
140 #define CCLASS 0305 /* (and token); character class */
141 #define NCCLASS 0306 /* (and token); negated character class */
142 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
143 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
144 #define SPLIT_G 0311 /* greedy split */
145 #define SPLIT_NG 0312 /* vmop; non-greedy split */
146 #define SMARK 0313 /* vmop; group start mark */
147 #define EMARK 0314 /* vmop; group end mark */
148 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
149 #define STAR_G 0315
150 #define STAR_NG 0316
151 /* */
152 #define END 0377 /* vmop; terminate: match found */
155 /******************************************************************************/
156 static inline uint32_t vmop_size (const void *op) {
157 switch (((const vmopcode_t *)op)->opcode) {
158 case RUNE:
159 return sizeof(vmop_rune_t);
160 case SPLIT_G: case SPLIT_NG:
161 return sizeof(vmop_split_t);
162 case SMARK: case EMARK:
163 return sizeof(vmop_mark_t);
164 case CCLASS: case NCCLASS:
165 return sizeof(vmop_class_t)+((const vmop_class_t *)op)->spi_count*sizeof(((const vmop_class_t *)op)->spi[0]);
166 /*default: ;*/
168 return sizeof(vmopcode_t);
172 /******************************************************************************/
173 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
174 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
175 static void dump_char (FILE *fo, re9_rune r) {
176 if (r >= 32 && r < 127 && r != '\'' && r != '\\') {
177 fprintf(fo, "%c", r);
178 } else {
179 fprintf(fo, "\\u%04x", r);
183 static uint32_t dump_opcode (FILE *fo, const re9_prog_t *pp, uint32_t pc) {
184 const vmopcode_t *op;
185 const vmop_class_t *opc;
186 uint32_t f;
187 if (pc >= pp->code_used) return 0;
188 fprintf(fo, "%05u: 0%03o ", pc, pp->code[pc]);
189 /* END */
190 if (pp->code[pc] == END) {
191 fprintf(fo, " END\n");
192 return pc+vmop_size((void *)(pp->code+pc));
194 if (pc+sizeof(vmopcode_t) > pp->code_used) {
195 fprintf(fo, "???: INVALID CODE!\n");
196 return 0;
198 op = (const vmopcode_t *)(pp->code+pc);
199 if (pc+vmop_size(op) > pp->code_used) {
200 fprintf(fo, "???: INVALID CODE!\n");
201 return 0;
203 fprintf(fo, "%05u ", op->next);
204 /* other */
205 switch (op->opcode) {
206 case NOP: fprintf(fo, "NOP\n"); break;
207 case ANY: fprintf(fo, "ANY\n"); break;
208 case ANYNL: fprintf(fo, "ANYNL\n"); break;
209 case BOL: fprintf(fo, "BOL\n"); break;
210 case EOL: fprintf(fo, "EOL\n"); break;
211 case WBOUND: fprintf(fo, "WBOUND\n"); break;
212 case WNBOUND: fprintf(fo, "WNBOUND\n"); break;
213 case RUNE:
214 fprintf(fo, "RUNE '");
215 dump_char(fo, ((const vmop_rune_t *)op)->r);
216 fputs("'\n", fo);
217 break;
218 case CCLASS: case NCCLASS:
219 opc = (const vmop_class_t *)op;
220 fprintf(fo, "%-8s [", (op->opcode == CCLASS ? "CCLASS" : "NCCLASS"));
221 for (f = 0; f < opc->spi_count; f += 2) {
222 dump_char(fo, opc->spi[f]);
223 if (opc->spi[f] != opc->spi[f+1]) {
224 fputc('-', fo);
225 dump_char(fo, opc->spi[f+1]);
228 fputs("]\n", fo);
229 break;
230 case SPLIT_G: case SPLIT_NG:
231 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t *)op)->alt);
232 break;
233 case STAR_G: case STAR_NG:
234 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "STAR_G" : "STAR_NG"), ((const vmop_split_t *)op)->alt);
235 break;
236 case SMARK: case EMARK:
237 fprintf(fo, "%-8s %u\n", (op->opcode == SMARK ? "SMARK" : "EMARK"), ((const vmop_mark_t *)op)->subid);
238 break;
239 default:
240 if (op->opcode < RUNE) {
241 fprintf(fo, "RUNE '");
242 dump_char(fo, op->opcode);
243 fputs("'\n", fo);
244 break;
246 fprintf(fo, "INVALID OPCODE!\n");
247 return 0;
249 return pc+vmop_size(op);
253 static void dump (const re9_prog_t *pp) {
254 uint32_t pc = 0;
255 fprintf(stderr, "code size: %u\n", pp->code_used);
256 while (pc < pp->code_used) if ((pc = dump_opcode(stderr, pp, pc)) == 0) break;
258 #endif
261 /******************************************************************************/
262 /* UTF-8 support */
264 #ifdef RE9_UNICODE_CASE
266 struct casemap {
267 unsigned short code; /* code point */
268 unsigned short lower; /* code */
269 unsigned short upper; /* code */
272 #include "re9_unicode_mapping.c"
275 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
277 static int cmp_casemap (const void *key, const void *cm) {
278 return *(const int *)key-(int)((const struct casemap *)cm)->code;
282 static inline int utf8_map_case (re9_rune uc, int upper) {
283 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
284 return (cm != NULL ? (upper ? cm->upper : cm->lower) : uc);
289 * returns the upper-case variant of the given unicode codepoint
290 * does not support unicode code points > \uffff
292 static inline re9_rune utf8_upper (re9_rune uc) {
293 return (uc < 256 ? (uc < 128 ? toupper(uc) : uc) : utf8_map_case(uc, 1));
298 * returns the lower-case variant of the given unicode codepoint.
299 * does not support unicode code points > \uffff
302 static inline int utf8_lower (int uc) {
303 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
306 #define UPPER(_r) utf8_upper(_r)
308 #else
310 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
312 #endif
315 enum {
316 RE9_UTF_MAX_BYTES = 4, /* maximum bytes per rune */
317 RE9_RUNE_SYNC = 0x80, /* cannot represent part of a UTF sequence (<) */
318 RE9_RUNE_SELF = 0x80, /* rune and UTF sequences are the same (<) */
319 RE9_RUNE_ERROR = 0xFFFD, /* decoding error in UTF */
320 RE9_RUNE_MAX = 0x10FFFF, /* maximum rune value */
321 /* */
322 RE9_RUNE_SPEC_EOL = RE9_RUNE_MAX+1,
323 RE9_RUNE_SPEC_NOTHING = RE9_RUNE_MAX+2,
324 RE9_RUNE_SPEC_WBOUND = RE9_RUNE_MAX+3,
325 RE9_RUNE_SPEC_WNBOUND = RE9_RUNE_MAX+4,
326 RE9_RUNE_SPEC_BOL = RE9_RUNE_MAX+5,
327 RE9_RUNE_SPEC_INVALID = RE9_RUNE_MAX+6
331 static const uint8_t utf8_clen[128] = {
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
334 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
335 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
336 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
337 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
338 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
339 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
344 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
345 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
346 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
347 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
351 /* returns length */
352 static inline int re9_char2rune (re9_rune *rune, const char *str, const char *eol) {
353 if ((intptr_t)(eol-str) > 0) {
354 re9_rune r;
355 uint8_t c = (unsigned char)(*str++), len = (utf8_clen[c>>1]>>((c&0x01)*4))&0x0f, res = len, oc = c;
356 if (len == 0 || eol-str+1 < len) { *rune = RE9_RUNE_ERROR; return 1; }
357 else if (len == 1) { *rune = c; return 1; }
358 r = c&(0x7c>>len);
359 while (--len > 0) {
360 if (((c = (unsigned char)(*str++))&0xc0) != 0x80) { *rune = RE9_RUNE_ERROR; return 1; }
361 r = (r<<6)|(c&0x3f);
363 if ((oc == 0xc0 || oc == 0xc1) && r != 0) r = RE9_RUNE_ERROR; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
364 if ((r >= 0xd800 && r <= 0xdfff) || // utf16/utf32 surrogates
365 (r >= 0xfdd0 && r <= 0xfdef) || // just for fun
366 (r >= 0xfffe && r <= 0xffff) || // fuck BOMs
367 r > 0x10ffff) { r = RE9_RUNE_ERROR; /*res = 1;*/ } // bad unicode
368 *rune = r;
369 return res;
370 } else {
371 *rune = RE9_RUNE_SPEC_EOL;
372 return 0;
377 static inline const char *re8_rune_strchr (const char *s, re9_rune c, const char *eol) {
378 if (s >= eol) return NULL;
379 /* not part of utf sequence? */
380 if (c < RE9_RUNE_SYNC) return memchr(s, c, eol-s);
381 while (s < eol) {
382 re9_rune r;
383 int len = re9_char2rune(&r, s, eol);
384 if (r == c) return s;
385 s += len;
387 return NULL;
391 /******************************************************************************/
392 /* compiler */
394 enum {
395 RE9_FALSE = 0,
396 RE9_TRUE = 1
400 /* parser information */
401 typedef struct {
402 /*re9_inst_t*/uint32_t first;
403 /*re9_inst_t*/uint32_t last;
404 } re9_node_t;
407 #define NSTACK (32)
409 /* compiler data */
410 typedef struct {
411 re9_prog_t *pp;
412 /* operand stack */
413 re9_node_t andstack[NSTACK];
414 re9_node_t *andp;
415 /* operator stack */
416 int atorstack[NSTACK];
417 int *atorp;
418 /* */
419 int cursubid; /* id of current subexpression */
420 int subidstack[NSTACK]; /* parallel to atorstack */
421 int *subidp;
422 /* */
423 int last_was_operand; /* last token was operand */
424 int nbra; /* number of opened parens */
425 const char *expr; /* pointer to next character in source expression */
426 const char *expr_eol;
427 int flags; /* parser flags */
428 /* */
429 int comment_level; /* >0: in comment; counts parens */
430 int return_rune_nothing; /* return 'nothing' rune from next lex()? will be reset by lex() */
431 re9_rune yyrune; /* last lex'd rune */
432 /* last lex'd class */
433 re9_rune *yyclass;
434 uint32_t yyc_used;
435 uint32_t yyc_alloted;
436 /* */
437 jmp_buf regkaboom;
438 const char *errormsg;
439 } re9_compiler_t;
442 /******************************************************************************/
443 static NORETURN void rcerror (re9_compiler_t *ci, const char *s) {
444 ci->errormsg = s;
445 longjmp(ci->regkaboom, 1);
449 /******************************************************************************/
450 static void add_to_class (re9_compiler_t *ci, re9_rune r) {
451 if (ci->comment_level == 0) {
452 if (ci->yyc_used+1 > ci->yyc_alloted) {
453 uint32_t newsz = ((ci->yyc_used+1)|0x1f)+1;
454 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
455 if (newr == NULL) rcerror(ci, "out of memory");
456 ci->yyclass = newr;
457 ci->yyc_alloted = newsz;
459 ci->yyclass[ci->yyc_used++] = r;
464 static void add_to_class_two (re9_compiler_t *ci, re9_rune r0, re9_rune r1) {
465 if (ci->comment_level == 0) {
466 if (ci->yyc_used+2 > ci->yyc_alloted) {
467 uint32_t newsz = ((ci->yyc_used+2)|0x1f)+1;
468 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
469 if (newr == NULL) rcerror(ci, "out of memory");
470 ci->yyclass = newr;
471 ci->yyc_alloted = newsz;
473 ci->yyclass[ci->yyc_used++] = r0;
474 ci->yyclass[ci->yyc_used++] = r1;
479 /******************************************************************************/
480 /* return 1 if char is quoted */
481 static int nextc (re9_compiler_t *ci, re9_rune *rp) {
482 if (ci->expr >= ci->expr_eol) { *rp = RE9_RUNE_SPEC_EOL; return 1; }
483 if ((ci->flags&RE9_FLAG_NONUTF8) == 0) {
484 if (ci->expr[0] == '\\') {
485 if (ci->expr+1 < ci->expr_eol) {
486 ci->expr += re9_char2rune(rp, ci->expr+1, ci->expr_eol)+1;
487 } else {
488 ++ci->expr;
489 *rp = '\\';
491 return 1;
492 } else {
493 ci->expr += re9_char2rune(rp, ci->expr, ci->expr_eol);
495 } else {
496 if (ci->expr[0] == '\\') {
497 if (ci->expr+1 < ci->expr_eol) {
498 *rp = (unsigned char)(ci->expr[1]);
499 ci->expr += 2;
500 } else {
501 ++ci->expr;
502 *rp = '\\';
504 return 1;
505 } else {
506 *rp = (unsigned char)(*ci->expr++);
507 if (ci->expr > ci->expr_eol) ci->expr = ci->expr_eol;
510 if (ci->flags&RE9_FLAG_CASEINSENS) *rp = UPPER(*rp);
511 return 0;
515 static void addmeta_space (re9_compiler_t *ci, int neg) {
516 if (!neg) {
517 add_to_class_two(ci, '\x9', '\xd');
518 add_to_class_two(ci, ' ', ' ');
519 } else {
520 add_to_class_two(ci, '\1', '\x9'-1);
521 add_to_class_two(ci, '\xd'+1, ' '-1);
522 add_to_class_two(ci, ' '+1, RE9_RUNE_MAX);
527 static void addmeta_digit (re9_compiler_t *ci, int neg) {
528 if (!neg) {
529 add_to_class_two(ci, '0', '9');
530 } else {
531 add_to_class_two(ci, '\1', '0'-1);
532 add_to_class_two(ci, '9'+1, RE9_RUNE_MAX);
537 static void addmeta_word (re9_compiler_t *ci, int neg) {
538 if (!neg) {
539 add_to_class_two(ci, '0', '9');
540 add_to_class_two(ci, 'A', 'Z');
541 add_to_class_two(ci, '_', '_');
542 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
543 } else {
544 add_to_class_two(ci, '\0', '0'-1);
545 add_to_class_two(ci, '9'+1, 'A'-1);
546 add_to_class_two(ci, 'Z'+1, '_'-1);
547 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) {
548 add_to_class_two(ci, '_'+1, 'a'-1);
549 add_to_class_two(ci, 'z'+1, RE9_RUNE_MAX);
550 } else {
551 add_to_class_two(ci, '_'+1, RE9_RUNE_MAX);
557 #ifndef RE9_DISABLE_POSIX_CLASSES
558 static void addmeta_upper (re9_compiler_t *ci) {
559 add_to_class_two(ci, 'A', 'Z');
563 static void addmeta_lower (re9_compiler_t *ci) {
564 if (ci->flags&RE9_FLAG_CASEINSENS) {
565 add_to_class_two(ci, 'A', 'Z');
566 } else {
567 add_to_class_two(ci, 'a', 'z');
572 static void addmeta_alpha (re9_compiler_t *ci) {
573 add_to_class_two(ci, 'A', 'Z');
574 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
578 static void addmeta_xdigit (re9_compiler_t *ci) {
579 add_to_class_two(ci, '0', '9');
580 add_to_class_two(ci, 'A', 'F');
581 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'f');
585 static void addmeta_punct (re9_compiler_t *ci) {
586 add_to_class_two(ci, 33, 47); // !"#$%&'()*+,-./
587 add_to_class_two(ci, 58, 64); // :;<=>?@
588 add_to_class_two(ci, 91, 96); // [\]^_`
589 add_to_class_two(ci, 123, 126); // {|}~
593 /*FIXME:??? this should include '\z'! */
594 static void addmeta_ascii (re9_compiler_t *ci) {
595 add_to_class_two(ci, '\1', '\x7f');
599 static void addmeta_blank (re9_compiler_t *ci) {
600 add_to_class_two(ci, '\t', '\t');
601 add_to_class_two(ci, ' ', ' ');
605 /*FIXME:??? this should include '\z'! */
606 static void addmeta_ctrl (re9_compiler_t *ci) {
607 add_to_class_two(ci, '\1', '\x1f');
608 add_to_class_two(ci, '\x7f', '\x7f');
612 static void addmeta_graph (re9_compiler_t *ci) {
613 add_to_class_two(ci, '!', '\x7e');
617 static void addmeta_print (re9_compiler_t *ci) {
618 add_to_class_two(ci, ' ', '\x7e');
620 #endif
623 static void normalize_spans (re9_compiler_t *ci) {
624 if (ci->yyc_used > 2) {
625 /* we have at least two spans */
626 uint32_t cp;
627 re9_rune *ep = ci->yyclass+ci->yyc_used;
628 re9_rune *p, *np;
629 /* sort on span start */
630 /* yeah, old good bubble sorting */
631 for (p = ci->yyclass; p < ep; p += 2) {
632 for (np = p; np < ep; np += 2) {
633 if (*np < *p) {
634 re9_rune rune;
635 rune = np[0];
636 np[0] = p[0];
637 p[0] = rune;
638 rune = np[1];
639 np[1] = p[1];
640 p[1] = rune;
644 /* merge spans */
645 for (cp = 2; cp < ci->yyc_used; ) {
646 uint32_t f;
647 /* if next span start is inside previous span, merge both spans */
648 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
649 if (ci->yyclass[cp] <= ci->yyclass[cp-1]) {
650 /* inside, merge */
651 if (ci->yyclass[cp+1] > ci->yyclass[cp-1]) ci->yyclass[cp-1] = ci->yyclass[cp+1];
652 /* remove this span */
653 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
654 ci->yyc_used -= 2;
655 } else if (ci->yyclass[cp-1]+1 == ci->yyclass[cp]) {
656 /* we can join this spans */
657 ci->yyclass[cp-1] = ci->yyclass[cp+1];
658 /* remove this span */
659 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
660 ci->yyc_used -= 2;
661 } else {
662 /* outside, skip */
663 cp += 2;
670 /* parse character range */
671 static int bldcclass (re9_compiler_t *ci) {
672 int type;
673 re9_rune rune;
674 int quoted;
675 /* we have already seen the '[' */
676 type = CCLASS;
677 ci->yyc_used = 0;
678 /* look ahead for negation */
679 /* SPECIAL CASE!!! negated classes don't match \n */
680 quoted = nextc(ci, &rune);
681 if (!quoted && rune == '^') {
682 type = NCCLASS;
683 quoted = nextc(ci, &rune);
684 add_to_class_two(ci, '\n', '\n');
686 /* parse class into a set of spans */
687 for (;; quoted = nextc(ci, &rune)) {
688 if (rune == RE9_RUNE_SPEC_EOL) rcerror(ci, "malformed '[]'");
689 if (!quoted && rune == ']' && (ci->yyc_used&0x01) == 0) break;
690 if (quoted && ((rune >= '0' && rune <= '9') || (rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z'))) {
691 /* metacharacters */
692 if ((ci->yyc_used&0x01) != 0 || (ci->expr < ci->expr_eol && ci->expr[0] == '-')) rcerror(ci, "malformed '[]'"); /* metacharacter can't be used in rangedef */
693 switch (rune) {
694 case 'd': case 'D': addmeta_digit(ci, (rune <= 'Z')); break;
695 case 's': case 'S': addmeta_space(ci, (rune <= 'Z')); break;
696 case 'w': case 'W': addmeta_word(ci, (rune <= 'Z')); break;
697 case 'z': add_to_class_two(ci, '\0', '\0'); break;
698 case 'Z': add_to_class_two(ci, '\1', RE9_RUNE_MAX); break;
699 default:
700 rcerror(ci, "invalid metacharacter in '[]'");
701 break;
703 #ifndef RE9_DISABLE_POSIX_CLASSES
704 } else if ((ci->yyc_used&0x01) == 0 && !quoted && rune == '[' && ci->expr+7 < ci->expr_eol && ci->expr[0] == ':') {
705 if (strncmp(ci->expr, ":alnum:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); addmeta_digit(ci, 0); }
706 else if (strncmp(ci->expr, ":alpha:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); }
707 else if (strncmp(ci->expr, ":ascii:]", 8) == 0) { ci->expr += 8; addmeta_ascii(ci); }
708 else if (strncmp(ci->expr, ":blank:]", 8) == 0) { ci->expr += 8; addmeta_blank(ci); }
709 else if (strncmp(ci->expr, ":cntrl:]", 8) == 0) { ci->expr += 8; addmeta_ctrl(ci); }
710 else if (strncmp(ci->expr, ":digit:]", 8) == 0) { ci->expr += 8; addmeta_digit(ci, 0); }
711 else if (strncmp(ci->expr, ":graph:]", 8) == 0) { ci->expr += 8; addmeta_graph(ci); }
712 else if (strncmp(ci->expr, ":lower:]", 8) == 0) { ci->expr += 8; addmeta_lower(ci); }
713 else if (strncmp(ci->expr, ":print:]", 8) == 0) { ci->expr += 8; addmeta_print(ci); }
714 else if (strncmp(ci->expr, ":punct:]", 8) == 0) { ci->expr += 8; addmeta_punct(ci); }
715 else if (strncmp(ci->expr, ":space:]", 8) == 0) { ci->expr += 8; addmeta_space(ci, 0); }
716 else if (strncmp(ci->expr, ":upper:]", 8) == 0) { ci->expr += 8; addmeta_upper(ci); }
717 else if (strncmp(ci->expr, ":wordc:]", 8) == 0) { ci->expr += 7; addmeta_word(ci, 0); } /*non-standard!*/
718 else if (ci->expr+8 < ci->expr_eol && strncmp(ci->expr, ":xdigit:]", 9) == 0) { ci->expr += 9; addmeta_xdigit(ci); }
719 else rcerror(ci, "invalid POSIX range");
720 #endif
721 } else {
722 if (ci->yyc_used&0x01 && rune < ci->yyclass[ci->yyc_used-1]) rcerror(ci, "invalid range in '[]'");
723 add_to_class(ci, rune);
724 if (ci->yyc_used&0x01) {
725 /* this was first char of possible range */
726 if (ci->expr < ci->expr_eol && ci->expr[0] == '-') {
727 /* rangedef; skip '-' */
728 ++ci->expr;
729 } else {
730 /* one char: convert it to range */
731 add_to_class(ci, rune);
736 /* sort and store only if we are not in comment */
737 if (ci->comment_level == 0) {
738 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: character class parsing error");
739 if (ci->yyc_used == 0) rcerror(ci, "empty '[]'");
740 normalize_spans(ci);
741 /* if we have only one char in range, we can use RUNE instead */
742 if (type == CCLASS && ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
743 /* yeah, one char! */
744 ci->yyrune = ci->yyclass[0];
745 return RUNE;
748 return type;
752 /* parse metacharacter */
753 static int bldmeta (re9_compiler_t *ci, re9_rune rune) {
754 int rangeop = (rune >= 'a' && rune <= 'z' ? CCLASS : NCCLASS);
755 ci->yyc_used = 0;
756 switch (rune) {
757 case 'd': case 'D': addmeta_digit(ci, 0); break;
758 case 's': case 'S': addmeta_space(ci, 0); break;
759 case 'w': case 'W': addmeta_word(ci, 0); break;
760 case 'Z': add_to_class_two(ci, '\0', '\0'); break;
761 case 'z': ci->yyrune = 0; return RUNE;
762 case 'b': ci->yyrune = RE9_RUNE_SPEC_WBOUND; return RUNE;
763 case 'B': ci->yyrune = RE9_RUNE_SPEC_WNBOUND; return RUNE;
764 default: rcerror(ci, "invalid metacharacter");
766 return rangeop;
770 /* get token */
771 static int lex (re9_compiler_t *ci) {
772 if (ci->return_rune_nothing) {
773 ci->return_rune_nothing = 0;
774 ci->yyrune = RE9_RUNE_SPEC_NOTHING;
775 } else if (ci->expr >= ci->expr_eol) {
776 ci->yyrune = RE9_RUNE_SPEC_EOL;
777 } else if (ci->flags&RE9_FLAG_LITERAL) {
778 if (ci->expr < ci->expr_eol && ci->expr[0] == '\\') {
779 ++ci->expr;
780 ci->yyrune = '\\';
781 } else {
782 /* can't be quoted */
783 nextc(ci, &ci->yyrune);
785 } else {
786 int quoted = nextc(ci, &ci->yyrune);
787 if (quoted) {
788 /* check for metacharacter */
789 if ((ci->yyrune >= '0' && ci->yyrune <= '9') || (ci->yyrune >= 'a' && ci->yyrune <= 'z') || (ci->yyrune >= 'A' && ci->yyrune <= 'Z')) return bldmeta(ci, ci->yyrune);
790 } else {
791 switch (ci->yyrune) {
792 case '*': quoted = STAR; goto closure;
793 case '?': quoted = QUEST; goto closure;
794 case '+': quoted = PLUS; /* fallthru */
795 closure: if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
796 #ifndef RE9_DISABLE_NONGREEDY
797 ++ci->expr;
798 quoted |= 0x100;
799 #else
800 if ((ci->flags&RE9_FLAG_NONGREEDY) == 0) rcerror(ci, "non-greedy closure support was disabled at compile time");
801 #endif
803 return quoted;
804 case '|': return ALT;
805 case '.': return (ci->flags&RE9_FLAG_ANYDOT ? ANYNL : ANY);
806 case '(': return LBRA;
807 case ')': return RBRA;
808 case '^': return BOL;
809 case '$': return EOL;
810 case '[': return bldcclass(ci);
814 return (ci->yyrune != RE9_RUNE_SPEC_EOL ? RUNE : END);
818 /******************************************************************************/
819 static inline uint32_t instofs (const re9_compiler_t *ci, const vmopcode_t *op) {
820 return (int32_t)(((const uint8_t *)op)-ci->pp->code);
824 static inline vmopcode_t *instptr (re9_compiler_t *ci, uint32_t ofs) {
825 return (vmopcode_t *)(ci->pp->code+ofs);
829 /* push instructions to 'operand stack' */
830 static void push_operand (re9_compiler_t *ci, vmopcode_t *f, vmopcode_t *l) {
831 if (ci->andp >= &ci->andstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operand stack overflow");
832 ci->andp->first = instofs(ci, f);
833 ci->andp->last = instofs(ci, l);
834 ++ci->andp;
838 /* push token and group id to 'operator stack' */
839 static void push_operator (re9_compiler_t *ci, int t, int csi) {
840 if (ci->atorp >= &ci->atorstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack overflow");
841 *ci->atorp++ = t;
842 *ci->subidp++ = csi;
846 /* pop instriction from 'operand stack' */
847 static re9_node_t *pop_operand (re9_compiler_t *ci) {
848 if (ci->andp <= &ci->andstack[0]) rcerror(ci, "missing operand");
849 return --ci->andp;
853 /* pop token from 'operator stack' */
854 static int pop_operator (re9_compiler_t *ci) {
855 if (ci->atorp <= &ci->atorstack[0]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack underflow");
856 --ci->subidp;
857 return *--ci->atorp;
861 /******************************************************************************/
862 static void *allot (re9_compiler_t *ci, size_t size) {
863 void *res;
864 if (ci->pp->code_used+size > ci->pp->code_alloted) {
865 uint32_t newsz = ((ci->pp->code_used+size)|0xfff)+1;
866 uint8_t *newc = realloc(ci->pp->code, newsz);
867 if (newc == NULL) rcerror(ci, "out of memory");
868 memset(newc+ci->pp->code_used, 0, newsz-ci->pp->code_used);
869 ci->pp->code = newc;
870 ci->pp->code_alloted = newsz;
872 res = ci->pp->code+ci->pp->code_used;
873 ci->pp->code_used += size;
874 return res;
878 /* allocate new instruction */
879 static inline vmopcode_t *newinst (re9_compiler_t *ci, int t) {
880 vmopcode_t *op = allot(ci, sizeof(*op));
881 op->opcode = t;
882 return op;
886 static vmopcode_t *newinst_class (re9_compiler_t *ci, int t) {
887 vmop_class_t *op = allot(ci, sizeof(*op)+ci->yyc_used*sizeof(op->spi[0]));
888 op->opc.opcode = t;
889 if (ci->yyc_used > 0) memcpy(op->spi, ci->yyclass, ci->yyc_used*sizeof(op->spi[0]));
890 op->spi_count = ci->yyc_used;
891 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
892 return (vmopcode_t *)op;
896 static vmopcode_t *newinst_rune (re9_compiler_t *ci, re9_rune r) {
897 if (r < RUNE || r > RE9_RUNE_MAX) {
898 vmopcode_t *op = allot(ci, sizeof(*op));
899 switch (r) {
900 case RE9_RUNE_SPEC_NOTHING: op->opcode = NOP; break;
901 case RE9_RUNE_SPEC_WBOUND: op->opcode = WBOUND; break;
902 case RE9_RUNE_SPEC_WNBOUND: op->opcode = WNBOUND; break;
903 default:
904 if (r < RUNE) {
905 op->opcode = r;
906 break;
908 rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
910 return op;
911 } else {
912 vmop_rune_t *op = allot(ci, sizeof(*op));
913 op->opc.opcode = RUNE;
914 op->r = r;
915 return (vmopcode_t *)op;
920 static vmopcode_t *newinst_mark (re9_compiler_t *ci, int t, int id) {
921 if (id >= 0) {
922 vmop_mark_t *op = allot(ci, sizeof(*op));
923 if (id > 127) rcerror(ci, "too many captures");
924 op->opc.opcode = t;
925 op->subid = id;
926 return (vmopcode_t *)op;
927 } else {
928 vmopcode_t *op = allot(ci, sizeof(*op));
929 op->opcode = NOP;
930 return op;
935 static inline vmop_split_t *newinst_split (re9_compiler_t *ci, int t) {
936 vmop_split_t *op = allot(ci, sizeof(*op));
937 #ifndef RE9_DISABLE_NONGREEDY
938 op->opc.opcode = ((ci->flags&RE9_FLAG_NONGREEDY) == 0 ? (t&0x100 ? SPLIT_NG : SPLIT_G) : (t&0x100 ? SPLIT_G : SPLIT_NG));
939 if ((t&0xff) == STAR) op->opc.opcode = (op->opc.opcode == SPLIT_G ? STAR_G : STAR_NG);
940 #else
941 op->opc.opcode = ((t&0xff) == STAR ? STAR_G : SPLIT_G);
942 #endif
943 return op;
947 static inline vmop_split_t *newinst_split1 (re9_compiler_t *ci, int t) {
948 vmop_split_t *op = allot(ci, sizeof(*op));
949 op->opc.opcode = t;
950 return op;
954 /* process instructions with higher current priority levels */
955 static void evaluntil (re9_compiler_t *ci, int pri) {
956 while (pri == RBRA || (ci->atorp[-1]&0xff) >= (pri&0xff)) {
957 re9_node_t *op1, *op2;
958 vmopcode_t *inst1, *inst2;
959 vmop_split_t *si1;
960 int op = pop_operator(ci);
961 switch (op) {
962 case LBRA: /* must have been RBRA */
963 op1 = pop_operand(ci);
964 inst2 = newinst_mark(ci, EMARK, *ci->subidp);
965 instptr(ci, op1->last)->next = instofs(ci, inst2);
966 inst1 = newinst_mark(ci, SMARK, *ci->subidp);
967 inst1->next = op1->first;
968 push_operand(ci, inst1, inst2);
969 return;
970 case ALT:
971 op2 = pop_operand(ci);
972 op1 = pop_operand(ci);
973 inst2 = newinst(ci, NOP);
974 instptr(ci, op2->last)->next = instptr(ci, op1->last)->next = instofs(ci, inst2);
975 si1 = newinst_split1(ci, SPLIT_G);
976 si1->alt = op1->first;
977 si1->opc.next = op2->first;
978 push_operand(ci, (void *)si1, inst2);
979 break;
980 case CAT:
981 op2 = pop_operand(ci);
982 op1 = pop_operand(ci);
983 instptr(ci, op1->last)->next = op2->first;
984 push_operand(ci, instptr(ci, op1->first), instptr(ci, op2->last));
985 break;
986 case STAR: case STAR|0x100: /* greedy and non-greedy */
987 op2 = pop_operand(ci);
988 si1 = newinst_split(ci, op);
989 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
990 si1->alt = op2->first;
991 push_operand(ci, (void *)si1, (void *)si1);
992 break;
993 case PLUS: case PLUS|0x100: /* greedy and non-greedy */
994 op2 = pop_operand(ci);
995 si1 = newinst_split(ci, op);
996 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
997 si1->alt = op2->first;
998 push_operand(ci, instptr(ci, op2->first), (void *)si1);
999 break;
1000 case QUEST: case QUEST|0x100: /* greedy and non-greedy */
1001 op2 = pop_operand(ci);
1002 si1 = newinst_split(ci, op);
1003 inst2 = newinst(ci, NOP);
1004 si1->opc.next = instofs(ci, inst2);
1005 si1->alt = op2->first;
1006 instptr(ci, op2->last)->next = instofs(ci, inst2);
1007 push_operand(ci, (void *)si1, inst2);
1008 break;
1009 default:
1010 rcerror(ci, "unknown operator in evaluntil");
1011 break;
1017 /******************************************************************************/
1018 /* process operator */
1019 static void operator (re9_compiler_t *ci, int t) {
1020 int csi = ci->cursubid;
1021 if (ci->comment_level) {
1022 /* in comment */
1023 if (t == RBRA || t == LBRA) {
1024 ci->nbra += (t == LBRA ? 1 : -1);
1025 ci->comment_level += (t == LBRA ? 1 : -1);
1027 } else {
1028 /* not in comment */
1029 if (t == RBRA && --ci->nbra < 0) rcerror(ci, "unmatched right paren");
1030 if (t == LBRA) {
1031 ++ci->nbra;
1032 csi = -1;
1033 if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
1034 if (ci->expr+1 < ci->expr_eol && ci->expr[1] == ':') {
1035 ci->expr += 2;
1036 } else if (ci->expr+1 < ci->expr_eol && ci->expr[1] == '#') {
1037 ci->expr += 2;
1038 ci->comment_level = 1;
1039 ci->last_was_operand = RE9_TRUE;
1040 return;
1041 } else {
1042 rcerror(ci, "invalid paren flags");
1044 } else {
1045 if ((csi = ++ci->cursubid) >= RE9_SUBEXP_MAX) rcerror(ci, "too many subexpressions");
1046 ci->pp->nsub = csi;
1048 if (ci->last_was_operand) operator(ci, CAT);
1049 /* process empty parens */
1050 if (ci->expr < ci->expr_eol && ci->expr[0] == ')') ci->return_rune_nothing = 1; /* hack: next lex() will return 'nothing' rune */
1051 } else {
1052 evaluntil(ci, t);
1054 if (t != RBRA) push_operator(ci, t, csi);
1056 switch (t&0xff) {
1057 case STAR: case QUEST: case PLUS: case RBRA:
1058 ci->last_was_operand = RE9_TRUE; /* these look like operands */
1059 break;
1060 default:
1061 ci->last_was_operand = RE9_FALSE;
1062 break;
1067 /* process operand: rune or character class */
1068 static void operand (re9_compiler_t *ci, int t) {
1069 if (ci->comment_level == 0) {
1070 vmopcode_t *i;
1071 if (ci->last_was_operand) operator(ci, CAT); /* concatenate is implicit */
1072 switch (t) {
1073 case CCLASS: case NCCLASS:
1074 i = newinst_class(ci, t);
1075 break;
1076 case RUNE:
1077 i = newinst_rune(ci, ci->yyrune);
1078 break;
1079 default:
1080 i = newinst(ci, t);
1081 break;
1083 push_operand(ci, i, i);
1085 ci->last_was_operand = RE9_TRUE;
1089 #ifndef RE9_DISABLE_LEARNING
1090 /* trace VM code and build 'starting range' */
1091 /* return !0 if we hit 'ANY*' */
1092 static int learn_start_tracer (re9_compiler_t *ci, uint32_t pc) {
1093 for (;;) {
1094 uint8_t opcode = ci->pp->code[pc];
1095 const vmop_class_t *opc;
1096 re9_rune pr;
1097 uint32_t f;
1098 if (opcode < RUNE) {
1099 /* wow, a solid hit */
1100 add_to_class_two(ci, opcode, opcode);
1101 return 0;
1103 switch (opcode) {
1104 case RUNE: /* another solid hit */
1105 add_to_class_two(ci, ((const vmop_rune_t *)(ci->pp->code+pc))->r, ((const vmop_rune_t *)(ci->pp->code+pc))->r);
1106 return 0;
1107 case ANY: case ANYNL: /* shit... */
1108 ci->yyc_used = 0;
1109 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1110 return 1;
1111 case SPLIT_G: case SPLIT_NG: /* choose your way to die */
1112 case STAR_G: case STAR_NG: /* i can do better analyzis here, but i don't care for now */
1113 if (learn_start_tracer(ci, ((const vmop_split_t *)(ci->pp->code+pc))->alt)) return 1; /* no way to go */
1114 break;
1115 case BOL:
1116 ci->pp->startflags |= PRG_STARTS_WITH_BOL;
1117 return 0;
1118 case EOL:
1119 ci->pp->startflags |= PRG_STARTS_WITH_EOL;
1120 return 0;
1121 case CCLASS:
1122 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1123 for (f = 0; f < opc->spi_count; ++f) add_to_class(ci, opc->spi[f]);
1124 return 0;
1125 case NCCLASS:
1126 /* oh, this is harder than previous one */
1127 /* we realy on the fact that range was sorted and there are gaps between ranges */
1128 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1129 if (opc->spi[0] > 0) add_to_class_two(ci, 0, opc->spi[0]-1);
1130 pr = opc->spi[1]+1;
1131 for (f = 2; f < opc->spi_count && pr <= RE9_RUNE_MAX; f += 2) {
1132 add_to_class_two(ci, pr, opc->spi[f]-1);
1133 pr = opc->spi[f+1]+1;
1135 if (pr <= RE9_RUNE_MAX) add_to_class_two(ci, pr, RE9_RUNE_MAX);
1136 return 0;
1137 case END:
1138 /*return 0;*/
1139 ci->yyc_used = 0;
1140 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1141 return 1;
1142 /* ignore all other shit */
1144 pc = instptr(ci, pc)->next;
1149 static void learn_start (re9_compiler_t *ci) {
1150 int any;
1151 ci->yyc_used = 0;
1152 ci->pp->startflags = 0;
1153 ci->pp->startrange = 0;
1154 ci->pp->startrune = 0;
1155 any = learn_start_tracer(ci, ci->pp->startinst);
1156 if (any || ci->yyc_used == 0) {
1157 ci->pp->startflags = PRG_STARTS_WITH_ANY;
1158 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1159 } else {
1160 /* generate CCLASS */
1161 normalize_spans(ci);
1162 if (ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
1163 /* simple RUNE will be enough */
1164 ci->pp->startflags |= PRG_STARTS_WITH_RUNE;
1165 ci->pp->startrune = ci->yyclass[0];
1166 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1167 } else {
1168 ci->pp->startflags |= PRG_STARTS_WITH_CLASS;
1169 ci->pp->startrange = instofs(ci, newinst_class(ci, CCLASS));
1170 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1171 #ifdef RE9_DEBUG_PRG
1172 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1173 #endif
1177 #endif
1180 /*TODO:??? collapse some things, get rid of nops? */
1181 static void optimize (re9_compiler_t *ci) {
1182 re9_prog_t *pp = ci->pp;
1183 uint32_t pc;
1184 /* */
1185 #ifndef RE9_DISABLE_LEARNING
1186 learn_start(ci);
1187 #endif
1188 /* get rid of NOOP chains */
1189 for (pc = 0; pc < pp->code_used; pc += vmop_size(pp->code+pc)) {
1190 vmopcode_t *inst = instptr(ci, pc), *target;
1191 target = instptr(ci, inst->next);
1192 while (target->opcode == NOP) target = instptr(ci, target->next);
1193 if (inst->opcode == STAR_G) inst->opcode = SPLIT_G;
1194 else if (inst->opcode == STAR_NG) inst->opcode = SPLIT_NG;
1195 inst->next = instofs(ci, target);
1196 if (inst->opcode == SPLIT_NG || inst->opcode == SPLIT_G) {
1197 vmop_split_t *op = (vmop_split_t *)inst;
1198 target = instptr(ci, op->alt);
1199 while (target->opcode == NOP) target = instptr(ci, target->next);
1200 op->alt = instofs(ci, target);
1201 if (inst->opcode == SPLIT_NG) ci->pp->flags |= RE9_FLAG_NONGREEDY;
1204 /* move startinst on non-NOOP */
1205 while (instptr(ci, pp->startinst)->opcode == NOP) pp->startinst = instptr(ci, pp->startinst)->next;
1206 /* shrink code area */
1207 if (pc > pp->code_alloted) {
1208 uint8_t *newc;
1209 pp->code_used = pp->code_alloted = pc;
1210 newc = realloc(pp->code, pc);
1211 if (newc != NULL) pp->code = newc;
1216 re9_prog_t *re9_compile_ex (const char *s, const char *eol, int flags, const char **errmsg) {
1217 int token;
1218 re9_compiler_t * volatile ci;
1219 re9_prog_t *pres;
1220 if (s == NULL) {
1221 if (errmsg != NULL) *errmsg = "can't parse NULL regexp";
1222 return NULL;
1224 if (eol == NULL) eol = s+strlen(s); else if (eol < s) eol = s;
1225 if ((ci = malloc(sizeof(*ci))) == NULL) {
1226 if (errmsg != NULL) *errmsg = "out of memory";
1227 return NULL;
1229 memset(ci, 0, sizeof(*ci));
1230 /* get memory for the program */
1231 ci->pp = malloc(sizeof(*ci->pp));
1232 if (ci->pp == NULL) {
1233 free(ci);
1234 if (errmsg != NULL) *errmsg = "out of memory";
1235 return NULL;
1237 memset(ci->pp, 0, sizeof(*ci->pp));
1238 //ci->pp->nsub = 0;
1239 ci->pp->flags = (flags&RE9_FLAG_CASEINSENS);
1240 /* */
1241 ci->flags = flags;
1242 if (errmsg != NULL) *errmsg = NULL;
1243 if (setjmp(ci->regkaboom)) {
1244 if (errmsg != NULL) *errmsg = ci->errormsg;
1245 if (ci->yyclass != NULL) free(ci->yyclass);
1246 if (ci->pp->code != NULL) free(ci->pp->code);
1247 free(ci->pp);
1248 free(ci);
1249 return NULL;
1251 /* go compile the sucker */
1252 ci->expr = s;
1253 ci->expr_eol = eol;
1254 ci->nbra = 0;
1255 ci->atorp = ci->atorstack;
1256 ci->andp = ci->andstack;
1257 ci->subidp = ci->subidstack;
1258 ci->last_was_operand = RE9_FALSE;
1259 ci->cursubid = 0;
1260 /* start with a low priority operator to prime parser */
1261 push_operator(ci, START-1, ci->cursubid);
1262 /* offset 0 should be NOOP */
1264 vmopcode_t *op = newinst(ci, NOP);
1265 op->next = ci->pp->code_used;
1267 while ((token = lex(ci)) != END) {
1268 if ((token&0300) == OPERATOR) operator(ci, token); else operand(ci, token);
1270 if (ci->comment_level == 0) {
1271 /* close with a low priority operator */
1272 evaluntil(ci, START);
1273 /* force END */
1274 operand(ci, END);
1275 evaluntil(ci, START);
1277 #ifdef RE9_DEBUG_PRG
1278 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1279 #endif
1280 if (ci->nbra) rcerror(ci, "unmatched left paren");
1281 --ci->andp; /* points to first and only operand */
1282 ci->pp->startinst = ci->andp->first;
1283 #ifdef RE9_DEBUG_PRG
1284 if (ci->flags&RE9_FLAG_DUMPPRG) {
1285 fprintf(stderr, "=== unoptimized ===\n");
1286 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1287 dump(ci->pp);
1289 #endif
1290 optimize(ci);
1291 #ifdef RE9_DEBUG_PRG
1292 if (ci->flags&RE9_FLAG_DUMPPRG) {
1293 fprintf(stderr, "=== optimized ===\n");
1294 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1295 dump(ci->pp);
1296 fprintf(stderr, "------\n");
1298 #endif
1299 pres = ci->pp;
1300 if (ci->yyclass != NULL) free(ci->yyclass);
1301 free(ci);
1302 ++pres->nsub;
1303 return pres;
1307 re9_prog_t *re9_compile (const char *s, int flags, const char **errmsg) {
1308 return re9_compile_ex(s, NULL, flags, errmsg);
1312 int re9_nsub (const re9_prog_t *p) {
1313 return (p != NULL ? p->nsub : 0);
1317 void re9_free (re9_prog_t *p) {
1318 if (p != NULL) {
1319 if (p->code != NULL) free(p->code);
1320 free(p);
1325 /******************************************************************************/
1326 /* executor */
1327 #define LASTINST (0)
1330 /* substitution list */
1331 typedef struct {
1332 re9_sub_t m[RE9_SUBEXP_MAX];
1333 } re9_sublist_t;
1337 * regexec execution lists
1339 #define LISTSIZE (16)
1340 #define BIGLISTSIZE (64*LISTSIZE)
1342 typedef struct {
1343 uint32_t inst; /* instruction of the thread */
1344 re9_sublist_t se; /* matched subexpressions in this thread */
1345 } re9_listitem_t;
1348 typedef struct {
1349 re9_listitem_t *start;
1350 re9_listitem_t *free;
1351 re9_listitem_t *end;
1352 } re9_list_t;
1355 typedef struct {
1356 re9_listitem_t *relist[2], *relistf[2];
1357 int listsize;
1358 const char *bol;
1359 const char *eol;
1360 int flags;
1361 } re9_ljunk_t;
1364 /* save a new match in mp */
1365 static inline void re9l_newmatch (re9_sub_t *mp, int ms, re9_sublist_t *sp) {
1366 if (mp == NULL || ms <= 0) return;
1367 if (mp[0].sp == NULL || sp->m[0].sp < mp[0].sp || (sp->m[0].sp == mp[0].sp && sp->m[0].ep > mp[0].ep)) {
1368 int i;
1369 for (i = 0; i < ms && i < RE9_SUBEXP_MAX; ++i) mp[i] = sp->m[i];
1370 for (; i < ms; ++i) mp[i].sp = mp[i].ep = 0;
1376 * Note optimization in re9l_newthread:
1377 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1380 * lp: list to add to
1381 * ip: instruction to add
1382 * ms: number of elements at mp
1383 * sep: pointers to subexpressions
1385 static inline re9_listitem_t *re9l_newthread (re9_list_t *lp, uint32_t ip, int ms, re9_sublist_t *sep, int nocheck) {
1386 re9_listitem_t *p;
1387 if (!nocheck) {
1388 for (p = lp->start; p->inst != LASTINST; ++p) {
1389 if (p->inst == ip) {
1390 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sep->m[0].sp, p->se.m[0].sp);
1391 if (sep->m[0].sp < p->se.m[0].sp) {
1392 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1394 return p;
1398 p = lp->free++;
1399 if (p >= lp->end-1) return NULL;
1400 dlogf("newthread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1401 p->inst = ip;
1402 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1403 p[1].inst = LASTINST;
1404 return p;
1409 * same as renewthread, but called with initial empty start pointer
1412 * lp: list to add to
1413 * ip: instruction to add
1414 * ms: number of elements at mp
1415 * sp: pointers to subexpressions
1417 static inline re9_listitem_t *re9l_newemptythread (re9_list_t *lp, uint32_t ip, int ms, const char *sp, int nocheck) {
1418 re9_listitem_t *p;
1419 if (!nocheck) {
1420 for (p = lp->start; p->inst != LASTINST; ++p) {
1421 if (p->inst == ip) {
1422 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sp, p->se.m[0].sp);
1423 if (sp < p->se.m[0].sp) {
1424 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1425 p->se.m[0].sp = sp;
1427 return p;
1431 p = lp->free++;
1432 if (p >= lp->end-1) return NULL;
1433 dlogf("newemptythread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1434 p->inst = ip;
1435 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1436 p->se.m[0].sp = sp;
1437 p[1].inst = LASTINST;
1438 return p;
1442 static inline int is_word_rune (re9_rune r) {
1443 return
1444 (r >= '0' && r <= '9') ||
1445 (r >= 'A' && r <= 'Z') ||
1446 (r >= 'a' && r <= 'z') ||
1447 (r >= 128 && r <= RE9_RUNE_MAX) ||
1448 r == '_';
1453 * return 0 if no match
1454 * >0 if a match
1455 * <0 if we ran out of list space
1458 * progp: program to run
1459 * bol: string to run machine on
1460 * mp: subexpression elements
1461 * ms: number of elements at mp
1464 static int regexec1 (const re9_prog_t *progp, re9_sub_t *mp, int ms, re9_ljunk_t *j) {
1465 int flag = 0;
1466 #ifdef RE9_CALC_MAXT
1467 int maxt = 0;
1468 #endif
1469 const char *s;
1470 #ifndef RE9_DISABLE_LEARNING
1471 const char *p;
1472 int checkstart;
1473 #endif
1474 re9_rune rprev, r;
1475 int rune_size;
1476 re9_list_t tl, nl; /* this list, next list */
1477 re9_listitem_t *tlp, *tlxn;
1478 int match, i, flags;
1479 uint32_t f;
1480 const vmop_class_t *opc;
1481 int dont_collapse = 0;
1482 /* */
1483 flags = (progp->flags&(RE9_FLAG_CASEINSENS|RE9_FLAG_NONGREEDY))|(j->flags&(RE9_FLAG_NONUTF8|RE9_FLAG_ANYDOT));
1484 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1485 match = 0;
1486 #ifndef RE9_DISABLE_LEARNING
1487 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY);
1488 #endif
1489 if (mp != NULL) {
1490 for (i = 0; i < ms; ++i) mp[i].sp = mp[i].ep = NULL;
1491 } else {
1492 ms = 0;
1494 if (ms < 0) ms = 0; else if (ms > RE9_SUBEXP_MAX) ms = RE9_SUBEXP_MAX;
1495 j->relist[0][0].inst = j->relist[1][0].inst = LASTINST;
1496 j->relistf[0] = j->relist[0];
1497 j->relistf[1] = j->relist[1];
1498 /* execute machine once for each character, including terminal NUL */
1499 s = j->bol;
1500 rprev = RE9_RUNE_SPEC_BOL;
1501 for (;;) {
1502 /* fast check for first char */
1503 #ifndef RE9_DISABLE_LEARNING
1504 if (checkstart) {
1505 dlogf("checkstart! 0x%02x\n", progp->startflags);
1506 switch (progp->startflags) {
1507 /* simple cases */
1508 case PRG_STARTS_WITH_BOL:
1509 if (s == j->bol) break;
1510 if (s == j->eol) goto mt_quit;
1511 p = memchr(s, '\n', j->eol-s);
1512 if (p == NULL || p+1 > j->eol) goto mt_quit;
1513 s = p+1;
1514 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1515 break;
1516 case PRG_STARTS_WITH_EOL:
1517 if (s == j->eol) goto mt_quit;
1518 p = memchr(s, '\n', j->eol-s);
1519 if (p == NULL) p = j->eol;
1520 s = p;
1521 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1522 break;
1523 case PRG_STARTS_WITH_RUNE:
1524 if ((flags&RE9_FLAG_CASEINSENS) == 0) {
1525 /* case-sensitive */
1526 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1527 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1528 p = memchr(s, progp->startrune, j->eol-s);
1529 } else {
1530 p = re8_rune_strchr(s, progp->startrune, j->eol);
1532 } else {
1533 /* case-insensitive */
1534 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1535 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1536 for (p = s; p < j->eol; ++p) if (UPPER(((unsigned char)*p)) == progp->startrune) break;
1537 } else {
1538 p = s;
1539 while (p < j->eol) {
1540 rune_size = re9_char2rune(&r, p, j->eol);
1541 if (UPPER(r) == progp->startrune) break;
1542 p += rune_size;
1546 if (p == NULL || p > j->eol) goto mt_quit;
1547 s = p;
1548 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1549 break;
1550 case PRG_STARTS_WITH_CLASS:
1551 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1552 while (s < j->eol) {
1553 r = *(const unsigned char *)s;
1554 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1555 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1556 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1557 if (f < opc->spi_count) break;
1558 s += rune_size;
1560 if (s >= j->eol) goto mt_quit;
1561 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1562 break;
1563 default:
1564 /* difficult cases */
1565 if ((progp->startflags&PRG_STARTS_WITH_BOL) && s == j->bol) goto difficult_found;
1566 if ((progp->startflags&PRG_STARTS_WITH_EOL) && s == j->eol) goto difficult_found;
1567 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1568 while (s < j->eol) {
1569 if (*s == '\n') {
1570 if (progp->startflags&PRG_STARTS_WITH_BOL) { ++s; break; }
1571 if (progp->startflags&PRG_STARTS_WITH_EOL) break;
1573 if (progp->startflags&(PRG_STARTS_WITH_RUNE|PRG_STARTS_WITH_CLASS)) {
1574 r = *(const unsigned char *)s;
1575 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1576 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1577 if (progp->startflags&PRG_STARTS_WITH_RUNE) {
1578 if (r == progp->startrune) break;
1579 } else {
1580 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1581 if (f < opc->spi_count) break;
1583 s += rune_size;
1584 } else {
1585 ++s;
1588 if (s > j->eol) goto mt_quit;
1589 difficult_found:
1590 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1591 break;
1594 #endif
1595 if (s < j->eol) {
1596 r = *(const unsigned char *)s;
1597 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1598 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1599 } else {
1600 r = RE9_RUNE_SPEC_EOL;
1601 rune_size = 0;
1603 /* switch run lists */
1604 tl.start = j->relist[flag];
1605 tl.end = j->relist[flag]+j->listsize;
1606 tl.free = j->relistf[flag];
1607 nl.start = j->relist[flag^=1];
1608 nl.end = j->relist[flag]+j->listsize;
1609 nl.free = nl.start;
1610 nl.start->inst = LASTINST;
1611 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag, match, tl.start);
1612 /* add first instruction to current list */
1613 if (match == 0) {
1614 if ((tlxn = re9l_newemptythread(&tl, progp->startinst, ms, s, dont_collapse)) == NULL) return -1;
1616 tlxn = tl.free;
1617 #ifdef RE9_DEBUG
1618 dlogf(" tlxn=%u\n", tlxn-tl.start);
1619 for (tlp = tl.start; tlp < tl.free; ++tlp) dlogf("%d: %u\n", tlp-tl.start, tlp->inst);
1620 #endif
1621 /* execute machine until current list is empty */
1622 if (tl.start->inst == LASTINST) break; /* nothing to do */
1623 #ifdef RE9_CALC_MAXT
1624 if (tl.free-tl.start > maxt) maxt = tl.free-tl.start;
1625 #endif
1626 for (tlp = tl.start; tlp->inst != LASTINST; ++tlp) {
1627 uint32_t ip;
1628 dlogf("* new thread (%d) at %u\n", flag, tlp-tl.start);
1629 #ifdef RE9_CALC_MAXT
1630 if (tlp-tl.start > maxt) maxt = tlp-tl.start;
1631 #endif
1632 for (ip = tlp->inst; ip != LASTINST; ip = ((const vmopcode_t *)(progp->code+ip))->next) {
1633 const vmopcode_t *op = (const vmopcode_t *)(progp->code+ip);
1634 #ifdef RE9_DEBUG
1635 dump_opcode(stderr, progp, ip);
1636 #endif
1637 if (op->opcode < RUNE) {
1638 /* ascii character */
1639 if (op->opcode == r) {
1640 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1642 } else {
1643 switch (op->opcode) {
1644 case RUNE: /* regular character */
1645 if (((const vmop_rune_t *)op)->r == r) {
1646 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1648 break;
1649 case SMARK:
1650 tlp->se.m[((const vmop_mark_t *)op)->subid].sp = s;
1651 continue;
1652 case EMARK:
1653 tlp->se.m[((const vmop_mark_t *)op)->subid].ep = s;
1654 continue;
1655 case ANY:
1656 if ((flags&RE9_FLAG_ANYDOT) == 0 && r == '\n') break;
1657 /* fallthru */
1658 case ANYNL:
1659 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1660 break;
1661 case BOL:
1662 if (s == j->bol || rprev == RE9_RUNE_SPEC_BOL || rprev == '\n') continue;
1663 break;
1664 case EOL:
1665 if (s == j->eol || r == '\n') continue;
1666 break;
1667 case WBOUND:
1668 if (is_word_rune(rprev) != is_word_rune(r)) continue;
1669 break;
1670 case WNBOUND:
1671 if (is_word_rune(rprev) == is_word_rune(r)) continue;
1672 break;
1673 case CCLASS:
1674 opc = (const vmop_class_t *)op;
1675 for (f = 0; f < opc->spi_count; f += 2) {
1676 if (r >= opc->spi[f] && r <= opc->spi[f+1]) {
1677 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1678 break;
1681 break;
1682 case NCCLASS:
1683 opc = (const vmop_class_t *)op;
1684 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1685 if (f >= opc->spi_count) {
1686 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1688 break;
1689 #ifndef RE9_DISABLE_NONGREEDY
1690 case SPLIT_NG:
1691 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1692 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1693 break;
1694 case SPLIT_G:
1695 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1696 if (flags&RE9_FLAG_NONGREEDY) {
1697 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1698 } else {
1699 continue;
1701 break;
1702 #else
1703 case SPLIT_G:
1704 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1705 continue;
1706 break;
1707 #endif
1708 case END: /* match! */
1709 match = 1;
1710 tlp->se.m[0].ep = s;
1711 if (mp != 0) re9l_newmatch(mp, ms, &tlp->se);
1712 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp-tl.start, tlp->se.m[0].sp-j->bol, tlp->se.m[0].ep-j->bol, (tlp < tlxn ? "old" : "new"));
1713 #ifndef RE9_DISABLE_NONGREEDY
1714 if (flags&RE9_FLAG_NONGREEDY) {
1715 /* fuck off all 'old' instructions */
1716 if ( tlp < tlxn) {
1717 /* we are in 'old' area, skip it all */
1718 tlp = tlxn-1; /* next one will be 'new' */
1719 } else {
1720 /* we are in 'new' area, stop right here */
1721 tlp[1].inst = LASTINST;
1724 #endif
1725 break;
1726 #ifdef RE9_DEBUG
1727 default:
1728 fprintf(stderr, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op->opcode);
1729 #endif
1730 } /* switch */
1731 } /* if */
1732 break;
1735 j->relistf[flag] = nl.free; /* update 'free' pointer */
1736 dlogf("+ free=%u; sz=%d\n", nl.free-nl.start, j->listsize);
1737 if (s == j->eol) break;
1738 #ifndef RE9_DISABLE_LEARNING
1739 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY && nl.start->inst == LASTINST && !match);
1740 #endif
1741 rprev = r;
1742 s += rune_size;
1743 dlogf("+++ (%d) eol=%d\n", r, s == j->eol);
1745 #ifndef RE9_DISABLE_LEARNING
1746 mt_quit:
1747 #endif
1748 #ifdef RE9_CALC_MAXT
1749 fprintf(stderr, "MAXT=%d\n", maxt);
1750 #endif
1751 return match;
1755 static int re9_exec_internal (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, re9_listitem_t *rl0, re9_listitem_t *rl1, size_t listsz) {
1756 re9_ljunk_t j;
1757 /* use user-specified starting/ending location if specified */
1758 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) {
1759 j.bol = (mp->sp != NULL ? mp->sp : bol);
1760 j.eol = (mp->ep != NULL ? mp->ep : j.bol+strlen(j.bol));
1761 } else {
1762 j.eol = (j.bol = bol)+strlen(bol);
1764 j.flags = flags;
1765 j.listsize = listsz;
1766 j.relist[0] = rl0;
1767 j.relist[1] = rl1;
1768 return regexec1(progp, mp, ms, &j);
1772 #ifdef REGEXP9_DEBUG_MEMSIZE
1773 int re9_memused;
1774 #endif
1778 * progp: program to run
1779 * bol: string to run machine on
1780 * mp: subexpression elements
1781 * ms: number of elements at mp
1783 int re9_execute_ex (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, size_t maxmem) {
1784 re9_listitem_t relist[2][LISTSIZE];
1785 const char *b, *e;
1786 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1787 #ifdef REGEXP9_DEBUG_MEMSIZE
1788 re9_memused = LISTSIZE;
1789 #endif
1790 int rv = re9_exec_internal(progp, flags, bol, mp, ms, relist[0], relist[1], LISTSIZE);
1791 if (rv < 0) {
1792 re9_listitem_t *r[2];
1793 int listsize = BIGLISTSIZE, rv;
1794 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1795 for (;;) {
1796 int f;
1797 for (f = 0; f < 2; ++f) {
1798 if ((r[f] = malloc(listsize*sizeof(re9_listitem_t))) == NULL) {
1799 while (--f >= 0) free(r[f]);
1800 return -1;
1803 #ifdef REGEXP9_DEBUG_MEMSIZE
1804 re9_memused = listsize;
1805 #endif
1806 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1807 rv = re9_exec_internal(progp, flags, bol, mp, ms, r[0], r[1], listsize);
1808 while (--f >= 0) free(r[f]);
1809 if (rv >= 0) break;
1810 if (listsize*2*sizeof(re9_listitem_t)*2 > maxmem) {
1811 size_t nsz = maxmem/(sizeof(re9_listitem_t)*2);
1812 if (nsz <= listsize) break;
1813 listsize = nsz;
1814 } else {
1815 listsize *= 2;
1818 #ifdef RE9_CALC_MAXT
1819 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%d\n", listsize/2);
1820 #endif
1822 return rv;
1826 int re9_execute (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1827 return re9_execute_ex(progp, flags, bol, mp, ms, 0);
1831 /******************************************************************************/
1832 struct re9_prog_prepared_s {
1833 const re9_prog_t *progp;
1834 re9_listitem_t *relist[2];
1835 size_t listsize;
1836 size_t maxmem;
1840 re9_prog_prepared_t *re9_prepare_ex (const re9_prog_t *progp, size_t maxmem) {
1841 re9_prog_prepared_t *res;
1842 int f;
1843 if (progp == NULL) return NULL;
1844 if ((res = malloc(sizeof(*res))) == NULL) return NULL;
1845 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1846 res->progp = progp;
1847 res->listsize = BIGLISTSIZE;
1848 res->maxmem = maxmem;
1849 for (f = 0; f < 2; ++f) {
1850 if ((res->relist[f] = malloc(res->listsize*sizeof(re9_listitem_t))) == NULL) {
1851 while (--f >= 0) free(res->relist[f]);
1852 free(res);
1853 return NULL;
1856 return res;
1860 re9_prog_prepared_t *re9_prepare (const re9_prog_t *progp) {
1861 return re9_prepare_ex(progp, 0);
1865 int re9_prepared_execute (re9_prog_prepared_t *pp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1866 if (pp != NULL) {
1867 int rv, f;
1868 const char *b, *e;
1869 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1870 for (;;) {
1871 rv = re9_exec_internal(pp->progp, flags, bol, mp, ms, pp->relist[0], pp->relist[1], pp->listsize);
1872 if (rv >= 0 || pp->listsize*sizeof(re9_listitem_t)*2 >= pp->maxmem) break;
1873 if (pp->listsize*2*sizeof(re9_listitem_t)*2 > pp->maxmem) {
1874 size_t nsz = pp->maxmem/(sizeof(re9_listitem_t)*2);
1875 if (nsz <= pp->listsize) break;
1876 pp->listsize = nsz;
1877 } else {
1878 pp->listsize *= 2;
1880 for (f = 0; f < 2; ++f) {
1881 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1882 if (r == NULL) break;
1883 pp->relist[f] = r;
1885 if (f < 2) { pp->listsize /= 2; rv = -1; break; }
1886 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1888 #ifdef RE9_CALC_MAXT
1889 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%u\n", pp->listsize);
1890 #endif
1891 if (pp->listsize > 32767) {
1892 int f;
1893 #ifdef RE9_CALC_MAXT
1894 fprintf(stderr, "*SHRINK: %u --> 32768\n", pp->listsize);
1895 #endif
1896 pp->listsize = 32768;
1897 for (f = 0; f < 2; ++f) {
1898 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1899 if (r == NULL) break;
1900 pp->relist[f] = r;
1903 return rv;
1905 return -1;
1909 void re9_prepared_free (re9_prog_prepared_t *pp) {
1910 if (pp != NULL) {
1911 if (pp->relist[1] != NULL) free(pp->relist[1]);
1912 if (pp->relist[0] != NULL) free(pp->relist[0]);
1913 free(pp);
1918 /******************************************************************************/
1919 /* substitute into one string using the matches from the last re9_execute() */
1920 int re9_sub (char *dp, size_t dlen, const char *sp, const re9_sub_t *mp, int ms) {
1921 int res = 0;
1922 char *dlast;
1923 if (mp == NULL || ms <= 0) ms = 0;
1924 if (dp == NULL) dlen = 0;
1925 dlast = (dlen > 0 ? dp+dlen-1 : NULL);
1926 while (*sp) {
1927 if (*sp == '\\' && (sp[1] == '{' || (sp[1] >= '0' && sp[1] <= '9'))) {
1928 int i = sp[1]-'0';
1929 if (i > 9) {
1930 /* parse {...} */
1931 i = 0;
1932 sp += 2;
1933 while (*sp && *sp != '}') {
1934 if (*sp < '0' || *sp > '9') break;
1935 i = i*10+sp[0]-'0';
1936 ++sp;
1937 if (i > ms) break;
1939 while (*sp && *sp != '}') ++sp;
1940 if (*sp == '}') ++sp;
1941 } else {
1942 sp += 2;
1944 if (i < ms && mp[i].sp != NULL) {
1945 const char *ssp = mp[i].sp;
1946 int len = (long)(mp[i].ep-ssp); /*FIXME: 2GB strings?*/
1947 if (len > 0) {
1948 res += len;
1949 if (len > dlen) len = dlen;
1950 if (len > 0) memcpy(dp, ssp, len);
1951 dlen -= len;
1952 dp += len;
1955 } else {
1956 if (*sp == '\\') {
1957 if (dlen > 0) {
1958 *dp++ = (sp[1] == 'z' ? 0 : sp[1]);
1959 --dlen;
1961 sp += 2;
1962 } else {
1963 if (dlen > 0) { *dp++ = *sp++; --dlen; } else ++sp;
1965 ++res;
1968 if (dlen > 0) *dp = '\0'; else if (dlast != NULL) *dlast = '\0';
1969 return ++res;
1973 /******************************************************************************/
1974 static inline int fromhex (char ch) {
1975 return
1976 (ch >= '0' && ch <= '9' ? ch-'0' :
1977 ch >= 'A' && ch <= 'F' ? ch-'A'+10 :
1978 ch >= 'a' && ch <= 'f' ? ch-'a'+10 :
1979 -1);
1983 /* substitute into one string using the matches from the last re9_execute() */
1984 int re9_subst (char *dp, size_t dlen, const char *sp, const re9_sub_t *mp, int ms) {
1985 int res = 0;
1986 char *dlast;
1987 if (mp == NULL || ms <= 0) ms = 0;
1988 if (dp == NULL) dlen = 0;
1989 dlast = (dlen > 0 ? dp+dlen-1 : NULL);
1990 while (*sp) {
1991 if (*sp == '$' && (sp[1] == '{' || (sp[1] >= '0' && sp[1] <= '9'))) {
1992 int i = sp[1]-'0';
1993 if (i > 9) {
1994 /* parse {...} */
1995 i = 0;
1996 sp += 2;
1997 while (*sp && *sp != '}') {
1998 if (*sp < '0' || *sp > '9') break;
1999 i = i*10+sp[0]-'0';
2000 ++sp;
2001 if (i > ms) break;
2003 while (*sp && *sp != '}') ++sp;
2004 if (*sp == '}') ++sp;
2005 } else {
2006 sp += 2;
2008 if (i < ms && mp[i].sp != NULL) {
2009 const char *ssp = mp[i].sp;
2010 int len = (long)(mp[i].ep-ssp); /*FIXME: 2GB strings?*/
2011 if (len > 0) {
2012 res += len;
2013 if (len > dlen) len = dlen;
2014 if (len > 0) memcpy(dp, ssp, len);
2015 dlen -= len;
2016 dp += len;
2019 } else {
2020 int ch = *sp++;
2021 if (ch == '\\') {
2022 switch (*sp++) {
2023 case 0: --sp; ch = '\\'; break;
2024 case 'e': ch = 27; break;
2025 case 'n': ch = '\n'; break;
2026 case 'r': ch = '\r'; break;
2027 case 't': ch = '\t'; break;
2028 case 'z': ch = 0; break;
2029 case 'x': /* hex code */
2030 ch = fromhex(*sp);
2031 if (ch >= 0) {
2032 int n;
2033 ++sp;
2034 n = fromhex(*sp);
2035 if (n >= 0) { ch = ch*16+n; ++sp; }
2036 } else {
2037 ch = 'x';
2039 break;
2040 default: ch = sp[-1]; break;
2043 if (dlen > 0) { *dp++ = ch; --dlen; }
2044 ++res;
2047 if (dlen > 0) *dp = '\0'; else if (dlast != NULL) *dlast = '\0';
2048 return ++res;