added API to compile non-asciiz regular expressions
[libre9.git] / src / libre9 / re9.c
blob10b976f3e4ef37d98bdcec99069f54d799a486e6
1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
17 #include "re9.h"
19 #include <ctype.h>
20 #include <limits.h>
21 #include <setjmp.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
29 /******************************************************************************/
30 #define PACKED __attribute__((packed))
32 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
34 #ifdef RE9_DEBUG
35 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
36 #else
37 # define dlogf(...)
38 #endif
40 //#define RE9_DEBUG_PRG
41 //#define RE9_CALC_MAXT
44 /******************************************************************************/
45 typedef uint32_t re9_rune;
48 enum {
49 PRG_STARTS_WITH_ANY = 0x00,
50 PRG_STARTS_WITH_BOL = 0x01,
51 PRG_STARTS_WITH_EOL = 0x02,
52 PRG_STARTS_WITH_RUNE = 0x04,
53 PRG_STARTS_WITH_CLASS = 0x08
57 struct re9_prog_s {
58 uint8_t *code; /* vm code */
59 uint32_t code_used;
60 uint32_t code_alloted;
61 uint32_t startinst; /* start pc */
62 #ifndef RE9_DISABLE_LEARNING
63 uint8_t startflags; /* PRG_START_WITH_xxx */
64 union {
65 uint32_t startrange; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
66 re9_rune startrune;
68 #endif
69 int flags; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
70 int nsub;
74 /* note that CLASS opcode is variable-length */
75 typedef struct PACKED {
76 uint32_t opcode:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
77 uint32_t next:24; /* offset of next opcode */
78 } vmopcode_t;
81 /* RUNE */
82 typedef struct PACKED {
83 vmopcode_t opc;
84 re9_rune r;
85 } vmop_rune_t;
88 /* SPLIT */
89 typedef struct PACKED {
90 vmopcode_t opc;
91 uint32_t alt; /* offset of 'alternate' opcode */
92 } vmop_split_t;
95 /* MARK */
96 typedef struct PACKED {
97 vmopcode_t opc;
98 uint8_t subid; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
99 } vmop_mark_t;
102 /* CLASS */
103 typedef struct PACKED {
104 vmopcode_t opc;
105 uint16_t spi_count; /* number of items in spi */
106 re9_rune spi[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
107 } vmop_class_t;
110 /******************************************************************************/
112 * Actions and Tokens (re9_inst_t types)
114 * 02xx are operators, value == precedence
115 * 03xx are tokens, i.e. operands for operators
117 #define RUNE 0177 /* literal rune */
119 #define OPERATOR 0200 /* bitmask of all operators */
120 #define START 0200 /* start, used for marker on stack */
121 #define RBRA 0201 /* right bracket */
122 #define LBRA 0202 /* left bracket */
123 #define ALT 0203 /* alternation */
124 #define CAT 0204 /* concatentation, implicit operator */
125 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
126 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
127 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
129 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
130 #define ANY 0301 /* (and token); any character except newline */
131 #define ANYNL 0302 /* (and token); any character including newline */
132 #define BOL 0303 /* (and token); beginning of line */
133 #define EOL 0304 /* (and token); end of line */
134 #define CCLASS 0305 /* (and token); character class */
135 #define NCCLASS 0306 /* (and token); negated character class */
136 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
137 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
138 #define SPLIT_G 0311 /* greedy split */
139 #define SPLIT_NG 0312 /* vmop; non-greedy split */
140 #define SMARK 0313 /* vmop; group start mark */
141 #define EMARK 0314 /* vmop; group end mark */
142 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
143 #define STAR_G 0315
144 #define STAR_NG 0316
145 /* */
146 #define END 0377 /* vmop; terminate: match found */
149 /******************************************************************************/
150 static inline uint32_t vmop_size (const void *op) {
151 switch (((const vmopcode_t *)op)->opcode) {
152 case RUNE:
153 return sizeof(vmop_rune_t);
154 case SPLIT_G: case SPLIT_NG:
155 return sizeof(vmop_split_t);
156 case SMARK: case EMARK:
157 return sizeof(vmop_mark_t);
158 case CCLASS: case NCCLASS:
159 return sizeof(vmop_class_t)+((const vmop_class_t *)op)->spi_count*sizeof(((const vmop_class_t *)op)->spi[0]);
160 /*default: ;*/
162 return sizeof(vmopcode_t);
166 /******************************************************************************/
167 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
168 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
169 static void dump_char (FILE *fo, re9_rune r) {
170 if (r >= 32 && r < 127 && r != '\'' && r != '\\') {
171 fprintf(fo, "%c", r);
172 } else {
173 fprintf(fo, "\\u%04x", r);
177 static uint32_t dump_opcode (FILE *fo, const re9_prog_t *pp, uint32_t pc) {
178 const vmopcode_t *op;
179 const vmop_class_t *opc;
180 uint32_t f;
181 if (pc >= pp->code_used) return 0;
182 fprintf(fo, "%05u: 0%03o ", pc, pp->code[pc]);
183 /* END */
184 if (pp->code[pc] == END) {
185 fprintf(fo, " END\n");
186 return pc+vmop_size((void *)(pp->code+pc));
188 if (pc+sizeof(vmopcode_t) > pp->code_used) {
189 fprintf(fo, "???: INVALID CODE!\n");
190 return 0;
192 op = (const vmopcode_t *)(pp->code+pc);
193 if (pc+vmop_size(op) > pp->code_used) {
194 fprintf(fo, "???: INVALID CODE!\n");
195 return 0;
197 fprintf(fo, "%05u ", op->next);
198 /* other */
199 switch (op->opcode) {
200 case NOP: fprintf(fo, "NOP\n"); break;
201 case ANY: fprintf(fo, "ANY\n"); break;
202 case ANYNL: fprintf(fo, "ANYNL\n"); break;
203 case BOL: fprintf(fo, "BOL\n"); break;
204 case EOL: fprintf(fo, "EOL\n"); break;
205 case WBOUND: fprintf(fo, "WBOUND\n"); break;
206 case WNBOUND: fprintf(fo, "WNBOUND\n"); break;
207 case RUNE:
208 fprintf(fo, "RUNE '");
209 dump_char(fo, ((const vmop_rune_t *)op)->r);
210 fputs("'\n", fo);
211 break;
212 case CCLASS: case NCCLASS:
213 opc = (const vmop_class_t *)op;
214 fprintf(fo, "%-8s [", (op->opcode == CCLASS ? "CCLASS" : "NCCLASS"));
215 for (f = 0; f < opc->spi_count; f += 2) {
216 dump_char(fo, opc->spi[f]);
217 if (opc->spi[f] != opc->spi[f+1]) {
218 fputc('-', fo);
219 dump_char(fo, opc->spi[f+1]);
222 fputs("]\n", fo);
223 break;
224 case SPLIT_G: case SPLIT_NG:
225 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t *)op)->alt);
226 break;
227 case STAR_G: case STAR_NG:
228 fprintf(fo, "%-8s %05u\n", (op->opcode == SPLIT_G ? "STAR_G" : "STAR_NG"), ((const vmop_split_t *)op)->alt);
229 break;
230 case SMARK: case EMARK:
231 fprintf(fo, "%-8s %u\n", (op->opcode == SMARK ? "SMARK" : "EMARK"), ((const vmop_mark_t *)op)->subid);
232 break;
233 default:
234 if (op->opcode < RUNE) {
235 fprintf(fo, "RUNE '");
236 dump_char(fo, op->opcode);
237 fputs("'\n", fo);
238 break;
240 fprintf(fo, "INVALID OPCODE!\n");
241 return 0;
243 return pc+vmop_size(op);
247 static void dump (const re9_prog_t *pp) {
248 uint32_t pc = 0;
249 fprintf(stderr, "code size: %u\n", pp->code_used);
250 while (pc < pp->code_used) if ((pc = dump_opcode(stderr, pp, pc)) == 0) break;
252 #endif
255 /******************************************************************************/
256 /* UTF-8 support */
258 #ifdef RE9_UNICODE_CASE
260 struct casemap {
261 unsigned short code; /* code point */
262 unsigned short lower; /* code */
263 unsigned short upper; /* code */
266 #include "re9_unicode_mapping.c"
269 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
271 static int cmp_casemap (const void *key, const void *cm) {
272 return *(const int *)key-(int)((const struct casemap *)cm)->code;
276 static inline int utf8_map_case (re9_rune uc, int upper) {
277 const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
278 return (cm != NULL ? (upper ? cm->upper : cm->lower) : uc);
283 * returns the upper-case variant of the given unicode codepoint
284 * does not support unicode code points > \uffff
286 static inline re9_rune utf8_upper (re9_rune uc) {
287 return (uc < 256 ? (uc < 128 ? toupper(uc) : uc) : utf8_map_case(uc, 1));
292 * returns the lower-case variant of the given unicode codepoint.
293 * does not support unicode code points > \uffff
296 static inline int utf8_lower (int uc) {
297 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
300 #define UPPER(_r) utf8_upper(_r)
302 #else
304 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
306 #endif
309 enum {
310 RE9_UTF_MAX_BYTES = 4, /* maximum bytes per rune */
311 RE9_RUNE_SYNC = 0x80, /* cannot represent part of a UTF sequence (<) */
312 RE9_RUNE_SELF = 0x80, /* rune and UTF sequences are the same (<) */
313 RE9_RUNE_ERROR = 0xFFFD, /* decoding error in UTF */
314 RE9_RUNE_MAX = 0x10FFFF, /* maximum rune value */
315 /* */
316 RE9_RUNE_SPEC_EOL = RE9_RUNE_MAX+1,
317 RE9_RUNE_SPEC_NOTHING = RE9_RUNE_MAX+2,
318 RE9_RUNE_SPEC_WBOUND = RE9_RUNE_MAX+3,
319 RE9_RUNE_SPEC_WNBOUND = RE9_RUNE_MAX+4,
320 RE9_RUNE_SPEC_BOL = RE9_RUNE_MAX+5,
321 RE9_RUNE_SPEC_INVALID = RE9_RUNE_MAX+6
325 static const uint8_t utf8_clen[128] = {
326 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
327 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
328 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
329 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
330 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
331 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
338 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
339 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
340 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
341 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
345 /* returns length */
346 static inline int re9_char2rune (re9_rune *rune, const char *str, const char *eol) {
347 if ((intptr_t)(eol-str) > 0) {
348 re9_rune r;
349 uint8_t c = (unsigned char)(*str++), len = (utf8_clen[c>>1]>>((c&0x01)*4))&0x0f, res = len, oc = c;
350 if (len == 0 || eol-str+1 < len) { *rune = RE9_RUNE_ERROR; return 1; }
351 else if (len == 1) { *rune = c; return 1; }
352 r = c&(0x7c>>len);
353 while (--len > 0) {
354 if (((c = (unsigned char)(*str++))&0xc0) != 0x80) { *rune = RE9_RUNE_ERROR; return 1; }
355 r = (r<<6)|(c&0x3f);
357 if ((oc == 0xc0 || oc == 0xc1) && r != 0) r = RE9_RUNE_ERROR; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
358 if ((r >= 0xd800 && r <= 0xdfff) || // utf16/utf32 surrogates
359 (r >= 0xfdd0 && r <= 0xfdef) || // just for fun
360 (r >= 0xfffe && r <= 0xffff) || // fuck BOMs
361 r > 0x10ffff) { r = RE9_RUNE_ERROR; /*res = 1;*/ } // bad unicode
362 *rune = r;
363 return res;
364 } else {
365 *rune = RE9_RUNE_SPEC_EOL;
366 return 0;
371 static inline const char *re8_rune_strchr (const char *s, re9_rune c, const char *eol) {
372 if (s >= eol) return NULL;
373 /* not part of utf sequence? */
374 if (c < RE9_RUNE_SYNC) return memchr(s, c, eol-s);
375 while (s < eol) {
376 re9_rune r;
377 int len = re9_char2rune(&r, s, eol);
378 if (r == c) return s;
379 s += len;
381 return NULL;
385 /******************************************************************************/
386 /* compiler */
388 enum {
389 RE9_FALSE = 0,
390 RE9_TRUE = 1
394 /* parser information */
395 typedef struct {
396 /*re9_inst_t*/uint32_t first;
397 /*re9_inst_t*/uint32_t last;
398 } re9_node_t;
401 #define NSTACK (32)
403 /* compiler data */
404 typedef struct {
405 re9_prog_t *pp;
406 /* operand stack */
407 re9_node_t andstack[NSTACK];
408 re9_node_t *andp;
409 /* operator stack */
410 int atorstack[NSTACK];
411 int *atorp;
412 /* */
413 int cursubid; /* id of current subexpression */
414 int subidstack[NSTACK]; /* parallel to atorstack */
415 int *subidp;
416 /* */
417 int last_was_operand; /* last token was operand */
418 int nbra; /* number of opened parens */
419 const char *expr; /* pointer to next character in source expression */
420 const char *expr_eol;
421 int flags; /* parser flags */
422 /* */
423 int comment_level; /* >0: in comment; counts parens */
424 int return_rune_nothing; /* return 'nothing' rune from next lex()? will be reset by lex() */
425 re9_rune yyrune; /* last lex'd rune */
426 /* last lex'd class */
427 re9_rune *yyclass;
428 uint32_t yyc_used;
429 uint32_t yyc_alloted;
430 /* */
431 jmp_buf regkaboom;
432 const char *errormsg;
433 } re9_compiler_t;
436 /******************************************************************************/
437 static __attribute__((noreturn)) void rcerror (re9_compiler_t *ci, const char *s) {
438 ci->errormsg = s;
439 longjmp(ci->regkaboom, 1);
443 /******************************************************************************/
444 static void add_to_class (re9_compiler_t *ci, re9_rune r) {
445 if (ci->comment_level == 0) {
446 if (ci->yyc_used+1 > ci->yyc_alloted) {
447 uint32_t newsz = ((ci->yyc_used+1)|0x1f)+1;
448 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
449 if (newr == NULL) rcerror(ci, "out of memory");
450 ci->yyclass = newr;
451 ci->yyc_alloted = newsz;
453 ci->yyclass[ci->yyc_used++] = r;
458 static void add_to_class_two (re9_compiler_t *ci, re9_rune r0, re9_rune r1) {
459 if (ci->comment_level == 0) {
460 if (ci->yyc_used+2 > ci->yyc_alloted) {
461 uint32_t newsz = ((ci->yyc_used+2)|0x1f)+1;
462 re9_rune *newr = realloc(ci->yyclass, newsz*sizeof(ci->yyclass[0]));
463 if (newr == NULL) rcerror(ci, "out of memory");
464 ci->yyclass = newr;
465 ci->yyc_alloted = newsz;
467 ci->yyclass[ci->yyc_used++] = r0;
468 ci->yyclass[ci->yyc_used++] = r1;
473 /******************************************************************************/
474 /* return 1 if char is quoted */
475 static int nextc (re9_compiler_t *ci, re9_rune *rp) {
476 if (ci->expr >= ci->expr_eol) { *rp = RE9_RUNE_SPEC_EOL; return 1; }
477 if ((ci->flags&RE9_FLAG_NONUTF8) == 0) {
478 if (ci->expr[0] == '\\' && ci->expr+1 < ci->expr_eol) {
479 ci->expr += re9_char2rune(rp, ci->expr+1, ci->expr_eol)+1;
480 return 1;
481 } else {
482 ci->expr += re9_char2rune(rp, ci->expr, ci->expr_eol);
484 } else {
485 if (ci->expr[0] == '\\' && ci->expr+1 < ci->expr_eol) {
486 *rp = (unsigned char)(ci->expr[1]);
487 ci->expr += 2;
488 return 1;
489 } else {
490 *rp = (uint8_t)(*ci->expr++);
491 if (ci->expr > ci->expr_eol) ci->expr = ci->expr_eol;
494 if (ci->flags&RE9_FLAG_CASEINSENS) *rp = UPPER(*rp);
495 return 0;
499 static void addmeta_space (re9_compiler_t *ci, int neg) {
500 if (!neg) {
501 add_to_class_two(ci, '\x9', '\xd');
502 add_to_class_two(ci, ' ', ' ');
503 } else {
504 add_to_class_two(ci, '\1', '\x9'-1);
505 add_to_class_two(ci, '\xd'+1, ' '-1);
506 add_to_class_two(ci, ' '+1, RE9_RUNE_MAX);
511 static void addmeta_digit (re9_compiler_t *ci, int neg) {
512 if (!neg) {
513 add_to_class_two(ci, '0', '9');
514 } else {
515 add_to_class_two(ci, '\1', '0'-1);
516 add_to_class_two(ci, '9'+1, RE9_RUNE_MAX);
521 static void addmeta_word (re9_compiler_t *ci, int neg) {
522 if (!neg) {
523 add_to_class_two(ci, '0', '9');
524 add_to_class_two(ci, 'A', 'Z');
525 add_to_class_two(ci, '_', '_');
526 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
527 } else {
528 add_to_class_two(ci, '\0', '0'-1);
529 add_to_class_two(ci, '9'+1, 'A'-1);
530 add_to_class_two(ci, 'Z'+1, '_'-1);
531 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) {
532 add_to_class_two(ci, '_'+1, 'a'-1);
533 add_to_class_two(ci, 'z'+1, RE9_RUNE_MAX);
534 } else {
535 add_to_class_two(ci, '_'+1, RE9_RUNE_MAX);
541 #ifndef RE9_DISABLE_POSIX_CLASSES
542 static void addmeta_upper (re9_compiler_t *ci) {
543 add_to_class_two(ci, 'A', 'Z');
547 static void addmeta_lower (re9_compiler_t *ci) {
548 if (ci->flags&RE9_FLAG_CASEINSENS) {
549 add_to_class_two(ci, 'A', 'Z');
550 } else {
551 add_to_class_two(ci, 'a', 'z');
556 static void addmeta_alpha (re9_compiler_t *ci) {
557 add_to_class_two(ci, 'A', 'Z');
558 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'z');
562 static void addmeta_xdigit (re9_compiler_t *ci) {
563 add_to_class_two(ci, '0', '9');
564 add_to_class_two(ci, 'A', 'F');
565 if ((ci->flags&RE9_FLAG_CASEINSENS) == 0) add_to_class_two(ci, 'a', 'f');
569 static void addmeta_punct (re9_compiler_t *ci) {
570 add_to_class_two(ci, 33, 47); // !"#$%&'()*+,-./
571 add_to_class_two(ci, 58, 64); // :;<=>?@
572 add_to_class_two(ci, 91, 96); // [\]^_`
573 add_to_class_two(ci, 123, 126); // {|}~
577 /*FIXME:??? this should include '\z'! */
578 static void addmeta_ascii (re9_compiler_t *ci) {
579 add_to_class_two(ci, '\1', '\x7f');
583 static void addmeta_blank (re9_compiler_t *ci) {
584 add_to_class_two(ci, '\t', '\t');
585 add_to_class_two(ci, ' ', ' ');
589 /*FIXME:??? this should include '\z'! */
590 static void addmeta_ctrl (re9_compiler_t *ci) {
591 add_to_class_two(ci, '\1', '\x1f');
592 add_to_class_two(ci, '\x7f', '\x7f');
596 static void addmeta_graph (re9_compiler_t *ci) {
597 add_to_class_two(ci, '!', '\x7e');
601 static void addmeta_print (re9_compiler_t *ci) {
602 add_to_class_two(ci, ' ', '\x7e');
604 #endif
607 static void normalize_spans (re9_compiler_t *ci) {
608 if (ci->yyc_used > 2) {
609 /* we have at least two spans */
610 uint32_t cp;
611 re9_rune *ep = ci->yyclass+ci->yyc_used;
612 re9_rune *p, *np;
613 /* sort on span start */
614 /* yeah, old good bubble sorting */
615 for (p = ci->yyclass; p < ep; p += 2) {
616 for (np = p; np < ep; np += 2) {
617 if (*np < *p) {
618 re9_rune rune;
619 rune = np[0];
620 np[0] = p[0];
621 p[0] = rune;
622 rune = np[1];
623 np[1] = p[1];
624 p[1] = rune;
628 /* merge spans */
629 for (cp = 2; cp < ci->yyc_used; ) {
630 uint32_t f;
631 /* if next span start is inside previous span, merge both spans */
632 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
633 if (ci->yyclass[cp] <= ci->yyclass[cp-1]) {
634 /* inside, merge */
635 if (ci->yyclass[cp+1] > ci->yyclass[cp-1]) ci->yyclass[cp-1] = ci->yyclass[cp+1];
636 /* remove this span */
637 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
638 ci->yyc_used -= 2;
639 } else if (ci->yyclass[cp-1]+1 == ci->yyclass[cp]) {
640 /* we can join this spans */
641 ci->yyclass[cp-1] = ci->yyclass[cp+1];
642 /* remove this span */
643 for (f = cp+2; f < ci->yyc_used; ++f) ci->yyclass[f-2] = ci->yyclass[f];
644 ci->yyc_used -= 2;
645 } else {
646 /* outside, skip */
647 cp += 2;
654 /* parse character range */
655 static int bldcclass (re9_compiler_t *ci) {
656 int type;
657 re9_rune rune;
658 int quoted;
659 /* we have already seen the '[' */
660 type = CCLASS;
661 ci->yyc_used = 0;
662 /* look ahead for negation */
663 /* SPECIAL CASE!!! negated classes don't match \n */
664 quoted = nextc(ci, &rune);
665 if (!quoted && rune == '^') {
666 type = NCCLASS;
667 quoted = nextc(ci, &rune);
668 add_to_class_two(ci, '\n', '\n');
670 /* parse class into a set of spans */
671 for (;; quoted = nextc(ci, &rune)) {
672 if (rune == RE9_RUNE_SPEC_EOL) rcerror(ci, "malformed '[]'");
673 if (!quoted && rune == ']' && (ci->yyc_used&0x01) == 0) break;
674 if (quoted && ((rune >= '0' && rune <= '9') || (rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z'))) {
675 /* metacharacters */
676 if ((ci->yyc_used&0x01) != 0 || (ci->expr < ci->expr_eol && ci->expr[0] == '-')) rcerror(ci, "malformed '[]'"); /* metacharacter can't be used in rangedef */
677 switch (rune) {
678 case 'd': case 'D': addmeta_digit(ci, (rune <= 'Z')); break;
679 case 's': case 'S': addmeta_space(ci, (rune <= 'Z')); break;
680 case 'w': case 'W': addmeta_word(ci, (rune <= 'Z')); break;
681 case 'z': add_to_class_two(ci, '\0', '\0'); break;
682 case 'Z': add_to_class_two(ci, '\1', RE9_RUNE_MAX); break;
683 default:
684 rcerror(ci, "invalid metacharacter in '[]'");
685 break;
687 #ifndef RE9_DISABLE_POSIX_CLASSES
688 } else if ((ci->yyc_used&0x01) == 0 && !quoted && rune == '[' && ci->expr+6 < ci->expr_eol && ci->expr[0] == ':') {
689 if (ci->expr+7 < ci->expr_eol) {
690 if (strncmp(ci->expr, ":alnum:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); addmeta_digit(ci, 0); }
691 else if (strncmp(ci->expr, ":alpha:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); }
692 else if (strncmp(ci->expr, ":ascii:]", 8) == 0) { ci->expr += 8; addmeta_ascii(ci); }
693 else if (strncmp(ci->expr, ":blank:]", 8) == 0) { ci->expr += 8; addmeta_blank(ci); }
694 else if (strncmp(ci->expr, ":cntrl:]", 8) == 0) { ci->expr += 8; addmeta_ctrl(ci); }
695 else if (strncmp(ci->expr, ":digit:]", 8) == 0) { ci->expr += 8; addmeta_digit(ci, 0); }
696 else if (strncmp(ci->expr, ":graph:]", 8) == 0) { ci->expr += 8; addmeta_graph(ci); }
697 else if (strncmp(ci->expr, ":lower:]", 8) == 0) { ci->expr += 8; addmeta_lower(ci); }
698 else if (strncmp(ci->expr, ":print:]", 8) == 0) { ci->expr += 8; addmeta_print(ci); }
699 else if (strncmp(ci->expr, ":punct:]", 8) == 0) { ci->expr += 8; addmeta_punct(ci); }
700 else if (strncmp(ci->expr, ":space:]", 8) == 0) { ci->expr += 8; addmeta_space(ci, 0); }
701 else if (strncmp(ci->expr, ":upper:]", 8) == 0) { ci->expr += 8; addmeta_upper(ci); }
702 } else if (strncmp(ci->expr, ":word:]", 7) == 0) { ci->expr += 7; addmeta_word(ci, 0); }
703 else if (ci->expr+8 < ci->expr_eol && strncmp(ci->expr, ":xdigit:]", 9) == 0) { ci->expr += 9; addmeta_xdigit(ci); }
704 #endif
705 } else {
706 if (ci->yyc_used&0x01 && rune < ci->yyclass[ci->yyc_used-1]) rcerror(ci, "invalid range in '[]'");
707 add_to_class(ci, rune);
708 if (ci->yyc_used&0x01) {
709 /* this was first char of possible range */
710 if (ci->expr < ci->expr_eol && ci->expr[0] == '-') {
711 /* rangedef; skip '-' */
712 ++ci->expr;
713 } else {
714 /* one char: convert it to range */
715 add_to_class(ci, rune);
720 /* sort and store only if we are not in comment */
721 if (ci->comment_level == 0) {
722 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: character class parsing error");
723 if (ci->yyc_used == 0) rcerror(ci, "empty '[]'");
724 normalize_spans(ci);
725 /* if we have only one char in range, we can use RUNE instead */
726 if (type == CCLASS && ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
727 /* yeah, one char! */
728 ci->yyrune = ci->yyclass[0];
729 return RUNE;
732 return type;
736 /* parse metacharacter */
737 static int bldmeta (re9_compiler_t *ci, re9_rune rune) {
738 int rangeop = (rune >= 'a' && rune <= 'z' ? CCLASS : NCCLASS);
739 ci->yyc_used = 0;
740 switch (rune) {
741 case 'd': case 'D': addmeta_digit(ci, 0); break;
742 case 's': case 'S': addmeta_space(ci, 0); break;
743 case 'w': case 'W': addmeta_word(ci, 0); break;
744 case 'Z': add_to_class_two(ci, '\0', '\0'); break;
745 case 'z': ci->yyrune = 0; return RUNE;
746 case 'b': ci->yyrune = RE9_RUNE_SPEC_WBOUND; return RUNE;
747 case 'B': ci->yyrune = RE9_RUNE_SPEC_WNBOUND; return RUNE;
748 default: rcerror(ci, "invalid metacharacter");
750 return rangeop;
754 /* get token */
755 static int lex (re9_compiler_t *ci) {
756 if (ci->return_rune_nothing) {
757 ci->return_rune_nothing = 0;
758 ci->yyrune = RE9_RUNE_SPEC_NOTHING;
759 } else if (ci->expr >= ci->expr_eol) {
760 ci->yyrune = RE9_RUNE_SPEC_EOL;
761 } else if (ci->flags&RE9_FLAG_LITERAL) {
762 if (ci->expr < ci->expr_eol && ci->expr[0] == '\\') {
763 ++ci->expr;
764 ci->yyrune = '\\';
765 } else {
766 /* can't be quoted */
767 nextc(ci, &ci->yyrune);
769 } else {
770 int quoted = nextc(ci, &ci->yyrune);
771 if (quoted) {
772 /* check for metacharacter */
773 if ((ci->yyrune >= '0' && ci->yyrune <= '9') || (ci->yyrune >= 'a' && ci->yyrune <= 'z') || (ci->yyrune >= 'A' && ci->yyrune <= 'Z')) return bldmeta(ci, ci->yyrune);
774 } else {
775 switch (ci->yyrune) {
776 case '*': quoted = STAR; goto closure;
777 case '?': quoted = QUEST; goto closure;
778 case '+': quoted = PLUS; /* fallthru */
779 closure: if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
780 #ifndef RE9_DISABLE_NONGREEDY
781 ++ci->expr;
782 quoted |= 0x100;
783 #else
784 if ((ci->flags&RE9_FLAG_NONGREEDY) == 0) rcerror(ci, "non-greedy closure support was disabled at compile time");
785 #endif
787 return quoted;
788 case '|': return ALT;
789 case '.': return (ci->flags&RE9_FLAG_ANYDOT ? ANYNL : ANY);
790 case '(': return LBRA;
791 case ')': return RBRA;
792 case '^': return BOL;
793 case '$': return EOL;
794 case '[': return bldcclass(ci);
798 return (ci->yyrune != RE9_RUNE_SPEC_EOL ? RUNE : END);
802 /******************************************************************************/
803 static inline uint32_t instofs (const re9_compiler_t *ci, const vmopcode_t *op) {
804 return (int32_t)(((const uint8_t *)op)-ci->pp->code);
808 static inline vmopcode_t *instptr (re9_compiler_t *ci, uint32_t ofs) {
809 return (vmopcode_t *)(ci->pp->code+ofs);
813 /* push instructions to 'operand stack' */
814 static void push_operand (re9_compiler_t *ci, vmopcode_t *f, vmopcode_t *l) {
815 if (ci->andp >= &ci->andstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operand stack overflow");
816 ci->andp->first = instofs(ci, f);
817 ci->andp->last = instofs(ci, l);
818 ++ci->andp;
822 /* push token and group id to 'operator stack' */
823 static void push_operator (re9_compiler_t *ci, int t, int csi) {
824 if (ci->atorp >= &ci->atorstack[NSTACK]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack overflow");
825 *ci->atorp++ = t;
826 *ci->subidp++ = csi;
830 /* pop instriction from 'operand stack' */
831 static re9_node_t *pop_operand (re9_compiler_t *ci) {
832 if (ci->andp <= &ci->andstack[0]) rcerror(ci, "missing operand");
833 return --ci->andp;
837 /* pop token from 'operator stack' */
838 static int pop_operator (re9_compiler_t *ci) {
839 if (ci->atorp <= &ci->atorstack[0]) rcerror(ci, "THE THING THAT SHOULD NOT BE: operator stack underflow");
840 --ci->subidp;
841 return *--ci->atorp;
845 /******************************************************************************/
846 static void *allot (re9_compiler_t *ci, size_t size) {
847 void *res;
848 if (ci->pp->code_used+size > ci->pp->code_alloted) {
849 uint32_t newsz = ((ci->pp->code_used+size)|0xfff)+1;
850 uint8_t *newc = realloc(ci->pp->code, newsz);
851 if (newc == NULL) rcerror(ci, "out of memory");
852 memset(newc+ci->pp->code_used, 0, newsz-ci->pp->code_used);
853 ci->pp->code = newc;
854 ci->pp->code_alloted = newsz;
856 res = ci->pp->code+ci->pp->code_used;
857 ci->pp->code_used += size;
858 return res;
862 /* allocate new instruction */
863 static inline vmopcode_t *newinst (re9_compiler_t *ci, int t) {
864 vmopcode_t *op = allot(ci, sizeof(*op));
865 op->opcode = t;
866 return op;
870 static vmopcode_t *newinst_class (re9_compiler_t *ci, int t) {
871 vmop_class_t *op = allot(ci, sizeof(*op)+ci->yyc_used*sizeof(op->spi[0]));
872 op->opc.opcode = t;
873 if (ci->yyc_used > 0) memcpy(op->spi, ci->yyclass, ci->yyc_used*sizeof(op->spi[0]));
874 op->spi_count = ci->yyc_used;
875 if (ci->yyc_used&0x01) rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
876 return (vmopcode_t *)op;
880 static vmopcode_t *newinst_rune (re9_compiler_t *ci, re9_rune r) {
881 if (r < RUNE || r > RE9_RUNE_MAX) {
882 vmopcode_t *op = allot(ci, sizeof(*op));
883 switch (r) {
884 case RE9_RUNE_SPEC_NOTHING: op->opcode = NOP; break;
885 case RE9_RUNE_SPEC_WBOUND: op->opcode = WBOUND; break;
886 case RE9_RUNE_SPEC_WNBOUND: op->opcode = WNBOUND; break;
887 default:
888 if (r < RUNE) {
889 op->opcode = r;
890 break;
892 rcerror(ci, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
894 return op;
895 } else {
896 vmop_rune_t *op = allot(ci, sizeof(*op));
897 op->opc.opcode = RUNE;
898 op->r = r;
899 return (vmopcode_t *)op;
904 static vmopcode_t *newinst_mark (re9_compiler_t *ci, int t, int id) {
905 if (id >= 0) {
906 vmop_mark_t *op = allot(ci, sizeof(*op));
907 if (id > 127) rcerror(ci, "too many captures");
908 op->opc.opcode = t;
909 op->subid = id;
910 return (vmopcode_t *)op;
911 } else {
912 vmopcode_t *op = allot(ci, sizeof(*op));
913 op->opcode = NOP;
914 return op;
919 static inline vmop_split_t *newinst_split (re9_compiler_t *ci, int t) {
920 vmop_split_t *op = allot(ci, sizeof(*op));
921 #ifndef RE9_DISABLE_NONGREEDY
922 op->opc.opcode = ((ci->flags&RE9_FLAG_NONGREEDY) == 0 ? (t&0x100 ? SPLIT_NG : SPLIT_G) : (t&0x100 ? SPLIT_G : SPLIT_NG));
923 if ((t&0xff) == STAR) op->opc.opcode = (op->opc.opcode == SPLIT_G ? STAR_G : STAR_NG);
924 #else
925 op->opc.opcode = ((t&0xff) == STAR ? STAR_G : SPLIT_G);
926 #endif
927 return op;
931 static inline vmop_split_t *newinst_split1 (re9_compiler_t *ci, int t) {
932 vmop_split_t *op = allot(ci, sizeof(*op));
933 op->opc.opcode = t;
934 return op;
938 /* process instructions with higher current priority levels */
939 static void evaluntil (re9_compiler_t *ci, int pri) {
940 while (pri == RBRA || (ci->atorp[-1]&0xff) >= (pri&0xff)) {
941 re9_node_t *op1, *op2;
942 vmopcode_t *inst1, *inst2;
943 vmop_split_t *si1;
944 int op = pop_operator(ci);
945 switch (op) {
946 case LBRA: /* must have been RBRA */
947 op1 = pop_operand(ci);
948 inst2 = newinst_mark(ci, EMARK, *ci->subidp);
949 instptr(ci, op1->last)->next = instofs(ci, inst2);
950 inst1 = newinst_mark(ci, SMARK, *ci->subidp);
951 inst1->next = op1->first;
952 push_operand(ci, inst1, inst2);
953 return;
954 case ALT:
955 op2 = pop_operand(ci);
956 op1 = pop_operand(ci);
957 inst2 = newinst(ci, NOP);
958 instptr(ci, op2->last)->next = instptr(ci, op1->last)->next = instofs(ci, inst2);
959 si1 = newinst_split1(ci, SPLIT_G);
960 si1->alt = op1->first;
961 si1->opc.next = op2->first;
962 push_operand(ci, (void *)si1, inst2);
963 break;
964 case CAT:
965 op2 = pop_operand(ci);
966 op1 = pop_operand(ci);
967 instptr(ci, op1->last)->next = op2->first;
968 push_operand(ci, instptr(ci, op1->first), instptr(ci, op2->last));
969 break;
970 case STAR: case STAR|0x100: /* greedy and non-greedy */
971 op2 = pop_operand(ci);
972 si1 = newinst_split(ci, op);
973 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
974 si1->alt = op2->first;
975 push_operand(ci, (void *)si1, (void *)si1);
976 break;
977 case PLUS: case PLUS|0x100: /* greedy and non-greedy */
978 op2 = pop_operand(ci);
979 si1 = newinst_split(ci, op);
980 instptr(ci, op2->last)->next = instofs(ci, (void *)si1);
981 si1->alt = op2->first;
982 push_operand(ci, instptr(ci, op2->first), (void *)si1);
983 break;
984 case QUEST: case QUEST|0x100: /* greedy and non-greedy */
985 op2 = pop_operand(ci);
986 si1 = newinst_split(ci, op);
987 inst2 = newinst(ci, NOP);
988 si1->opc.next = instofs(ci, inst2);
989 si1->alt = op2->first;
990 instptr(ci, op2->last)->next = instofs(ci, inst2);
991 push_operand(ci, (void *)si1, inst2);
992 break;
993 default:
994 rcerror(ci, "unknown operator in evaluntil");
995 break;
1001 /******************************************************************************/
1002 /* process operator */
1003 static void operator (re9_compiler_t *ci, int t) {
1004 int csi = ci->cursubid;
1005 if (ci->comment_level) {
1006 /* in comment */
1007 if (t == RBRA || t == LBRA) {
1008 ci->nbra += (t == LBRA ? 1 : -1);
1009 ci->comment_level += (t == LBRA ? 1 : -1);
1011 } else {
1012 /* not in comment */
1013 if (t == RBRA && --ci->nbra < 0) rcerror(ci, "unmatched right paren");
1014 if (t == LBRA) {
1015 ++ci->nbra;
1016 csi = -1;
1017 if (ci->expr < ci->expr_eol && ci->expr[0] == '?') {
1018 if (ci->expr+1 < ci->expr_eol && ci->expr[1] == ':') {
1019 ci->expr += 2;
1020 } else if (ci->expr+1 < ci->expr_eol && ci->expr[1] == '#') {
1021 ci->expr += 2;
1022 ci->comment_level = 1;
1023 ci->last_was_operand = RE9_TRUE;
1024 return;
1025 } else {
1026 rcerror(ci, "invalid paren flags");
1028 } else {
1029 if ((csi = ++ci->cursubid) >= RE9_SUBEXP_MAX) rcerror(ci, "too many subexpressions");
1030 ci->pp->nsub = csi;
1032 if (ci->last_was_operand) operator(ci, CAT);
1033 /* process empty parens */
1034 if (ci->expr < ci->expr_eol && ci->expr[0] == ')') ci->return_rune_nothing = 1; /* hack: next lex() will return 'nothing' rune */
1035 } else {
1036 evaluntil(ci, t);
1038 if (t != RBRA) push_operator(ci, t, csi);
1040 switch (t&0xff) {
1041 case STAR: case QUEST: case PLUS: case RBRA:
1042 ci->last_was_operand = RE9_TRUE; /* these look like operands */
1043 break;
1044 default:
1045 ci->last_was_operand = RE9_FALSE;
1046 break;
1051 /* process operand: rune or character class */
1052 static void operand (re9_compiler_t *ci, int t) {
1053 if (ci->comment_level == 0) {
1054 vmopcode_t *i;
1055 if (ci->last_was_operand) operator(ci, CAT); /* concatenate is implicit */
1056 switch (t) {
1057 case CCLASS: case NCCLASS:
1058 i = newinst_class(ci, t);
1059 break;
1060 case RUNE:
1061 i = newinst_rune(ci, ci->yyrune);
1062 break;
1063 default:
1064 i = newinst(ci, t);
1065 break;
1067 push_operand(ci, i, i);
1069 ci->last_was_operand = RE9_TRUE;
1073 #ifndef RE9_DISABLE_LEARNING
1074 /* trace VM code and build 'starting range' */
1075 /* return !0 if we hit 'ANY*' */
1076 static int learn_start_tracer (re9_compiler_t *ci, uint32_t pc) {
1077 for (;;) {
1078 uint8_t opcode = ci->pp->code[pc];
1079 const vmop_class_t *opc;
1080 re9_rune pr;
1081 uint32_t f;
1082 if (opcode < RUNE) {
1083 /* wow, a solid hit */
1084 add_to_class_two(ci, opcode, opcode);
1085 return 0;
1087 switch (opcode) {
1088 case RUNE: /* another solid hit */
1089 add_to_class_two(ci, ((const vmop_rune_t *)(ci->pp->code+pc))->r, ((const vmop_rune_t *)(ci->pp->code+pc))->r);
1090 return 0;
1091 case ANY: case ANYNL: /* shit... */
1092 ci->yyc_used = 0;
1093 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1094 return 1;
1095 case SPLIT_G: case SPLIT_NG: /* choose your way to die */
1096 case STAR_G: case STAR_NG: /* i can do better analyzis here, but i don't care for now */
1097 if (learn_start_tracer(ci, ((const vmop_split_t *)(ci->pp->code+pc))->alt)) return 1; /* no way to go */
1098 break;
1099 case BOL:
1100 ci->pp->startflags |= PRG_STARTS_WITH_BOL;
1101 return 0;
1102 case EOL:
1103 ci->pp->startflags |= PRG_STARTS_WITH_EOL;
1104 return 0;
1105 case CCLASS:
1106 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1107 for (f = 0; f < opc->spi_count; ++f) add_to_class(ci, opc->spi[f]);
1108 return 0;
1109 case NCCLASS:
1110 /* oh, this is harder than previous one */
1111 /* we realy on the fact that range was sorted and there are gaps between ranges */
1112 opc = ((const vmop_class_t *)(ci->pp->code+pc));
1113 if (opc->spi[0] > 0) add_to_class_two(ci, 0, opc->spi[0]-1);
1114 pr = opc->spi[1]+1;
1115 for (f = 2; f < opc->spi_count && pr <= RE9_RUNE_MAX; f += 2) {
1116 add_to_class_two(ci, pr, opc->spi[f]-1);
1117 pr = opc->spi[f+1]+1;
1119 if (pr <= RE9_RUNE_MAX) add_to_class_two(ci, pr, RE9_RUNE_MAX);
1120 return 0;
1121 case END:
1122 /*return 0;*/
1123 ci->yyc_used = 0;
1124 add_to_class_two(ci, 0, RE9_RUNE_MAX);
1125 return 1;
1126 /* ignore all other shit */
1128 pc = instptr(ci, pc)->next;
1133 static void learn_start (re9_compiler_t *ci) {
1134 int any;
1135 ci->yyc_used = 0;
1136 ci->pp->startflags = 0;
1137 ci->pp->startrange = 0;
1138 ci->pp->startrune = 0;
1139 any = learn_start_tracer(ci, ci->pp->startinst);
1140 if (any || ci->yyc_used == 0) {
1141 ci->pp->startflags = PRG_STARTS_WITH_ANY;
1142 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1143 } else {
1144 /* generate CCLASS */
1145 normalize_spans(ci);
1146 if (ci->yyc_used == 2 && ci->yyclass[0] == ci->yyclass[1]) {
1147 /* simple RUNE will be enough */
1148 ci->pp->startflags |= PRG_STARTS_WITH_RUNE;
1149 ci->pp->startrune = ci->yyclass[0];
1150 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1151 } else {
1152 ci->pp->startflags |= PRG_STARTS_WITH_CLASS;
1153 ci->pp->startrange = instofs(ci, newinst_class(ci, CCLASS));
1154 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1155 #ifdef RE9_DEBUG_PRG
1156 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1157 #endif
1161 #endif
1164 /*TODO:??? collapse some things, get rid of nops? */
1165 static void optimize (re9_compiler_t *ci) {
1166 re9_prog_t *pp = ci->pp;
1167 uint32_t pc;
1168 /* */
1169 #ifndef RE9_DISABLE_LEARNING
1170 learn_start(ci);
1171 #endif
1172 /* get rid of NOOP chains */
1173 for (pc = 0; pc < pp->code_used; pc += vmop_size(pp->code+pc)) {
1174 vmopcode_t *inst = instptr(ci, pc), *target;
1175 target = instptr(ci, inst->next);
1176 while (target->opcode == NOP) target = instptr(ci, target->next);
1177 if (inst->opcode == STAR_G) inst->opcode = SPLIT_G;
1178 else if (inst->opcode == STAR_NG) inst->opcode = SPLIT_NG;
1179 inst->next = instofs(ci, target);
1180 if (inst->opcode == SPLIT_NG || inst->opcode == SPLIT_G) {
1181 vmop_split_t *op = (vmop_split_t *)inst;
1182 target = instptr(ci, op->alt);
1183 while (target->opcode == NOP) target = instptr(ci, target->next);
1184 op->alt = instofs(ci, target);
1185 if (inst->opcode == SPLIT_NG) ci->pp->flags |= RE9_FLAG_NONGREEDY;
1188 /* move startinst on non-NOOP */
1189 while (instptr(ci, pp->startinst)->opcode == NOP) pp->startinst = instptr(ci, pp->startinst)->next;
1190 /* shrink code area */
1191 if (pc > pp->code_alloted) {
1192 uint8_t *newc;
1193 pp->code_used = pp->code_alloted = pc;
1194 newc = realloc(pp->code, pc);
1195 if (newc != NULL) pp->code = newc;
1200 re9_prog_t *re9_compile_ex (const char *s, const char *eol, int flags, const char **errmsg) {
1201 int token;
1202 re9_compiler_t * volatile ci;
1203 re9_prog_t *pres;
1204 if (s == NULL) {
1205 if (errmsg != NULL) *errmsg = "can't parse NULL regexp";
1206 return NULL;
1208 if (eol == NULL) eol = s+strlen(s); else if (eol < s) eol = s;
1209 if ((ci = malloc(sizeof(*ci))) == NULL) {
1210 if (errmsg != NULL) *errmsg = "out of memory";
1211 return NULL;
1213 memset(ci, 0, sizeof(*ci));
1214 /* get memory for the program */
1215 ci->pp = malloc(sizeof(*ci->pp));
1216 if (ci->pp == NULL) {
1217 free(ci);
1218 if (errmsg != NULL) *errmsg = "out of memory";
1219 return NULL;
1221 memset(ci->pp, 0, sizeof(*ci->pp));
1222 //ci->pp->nsub = 0;
1223 ci->pp->flags = (flags&RE9_FLAG_CASEINSENS);
1224 /* */
1225 ci->flags = flags;
1226 if (errmsg != NULL) *errmsg = NULL;
1227 if (setjmp(ci->regkaboom)) {
1228 if (errmsg != NULL) *errmsg = ci->errormsg;
1229 if (ci->yyclass != NULL) free(ci->yyclass);
1230 if (ci->pp->code != NULL) free(ci->pp->code);
1231 free(ci->pp);
1232 free(ci);
1233 return NULL;
1235 /* go compile the sucker */
1236 ci->expr = s;
1237 ci->expr_eol = eol;
1238 ci->nbra = 0;
1239 ci->atorp = ci->atorstack;
1240 ci->andp = ci->andstack;
1241 ci->subidp = ci->subidstack;
1242 ci->last_was_operand = RE9_FALSE;
1243 ci->cursubid = 0;
1244 /* start with a low priority operator to prime parser */
1245 push_operator(ci, START-1, ci->cursubid);
1246 /* offset 0 should be NOOP */
1247 newinst(ci, NOP)->next = ci->pp->code_used;
1248 while ((token = lex(ci)) != END) {
1249 if ((token&0300) == OPERATOR) operator(ci, token); else operand(ci, token);
1251 if (ci->comment_level == 0) {
1252 /* close with a low priority operator */
1253 evaluntil(ci, START);
1254 /* force END */
1255 operand(ci, END);
1256 evaluntil(ci, START);
1258 #ifdef RE9_DEBUG_PRG
1259 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1260 #endif
1261 if (ci->nbra) rcerror(ci, "unmatched left paren");
1262 --ci->andp; /* points to first and only operand */
1263 ci->pp->startinst = ci->andp->first;
1264 #ifdef RE9_DEBUG_PRG
1265 if (ci->flags&RE9_FLAG_DUMPPRG) {
1266 fprintf(stderr, "=== unoptimized ===\n");
1267 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1268 dump(ci->pp);
1270 #endif
1271 optimize(ci);
1272 #ifdef RE9_DEBUG_PRG
1273 if (ci->flags&RE9_FLAG_DUMPPRG) {
1274 fprintf(stderr, "=== optimized ===\n");
1275 fprintf(stderr, "start: %u\n", ci->pp->startinst);
1276 dump(ci->pp);
1277 fprintf(stderr, "------\n");
1279 #endif
1280 pres = ci->pp;
1281 if (ci->yyclass != NULL) free(ci->yyclass);
1282 free(ci);
1283 ++pres->nsub;
1284 return pres;
1288 re9_prog_t *re9_compile (const char *s, int flags, const char **errmsg) {
1289 return re9_compile_ex(s, NULL, flags, errmsg);
1293 int re9_nsub (const re9_prog_t *p) {
1294 return (p != NULL ? p->nsub : 0);
1298 void re9_free (re9_prog_t *p) {
1299 if (p != NULL) {
1300 if (p->code != NULL) free(p->code);
1301 free(p);
1306 /******************************************************************************/
1307 /* executor */
1308 #define LASTINST (0)
1311 /* substitution list */
1312 typedef struct {
1313 re9_sub_t m[RE9_SUBEXP_MAX];
1314 } re9_sublist_t;
1318 * regexec execution lists
1320 #define LISTSIZE (16)
1321 #define BIGLISTSIZE (64*LISTSIZE)
1323 typedef struct {
1324 uint32_t inst; /* instruction of the thread */
1325 re9_sublist_t se; /* matched subexpressions in this thread */
1326 } re9_listitem_t;
1329 typedef struct {
1330 re9_listitem_t *start;
1331 re9_listitem_t *free;
1332 re9_listitem_t *end;
1333 } re9_list_t;
1336 typedef struct {
1337 re9_listitem_t *relist[2], *relistf[2];
1338 int listsize;
1339 const char *bol;
1340 const char *eol;
1341 int flags;
1342 } re9_ljunk_t;
1345 /* save a new match in mp */
1346 static inline void re9l_newmatch (re9_sub_t *mp, int ms, re9_sublist_t *sp) {
1347 if (mp == NULL || ms <= 0) return;
1348 if (mp[0].sp == NULL || sp->m[0].sp < mp[0].sp || (sp->m[0].sp == mp[0].sp && sp->m[0].ep > mp[0].ep)) {
1349 int i;
1350 for (i = 0; i < ms && i < RE9_SUBEXP_MAX; ++i) mp[i] = sp->m[i];
1351 for (; i < ms; ++i) mp[i].sp = mp[i].ep = 0;
1357 * Note optimization in re9l_newthread:
1358 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1361 * lp: list to add to
1362 * ip: instruction to add
1363 * ms: number of elements at mp
1364 * sep: pointers to subexpressions
1366 static inline re9_listitem_t *re9l_newthread (re9_list_t *lp, uint32_t ip, int ms, re9_sublist_t *sep, int nocheck) {
1367 re9_listitem_t *p;
1368 if (!nocheck) {
1369 for (p = lp->start; p->inst != LASTINST; ++p) {
1370 if (p->inst == ip) {
1371 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sep->m[0].sp, p->se.m[0].sp);
1372 if (sep->m[0].sp < p->se.m[0].sp) {
1373 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1375 return p;
1379 p = lp->free++;
1380 if (p >= lp->end-1) return NULL;
1381 dlogf("newthread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1382 p->inst = ip;
1383 if (ms > 1) p->se = *sep; else p->se.m[0] = sep->m[0];
1384 p[1].inst = LASTINST;
1385 return p;
1390 * same as renewthread, but called with initial empty start pointer
1393 * lp: list to add to
1394 * ip: instruction to add
1395 * ms: number of elements at mp
1396 * sp: pointers to subexpressions
1398 static inline re9_listitem_t *re9l_newemptythread (re9_list_t *lp, uint32_t ip, int ms, const char *sp, int nocheck) {
1399 re9_listitem_t *p;
1400 if (!nocheck) {
1401 for (p = lp->start; p->inst != LASTINST; ++p) {
1402 if (p->inst == ip) {
1403 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp, p-lp->start, sp, p->se.m[0].sp);
1404 if (sp < p->se.m[0].sp) {
1405 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1406 p->se.m[0].sp = sp;
1408 return p;
1412 p = lp->free++;
1413 if (p >= lp->end-1) return NULL;
1414 dlogf("newemptythread:%p; new at %u(%u)\n", lp->start, p-lp->start, ip);
1415 p->inst = ip;
1416 if (ms > 1) memset(&p->se, 0, sizeof(p->se));
1417 p->se.m[0].sp = sp;
1418 p[1].inst = LASTINST;
1419 return p;
1423 static inline int is_word_rune (re9_rune r) {
1424 return
1425 (r >= '0' && r <= '9') ||
1426 (r >= 'A' && r <= 'Z') ||
1427 (r >= 'a' && r <= 'z') ||
1428 (r >= 128 && r <= RE9_RUNE_MAX) ||
1429 r == '_';
1434 * return 0 if no match
1435 * >0 if a match
1436 * <0 if we ran out of list space
1439 * progp: program to run
1440 * bol: string to run machine on
1441 * mp: subexpression elements
1442 * ms: number of elements at mp
1445 static int regexec1 (const re9_prog_t *progp, re9_sub_t *mp, int ms, re9_ljunk_t *j) {
1446 int flag = 0;
1447 #ifdef RE9_CALC_MAXT
1448 int maxt = 0;
1449 #endif
1450 const char *s;
1451 #ifndef RE9_DISABLE_LEARNING
1452 const char *p;
1453 int checkstart;
1454 #endif
1455 re9_rune rprev, r;
1456 int rune_size;
1457 re9_list_t tl, nl; /* this list, next list */
1458 re9_listitem_t *tlp, *tlxn;
1459 int match, i, flags;
1460 uint32_t f;
1461 const vmop_class_t *opc;
1462 int dont_collapse = 0;
1463 /* */
1464 flags = (progp->flags&(RE9_FLAG_CASEINSENS|RE9_FLAG_NONGREEDY))|(j->flags&(RE9_FLAG_NONUTF8|RE9_FLAG_ANYDOT));
1465 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1466 match = 0;
1467 #ifndef RE9_DISABLE_LEARNING
1468 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY);
1469 #endif
1470 if (mp != NULL) {
1471 for (i = 0; i < ms; ++i) mp[i].sp = mp[i].ep = NULL;
1472 } else {
1473 ms = 0;
1475 if (ms < 0) ms = 0; else if (ms > RE9_SUBEXP_MAX) ms = RE9_SUBEXP_MAX;
1476 j->relist[0][0].inst = j->relist[1][0].inst = LASTINST;
1477 j->relistf[0] = j->relist[0];
1478 j->relistf[1] = j->relist[1];
1479 /* execute machine once for each character, including terminal NUL */
1480 s = j->bol;
1481 rprev = RE9_RUNE_SPEC_BOL;
1482 for (;;) {
1483 /* fast check for first char */
1484 #ifndef RE9_DISABLE_LEARNING
1485 if (checkstart) {
1486 dlogf("checkstart! 0x%02x\n", progp->startflags);
1487 switch (progp->startflags) {
1488 /* simple cases */
1489 case PRG_STARTS_WITH_BOL:
1490 if (s == j->bol) break;
1491 if (s == j->eol) goto mt_quit;
1492 p = memchr(s, '\n', j->eol-s);
1493 if (p == NULL || p+1 > j->eol) goto mt_quit;
1494 s = p+1;
1495 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1496 break;
1497 case PRG_STARTS_WITH_EOL:
1498 if (s == j->eol) goto mt_quit;
1499 p = memchr(s, '\n', j->eol-s);
1500 if (p == NULL) p = j->eol;
1501 s = p;
1502 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1503 break;
1504 case PRG_STARTS_WITH_RUNE:
1505 if ((flags&RE9_FLAG_CASEINSENS) == 0) {
1506 /* case-sensitive */
1507 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1508 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1509 p = memchr(s, progp->startrune, j->eol-s);
1510 } else {
1511 p = re8_rune_strchr(s, progp->startrune, j->eol);
1513 } else {
1514 /* case-insensitive */
1515 if (progp->startrune < 128 || (flags&RE9_FLAG_NONUTF8)) {
1516 if (progp->startrune > 255) goto mt_quit; /* can't be found */
1517 for (p = s; p < j->eol; ++p) if (UPPER(((unsigned char)*p)) == progp->startrune) break;
1518 } else {
1519 p = s;
1520 while (p < j->eol) {
1521 rune_size = re9_char2rune(&r, p, j->eol);
1522 if (UPPER(r) == progp->startrune) break;
1523 p += rune_size;
1527 if (p == NULL || p > j->eol) goto mt_quit;
1528 s = p;
1529 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1530 break;
1531 case PRG_STARTS_WITH_CLASS:
1532 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1533 while (s < j->eol) {
1534 r = *(const uint8_t *)s;
1535 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1536 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1537 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1538 if (f < opc->spi_count) break;
1539 s += rune_size;
1541 if (s >= j->eol) goto mt_quit;
1542 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1543 break;
1544 default:
1545 /* difficult cases */
1546 if ((progp->startflags&PRG_STARTS_WITH_BOL) && s == j->bol) goto difficult_found;
1547 if ((progp->startflags&PRG_STARTS_WITH_EOL) && s == j->eol) goto difficult_found;
1548 opc = (const vmop_class_t *)(progp->code+progp->startrange);
1549 while (s < j->eol) {
1550 if (*s == '\n') {
1551 if (progp->startflags&PRG_STARTS_WITH_BOL) { ++s; break; }
1552 if (progp->startflags&PRG_STARTS_WITH_EOL) break;
1554 if (progp->startflags&(PRG_STARTS_WITH_RUNE|PRG_STARTS_WITH_CLASS)) {
1555 r = *(const uint8_t *)s;
1556 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1557 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1558 if (progp->startflags&PRG_STARTS_WITH_RUNE) {
1559 if (r == progp->startrune) break;
1560 } else {
1561 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1562 if (f < opc->spi_count) break;
1564 s += rune_size;
1565 } else {
1566 ++s;
1569 if (s > j->eol) goto mt_quit;
1570 difficult_found:
1571 rprev = (s == j->bol ? RE9_RUNE_SPEC_BOL : s[-1]); /*FIXME: previous rune can be UTF-8 one*/
1572 break;
1575 #endif
1576 if (s < j->eol) {
1577 r = *(const uint8_t *)s;
1578 rune_size = (r < RE9_RUNE_SELF || (flags&RE9_FLAG_NONUTF8) ? 1 : re9_char2rune(&r, s, j->eol));
1579 if (flags&RE9_FLAG_CASEINSENS) r = UPPER(r);
1580 } else {
1581 r = RE9_RUNE_SPEC_EOL;
1582 rune_size = 0;
1584 /* switch run lists */
1585 tl.start = j->relist[flag];
1586 tl.end = j->relist[flag]+j->listsize;
1587 tl.free = j->relistf[flag];
1588 nl.start = j->relist[flag^=1];
1589 nl.end = j->relist[flag]+j->listsize;
1590 nl.free = nl.start;
1591 nl.start->inst = LASTINST;
1592 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag, match, tl.start);
1593 /* add first instruction to current list */
1594 if (match == 0) {
1595 if ((tlxn = re9l_newemptythread(&tl, progp->startinst, ms, s, dont_collapse)) == NULL) return -1;
1597 tlxn = tl.free;
1598 #ifdef RE9_DEBUG
1599 dlogf(" tlxn=%u\n", tlxn-tl.start);
1600 for (tlp = tl.start; tlp < tl.free; ++tlp) dlogf("%d: %u\n", tlp-tl.start, tlp->inst);
1601 #endif
1602 /* execute machine until current list is empty */
1603 if (tl.start->inst == LASTINST) break; /* nothing to do */
1604 #ifdef RE9_CALC_MAXT
1605 if (tl.free-tl.start > maxt) maxt = tl.free-tl.start;
1606 #endif
1607 for (tlp = tl.start; tlp->inst != LASTINST; ++tlp) {
1608 uint32_t ip;
1609 dlogf("* new thread (%d) at %u\n", flag, tlp-tl.start);
1610 #ifdef RE9_CALC_MAXT
1611 if (tlp-tl.start > maxt) maxt = tlp-tl.start;
1612 #endif
1613 for (ip = tlp->inst; ip != LASTINST; ip = ((const vmopcode_t *)(progp->code+ip))->next) {
1614 const vmopcode_t *op = (const vmopcode_t *)(progp->code+ip);
1615 #ifdef RE9_DEBUG
1616 dump_opcode(stderr, progp, ip);
1617 #endif
1618 if (op->opcode < RUNE) {
1619 /* ascii character */
1620 if (op->opcode == r) {
1621 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1623 } else {
1624 switch (op->opcode) {
1625 case RUNE: /* regular character */
1626 if (((const vmop_rune_t *)op)->r == r) {
1627 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1629 break;
1630 case SMARK:
1631 tlp->se.m[((const vmop_mark_t *)op)->subid].sp = s;
1632 continue;
1633 case EMARK:
1634 tlp->se.m[((const vmop_mark_t *)op)->subid].ep = s;
1635 continue;
1636 case ANY:
1637 if ((flags&RE9_FLAG_ANYDOT) == 0 && r == '\n') break;
1638 /* fallthru */
1639 case ANYNL:
1640 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1641 break;
1642 case BOL:
1643 if (s == j->bol || rprev == RE9_RUNE_SPEC_BOL || rprev == '\n') continue;
1644 break;
1645 case EOL:
1646 if (s == j->eol || r == '\n') continue;
1647 break;
1648 case WBOUND:
1649 if (is_word_rune(rprev) != is_word_rune(r)) continue;
1650 break;
1651 case WNBOUND:
1652 if (is_word_rune(rprev) == is_word_rune(r)) continue;
1653 break;
1654 case CCLASS:
1655 opc = (const vmop_class_t *)op;
1656 for (f = 0; f < opc->spi_count; f += 2) {
1657 if (r >= opc->spi[f] && r <= opc->spi[f+1]) {
1658 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1659 break;
1662 break;
1663 case NCCLASS:
1664 opc = (const vmop_class_t *)op;
1665 for (f = 0; f < opc->spi_count; f += 2) if (r >= opc->spi[f] && r <= opc->spi[f+1]) break;
1666 if (f >= opc->spi_count) {
1667 if (re9l_newthread(&nl, op->next, ms, &tlp->se, dont_collapse) == NULL) return -1;
1669 break;
1670 #ifndef RE9_DISABLE_NONGREEDY
1671 case SPLIT_NG:
1672 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1673 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1674 break;
1675 case SPLIT_G:
1676 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1677 if (flags&RE9_FLAG_NONGREEDY) {
1678 if (re9l_newthread(&tl, op->next, ms, &tlp->se, 1) == NULL) return -1;
1679 } else {
1680 continue;
1682 break;
1683 #else
1684 case SPLIT_G:
1685 if (re9l_newthread(&tl, ((const vmop_split_t *)op)->alt, ms, &tlp->se, 1) == NULL) return -1;
1686 continue;
1687 break;
1688 #endif
1689 case END: /* match! */
1690 match = 1;
1691 tlp->se.m[0].ep = s;
1692 if (mp != 0) re9l_newmatch(mp, ms, &tlp->se);
1693 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp-tl.start, tlp->se.m[0].sp-j->bol, tlp->se.m[0].ep-j->bol, (tlp < tlxn ? "old" : "new"));
1694 #ifndef RE9_DISABLE_NONGREEDY
1695 if (flags&RE9_FLAG_NONGREEDY) {
1696 /* fuck off all 'old' instructions */
1697 if ( tlp < tlxn) {
1698 /* we are in 'old' area, skip it all */
1699 tlp = tlxn-1; /* next one will be 'new' */
1700 } else {
1701 /* we are in 'new' area, stop right here */
1702 tlp[1].inst = LASTINST;
1705 #endif
1706 break;
1707 #ifdef RE9_DEBUG
1708 default:
1709 fprintf(stderr, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op->opcode);
1710 #endif
1711 } /* switch */
1712 } /* if */
1713 break;
1716 j->relistf[flag] = nl.free; /* update 'free' pointer */
1717 dlogf("+ free=%u; sz=%d\n", nl.free-nl.start, j->listsize);
1718 if (s == j->eol) break;
1719 #ifndef RE9_DISABLE_LEARNING
1720 checkstart = (progp->startflags != PRG_STARTS_WITH_ANY && nl.start->inst == LASTINST && !match);
1721 #endif
1722 rprev = r;
1723 s += rune_size;
1724 dlogf("+++ (%d) eol=%d\n", r, s == j->eol);
1726 #ifndef RE9_DISABLE_LEARNING
1727 mt_quit:
1728 #endif
1729 #ifdef RE9_CALC_MAXT
1730 fprintf(stderr, "MAXT=%d\n", maxt);
1731 #endif
1732 return match;
1736 static int re9_exec_internal (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, re9_listitem_t *rl0, re9_listitem_t *rl1, size_t listsz) {
1737 re9_ljunk_t j;
1738 /* use user-specified starting/ending location if specified */
1739 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) {
1740 j.bol = (mp->sp != NULL ? mp->sp : bol);
1741 j.eol = (mp->ep != NULL ? mp->ep : j.bol+strlen(j.bol));
1742 } else {
1743 j.eol = (j.bol = bol)+strlen(bol);
1745 j.flags = flags;
1746 j.listsize = listsz;
1747 j.relist[0] = rl0;
1748 j.relist[1] = rl1;
1749 return regexec1(progp, mp, ms, &j);
1754 * progp: program to run
1755 * bol: string to run machine on
1756 * mp: subexpression elements
1757 * ms: number of elements at mp
1759 int re9_execute_ex (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms, size_t maxmem) {
1760 re9_listitem_t relist[2][LISTSIZE];
1761 const char *b, *e;
1762 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1763 int rv = re9_exec_internal(progp, flags, bol, mp, ms, relist[0], relist[1], LISTSIZE);
1764 if (rv < 0) {
1765 re9_listitem_t *r[2];
1766 int listsize = BIGLISTSIZE, rv;
1767 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1768 for (;;) {
1769 int f;
1770 for (f = 0; f < 2; ++f) {
1771 if ((r[f] = malloc(listsize*sizeof(re9_listitem_t))) == NULL) {
1772 while (--f >= 0) free(r[f]);
1773 return -1;
1776 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1777 rv = re9_exec_internal(progp, flags, bol, mp, ms, r[0], r[1], listsize);
1778 while (--f >= 0) free(r[f]);
1779 if (rv >= 0) break;
1780 if (listsize*2*sizeof(re9_listitem_t)*2 > maxmem) {
1781 size_t nsz = maxmem/(sizeof(re9_listitem_t)*2);
1782 if (nsz <= listsize) break;
1783 listsize = nsz;
1784 } else {
1785 listsize *= 2;
1788 #ifdef RE9_CALC_MAXT
1789 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%d\n", listsize/2);
1790 #endif
1792 return rv;
1796 int re9_execute (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1797 return re9_execute_ex(progp, flags, bol, mp, ms, 0);
1801 /******************************************************************************/
1802 struct re9_prog_prepared_s {
1803 const re9_prog_t *progp;
1804 re9_listitem_t *relist[2];
1805 size_t listsize;
1806 size_t maxmem;
1810 re9_prog_prepared_t *re9_prepare_ex (const re9_prog_t *progp, size_t maxmem) {
1811 re9_prog_prepared_t *res;
1812 int f;
1813 if (progp == NULL) return NULL;
1814 if ((res = malloc(sizeof(*res))) == NULL) return NULL;
1815 if (maxmem == 0) maxmem = 1024*1024*32; /* up to 32MB */
1816 res->progp = progp;
1817 res->listsize = BIGLISTSIZE;
1818 res->maxmem = maxmem;
1819 for (f = 0; f < 2; ++f) {
1820 if ((res->relist[f] = malloc(res->listsize*sizeof(re9_listitem_t))) == NULL) {
1821 while (--f >= 0) free(res->relist[f]);
1822 free(res);
1823 return NULL;
1826 return res;
1830 re9_prog_prepared_t *re9_prepare (const re9_prog_t *progp) {
1831 return re9_prepare_ex(progp, 0);
1835 int re9_prepared_execute (re9_prog_prepared_t *pp, int flags, const char *bol, re9_sub_t *mp, int ms) {
1836 if (pp != NULL) {
1837 int rv, f;
1838 const char *b, *e;
1839 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { b = mp[0].sp; e = mp[0].ep; } else { b = e = NULL; }
1840 for (;;) {
1841 rv = re9_exec_internal(pp->progp, flags, bol, mp, ms, pp->relist[0], pp->relist[1], pp->listsize);
1842 if (rv >= 0 || pp->listsize*sizeof(re9_listitem_t)*2 >= pp->maxmem) break;
1843 if (pp->listsize*2*sizeof(re9_listitem_t)*2 > pp->maxmem) {
1844 size_t nsz = pp->maxmem/(sizeof(re9_listitem_t)*2);
1845 if (nsz <= pp->listsize) break;
1846 pp->listsize = nsz;
1847 } else {
1848 pp->listsize *= 2;
1850 for (f = 0; f < 2; ++f) {
1851 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1852 if (r == NULL) break;
1853 pp->relist[f] = r;
1855 if (f < 2) { pp->listsize /= 2; rv = -1; break; }
1856 if ((flags&RE9_FLAG_MT0_RANGE) && mp != NULL) { mp[0].sp = b; mp[0].ep = e; }
1858 #ifdef RE9_CALC_MAXT
1859 if (rv < 0) fprintf(stderr, "OUT OF MEMORY! listsize=%u\n", pp->listsize);
1860 #endif
1861 if (pp->listsize > 32767) {
1862 int f;
1863 #ifdef RE9_CALC_MAXT
1864 fprintf(stderr, "*SHRINK: %u --> 32768\n", pp->listsize);
1865 #endif
1866 pp->listsize = 32768;
1867 for (f = 0; f < 2; ++f) {
1868 re9_listitem_t *r = realloc(pp->relist[f], pp->listsize*sizeof(re9_listitem_t));
1869 if (r == NULL) break;
1870 pp->relist[f] = r;
1873 return rv;
1875 return -1;
1879 void re9_prepared_free (re9_prog_prepared_t *pp) {
1880 if (pp != NULL) {
1881 if (pp->relist[1] != NULL) free(pp->relist[1]);
1882 if (pp->relist[0] != NULL) free(pp->relist[0]);
1883 free(pp);
1888 /******************************************************************************/
1889 /* substitute into one string using the matches from the last re9_execute() */
1890 int re9_sub (char *dp, size_t dlen, const char *sp, const re9_sub_t *mp, int ms) {
1891 int res = 0;
1892 char *dlast;
1893 if (mp == NULL || ms <= 0) ms = 0;
1894 if (dp == NULL) dlen = 0;
1895 dlast = (dlen > 0 ? dp+dlen-1 : NULL);
1896 while (*sp) {
1897 if (*sp == '\\' && (sp[1] == '{' || (sp[1] >= '0' && sp[1] <= '9'))) {
1898 int i = sp[1]-'0';
1899 if (i > 9) {
1900 /* parse {...} */
1901 i = 0;
1902 sp += 2;
1903 while (*sp && *sp != '}') {
1904 if (*sp < '0' || *sp > '9') break;
1905 i = i*10+sp[0]-'0';
1906 ++sp;
1907 if (i > ms) break;
1909 while (*sp && *sp != '}') ++sp;
1910 if (*sp == '}') ++sp;
1911 } else {
1912 sp += 2;
1914 if (i < ms && mp[i].sp != NULL) {
1915 const char *ssp = mp[i].sp;
1916 int len = (int)(mp[i].ep-ssp); /*FIXME: 2GB strings?*/
1917 if (len > 0) {
1918 res += len;
1919 if (len > dlen) len = dlen;
1920 if (len > 0) memcpy(dp, ssp, len);
1921 dlen -= len;
1922 dp += len;
1925 } else {
1926 if (*sp == '\\') {
1927 if (dlen > 0) {
1928 *dp++ = (sp[1] == 'z' ? 0 : sp[1]);
1929 --dlen;
1931 sp += 2;
1932 } else {
1933 if (dlen > 0) { *dp++ = *sp++; --dlen; } else ++sp;
1935 ++res;
1938 if (dlen > 0) *dp = '\0'; else if (dlast != NULL) *dlast = '\0';
1939 return ++res;