2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
29 /******************************************************************************/
30 #define PACKED __attribute__((packed))
32 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
35 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
40 //#define RE9_DEBUG_PRG
41 //#define RE9_CALC_MAXT
44 /******************************************************************************/
45 typedef uint32_t re9_rune
;
49 PRG_STARTS_WITH_ANY
= 0x00,
50 PRG_STARTS_WITH_BOL
= 0x01,
51 PRG_STARTS_WITH_EOL
= 0x02,
52 PRG_STARTS_WITH_RUNE
= 0x04,
53 PRG_STARTS_WITH_CLASS
= 0x08
58 uint8_t *code
; /* vm code */
60 uint32_t code_alloted
;
61 uint32_t startinst
; /* start pc */
62 #ifndef RE9_DISABLE_LEARNING
63 uint8_t startflags
; /* PRG_START_WITH_xxx */
65 uint32_t startrange
; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
69 int flags
; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
74 /* note that CLASS opcode is variable-length */
75 typedef struct PACKED
{
76 uint32_t opcode
:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
77 uint32_t next
:24; /* offset of next opcode */
82 typedef struct PACKED
{
89 typedef struct PACKED
{
91 uint32_t alt
; /* offset of 'alternate' opcode */
96 typedef struct PACKED
{
98 uint8_t subid
; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
103 typedef struct PACKED
{
105 uint16_t spi_count
; /* number of items in spi */
106 re9_rune spi
[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
110 /******************************************************************************/
112 * Actions and Tokens (re9_inst_t types)
114 * 02xx are operators, value == precedence
115 * 03xx are tokens, i.e. operands for operators
117 #define RUNE 0177 /* literal rune */
119 #define OPERATOR 0200 /* bitmask of all operators */
120 #define START 0200 /* start, used for marker on stack */
121 #define RBRA 0201 /* right bracket */
122 #define LBRA 0202 /* left bracket */
123 #define ALT 0203 /* alternation */
124 #define CAT 0204 /* concatentation, implicit operator */
125 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
126 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
127 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
129 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
130 #define ANY 0301 /* (and token); any character except newline */
131 #define ANYNL 0302 /* (and token); any character including newline */
132 #define BOL 0303 /* (and token); beginning of line */
133 #define EOL 0304 /* (and token); end of line */
134 #define CCLASS 0305 /* (and token); character class */
135 #define NCCLASS 0306 /* (and token); negated character class */
136 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
137 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
138 #define SPLIT_G 0311 /* greedy split */
139 #define SPLIT_NG 0312 /* vmop; non-greedy split */
140 #define SMARK 0313 /* vmop; group start mark */
141 #define EMARK 0314 /* vmop; group end mark */
142 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
146 #define END 0377 /* vmop; terminate: match found */
149 /******************************************************************************/
150 static inline uint32_t vmop_size (const void *op
) {
151 switch (((const vmopcode_t
*)op
)->opcode
) {
153 return sizeof(vmop_rune_t
);
154 case SPLIT_G
: case SPLIT_NG
:
155 return sizeof(vmop_split_t
);
156 case SMARK
: case EMARK
:
157 return sizeof(vmop_mark_t
);
158 case CCLASS
: case NCCLASS
:
159 return sizeof(vmop_class_t
)+((const vmop_class_t
*)op
)->spi_count
*sizeof(((const vmop_class_t
*)op
)->spi
[0]);
162 return sizeof(vmopcode_t
);
166 /******************************************************************************/
167 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
168 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
169 static void dump_char (FILE *fo
, re9_rune r
) {
170 if (r
>= 32 && r
< 127 && r
!= '\'' && r
!= '\\') {
171 fprintf(fo
, "%c", r
);
173 fprintf(fo
, "\\u%04x", r
);
177 static uint32_t dump_opcode (FILE *fo
, const re9_prog_t
*pp
, uint32_t pc
) {
178 const vmopcode_t
*op
;
179 const vmop_class_t
*opc
;
181 if (pc
>= pp
->code_used
) return 0;
182 fprintf(fo
, "%05u: 0%03o ", pc
, pp
->code
[pc
]);
184 if (pp
->code
[pc
] == END
) {
185 fprintf(fo
, " END\n");
186 return pc
+vmop_size((void *)(pp
->code
+pc
));
188 if (pc
+sizeof(vmopcode_t
) > pp
->code_used
) {
189 fprintf(fo
, "???: INVALID CODE!\n");
192 op
= (const vmopcode_t
*)(pp
->code
+pc
);
193 if (pc
+vmop_size(op
) > pp
->code_used
) {
194 fprintf(fo
, "???: INVALID CODE!\n");
197 fprintf(fo
, "%05u ", op
->next
);
199 switch (op
->opcode
) {
200 case NOP
: fprintf(fo
, "NOP\n"); break;
201 case ANY
: fprintf(fo
, "ANY\n"); break;
202 case ANYNL
: fprintf(fo
, "ANYNL\n"); break;
203 case BOL
: fprintf(fo
, "BOL\n"); break;
204 case EOL
: fprintf(fo
, "EOL\n"); break;
205 case WBOUND
: fprintf(fo
, "WBOUND\n"); break;
206 case WNBOUND
: fprintf(fo
, "WNBOUND\n"); break;
208 fprintf(fo
, "RUNE '");
209 dump_char(fo
, ((const vmop_rune_t
*)op
)->r
);
212 case CCLASS
: case NCCLASS
:
213 opc
= (const vmop_class_t
*)op
;
214 fprintf(fo
, "%-8s [", (op
->opcode
== CCLASS
? "CCLASS" : "NCCLASS"));
215 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
216 dump_char(fo
, opc
->spi
[f
]);
217 if (opc
->spi
[f
] != opc
->spi
[f
+1]) {
219 dump_char(fo
, opc
->spi
[f
+1]);
224 case SPLIT_G
: case SPLIT_NG
:
225 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t
*)op
)->alt
);
227 case STAR_G
: case STAR_NG
:
228 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "STAR_G" : "STAR_NG"), ((const vmop_split_t
*)op
)->alt
);
230 case SMARK
: case EMARK
:
231 fprintf(fo
, "%-8s %u\n", (op
->opcode
== SMARK
? "SMARK" : "EMARK"), ((const vmop_mark_t
*)op
)->subid
);
234 if (op
->opcode
< RUNE
) {
235 fprintf(fo
, "RUNE '");
236 dump_char(fo
, op
->opcode
);
240 fprintf(fo
, "INVALID OPCODE!\n");
243 return pc
+vmop_size(op
);
247 static void dump (const re9_prog_t
*pp
) {
249 fprintf(stderr
, "code size: %u\n", pp
->code_used
);
250 while (pc
< pp
->code_used
) if ((pc
= dump_opcode(stderr
, pp
, pc
)) == 0) break;
255 /******************************************************************************/
258 #ifdef RE9_UNICODE_CASE
261 unsigned short code
; /* code point */
262 unsigned short lower
; /* code */
263 unsigned short upper
; /* code */
266 #include "re9_unicode_mapping.c"
269 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
271 static int cmp_casemap (const void *key
, const void *cm
) {
272 return *(const int *)key
-(int)((const struct casemap
*)cm
)->code
;
276 static inline int utf8_map_case (re9_rune uc
, int upper
) {
277 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
278 return (cm
!= NULL
? (upper
? cm
->upper
: cm
->lower
) : uc
);
283 * returns the upper-case variant of the given unicode codepoint
284 * does not support unicode code points > \uffff
286 static inline re9_rune
utf8_upper (re9_rune uc
) {
287 return (uc
< 256 ? (uc
< 128 ? toupper(uc
) : uc
) : utf8_map_case(uc
, 1));
292 * returns the lower-case variant of the given unicode codepoint.
293 * does not support unicode code points > \uffff
296 static inline int utf8_lower (int uc) {
297 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
300 #define UPPER(_r) utf8_upper(_r)
304 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
310 RE9_UTF_MAX_BYTES
= 4, /* maximum bytes per rune */
311 RE9_RUNE_SYNC
= 0x80, /* cannot represent part of a UTF sequence (<) */
312 RE9_RUNE_SELF
= 0x80, /* rune and UTF sequences are the same (<) */
313 RE9_RUNE_ERROR
= 0xFFFD, /* decoding error in UTF */
314 RE9_RUNE_MAX
= 0x10FFFF, /* maximum rune value */
316 RE9_RUNE_SPEC_EOL
= RE9_RUNE_MAX
+1,
317 RE9_RUNE_SPEC_NOTHING
= RE9_RUNE_MAX
+2,
318 RE9_RUNE_SPEC_WBOUND
= RE9_RUNE_MAX
+3,
319 RE9_RUNE_SPEC_WNBOUND
= RE9_RUNE_MAX
+4,
320 RE9_RUNE_SPEC_BOL
= RE9_RUNE_MAX
+5,
321 RE9_RUNE_SPEC_INVALID
= RE9_RUNE_MAX
+6
325 static const uint8_t utf8_clen
[128] = {
326 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
327 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
328 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
329 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
330 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
331 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
338 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
339 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
340 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
341 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
346 static inline int re9_char2rune (re9_rune
*rune
, const char *str
, const char *eol
) {
347 if ((intptr_t)(eol
-str
) > 0) {
349 uint8_t c
= (unsigned char)(*str
++), len
= (utf8_clen
[c
>>1]>>((c
&0x01)*4))&0x0f, res
= len
, oc
= c
;
350 if (len
== 0 || eol
-str
+1 < len
) { *rune
= RE9_RUNE_ERROR
; return 1; }
351 else if (len
== 1) { *rune
= c
; return 1; }
354 if (((c
= (unsigned char)(*str
++))&0xc0) != 0x80) { *rune
= RE9_RUNE_ERROR
; return 1; }
357 if ((oc
== 0xc0 || oc
== 0xc1) && r
!= 0) r
= RE9_RUNE_ERROR
; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
358 if ((r
>= 0xd800 && r
<= 0xdfff) || // utf16/utf32 surrogates
359 (r
>= 0xfdd0 && r
<= 0xfdef) || // just for fun
360 (r
>= 0xfffe && r
<= 0xffff) || // fuck BOMs
361 r
> 0x10ffff) { r
= RE9_RUNE_ERROR
; /*res = 1;*/ } // bad unicode
365 *rune
= RE9_RUNE_SPEC_EOL
;
371 static inline const char *re8_rune_strchr (const char *s
, re9_rune c
, const char *eol
) {
372 if (s
>= eol
) return NULL
;
373 /* not part of utf sequence? */
374 if (c
< RE9_RUNE_SYNC
) return memchr(s
, c
, eol
-s
);
377 int len
= re9_char2rune(&r
, s
, eol
);
378 if (r
== c
) return s
;
385 /******************************************************************************/
394 /* parser information */
396 /*re9_inst_t*/uint32_t first
;
397 /*re9_inst_t*/uint32_t last
;
407 re9_node_t andstack
[NSTACK
];
410 int atorstack
[NSTACK
];
413 int cursubid
; /* id of current subexpression */
414 int subidstack
[NSTACK
]; /* parallel to atorstack */
417 int last_was_operand
; /* last token was operand */
418 int nbra
; /* number of opened parens */
419 const char *expr
; /* pointer to next character in source expression */
420 const char *expr_eol
;
421 int flags
; /* parser flags */
423 int comment_level
; /* >0: in comment; counts parens */
424 int return_rune_nothing
; /* return 'nothing' rune from next lex()? will be reset by lex() */
425 re9_rune yyrune
; /* last lex'd rune */
426 /* last lex'd class */
429 uint32_t yyc_alloted
;
432 const char *errormsg
;
436 /******************************************************************************/
437 static __attribute__((noreturn
)) void rcerror (re9_compiler_t
*ci
, const char *s
) {
439 longjmp(ci
->regkaboom
, 1);
443 /******************************************************************************/
444 static void add_to_class (re9_compiler_t
*ci
, re9_rune r
) {
445 if (ci
->comment_level
== 0) {
446 if (ci
->yyc_used
+1 > ci
->yyc_alloted
) {
447 uint32_t newsz
= ((ci
->yyc_used
+1)|0x1f)+1;
448 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
449 if (newr
== NULL
) rcerror(ci
, "out of memory");
451 ci
->yyc_alloted
= newsz
;
453 ci
->yyclass
[ci
->yyc_used
++] = r
;
458 static void add_to_class_two (re9_compiler_t
*ci
, re9_rune r0
, re9_rune r1
) {
459 if (ci
->comment_level
== 0) {
460 if (ci
->yyc_used
+2 > ci
->yyc_alloted
) {
461 uint32_t newsz
= ((ci
->yyc_used
+2)|0x1f)+1;
462 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
463 if (newr
== NULL
) rcerror(ci
, "out of memory");
465 ci
->yyc_alloted
= newsz
;
467 ci
->yyclass
[ci
->yyc_used
++] = r0
;
468 ci
->yyclass
[ci
->yyc_used
++] = r1
;
473 /******************************************************************************/
474 /* return 1 if char is quoted */
475 static int nextc (re9_compiler_t
*ci
, re9_rune
*rp
) {
476 if (ci
->expr
>= ci
->expr_eol
) { *rp
= RE9_RUNE_SPEC_EOL
; return 1; }
477 if ((ci
->flags
&RE9_FLAG_NONUTF8
) == 0) {
478 if (ci
->expr
[0] == '\\' && ci
->expr
+1 < ci
->expr_eol
) {
479 ci
->expr
+= re9_char2rune(rp
, ci
->expr
+1, ci
->expr_eol
)+1;
482 ci
->expr
+= re9_char2rune(rp
, ci
->expr
, ci
->expr_eol
);
485 if (ci
->expr
[0] == '\\' && ci
->expr
+1 < ci
->expr_eol
) {
486 *rp
= (unsigned char)(ci
->expr
[1]);
490 *rp
= (uint8_t)(*ci
->expr
++);
491 if (ci
->expr
> ci
->expr_eol
) ci
->expr
= ci
->expr_eol
;
494 if (ci
->flags
&RE9_FLAG_CASEINSENS
) *rp
= UPPER(*rp
);
499 static void addmeta_space (re9_compiler_t
*ci
, int neg
) {
501 add_to_class_two(ci
, '\x9', '\xd');
502 add_to_class_two(ci
, ' ', ' ');
504 add_to_class_two(ci
, '\1', '\x9'-1);
505 add_to_class_two(ci
, '\xd'+1, ' '-1);
506 add_to_class_two(ci
, ' '+1, RE9_RUNE_MAX
);
511 static void addmeta_digit (re9_compiler_t
*ci
, int neg
) {
513 add_to_class_two(ci
, '0', '9');
515 add_to_class_two(ci
, '\1', '0'-1);
516 add_to_class_two(ci
, '9'+1, RE9_RUNE_MAX
);
521 static void addmeta_word (re9_compiler_t
*ci
, int neg
) {
523 add_to_class_two(ci
, '0', '9');
524 add_to_class_two(ci
, 'A', 'Z');
525 add_to_class_two(ci
, '_', '_');
526 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
528 add_to_class_two(ci
, '\0', '0'-1);
529 add_to_class_two(ci
, '9'+1, 'A'-1);
530 add_to_class_two(ci
, 'Z'+1, '_'-1);
531 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) {
532 add_to_class_two(ci
, '_'+1, 'a'-1);
533 add_to_class_two(ci
, 'z'+1, RE9_RUNE_MAX
);
535 add_to_class_two(ci
, '_'+1, RE9_RUNE_MAX
);
541 #ifndef RE9_DISABLE_POSIX_CLASSES
542 static void addmeta_upper (re9_compiler_t
*ci
) {
543 add_to_class_two(ci
, 'A', 'Z');
547 static void addmeta_lower (re9_compiler_t
*ci
) {
548 if (ci
->flags
&RE9_FLAG_CASEINSENS
) {
549 add_to_class_two(ci
, 'A', 'Z');
551 add_to_class_two(ci
, 'a', 'z');
556 static void addmeta_alpha (re9_compiler_t
*ci
) {
557 add_to_class_two(ci
, 'A', 'Z');
558 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
562 static void addmeta_xdigit (re9_compiler_t
*ci
) {
563 add_to_class_two(ci
, '0', '9');
564 add_to_class_two(ci
, 'A', 'F');
565 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'f');
569 static void addmeta_punct (re9_compiler_t
*ci
) {
570 add_to_class_two(ci
, 33, 47); // !"#$%&'()*+,-./
571 add_to_class_two(ci
, 58, 64); // :;<=>?@
572 add_to_class_two(ci
, 91, 96); // [\]^_`
573 add_to_class_two(ci
, 123, 126); // {|}~
577 /*FIXME:??? this should include '\z'! */
578 static void addmeta_ascii (re9_compiler_t
*ci
) {
579 add_to_class_two(ci
, '\1', '\x7f');
583 static void addmeta_blank (re9_compiler_t
*ci
) {
584 add_to_class_two(ci
, '\t', '\t');
585 add_to_class_two(ci
, ' ', ' ');
589 /*FIXME:??? this should include '\z'! */
590 static void addmeta_ctrl (re9_compiler_t
*ci
) {
591 add_to_class_two(ci
, '\1', '\x1f');
592 add_to_class_two(ci
, '\x7f', '\x7f');
596 static void addmeta_graph (re9_compiler_t
*ci
) {
597 add_to_class_two(ci
, '!', '\x7e');
601 static void addmeta_print (re9_compiler_t
*ci
) {
602 add_to_class_two(ci
, ' ', '\x7e');
607 static void normalize_spans (re9_compiler_t
*ci
) {
608 if (ci
->yyc_used
> 2) {
609 /* we have at least two spans */
611 re9_rune
*ep
= ci
->yyclass
+ci
->yyc_used
;
613 /* sort on span start */
614 /* yeah, old good bubble sorting */
615 for (p
= ci
->yyclass
; p
< ep
; p
+= 2) {
616 for (np
= p
; np
< ep
; np
+= 2) {
629 for (cp
= 2; cp
< ci
->yyc_used
; ) {
631 /* if next span start is inside previous span, merge both spans */
632 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
633 if (ci
->yyclass
[cp
] <= ci
->yyclass
[cp
-1]) {
635 if (ci
->yyclass
[cp
+1] > ci
->yyclass
[cp
-1]) ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
636 /* remove this span */
637 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
639 } else if (ci
->yyclass
[cp
-1]+1 == ci
->yyclass
[cp
]) {
640 /* we can join this spans */
641 ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
642 /* remove this span */
643 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
654 /* parse character range */
655 static int bldcclass (re9_compiler_t
*ci
) {
659 /* we have already seen the '[' */
662 /* look ahead for negation */
663 /* SPECIAL CASE!!! negated classes don't match \n */
664 quoted
= nextc(ci
, &rune
);
665 if (!quoted
&& rune
== '^') {
667 quoted
= nextc(ci
, &rune
);
668 add_to_class_two(ci
, '\n', '\n');
670 /* parse class into a set of spans */
671 for (;; quoted
= nextc(ci
, &rune
)) {
672 if (rune
== RE9_RUNE_SPEC_EOL
) rcerror(ci
, "malformed '[]'");
673 if (!quoted
&& rune
== ']' && (ci
->yyc_used
&0x01) == 0) break;
674 if (quoted
&& ((rune
>= '0' && rune
<= '9') || (rune
>= 'a' && rune
<= 'z') || (rune
>= 'A' && rune
<= 'Z'))) {
676 if ((ci
->yyc_used
&0x01) != 0 || (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-')) rcerror(ci
, "malformed '[]'"); /* metacharacter can't be used in rangedef */
678 case 'd': case 'D': addmeta_digit(ci
, (rune
<= 'Z')); break;
679 case 's': case 'S': addmeta_space(ci
, (rune
<= 'Z')); break;
680 case 'w': case 'W': addmeta_word(ci
, (rune
<= 'Z')); break;
681 case 'z': add_to_class_two(ci
, '\0', '\0'); break;
682 case 'Z': add_to_class_two(ci
, '\1', RE9_RUNE_MAX
); break;
684 rcerror(ci
, "invalid metacharacter in '[]'");
687 #ifndef RE9_DISABLE_POSIX_CLASSES
688 } else if ((ci
->yyc_used
&0x01) == 0 && !quoted
&& rune
== '[' && ci
->expr
+6 < ci
->expr_eol
&& ci
->expr
[0] == ':') {
689 if (ci
->expr
+7 < ci
->expr_eol
) {
690 if (strncmp(ci
->expr
, ":alnum:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); addmeta_digit(ci
, 0); }
691 else if (strncmp(ci
->expr
, ":alpha:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); }
692 else if (strncmp(ci
->expr
, ":ascii:]", 8) == 0) { ci
->expr
+= 8; addmeta_ascii(ci
); }
693 else if (strncmp(ci
->expr
, ":blank:]", 8) == 0) { ci
->expr
+= 8; addmeta_blank(ci
); }
694 else if (strncmp(ci
->expr
, ":cntrl:]", 8) == 0) { ci
->expr
+= 8; addmeta_ctrl(ci
); }
695 else if (strncmp(ci
->expr
, ":digit:]", 8) == 0) { ci
->expr
+= 8; addmeta_digit(ci
, 0); }
696 else if (strncmp(ci
->expr
, ":graph:]", 8) == 0) { ci
->expr
+= 8; addmeta_graph(ci
); }
697 else if (strncmp(ci
->expr
, ":lower:]", 8) == 0) { ci
->expr
+= 8; addmeta_lower(ci
); }
698 else if (strncmp(ci
->expr
, ":print:]", 8) == 0) { ci
->expr
+= 8; addmeta_print(ci
); }
699 else if (strncmp(ci
->expr
, ":punct:]", 8) == 0) { ci
->expr
+= 8; addmeta_punct(ci
); }
700 else if (strncmp(ci
->expr
, ":space:]", 8) == 0) { ci
->expr
+= 8; addmeta_space(ci
, 0); }
701 else if (strncmp(ci
->expr
, ":upper:]", 8) == 0) { ci
->expr
+= 8; addmeta_upper(ci
); }
702 } else if (strncmp(ci
->expr
, ":word:]", 7) == 0) { ci
->expr
+= 7; addmeta_word(ci
, 0); }
703 else if (ci
->expr
+8 < ci
->expr_eol
&& strncmp(ci
->expr
, ":xdigit:]", 9) == 0) { ci
->expr
+= 9; addmeta_xdigit(ci
); }
706 if (ci
->yyc_used
&0x01 && rune
< ci
->yyclass
[ci
->yyc_used
-1]) rcerror(ci
, "invalid range in '[]'");
707 add_to_class(ci
, rune
);
708 if (ci
->yyc_used
&0x01) {
709 /* this was first char of possible range */
710 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-') {
711 /* rangedef; skip '-' */
714 /* one char: convert it to range */
715 add_to_class(ci
, rune
);
720 /* sort and store only if we are not in comment */
721 if (ci
->comment_level
== 0) {
722 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: character class parsing error");
723 if (ci
->yyc_used
== 0) rcerror(ci
, "empty '[]'");
725 /* if we have only one char in range, we can use RUNE instead */
726 if (type
== CCLASS
&& ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
727 /* yeah, one char! */
728 ci
->yyrune
= ci
->yyclass
[0];
736 /* parse metacharacter */
737 static int bldmeta (re9_compiler_t
*ci
, re9_rune rune
) {
738 int rangeop
= (rune
>= 'a' && rune
<= 'z' ? CCLASS
: NCCLASS
);
741 case 'd': case 'D': addmeta_digit(ci
, 0); break;
742 case 's': case 'S': addmeta_space(ci
, 0); break;
743 case 'w': case 'W': addmeta_word(ci
, 0); break;
744 case 'Z': add_to_class_two(ci
, '\0', '\0'); break;
745 case 'z': ci
->yyrune
= 0; return RUNE
;
746 case 'b': ci
->yyrune
= RE9_RUNE_SPEC_WBOUND
; return RUNE
;
747 case 'B': ci
->yyrune
= RE9_RUNE_SPEC_WNBOUND
; return RUNE
;
748 default: rcerror(ci
, "invalid metacharacter");
755 static int lex (re9_compiler_t
*ci
) {
756 if (ci
->return_rune_nothing
) {
757 ci
->return_rune_nothing
= 0;
758 ci
->yyrune
= RE9_RUNE_SPEC_NOTHING
;
759 } else if (ci
->expr
>= ci
->expr_eol
) {
760 ci
->yyrune
= RE9_RUNE_SPEC_EOL
;
761 } else if (ci
->flags
&RE9_FLAG_LITERAL
) {
762 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '\\') {
766 /* can't be quoted */
767 nextc(ci
, &ci
->yyrune
);
770 int quoted
= nextc(ci
, &ci
->yyrune
);
772 /* check for metacharacter */
773 if ((ci
->yyrune
>= '0' && ci
->yyrune
<= '9') || (ci
->yyrune
>= 'a' && ci
->yyrune
<= 'z') || (ci
->yyrune
>= 'A' && ci
->yyrune
<= 'Z')) return bldmeta(ci
, ci
->yyrune
);
775 switch (ci
->yyrune
) {
776 case '*': quoted
= STAR
; goto closure
;
777 case '?': quoted
= QUEST
; goto closure
;
778 case '+': quoted
= PLUS
; /* fallthru */
779 closure
: if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
780 #ifndef RE9_DISABLE_NONGREEDY
784 if ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0) rcerror(ci
, "non-greedy closure support was disabled at compile time");
788 case '|': return ALT
;
789 case '.': return (ci
->flags
&RE9_FLAG_ANYDOT
? ANYNL
: ANY
);
790 case '(': return LBRA
;
791 case ')': return RBRA
;
792 case '^': return BOL
;
793 case '$': return EOL
;
794 case '[': return bldcclass(ci
);
798 return (ci
->yyrune
!= RE9_RUNE_SPEC_EOL
? RUNE
: END
);
802 /******************************************************************************/
803 static inline uint32_t instofs (const re9_compiler_t
*ci
, const vmopcode_t
*op
) {
804 return (int32_t)(((const uint8_t *)op
)-ci
->pp
->code
);
808 static inline vmopcode_t
*instptr (re9_compiler_t
*ci
, uint32_t ofs
) {
809 return (vmopcode_t
*)(ci
->pp
->code
+ofs
);
813 /* push instructions to 'operand stack' */
814 static void push_operand (re9_compiler_t
*ci
, vmopcode_t
*f
, vmopcode_t
*l
) {
815 if (ci
->andp
>= &ci
->andstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operand stack overflow");
816 ci
->andp
->first
= instofs(ci
, f
);
817 ci
->andp
->last
= instofs(ci
, l
);
822 /* push token and group id to 'operator stack' */
823 static void push_operator (re9_compiler_t
*ci
, int t
, int csi
) {
824 if (ci
->atorp
>= &ci
->atorstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack overflow");
830 /* pop instriction from 'operand stack' */
831 static re9_node_t
*pop_operand (re9_compiler_t
*ci
) {
832 if (ci
->andp
<= &ci
->andstack
[0]) rcerror(ci
, "missing operand");
837 /* pop token from 'operator stack' */
838 static int pop_operator (re9_compiler_t
*ci
) {
839 if (ci
->atorp
<= &ci
->atorstack
[0]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack underflow");
845 /******************************************************************************/
846 static void *allot (re9_compiler_t
*ci
, size_t size
) {
848 if (ci
->pp
->code_used
+size
> ci
->pp
->code_alloted
) {
849 uint32_t newsz
= ((ci
->pp
->code_used
+size
)|0xfff)+1;
850 uint8_t *newc
= realloc(ci
->pp
->code
, newsz
);
851 if (newc
== NULL
) rcerror(ci
, "out of memory");
852 memset(newc
+ci
->pp
->code_used
, 0, newsz
-ci
->pp
->code_used
);
854 ci
->pp
->code_alloted
= newsz
;
856 res
= ci
->pp
->code
+ci
->pp
->code_used
;
857 ci
->pp
->code_used
+= size
;
862 /* allocate new instruction */
863 static inline vmopcode_t
*newinst (re9_compiler_t
*ci
, int t
) {
864 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
870 static vmopcode_t
*newinst_class (re9_compiler_t
*ci
, int t
) {
871 vmop_class_t
*op
= allot(ci
, sizeof(*op
)+ci
->yyc_used
*sizeof(op
->spi
[0]));
873 if (ci
->yyc_used
> 0) memcpy(op
->spi
, ci
->yyclass
, ci
->yyc_used
*sizeof(op
->spi
[0]));
874 op
->spi_count
= ci
->yyc_used
;
875 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
876 return (vmopcode_t
*)op
;
880 static vmopcode_t
*newinst_rune (re9_compiler_t
*ci
, re9_rune r
) {
881 if (r
< RUNE
|| r
> RE9_RUNE_MAX
) {
882 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
884 case RE9_RUNE_SPEC_NOTHING
: op
->opcode
= NOP
; break;
885 case RE9_RUNE_SPEC_WBOUND
: op
->opcode
= WBOUND
; break;
886 case RE9_RUNE_SPEC_WNBOUND
: op
->opcode
= WNBOUND
; break;
892 rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
896 vmop_rune_t
*op
= allot(ci
, sizeof(*op
));
897 op
->opc
.opcode
= RUNE
;
899 return (vmopcode_t
*)op
;
904 static vmopcode_t
*newinst_mark (re9_compiler_t
*ci
, int t
, int id
) {
906 vmop_mark_t
*op
= allot(ci
, sizeof(*op
));
907 if (id
> 127) rcerror(ci
, "too many captures");
910 return (vmopcode_t
*)op
;
912 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
919 static inline vmop_split_t
*newinst_split (re9_compiler_t
*ci
, int t
) {
920 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
921 #ifndef RE9_DISABLE_NONGREEDY
922 op
->opc
.opcode
= ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0 ? (t
&0x100 ? SPLIT_NG
: SPLIT_G
) : (t
&0x100 ? SPLIT_G
: SPLIT_NG
));
923 if ((t
&0xff) == STAR
) op
->opc
.opcode
= (op
->opc
.opcode
== SPLIT_G
? STAR_G
: STAR_NG
);
925 op
->opc
.opcode
= ((t
&0xff) == STAR
? STAR_G
: SPLIT_G
);
931 static inline vmop_split_t
*newinst_split1 (re9_compiler_t
*ci
, int t
) {
932 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
938 /* process instructions with higher current priority levels */
939 static void evaluntil (re9_compiler_t
*ci
, int pri
) {
940 while (pri
== RBRA
|| (ci
->atorp
[-1]&0xff) >= (pri
&0xff)) {
941 re9_node_t
*op1
, *op2
;
942 vmopcode_t
*inst1
, *inst2
;
944 int op
= pop_operator(ci
);
946 case LBRA
: /* must have been RBRA */
947 op1
= pop_operand(ci
);
948 inst2
= newinst_mark(ci
, EMARK
, *ci
->subidp
);
949 instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
950 inst1
= newinst_mark(ci
, SMARK
, *ci
->subidp
);
951 inst1
->next
= op1
->first
;
952 push_operand(ci
, inst1
, inst2
);
955 op2
= pop_operand(ci
);
956 op1
= pop_operand(ci
);
957 inst2
= newinst(ci
, NOP
);
958 instptr(ci
, op2
->last
)->next
= instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
959 si1
= newinst_split1(ci
, SPLIT_G
);
960 si1
->alt
= op1
->first
;
961 si1
->opc
.next
= op2
->first
;
962 push_operand(ci
, (void *)si1
, inst2
);
965 op2
= pop_operand(ci
);
966 op1
= pop_operand(ci
);
967 instptr(ci
, op1
->last
)->next
= op2
->first
;
968 push_operand(ci
, instptr(ci
, op1
->first
), instptr(ci
, op2
->last
));
970 case STAR
: case STAR
|0x100: /* greedy and non-greedy */
971 op2
= pop_operand(ci
);
972 si1
= newinst_split(ci
, op
);
973 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
974 si1
->alt
= op2
->first
;
975 push_operand(ci
, (void *)si1
, (void *)si1
);
977 case PLUS
: case PLUS
|0x100: /* greedy and non-greedy */
978 op2
= pop_operand(ci
);
979 si1
= newinst_split(ci
, op
);
980 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
981 si1
->alt
= op2
->first
;
982 push_operand(ci
, instptr(ci
, op2
->first
), (void *)si1
);
984 case QUEST
: case QUEST
|0x100: /* greedy and non-greedy */
985 op2
= pop_operand(ci
);
986 si1
= newinst_split(ci
, op
);
987 inst2
= newinst(ci
, NOP
);
988 si1
->opc
.next
= instofs(ci
, inst2
);
989 si1
->alt
= op2
->first
;
990 instptr(ci
, op2
->last
)->next
= instofs(ci
, inst2
);
991 push_operand(ci
, (void *)si1
, inst2
);
994 rcerror(ci
, "unknown operator in evaluntil");
1001 /******************************************************************************/
1002 /* process operator */
1003 static void operator (re9_compiler_t
*ci
, int t
) {
1004 int csi
= ci
->cursubid
;
1005 if (ci
->comment_level
) {
1007 if (t
== RBRA
|| t
== LBRA
) {
1008 ci
->nbra
+= (t
== LBRA
? 1 : -1);
1009 ci
->comment_level
+= (t
== LBRA
? 1 : -1);
1012 /* not in comment */
1013 if (t
== RBRA
&& --ci
->nbra
< 0) rcerror(ci
, "unmatched right paren");
1017 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
1018 if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == ':') {
1020 } else if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == '#') {
1022 ci
->comment_level
= 1;
1023 ci
->last_was_operand
= RE9_TRUE
;
1026 rcerror(ci
, "invalid paren flags");
1029 if ((csi
= ++ci
->cursubid
) >= RE9_SUBEXP_MAX
) rcerror(ci
, "too many subexpressions");
1032 if (ci
->last_was_operand
) operator(ci
, CAT
);
1033 /* process empty parens */
1034 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == ')') ci
->return_rune_nothing
= 1; /* hack: next lex() will return 'nothing' rune */
1038 if (t
!= RBRA
) push_operator(ci
, t
, csi
);
1041 case STAR
: case QUEST
: case PLUS
: case RBRA
:
1042 ci
->last_was_operand
= RE9_TRUE
; /* these look like operands */
1045 ci
->last_was_operand
= RE9_FALSE
;
1051 /* process operand: rune or character class */
1052 static void operand (re9_compiler_t
*ci
, int t
) {
1053 if (ci
->comment_level
== 0) {
1055 if (ci
->last_was_operand
) operator(ci
, CAT
); /* concatenate is implicit */
1057 case CCLASS
: case NCCLASS
:
1058 i
= newinst_class(ci
, t
);
1061 i
= newinst_rune(ci
, ci
->yyrune
);
1067 push_operand(ci
, i
, i
);
1069 ci
->last_was_operand
= RE9_TRUE
;
1073 #ifndef RE9_DISABLE_LEARNING
1074 /* trace VM code and build 'starting range' */
1075 /* return !0 if we hit 'ANY*' */
1076 static int learn_start_tracer (re9_compiler_t
*ci
, uint32_t pc
) {
1078 uint8_t opcode
= ci
->pp
->code
[pc
];
1079 const vmop_class_t
*opc
;
1082 if (opcode
< RUNE
) {
1083 /* wow, a solid hit */
1084 add_to_class_two(ci
, opcode
, opcode
);
1088 case RUNE
: /* another solid hit */
1089 add_to_class_two(ci
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
);
1091 case ANY
: case ANYNL
: /* shit... */
1093 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1095 case SPLIT_G
: case SPLIT_NG
: /* choose your way to die */
1096 case STAR_G
: case STAR_NG
: /* i can do better analyzis here, but i don't care for now */
1097 if (learn_start_tracer(ci
, ((const vmop_split_t
*)(ci
->pp
->code
+pc
))->alt
)) return 1; /* no way to go */
1100 ci
->pp
->startflags
|= PRG_STARTS_WITH_BOL
;
1103 ci
->pp
->startflags
|= PRG_STARTS_WITH_EOL
;
1106 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1107 for (f
= 0; f
< opc
->spi_count
; ++f
) add_to_class(ci
, opc
->spi
[f
]);
1110 /* oh, this is harder than previous one */
1111 /* we realy on the fact that range was sorted and there are gaps between ranges */
1112 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1113 if (opc
->spi
[0] > 0) add_to_class_two(ci
, 0, opc
->spi
[0]-1);
1115 for (f
= 2; f
< opc
->spi_count
&& pr
<= RE9_RUNE_MAX
; f
+= 2) {
1116 add_to_class_two(ci
, pr
, opc
->spi
[f
]-1);
1117 pr
= opc
->spi
[f
+1]+1;
1119 if (pr
<= RE9_RUNE_MAX
) add_to_class_two(ci
, pr
, RE9_RUNE_MAX
);
1124 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1126 /* ignore all other shit */
1128 pc
= instptr(ci
, pc
)->next
;
1133 static void learn_start (re9_compiler_t
*ci
) {
1136 ci
->pp
->startflags
= 0;
1137 ci
->pp
->startrange
= 0;
1138 ci
->pp
->startrune
= 0;
1139 any
= learn_start_tracer(ci
, ci
->pp
->startinst
);
1140 if (any
|| ci
->yyc_used
== 0) {
1141 ci
->pp
->startflags
= PRG_STARTS_WITH_ANY
;
1142 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1144 /* generate CCLASS */
1145 normalize_spans(ci
);
1146 if (ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
1147 /* simple RUNE will be enough */
1148 ci
->pp
->startflags
|= PRG_STARTS_WITH_RUNE
;
1149 ci
->pp
->startrune
= ci
->yyclass
[0];
1150 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1152 ci
->pp
->startflags
|= PRG_STARTS_WITH_CLASS
;
1153 ci
->pp
->startrange
= instofs(ci
, newinst_class(ci
, CCLASS
));
1154 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1155 #ifdef RE9_DEBUG_PRG
1156 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1164 /*TODO:??? collapse some things, get rid of nops? */
1165 static void optimize (re9_compiler_t
*ci
) {
1166 re9_prog_t
*pp
= ci
->pp
;
1169 #ifndef RE9_DISABLE_LEARNING
1172 /* get rid of NOOP chains */
1173 for (pc
= 0; pc
< pp
->code_used
; pc
+= vmop_size(pp
->code
+pc
)) {
1174 vmopcode_t
*inst
= instptr(ci
, pc
), *target
;
1175 target
= instptr(ci
, inst
->next
);
1176 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1177 if (inst
->opcode
== STAR_G
) inst
->opcode
= SPLIT_G
;
1178 else if (inst
->opcode
== STAR_NG
) inst
->opcode
= SPLIT_NG
;
1179 inst
->next
= instofs(ci
, target
);
1180 if (inst
->opcode
== SPLIT_NG
|| inst
->opcode
== SPLIT_G
) {
1181 vmop_split_t
*op
= (vmop_split_t
*)inst
;
1182 target
= instptr(ci
, op
->alt
);
1183 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1184 op
->alt
= instofs(ci
, target
);
1185 if (inst
->opcode
== SPLIT_NG
) ci
->pp
->flags
|= RE9_FLAG_NONGREEDY
;
1188 /* move startinst on non-NOOP */
1189 while (instptr(ci
, pp
->startinst
)->opcode
== NOP
) pp
->startinst
= instptr(ci
, pp
->startinst
)->next
;
1190 /* shrink code area */
1191 if (pc
> pp
->code_alloted
) {
1193 pp
->code_used
= pp
->code_alloted
= pc
;
1194 newc
= realloc(pp
->code
, pc
);
1195 if (newc
!= NULL
) pp
->code
= newc
;
1200 re9_prog_t
*re9_compile_ex (const char *s
, const char *eol
, int flags
, const char **errmsg
) {
1202 re9_compiler_t
* volatile ci
;
1205 if (errmsg
!= NULL
) *errmsg
= "can't parse NULL regexp";
1208 if (eol
== NULL
) eol
= s
+strlen(s
); else if (eol
< s
) eol
= s
;
1209 if ((ci
= malloc(sizeof(*ci
))) == NULL
) {
1210 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1213 memset(ci
, 0, sizeof(*ci
));
1214 /* get memory for the program */
1215 ci
->pp
= malloc(sizeof(*ci
->pp
));
1216 if (ci
->pp
== NULL
) {
1218 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1221 memset(ci
->pp
, 0, sizeof(*ci
->pp
));
1223 ci
->pp
->flags
= (flags
&RE9_FLAG_CASEINSENS
);
1226 if (errmsg
!= NULL
) *errmsg
= NULL
;
1227 if (setjmp(ci
->regkaboom
)) {
1228 if (errmsg
!= NULL
) *errmsg
= ci
->errormsg
;
1229 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1230 if (ci
->pp
->code
!= NULL
) free(ci
->pp
->code
);
1235 /* go compile the sucker */
1239 ci
->atorp
= ci
->atorstack
;
1240 ci
->andp
= ci
->andstack
;
1241 ci
->subidp
= ci
->subidstack
;
1242 ci
->last_was_operand
= RE9_FALSE
;
1244 /* start with a low priority operator to prime parser */
1245 push_operator(ci
, START
-1, ci
->cursubid
);
1246 /* offset 0 should be NOOP */
1247 newinst(ci
, NOP
)->next
= ci
->pp
->code_used
;
1248 while ((token
= lex(ci
)) != END
) {
1249 if ((token
&0300) == OPERATOR
) operator(ci
, token
); else operand(ci
, token
);
1251 if (ci
->comment_level
== 0) {
1252 /* close with a low priority operator */
1253 evaluntil(ci
, START
);
1256 evaluntil(ci
, START
);
1258 #ifdef RE9_DEBUG_PRG
1259 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1261 if (ci
->nbra
) rcerror(ci
, "unmatched left paren");
1262 --ci
->andp
; /* points to first and only operand */
1263 ci
->pp
->startinst
= ci
->andp
->first
;
1264 #ifdef RE9_DEBUG_PRG
1265 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1266 fprintf(stderr
, "=== unoptimized ===\n");
1267 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1272 #ifdef RE9_DEBUG_PRG
1273 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1274 fprintf(stderr
, "=== optimized ===\n");
1275 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1277 fprintf(stderr
, "------\n");
1281 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1288 re9_prog_t
*re9_compile (const char *s
, int flags
, const char **errmsg
) {
1289 return re9_compile_ex(s
, NULL
, flags
, errmsg
);
1293 int re9_nsub (const re9_prog_t
*p
) {
1294 return (p
!= NULL
? p
->nsub
: 0);
1298 void re9_free (re9_prog_t
*p
) {
1300 if (p
->code
!= NULL
) free(p
->code
);
1306 /******************************************************************************/
1308 #define LASTINST (0)
1311 /* substitution list */
1313 re9_sub_t m
[RE9_SUBEXP_MAX
];
1318 * regexec execution lists
1320 #define LISTSIZE (16)
1321 #define BIGLISTSIZE (64*LISTSIZE)
1324 uint32_t inst
; /* instruction of the thread */
1325 re9_sublist_t se
; /* matched subexpressions in this thread */
1330 re9_listitem_t
*start
;
1331 re9_listitem_t
*free
;
1332 re9_listitem_t
*end
;
1337 re9_listitem_t
*relist
[2], *relistf
[2];
1345 /* save a new match in mp */
1346 static inline void re9l_newmatch (re9_sub_t
*mp
, int ms
, re9_sublist_t
*sp
) {
1347 if (mp
== NULL
|| ms
<= 0) return;
1348 if (mp
[0].sp
== NULL
|| sp
->m
[0].sp
< mp
[0].sp
|| (sp
->m
[0].sp
== mp
[0].sp
&& sp
->m
[0].ep
> mp
[0].ep
)) {
1350 for (i
= 0; i
< ms
&& i
< RE9_SUBEXP_MAX
; ++i
) mp
[i
] = sp
->m
[i
];
1351 for (; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= 0;
1357 * Note optimization in re9l_newthread:
1358 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1361 * lp: list to add to
1362 * ip: instruction to add
1363 * ms: number of elements at mp
1364 * sep: pointers to subexpressions
1366 static inline re9_listitem_t
*re9l_newthread (re9_list_t
*lp
, uint32_t ip
, int ms
, re9_sublist_t
*sep
, int nocheck
) {
1369 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1370 if (p
->inst
== ip
) {
1371 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sep
->m
[0].sp
, p
->se
.m
[0].sp
);
1372 if (sep
->m
[0].sp
< p
->se
.m
[0].sp
) {
1373 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1380 if (p
>= lp
->end
-1) return NULL
;
1381 dlogf("newthread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1383 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1384 p
[1].inst
= LASTINST
;
1390 * same as renewthread, but called with initial empty start pointer
1393 * lp: list to add to
1394 * ip: instruction to add
1395 * ms: number of elements at mp
1396 * sp: pointers to subexpressions
1398 static inline re9_listitem_t
*re9l_newemptythread (re9_list_t
*lp
, uint32_t ip
, int ms
, const char *sp
, int nocheck
) {
1401 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1402 if (p
->inst
== ip
) {
1403 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sp
, p
->se
.m
[0].sp
);
1404 if (sp
< p
->se
.m
[0].sp
) {
1405 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1413 if (p
>= lp
->end
-1) return NULL
;
1414 dlogf("newemptythread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1416 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1418 p
[1].inst
= LASTINST
;
1423 static inline int is_word_rune (re9_rune r
) {
1425 (r
>= '0' && r
<= '9') ||
1426 (r
>= 'A' && r
<= 'Z') ||
1427 (r
>= 'a' && r
<= 'z') ||
1428 (r
>= 128 && r
<= RE9_RUNE_MAX
) ||
1434 * return 0 if no match
1436 * <0 if we ran out of list space
1439 * progp: program to run
1440 * bol: string to run machine on
1441 * mp: subexpression elements
1442 * ms: number of elements at mp
1445 static int regexec1 (const re9_prog_t
*progp
, re9_sub_t
*mp
, int ms
, re9_ljunk_t
*j
) {
1447 #ifdef RE9_CALC_MAXT
1451 #ifndef RE9_DISABLE_LEARNING
1457 re9_list_t tl
, nl
; /* this list, next list */
1458 re9_listitem_t
*tlp
, *tlxn
;
1459 int match
, i
, flags
;
1461 const vmop_class_t
*opc
;
1462 int dont_collapse
= 0;
1464 flags
= (progp
->flags
&(RE9_FLAG_CASEINSENS
|RE9_FLAG_NONGREEDY
))|(j
->flags
&(RE9_FLAG_NONUTF8
|RE9_FLAG_ANYDOT
));
1465 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1467 #ifndef RE9_DISABLE_LEARNING
1468 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
);
1471 for (i
= 0; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= NULL
;
1475 if (ms
< 0) ms
= 0; else if (ms
> RE9_SUBEXP_MAX
) ms
= RE9_SUBEXP_MAX
;
1476 j
->relist
[0][0].inst
= j
->relist
[1][0].inst
= LASTINST
;
1477 j
->relistf
[0] = j
->relist
[0];
1478 j
->relistf
[1] = j
->relist
[1];
1479 /* execute machine once for each character, including terminal NUL */
1481 rprev
= RE9_RUNE_SPEC_BOL
;
1483 /* fast check for first char */
1484 #ifndef RE9_DISABLE_LEARNING
1486 dlogf("checkstart! 0x%02x\n", progp
->startflags
);
1487 switch (progp
->startflags
) {
1489 case PRG_STARTS_WITH_BOL
:
1490 if (s
== j
->bol
) break;
1491 if (s
== j
->eol
) goto mt_quit
;
1492 p
= memchr(s
, '\n', j
->eol
-s
);
1493 if (p
== NULL
|| p
+1 > j
->eol
) goto mt_quit
;
1495 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1497 case PRG_STARTS_WITH_EOL
:
1498 if (s
== j
->eol
) goto mt_quit
;
1499 p
= memchr(s
, '\n', j
->eol
-s
);
1500 if (p
== NULL
) p
= j
->eol
;
1502 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1504 case PRG_STARTS_WITH_RUNE
:
1505 if ((flags
&RE9_FLAG_CASEINSENS
) == 0) {
1506 /* case-sensitive */
1507 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1508 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1509 p
= memchr(s
, progp
->startrune
, j
->eol
-s
);
1511 p
= re8_rune_strchr(s
, progp
->startrune
, j
->eol
);
1514 /* case-insensitive */
1515 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1516 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1517 for (p
= s
; p
< j
->eol
; ++p
) if (UPPER(((unsigned char)*p
)) == progp
->startrune
) break;
1520 while (p
< j
->eol
) {
1521 rune_size
= re9_char2rune(&r
, p
, j
->eol
);
1522 if (UPPER(r
) == progp
->startrune
) break;
1527 if (p
== NULL
|| p
> j
->eol
) goto mt_quit
;
1529 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1531 case PRG_STARTS_WITH_CLASS
:
1532 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1533 while (s
< j
->eol
) {
1534 r
= *(const uint8_t *)s
;
1535 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1536 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1537 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1538 if (f
< opc
->spi_count
) break;
1541 if (s
>= j
->eol
) goto mt_quit
;
1542 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1545 /* difficult cases */
1546 if ((progp
->startflags
&PRG_STARTS_WITH_BOL
) && s
== j
->bol
) goto difficult_found
;
1547 if ((progp
->startflags
&PRG_STARTS_WITH_EOL
) && s
== j
->eol
) goto difficult_found
;
1548 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1549 while (s
< j
->eol
) {
1551 if (progp
->startflags
&PRG_STARTS_WITH_BOL
) { ++s
; break; }
1552 if (progp
->startflags
&PRG_STARTS_WITH_EOL
) break;
1554 if (progp
->startflags
&(PRG_STARTS_WITH_RUNE
|PRG_STARTS_WITH_CLASS
)) {
1555 r
= *(const uint8_t *)s
;
1556 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1557 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1558 if (progp
->startflags
&PRG_STARTS_WITH_RUNE
) {
1559 if (r
== progp
->startrune
) break;
1561 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1562 if (f
< opc
->spi_count
) break;
1569 if (s
> j
->eol
) goto mt_quit
;
1571 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1577 r
= *(const uint8_t *)s
;
1578 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1579 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1581 r
= RE9_RUNE_SPEC_EOL
;
1584 /* switch run lists */
1585 tl
.start
= j
->relist
[flag
];
1586 tl
.end
= j
->relist
[flag
]+j
->listsize
;
1587 tl
.free
= j
->relistf
[flag
];
1588 nl
.start
= j
->relist
[flag
^=1];
1589 nl
.end
= j
->relist
[flag
]+j
->listsize
;
1591 nl
.start
->inst
= LASTINST
;
1592 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag
, match
, tl
.start
);
1593 /* add first instruction to current list */
1595 if ((tlxn
= re9l_newemptythread(&tl
, progp
->startinst
, ms
, s
, dont_collapse
)) == NULL
) return -1;
1599 dlogf(" tlxn=%u\n", tlxn
-tl
.start
);
1600 for (tlp
= tl
.start
; tlp
< tl
.free
; ++tlp
) dlogf("%d: %u\n", tlp
-tl
.start
, tlp
->inst
);
1602 /* execute machine until current list is empty */
1603 if (tl
.start
->inst
== LASTINST
) break; /* nothing to do */
1604 #ifdef RE9_CALC_MAXT
1605 if (tl
.free
-tl
.start
> maxt
) maxt
= tl
.free
-tl
.start
;
1607 for (tlp
= tl
.start
; tlp
->inst
!= LASTINST
; ++tlp
) {
1609 dlogf("* new thread (%d) at %u\n", flag
, tlp
-tl
.start
);
1610 #ifdef RE9_CALC_MAXT
1611 if (tlp
-tl
.start
> maxt
) maxt
= tlp
-tl
.start
;
1613 for (ip
= tlp
->inst
; ip
!= LASTINST
; ip
= ((const vmopcode_t
*)(progp
->code
+ip
))->next
) {
1614 const vmopcode_t
*op
= (const vmopcode_t
*)(progp
->code
+ip
);
1616 dump_opcode(stderr
, progp
, ip
);
1618 if (op
->opcode
< RUNE
) {
1619 /* ascii character */
1620 if (op
->opcode
== r
) {
1621 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1624 switch (op
->opcode
) {
1625 case RUNE
: /* regular character */
1626 if (((const vmop_rune_t
*)op
)->r
== r
) {
1627 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1631 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].sp
= s
;
1634 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].ep
= s
;
1637 if ((flags
&RE9_FLAG_ANYDOT
) == 0 && r
== '\n') break;
1640 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1643 if (s
== j
->bol
|| rprev
== RE9_RUNE_SPEC_BOL
|| rprev
== '\n') continue;
1646 if (s
== j
->eol
|| r
== '\n') continue;
1649 if (is_word_rune(rprev
) != is_word_rune(r
)) continue;
1652 if (is_word_rune(rprev
) == is_word_rune(r
)) continue;
1655 opc
= (const vmop_class_t
*)op
;
1656 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
1657 if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) {
1658 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1664 opc
= (const vmop_class_t
*)op
;
1665 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1666 if (f
>= opc
->spi_count
) {
1667 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1670 #ifndef RE9_DISABLE_NONGREEDY
1672 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1673 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1676 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1677 if (flags
&RE9_FLAG_NONGREEDY
) {
1678 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1685 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1689 case END
: /* match! */
1691 tlp
->se
.m
[0].ep
= s
;
1692 if (mp
!= 0) re9l_newmatch(mp
, ms
, &tlp
->se
);
1693 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp
-tl
.start
, tlp
->se
.m
[0].sp
-j
->bol
, tlp
->se
.m
[0].ep
-j
->bol
, (tlp
< tlxn
? "old" : "new"));
1694 #ifndef RE9_DISABLE_NONGREEDY
1695 if (flags
&RE9_FLAG_NONGREEDY
) {
1696 /* fuck off all 'old' instructions */
1698 /* we are in 'old' area, skip it all */
1699 tlp
= tlxn
-1; /* next one will be 'new' */
1701 /* we are in 'new' area, stop right here */
1702 tlp
[1].inst
= LASTINST
;
1709 fprintf(stderr
, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op
->opcode
);
1716 j
->relistf
[flag
] = nl
.free
; /* update 'free' pointer */
1717 dlogf("+ free=%u; sz=%d\n", nl
.free
-nl
.start
, j
->listsize
);
1718 if (s
== j
->eol
) break;
1719 #ifndef RE9_DISABLE_LEARNING
1720 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
&& nl
.start
->inst
== LASTINST
&& !match
);
1724 dlogf("+++ (%d) eol=%d\n", r
, s
== j
->eol
);
1726 #ifndef RE9_DISABLE_LEARNING
1729 #ifdef RE9_CALC_MAXT
1730 fprintf(stderr
, "MAXT=%d\n", maxt
);
1736 static int re9_exec_internal (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, re9_listitem_t
*rl0
, re9_listitem_t
*rl1
, size_t listsz
) {
1738 /* use user-specified starting/ending location if specified */
1739 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) {
1740 j
.bol
= (mp
->sp
!= NULL
? mp
->sp
: bol
);
1741 j
.eol
= (mp
->ep
!= NULL
? mp
->ep
: j
.bol
+strlen(j
.bol
));
1743 j
.eol
= (j
.bol
= bol
)+strlen(bol
);
1746 j
.listsize
= listsz
;
1749 return regexec1(progp
, mp
, ms
, &j
);
1754 * progp: program to run
1755 * bol: string to run machine on
1756 * mp: subexpression elements
1757 * ms: number of elements at mp
1759 int re9_execute_ex (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, size_t maxmem
) {
1760 re9_listitem_t relist
[2][LISTSIZE
];
1762 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1763 int rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, relist
[0], relist
[1], LISTSIZE
);
1765 re9_listitem_t
*r
[2];
1766 int listsize
= BIGLISTSIZE
, rv
;
1767 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1770 for (f
= 0; f
< 2; ++f
) {
1771 if ((r
[f
] = malloc(listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1772 while (--f
>= 0) free(r
[f
]);
1776 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1777 rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, r
[0], r
[1], listsize
);
1778 while (--f
>= 0) free(r
[f
]);
1780 if (listsize
*2*sizeof(re9_listitem_t
)*2 > maxmem
) {
1781 size_t nsz
= maxmem
/(sizeof(re9_listitem_t
)*2);
1782 if (nsz
<= listsize
) break;
1788 #ifdef RE9_CALC_MAXT
1789 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%d\n", listsize
/2);
1796 int re9_execute (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1797 return re9_execute_ex(progp
, flags
, bol
, mp
, ms
, 0);
1801 /******************************************************************************/
1802 struct re9_prog_prepared_s
{
1803 const re9_prog_t
*progp
;
1804 re9_listitem_t
*relist
[2];
1810 re9_prog_prepared_t
*re9_prepare_ex (const re9_prog_t
*progp
, size_t maxmem
) {
1811 re9_prog_prepared_t
*res
;
1813 if (progp
== NULL
) return NULL
;
1814 if ((res
= malloc(sizeof(*res
))) == NULL
) return NULL
;
1815 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1817 res
->listsize
= BIGLISTSIZE
;
1818 res
->maxmem
= maxmem
;
1819 for (f
= 0; f
< 2; ++f
) {
1820 if ((res
->relist
[f
] = malloc(res
->listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1821 while (--f
>= 0) free(res
->relist
[f
]);
1830 re9_prog_prepared_t
*re9_prepare (const re9_prog_t
*progp
) {
1831 return re9_prepare_ex(progp
, 0);
1835 int re9_prepared_execute (re9_prog_prepared_t
*pp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1839 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1841 rv
= re9_exec_internal(pp
->progp
, flags
, bol
, mp
, ms
, pp
->relist
[0], pp
->relist
[1], pp
->listsize
);
1842 if (rv
>= 0 || pp
->listsize
*sizeof(re9_listitem_t
)*2 >= pp
->maxmem
) break;
1843 if (pp
->listsize
*2*sizeof(re9_listitem_t
)*2 > pp
->maxmem
) {
1844 size_t nsz
= pp
->maxmem
/(sizeof(re9_listitem_t
)*2);
1845 if (nsz
<= pp
->listsize
) break;
1850 for (f
= 0; f
< 2; ++f
) {
1851 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1852 if (r
== NULL
) break;
1855 if (f
< 2) { pp
->listsize
/= 2; rv
= -1; break; }
1856 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1858 #ifdef RE9_CALC_MAXT
1859 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%u\n", pp
->listsize
);
1861 if (pp
->listsize
> 32767) {
1863 #ifdef RE9_CALC_MAXT
1864 fprintf(stderr
, "*SHRINK: %u --> 32768\n", pp
->listsize
);
1866 pp
->listsize
= 32768;
1867 for (f
= 0; f
< 2; ++f
) {
1868 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1869 if (r
== NULL
) break;
1879 void re9_prepared_free (re9_prog_prepared_t
*pp
) {
1881 if (pp
->relist
[1] != NULL
) free(pp
->relist
[1]);
1882 if (pp
->relist
[0] != NULL
) free(pp
->relist
[0]);
1888 /******************************************************************************/
1889 /* substitute into one string using the matches from the last re9_execute() */
1890 int re9_sub (char *dp
, size_t dlen
, const char *sp
, const re9_sub_t
*mp
, int ms
) {
1893 if (mp
== NULL
|| ms
<= 0) ms
= 0;
1894 if (dp
== NULL
) dlen
= 0;
1895 dlast
= (dlen
> 0 ? dp
+dlen
-1 : NULL
);
1897 if (*sp
== '\\' && (sp
[1] == '{' || (sp
[1] >= '0' && sp
[1] <= '9'))) {
1903 while (*sp
&& *sp
!= '}') {
1904 if (*sp
< '0' || *sp
> '9') break;
1909 while (*sp
&& *sp
!= '}') ++sp
;
1910 if (*sp
== '}') ++sp
;
1914 if (i
< ms
&& mp
[i
].sp
!= NULL
) {
1915 const char *ssp
= mp
[i
].sp
;
1916 int len
= (int)(mp
[i
].ep
-ssp
); /*FIXME: 2GB strings?*/
1919 if (len
> dlen
) len
= dlen
;
1920 if (len
> 0) memcpy(dp
, ssp
, len
);
1928 *dp
++ = (sp
[1] == 'z' ? 0 : sp
[1]);
1933 if (dlen
> 0) { *dp
++ = *sp
++; --dlen
; } else ++sp
;
1938 if (dlen
> 0) *dp
= '\0'; else if (dlast
!= NULL
) *dlast
= '\0';