2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 * heavily modified by Ketmar // Vampire Avalon
29 /******************************************************************************/
30 #if defined(__GNUC__) && __GNUC__ >= 3
31 # define PACKED __attribute__((packed))
32 # define NORETURN __attribute__((noreturn))
38 #define ARRAYLEN(_a) ((sizeof((_a)))/sizeof((_a)[0]))
41 # define dlogf(...) fprintf(stderr, __VA_ARGS__)
46 //#define RE9_DEBUG_PRG
47 //#define RE9_CALC_MAXT
50 /******************************************************************************/
51 typedef uint32_t re9_rune
;
55 PRG_STARTS_WITH_ANY
= 0x00,
56 PRG_STARTS_WITH_BOL
= 0x01,
57 PRG_STARTS_WITH_EOL
= 0x02,
58 PRG_STARTS_WITH_RUNE
= 0x04,
59 PRG_STARTS_WITH_CLASS
= 0x08
64 uint8_t *code
; /* vm code */
66 uint32_t code_alloted
;
67 uint32_t startinst
; /* start pc */
68 #ifndef RE9_DISABLE_LEARNING
69 uint8_t startflags
; /* PRG_START_WITH_xxx */
71 uint32_t startrange
; /* offset of (vmop_class_t *) with starting range if PRG_STARTS_WITH_CLASS set or re9_rune if PRG_STARTS_WITH_RUNE set */
75 int flags
; /* RE9_FLAG_CASEINSENS, RE9_FLAG_NONGREEDY will be set if there were some non-greedy splits */
80 /* note that CLASS opcode is variable-length */
81 typedef struct PACKED
{
82 uint32_t opcode
:8; /* 0..126: literal char; 127: RUNE; 0xc0..0xff (0300..0377): opcodes */
83 uint32_t next
:24; /* offset of next opcode */
88 typedef struct PACKED
{
95 typedef struct PACKED
{
97 uint32_t alt
; /* offset of 'alternate' opcode */
102 typedef struct PACKED
{
104 uint8_t subid
; /* bit 7 is RESERVED! 127 subgroups should be enough for everyone */
109 typedef struct PACKED
{
111 uint16_t spi_count
; /* number of items in spi */
112 re9_rune spi
[]; /* span data: spi_count*sizeof(spi[0]); sorted in ascending order */
116 /******************************************************************************/
118 * Actions and Tokens (re9_inst_t types)
120 * 02xx are operators, value == precedence
121 * 03xx are tokens, i.e. operands for operators
123 #define RUNE 0177 /* literal rune */
125 #define OPERATOR 0200 /* bitmask of all operators */
126 #define START 0200 /* start, used for marker on stack */
127 #define RBRA 0201 /* right bracket */
128 #define LBRA 0202 /* left bracket */
129 #define ALT 0203 /* alternation */
130 #define CAT 0204 /* concatentation, implicit operator */
131 #define STAR 0205 /* closure, *; bit 8 set: non-greedy */
132 #define PLUS 0206 /* a+ == aa*; bit 8 set: non-greedy */
133 #define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's; bit 8 set: non-greedy */
135 #define NOP 0300 /* no operation, internal use only (will be skipped by optimizer) */
136 #define ANY 0301 /* (and token); any character except newline */
137 #define ANYNL 0302 /* (and token); any character including newline */
138 #define BOL 0303 /* (and token); beginning of line */
139 #define EOL 0304 /* (and token); end of line */
140 #define CCLASS 0305 /* (and token); character class */
141 #define NCCLASS 0306 /* (and token); negated character class */
142 #define WBOUND 0307 /* word boundary assertion (generated by optimizer from rune) */
143 #define WNBOUND 0310 /* word non-boundary assertion (generated by optimizer from rune) */
144 #define SPLIT_G 0311 /* greedy split */
145 #define SPLIT_NG 0312 /* vmop; non-greedy split */
146 #define SMARK 0313 /* vmop; group start mark */
147 #define EMARK 0314 /* vmop; group end mark */
148 /* special splits for '*' -- should be ignored by learn_start() and replaced by optimize() */
152 #define END 0377 /* vmop; terminate: match found */
155 /******************************************************************************/
156 static inline uint32_t vmop_size (const void *op
) {
157 switch (((const vmopcode_t
*)op
)->opcode
) {
159 return sizeof(vmop_rune_t
);
160 case SPLIT_G
: case SPLIT_NG
:
161 return sizeof(vmop_split_t
);
162 case SMARK
: case EMARK
:
163 return sizeof(vmop_mark_t
);
164 case CCLASS
: case NCCLASS
:
165 return sizeof(vmop_class_t
)+((const vmop_class_t
*)op
)->spi_count
*sizeof(((const vmop_class_t
*)op
)->spi
[0]);
168 return sizeof(vmopcode_t
);
172 /******************************************************************************/
173 #if defined(RE9_DEBUG_PRG) || defined(RE9_DEBUG)
174 /* return 0 if pc is out of range or next opcode pc (NOT next pointer from vmopcode_t!) */
175 static void dump_char (FILE *fo
, re9_rune r
) {
176 if (r
>= 32 && r
< 127 && r
!= '\'' && r
!= '\\') {
177 fprintf(fo
, "%c", r
);
179 fprintf(fo
, "\\u%04x", r
);
183 static uint32_t dump_opcode (FILE *fo
, const re9_prog_t
*pp
, uint32_t pc
) {
184 const vmopcode_t
*op
;
185 const vmop_class_t
*opc
;
187 if (pc
>= pp
->code_used
) return 0;
188 fprintf(fo
, "%05u: 0%03o ", pc
, pp
->code
[pc
]);
190 if (pp
->code
[pc
] == END
) {
191 fprintf(fo
, " END\n");
192 return pc
+vmop_size((void *)(pp
->code
+pc
));
194 if (pc
+sizeof(vmopcode_t
) > pp
->code_used
) {
195 fprintf(fo
, "???: INVALID CODE!\n");
198 op
= (const vmopcode_t
*)(pp
->code
+pc
);
199 if (pc
+vmop_size(op
) > pp
->code_used
) {
200 fprintf(fo
, "???: INVALID CODE!\n");
203 fprintf(fo
, "%05u ", op
->next
);
205 switch (op
->opcode
) {
206 case NOP
: fprintf(fo
, "NOP\n"); break;
207 case ANY
: fprintf(fo
, "ANY\n"); break;
208 case ANYNL
: fprintf(fo
, "ANYNL\n"); break;
209 case BOL
: fprintf(fo
, "BOL\n"); break;
210 case EOL
: fprintf(fo
, "EOL\n"); break;
211 case WBOUND
: fprintf(fo
, "WBOUND\n"); break;
212 case WNBOUND
: fprintf(fo
, "WNBOUND\n"); break;
214 fprintf(fo
, "RUNE '");
215 dump_char(fo
, ((const vmop_rune_t
*)op
)->r
);
218 case CCLASS
: case NCCLASS
:
219 opc
= (const vmop_class_t
*)op
;
220 fprintf(fo
, "%-8s [", (op
->opcode
== CCLASS
? "CCLASS" : "NCCLASS"));
221 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
222 dump_char(fo
, opc
->spi
[f
]);
223 if (opc
->spi
[f
] != opc
->spi
[f
+1]) {
225 dump_char(fo
, opc
->spi
[f
+1]);
230 case SPLIT_G
: case SPLIT_NG
:
231 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "SPLIT_G" : "SPLIT_NG"), ((const vmop_split_t
*)op
)->alt
);
233 case STAR_G
: case STAR_NG
:
234 fprintf(fo
, "%-8s %05u\n", (op
->opcode
== SPLIT_G
? "STAR_G" : "STAR_NG"), ((const vmop_split_t
*)op
)->alt
);
236 case SMARK
: case EMARK
:
237 fprintf(fo
, "%-8s %u\n", (op
->opcode
== SMARK
? "SMARK" : "EMARK"), ((const vmop_mark_t
*)op
)->subid
);
240 if (op
->opcode
< RUNE
) {
241 fprintf(fo
, "RUNE '");
242 dump_char(fo
, op
->opcode
);
246 fprintf(fo
, "INVALID OPCODE!\n");
249 return pc
+vmop_size(op
);
253 static void dump (const re9_prog_t
*pp
) {
255 fprintf(stderr
, "code size: %u\n", pp
->code_used
);
256 while (pc
< pp
->code_used
) if ((pc
= dump_opcode(stderr
, pp
, pc
)) == 0) break;
261 /******************************************************************************/
264 #ifdef RE9_UNICODE_CASE
267 unsigned short code
; /* code point */
268 unsigned short lower
; /* code */
269 unsigned short upper
; /* code */
272 #include "re9_unicode_mapping.c"
275 #define NUMCASEMAP (sizeof(unicode_case_mapping)/sizeof(*unicode_case_mapping))
277 static int cmp_casemap (const void *key
, const void *cm
) {
278 return *(const int *)key
-(int)((const struct casemap
*)cm
)->code
;
282 static inline int utf8_map_case (re9_rune uc
, int upper
) {
283 const struct casemap
*cm
= bsearch(&uc
, unicode_case_mapping
, NUMCASEMAP
, sizeof(*unicode_case_mapping
), cmp_casemap
);
284 return (cm
!= NULL
? (upper
? cm
->upper
: cm
->lower
) : uc
);
289 * returns the upper-case variant of the given unicode codepoint
290 * does not support unicode code points > \uffff
292 static inline re9_rune
utf8_upper (re9_rune uc
) {
293 return (uc
< 256 ? (uc
< 128 ? toupper(uc
) : uc
) : utf8_map_case(uc
, 1));
298 * returns the lower-case variant of the given unicode codepoint.
299 * does not support unicode code points > \uffff
302 static inline int utf8_lower (int uc) {
303 return (uc < 256 tolower(uc) : utf8_map_case(uc, 0));
306 #define UPPER(_r) utf8_upper(_r)
310 #define UPPER(_r) ((_r) < 128 ? toupper(_r) : (_r))
316 RE9_UTF_MAX_BYTES
= 4, /* maximum bytes per rune */
317 RE9_RUNE_SYNC
= 0x80, /* cannot represent part of a UTF sequence (<) */
318 RE9_RUNE_SELF
= 0x80, /* rune and UTF sequences are the same (<) */
319 RE9_RUNE_ERROR
= 0xFFFD, /* decoding error in UTF */
320 RE9_RUNE_MAX
= 0x10FFFF, /* maximum rune value */
322 RE9_RUNE_SPEC_EOL
= RE9_RUNE_MAX
+1,
323 RE9_RUNE_SPEC_NOTHING
= RE9_RUNE_MAX
+2,
324 RE9_RUNE_SPEC_WBOUND
= RE9_RUNE_MAX
+3,
325 RE9_RUNE_SPEC_WNBOUND
= RE9_RUNE_MAX
+4,
326 RE9_RUNE_SPEC_BOL
= RE9_RUNE_MAX
+5,
327 RE9_RUNE_SPEC_INVALID
= RE9_RUNE_MAX
+6
331 static const uint8_t utf8_clen
[128] = {
332 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x00-0x0f */
333 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x10-0x1f */
334 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x20-0x2f */
335 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x30-0x3f */
336 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x40-0x4f */
337 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x50-0x5f */
338 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x60-0x6f */
339 0x11,0x11,0x11,0x11,0x11,0x11,0x11,0x11, /* 0x70-0x7f */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x80-0x8f */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0x90-0x9f */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xa0-0xaf */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0xb0-0xbf */
344 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xc0-0xcf c0-c1: overlong encoding: start of a 2-byte sequence, but code point <= 127 */
345 0x22,0x22,0x22,0x22,0x22,0x22,0x22,0x22, /* 0xd0-0xdf */
346 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33, /* 0xe0-0xef */
347 0x44,0x44,0x44,0x44,0x00,0x00,0x00,0x00, /* 0xf0-0xff */
352 static inline int re9_char2rune (re9_rune
*rune
, const char *str
, const char *eol
) {
353 if ((intptr_t)(eol
-str
) > 0) {
355 uint8_t c
= (unsigned char)(*str
++), len
= (utf8_clen
[c
>>1]>>((c
&0x01)*4))&0x0f, res
= len
, oc
= c
;
356 if (len
== 0 || eol
-str
+1 < len
) { *rune
= RE9_RUNE_ERROR
; return 1; }
357 else if (len
== 1) { *rune
= c
; return 1; }
360 if (((c
= (unsigned char)(*str
++))&0xc0) != 0x80) { *rune
= RE9_RUNE_ERROR
; return 1; }
363 if ((oc
== 0xc0 || oc
== 0xc1) && r
!= 0) r
= RE9_RUNE_ERROR
; /* overlong encoding: only 'modified utf-8' zero rune allowed here */
364 if ((r
>= 0xd800 && r
<= 0xdfff) || // utf16/utf32 surrogates
365 (r
>= 0xfdd0 && r
<= 0xfdef) || // just for fun
366 (r
>= 0xfffe && r
<= 0xffff) || // fuck BOMs
367 r
> 0x10ffff) { r
= RE9_RUNE_ERROR
; /*res = 1;*/ } // bad unicode
371 *rune
= RE9_RUNE_SPEC_EOL
;
377 static inline const char *re8_rune_strchr (const char *s
, re9_rune c
, const char *eol
) {
378 if (s
>= eol
) return NULL
;
379 /* not part of utf sequence? */
380 if (c
< RE9_RUNE_SYNC
) return memchr(s
, c
, eol
-s
);
383 int len
= re9_char2rune(&r
, s
, eol
);
384 if (r
== c
) return s
;
391 /******************************************************************************/
400 /* parser information */
402 /*re9_inst_t*/uint32_t first
;
403 /*re9_inst_t*/uint32_t last
;
413 re9_node_t andstack
[NSTACK
];
416 int atorstack
[NSTACK
];
419 int cursubid
; /* id of current subexpression */
420 int subidstack
[NSTACK
]; /* parallel to atorstack */
423 int last_was_operand
; /* last token was operand */
424 int nbra
; /* number of opened parens */
425 const char *expr
; /* pointer to next character in source expression */
426 const char *expr_eol
;
427 int flags
; /* parser flags */
429 int comment_level
; /* >0: in comment; counts parens */
430 int return_rune_nothing
; /* return 'nothing' rune from next lex()? will be reset by lex() */
431 re9_rune yyrune
; /* last lex'd rune */
432 /* last lex'd class */
435 uint32_t yyc_alloted
;
438 const char *errormsg
;
442 /******************************************************************************/
443 static NORETURN
void rcerror (re9_compiler_t
*ci
, const char *s
) {
445 longjmp(ci
->regkaboom
, 1);
449 /******************************************************************************/
450 static void add_to_class (re9_compiler_t
*ci
, re9_rune r
) {
451 if (ci
->comment_level
== 0) {
452 if (ci
->yyc_used
+1 > ci
->yyc_alloted
) {
453 uint32_t newsz
= ((ci
->yyc_used
+1)|0x1f)+1;
454 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
455 if (newr
== NULL
) rcerror(ci
, "out of memory");
457 ci
->yyc_alloted
= newsz
;
459 ci
->yyclass
[ci
->yyc_used
++] = r
;
464 static void add_to_class_two (re9_compiler_t
*ci
, re9_rune r0
, re9_rune r1
) {
465 if (ci
->comment_level
== 0) {
466 if (ci
->yyc_used
+2 > ci
->yyc_alloted
) {
467 uint32_t newsz
= ((ci
->yyc_used
+2)|0x1f)+1;
468 re9_rune
*newr
= realloc(ci
->yyclass
, newsz
*sizeof(ci
->yyclass
[0]));
469 if (newr
== NULL
) rcerror(ci
, "out of memory");
471 ci
->yyc_alloted
= newsz
;
473 ci
->yyclass
[ci
->yyc_used
++] = r0
;
474 ci
->yyclass
[ci
->yyc_used
++] = r1
;
479 /******************************************************************************/
480 /* return 1 if char is quoted */
481 static int nextc (re9_compiler_t
*ci
, re9_rune
*rp
) {
482 if (ci
->expr
>= ci
->expr_eol
) { *rp
= RE9_RUNE_SPEC_EOL
; return 1; }
483 if ((ci
->flags
&RE9_FLAG_NONUTF8
) == 0) {
484 if (ci
->expr
[0] == '\\') {
485 if (ci
->expr
+1 < ci
->expr_eol
) {
486 ci
->expr
+= re9_char2rune(rp
, ci
->expr
+1, ci
->expr_eol
)+1;
493 ci
->expr
+= re9_char2rune(rp
, ci
->expr
, ci
->expr_eol
);
496 if (ci
->expr
[0] == '\\') {
497 if (ci
->expr
+1 < ci
->expr_eol
) {
498 *rp
= (unsigned char)(ci
->expr
[1]);
506 *rp
= (unsigned char)(*ci
->expr
++);
507 if (ci
->expr
> ci
->expr_eol
) ci
->expr
= ci
->expr_eol
;
510 if (ci
->flags
&RE9_FLAG_CASEINSENS
) *rp
= UPPER(*rp
);
515 static void addmeta_space (re9_compiler_t
*ci
, int neg
) {
517 add_to_class_two(ci
, '\x9', '\xd');
518 add_to_class_two(ci
, ' ', ' ');
520 add_to_class_two(ci
, '\1', '\x9'-1);
521 add_to_class_two(ci
, '\xd'+1, ' '-1);
522 add_to_class_two(ci
, ' '+1, RE9_RUNE_MAX
);
527 static void addmeta_digit (re9_compiler_t
*ci
, int neg
) {
529 add_to_class_two(ci
, '0', '9');
531 add_to_class_two(ci
, '\1', '0'-1);
532 add_to_class_two(ci
, '9'+1, RE9_RUNE_MAX
);
537 static void addmeta_word (re9_compiler_t
*ci
, int neg
) {
539 add_to_class_two(ci
, '0', '9');
540 add_to_class_two(ci
, 'A', 'Z');
541 add_to_class_two(ci
, '_', '_');
542 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
544 add_to_class_two(ci
, '\0', '0'-1);
545 add_to_class_two(ci
, '9'+1, 'A'-1);
546 add_to_class_two(ci
, 'Z'+1, '_'-1);
547 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) {
548 add_to_class_two(ci
, '_'+1, 'a'-1);
549 add_to_class_two(ci
, 'z'+1, RE9_RUNE_MAX
);
551 add_to_class_two(ci
, '_'+1, RE9_RUNE_MAX
);
557 #ifndef RE9_DISABLE_POSIX_CLASSES
558 static void addmeta_upper (re9_compiler_t
*ci
) {
559 add_to_class_two(ci
, 'A', 'Z');
563 static void addmeta_lower (re9_compiler_t
*ci
) {
564 if (ci
->flags
&RE9_FLAG_CASEINSENS
) {
565 add_to_class_two(ci
, 'A', 'Z');
567 add_to_class_two(ci
, 'a', 'z');
572 static void addmeta_alpha (re9_compiler_t
*ci
) {
573 add_to_class_two(ci
, 'A', 'Z');
574 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'z');
578 static void addmeta_xdigit (re9_compiler_t
*ci
) {
579 add_to_class_two(ci
, '0', '9');
580 add_to_class_two(ci
, 'A', 'F');
581 if ((ci
->flags
&RE9_FLAG_CASEINSENS
) == 0) add_to_class_two(ci
, 'a', 'f');
585 static void addmeta_punct (re9_compiler_t
*ci
) {
586 add_to_class_two(ci
, 33, 47); // !"#$%&'()*+,-./
587 add_to_class_two(ci
, 58, 64); // :;<=>?@
588 add_to_class_two(ci
, 91, 96); // [\]^_`
589 add_to_class_two(ci
, 123, 126); // {|}~
593 /*FIXME:??? this should include '\z'! */
594 static void addmeta_ascii (re9_compiler_t
*ci
) {
595 add_to_class_two(ci
, '\1', '\x7f');
599 static void addmeta_blank (re9_compiler_t
*ci
) {
600 add_to_class_two(ci
, '\t', '\t');
601 add_to_class_two(ci
, ' ', ' ');
605 /*FIXME:??? this should include '\z'! */
606 static void addmeta_ctrl (re9_compiler_t
*ci
) {
607 add_to_class_two(ci
, '\1', '\x1f');
608 add_to_class_two(ci
, '\x7f', '\x7f');
612 static void addmeta_graph (re9_compiler_t
*ci
) {
613 add_to_class_two(ci
, '!', '\x7e');
617 static void addmeta_print (re9_compiler_t
*ci
) {
618 add_to_class_two(ci
, ' ', '\x7e');
623 static void normalize_spans (re9_compiler_t
*ci
) {
624 if (ci
->yyc_used
> 2) {
625 /* we have at least two spans */
627 re9_rune
*ep
= ci
->yyclass
+ci
->yyc_used
;
629 /* sort on span start */
630 /* yeah, old good bubble sorting */
631 for (p
= ci
->yyclass
; p
< ep
; p
+= 2) {
632 for (np
= p
; np
< ep
; np
+= 2) {
645 for (cp
= 2; cp
< ci
->yyc_used
; ) {
647 /* if next span start is inside previous span, merge both spans */
648 //fprintf(stderr, "prev:(%u,%u); curr:(%u,%u)\n", ci->yyclass[cp-2], ci->yyclass[cp-1], ci->yyclass[cp], ci->yyclass[cp+1]);
649 if (ci
->yyclass
[cp
] <= ci
->yyclass
[cp
-1]) {
651 if (ci
->yyclass
[cp
+1] > ci
->yyclass
[cp
-1]) ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
652 /* remove this span */
653 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
655 } else if (ci
->yyclass
[cp
-1]+1 == ci
->yyclass
[cp
]) {
656 /* we can join this spans */
657 ci
->yyclass
[cp
-1] = ci
->yyclass
[cp
+1];
658 /* remove this span */
659 for (f
= cp
+2; f
< ci
->yyc_used
; ++f
) ci
->yyclass
[f
-2] = ci
->yyclass
[f
];
670 /* parse character range */
671 static int bldcclass (re9_compiler_t
*ci
) {
675 /* we have already seen the '[' */
678 /* look ahead for negation */
679 /* SPECIAL CASE!!! negated classes don't match \n */
680 quoted
= nextc(ci
, &rune
);
681 if (!quoted
&& rune
== '^') {
683 quoted
= nextc(ci
, &rune
);
684 add_to_class_two(ci
, '\n', '\n');
686 /* parse class into a set of spans */
687 for (;; quoted
= nextc(ci
, &rune
)) {
688 if (rune
== RE9_RUNE_SPEC_EOL
) rcerror(ci
, "malformed '[]'");
689 if (!quoted
&& rune
== ']' && (ci
->yyc_used
&0x01) == 0) break;
690 if (quoted
&& ((rune
>= '0' && rune
<= '9') || (rune
>= 'a' && rune
<= 'z') || (rune
>= 'A' && rune
<= 'Z'))) {
692 if ((ci
->yyc_used
&0x01) != 0 || (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-')) rcerror(ci
, "malformed '[]'"); /* metacharacter can't be used in rangedef */
694 case 'd': case 'D': addmeta_digit(ci
, (rune
<= 'Z')); break;
695 case 's': case 'S': addmeta_space(ci
, (rune
<= 'Z')); break;
696 case 'w': case 'W': addmeta_word(ci
, (rune
<= 'Z')); break;
697 case 'z': add_to_class_two(ci
, '\0', '\0'); break;
698 case 'Z': add_to_class_two(ci
, '\1', RE9_RUNE_MAX
); break;
700 rcerror(ci
, "invalid metacharacter in '[]'");
703 #ifndef RE9_DISABLE_POSIX_CLASSES
704 } else if ((ci
->yyc_used
&0x01) == 0 && !quoted
&& rune
== '[' && ci
->expr
+7 < ci
->expr_eol
&& ci
->expr
[0] == ':') {
705 if (strncmp(ci
->expr
, ":alnum:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); addmeta_digit(ci
, 0); }
706 else if (strncmp(ci
->expr
, ":alpha:]", 8) == 0) { ci
->expr
+= 8; addmeta_alpha(ci
); }
707 else if (strncmp(ci
->expr
, ":ascii:]", 8) == 0) { ci
->expr
+= 8; addmeta_ascii(ci
); }
708 else if (strncmp(ci
->expr
, ":blank:]", 8) == 0) { ci
->expr
+= 8; addmeta_blank(ci
); }
709 else if (strncmp(ci
->expr
, ":cntrl:]", 8) == 0) { ci
->expr
+= 8; addmeta_ctrl(ci
); }
710 else if (strncmp(ci
->expr
, ":digit:]", 8) == 0) { ci
->expr
+= 8; addmeta_digit(ci
, 0); }
711 else if (strncmp(ci
->expr
, ":graph:]", 8) == 0) { ci
->expr
+= 8; addmeta_graph(ci
); }
712 else if (strncmp(ci
->expr
, ":lower:]", 8) == 0) { ci
->expr
+= 8; addmeta_lower(ci
); }
713 else if (strncmp(ci
->expr
, ":print:]", 8) == 0) { ci
->expr
+= 8; addmeta_print(ci
); }
714 else if (strncmp(ci
->expr
, ":punct:]", 8) == 0) { ci
->expr
+= 8; addmeta_punct(ci
); }
715 else if (strncmp(ci
->expr
, ":space:]", 8) == 0) { ci
->expr
+= 8; addmeta_space(ci
, 0); }
716 else if (strncmp(ci
->expr
, ":upper:]", 8) == 0) { ci
->expr
+= 8; addmeta_upper(ci
); }
717 else if (strncmp(ci
->expr
, ":wordc:]", 8) == 0) { ci
->expr
+= 7; addmeta_word(ci
, 0); } /*non-standard!*/
718 else if (ci
->expr
+8 < ci
->expr_eol
&& strncmp(ci
->expr
, ":xdigit:]", 9) == 0) { ci
->expr
+= 9; addmeta_xdigit(ci
); }
719 else rcerror(ci
, "invalid POSIX range");
722 if (ci
->yyc_used
&0x01 && rune
< ci
->yyclass
[ci
->yyc_used
-1]) rcerror(ci
, "invalid range in '[]'");
723 add_to_class(ci
, rune
);
724 if (ci
->yyc_used
&0x01) {
725 /* this was first char of possible range */
726 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '-') {
727 /* rangedef; skip '-' */
730 /* one char: convert it to range */
731 add_to_class(ci
, rune
);
736 /* sort and store only if we are not in comment */
737 if (ci
->comment_level
== 0) {
738 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: character class parsing error");
739 if (ci
->yyc_used
== 0) rcerror(ci
, "empty '[]'");
741 /* if we have only one char in range, we can use RUNE instead */
742 if (type
== CCLASS
&& ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
743 /* yeah, one char! */
744 ci
->yyrune
= ci
->yyclass
[0];
752 /* parse metacharacter */
753 static int bldmeta (re9_compiler_t
*ci
, re9_rune rune
) {
754 int rangeop
= (rune
>= 'a' && rune
<= 'z' ? CCLASS
: NCCLASS
);
757 case 'd': case 'D': addmeta_digit(ci
, 0); break;
758 case 's': case 'S': addmeta_space(ci
, 0); break;
759 case 'w': case 'W': addmeta_word(ci
, 0); break;
760 case 'Z': add_to_class_two(ci
, '\0', '\0'); break;
761 case 'z': ci
->yyrune
= 0; return RUNE
;
762 case 'b': ci
->yyrune
= RE9_RUNE_SPEC_WBOUND
; return RUNE
;
763 case 'B': ci
->yyrune
= RE9_RUNE_SPEC_WNBOUND
; return RUNE
;
764 default: rcerror(ci
, "invalid metacharacter");
771 static int lex (re9_compiler_t
*ci
) {
772 if (ci
->return_rune_nothing
) {
773 ci
->return_rune_nothing
= 0;
774 ci
->yyrune
= RE9_RUNE_SPEC_NOTHING
;
775 } else if (ci
->expr
>= ci
->expr_eol
) {
776 ci
->yyrune
= RE9_RUNE_SPEC_EOL
;
777 } else if (ci
->flags
&RE9_FLAG_LITERAL
) {
778 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '\\') {
782 /* can't be quoted */
783 nextc(ci
, &ci
->yyrune
);
786 int quoted
= nextc(ci
, &ci
->yyrune
);
788 /* check for metacharacter */
789 if ((ci
->yyrune
>= '0' && ci
->yyrune
<= '9') || (ci
->yyrune
>= 'a' && ci
->yyrune
<= 'z') || (ci
->yyrune
>= 'A' && ci
->yyrune
<= 'Z')) return bldmeta(ci
, ci
->yyrune
);
791 switch (ci
->yyrune
) {
792 case '*': quoted
= STAR
; goto closure
;
793 case '?': quoted
= QUEST
; goto closure
;
794 case '+': quoted
= PLUS
; /* fallthru */
795 closure
: if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
796 #ifndef RE9_DISABLE_NONGREEDY
800 if ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0) rcerror(ci
, "non-greedy closure support was disabled at compile time");
804 case '|': return ALT
;
805 case '.': return (ci
->flags
&RE9_FLAG_ANYDOT
? ANYNL
: ANY
);
806 case '(': return LBRA
;
807 case ')': return RBRA
;
808 case '^': return BOL
;
809 case '$': return EOL
;
810 case '[': return bldcclass(ci
);
814 return (ci
->yyrune
!= RE9_RUNE_SPEC_EOL
? RUNE
: END
);
818 /******************************************************************************/
819 static inline uint32_t instofs (const re9_compiler_t
*ci
, const vmopcode_t
*op
) {
820 return (int32_t)(((const uint8_t *)op
)-ci
->pp
->code
);
824 static inline vmopcode_t
*instptr (re9_compiler_t
*ci
, uint32_t ofs
) {
825 return (vmopcode_t
*)(ci
->pp
->code
+ofs
);
829 /* push instructions to 'operand stack' */
830 static void push_operand (re9_compiler_t
*ci
, vmopcode_t
*f
, vmopcode_t
*l
) {
831 if (ci
->andp
>= &ci
->andstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operand stack overflow");
832 ci
->andp
->first
= instofs(ci
, f
);
833 ci
->andp
->last
= instofs(ci
, l
);
838 /* push token and group id to 'operator stack' */
839 static void push_operator (re9_compiler_t
*ci
, int t
, int csi
) {
840 if (ci
->atorp
>= &ci
->atorstack
[NSTACK
]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack overflow");
846 /* pop instriction from 'operand stack' */
847 static re9_node_t
*pop_operand (re9_compiler_t
*ci
) {
848 if (ci
->andp
<= &ci
->andstack
[0]) rcerror(ci
, "missing operand");
853 /* pop token from 'operator stack' */
854 static int pop_operator (re9_compiler_t
*ci
) {
855 if (ci
->atorp
<= &ci
->atorstack
[0]) rcerror(ci
, "THE THING THAT SHOULD NOT BE: operator stack underflow");
861 /******************************************************************************/
862 static void *allot (re9_compiler_t
*ci
, size_t size
) {
864 if (ci
->pp
->code_used
+size
> ci
->pp
->code_alloted
) {
865 uint32_t newsz
= ((ci
->pp
->code_used
+size
)|0xfff)+1;
866 uint8_t *newc
= realloc(ci
->pp
->code
, newsz
);
867 if (newc
== NULL
) rcerror(ci
, "out of memory");
868 memset(newc
+ci
->pp
->code_used
, 0, newsz
-ci
->pp
->code_used
);
870 ci
->pp
->code_alloted
= newsz
;
872 res
= ci
->pp
->code
+ci
->pp
->code_used
;
873 ci
->pp
->code_used
+= size
;
878 /* allocate new instruction */
879 static inline vmopcode_t
*newinst (re9_compiler_t
*ci
, int t
) {
880 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
886 static vmopcode_t
*newinst_class (re9_compiler_t
*ci
, int t
) {
887 vmop_class_t
*op
= allot(ci
, sizeof(*op
)+ci
->yyc_used
*sizeof(op
->spi
[0]));
889 if (ci
->yyc_used
> 0) memcpy(op
->spi
, ci
->yyclass
, ci
->yyc_used
*sizeof(op
->spi
[0]));
890 op
->spi_count
= ci
->yyc_used
;
891 if (ci
->yyc_used
&0x01) rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_class() fucked!");
892 return (vmopcode_t
*)op
;
896 static vmopcode_t
*newinst_rune (re9_compiler_t
*ci
, re9_rune r
) {
897 if (r
< RUNE
|| r
> RE9_RUNE_MAX
) {
898 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
900 case RE9_RUNE_SPEC_NOTHING
: op
->opcode
= NOP
; break;
901 case RE9_RUNE_SPEC_WBOUND
: op
->opcode
= WBOUND
; break;
902 case RE9_RUNE_SPEC_WNBOUND
: op
->opcode
= WNBOUND
; break;
908 rcerror(ci
, "THE THING THAT SHOULD NOT BE: newinst_rune() fucked!");
912 vmop_rune_t
*op
= allot(ci
, sizeof(*op
));
913 op
->opc
.opcode
= RUNE
;
915 return (vmopcode_t
*)op
;
920 static vmopcode_t
*newinst_mark (re9_compiler_t
*ci
, int t
, int id
) {
922 vmop_mark_t
*op
= allot(ci
, sizeof(*op
));
923 if (id
> 127) rcerror(ci
, "too many captures");
926 return (vmopcode_t
*)op
;
928 vmopcode_t
*op
= allot(ci
, sizeof(*op
));
935 static inline vmop_split_t
*newinst_split (re9_compiler_t
*ci
, int t
) {
936 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
937 #ifndef RE9_DISABLE_NONGREEDY
938 op
->opc
.opcode
= ((ci
->flags
&RE9_FLAG_NONGREEDY
) == 0 ? (t
&0x100 ? SPLIT_NG
: SPLIT_G
) : (t
&0x100 ? SPLIT_G
: SPLIT_NG
));
939 if ((t
&0xff) == STAR
) op
->opc
.opcode
= (op
->opc
.opcode
== SPLIT_G
? STAR_G
: STAR_NG
);
941 op
->opc
.opcode
= ((t
&0xff) == STAR
? STAR_G
: SPLIT_G
);
947 static inline vmop_split_t
*newinst_split1 (re9_compiler_t
*ci
, int t
) {
948 vmop_split_t
*op
= allot(ci
, sizeof(*op
));
954 /* process instructions with higher current priority levels */
955 static void evaluntil (re9_compiler_t
*ci
, int pri
) {
956 while (pri
== RBRA
|| (ci
->atorp
[-1]&0xff) >= (pri
&0xff)) {
957 re9_node_t
*op1
, *op2
;
958 vmopcode_t
*inst1
, *inst2
;
960 int op
= pop_operator(ci
);
962 case LBRA
: /* must have been RBRA */
963 op1
= pop_operand(ci
);
964 inst2
= newinst_mark(ci
, EMARK
, *ci
->subidp
);
965 instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
966 inst1
= newinst_mark(ci
, SMARK
, *ci
->subidp
);
967 inst1
->next
= op1
->first
;
968 push_operand(ci
, inst1
, inst2
);
971 op2
= pop_operand(ci
);
972 op1
= pop_operand(ci
);
973 inst2
= newinst(ci
, NOP
);
974 instptr(ci
, op2
->last
)->next
= instptr(ci
, op1
->last
)->next
= instofs(ci
, inst2
);
975 si1
= newinst_split1(ci
, SPLIT_G
);
976 si1
->alt
= op1
->first
;
977 si1
->opc
.next
= op2
->first
;
978 push_operand(ci
, (void *)si1
, inst2
);
981 op2
= pop_operand(ci
);
982 op1
= pop_operand(ci
);
983 instptr(ci
, op1
->last
)->next
= op2
->first
;
984 push_operand(ci
, instptr(ci
, op1
->first
), instptr(ci
, op2
->last
));
986 case STAR
: case STAR
|0x100: /* greedy and non-greedy */
987 op2
= pop_operand(ci
);
988 si1
= newinst_split(ci
, op
);
989 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
990 si1
->alt
= op2
->first
;
991 push_operand(ci
, (void *)si1
, (void *)si1
);
993 case PLUS
: case PLUS
|0x100: /* greedy and non-greedy */
994 op2
= pop_operand(ci
);
995 si1
= newinst_split(ci
, op
);
996 instptr(ci
, op2
->last
)->next
= instofs(ci
, (void *)si1
);
997 si1
->alt
= op2
->first
;
998 push_operand(ci
, instptr(ci
, op2
->first
), (void *)si1
);
1000 case QUEST
: case QUEST
|0x100: /* greedy and non-greedy */
1001 op2
= pop_operand(ci
);
1002 si1
= newinst_split(ci
, op
);
1003 inst2
= newinst(ci
, NOP
);
1004 si1
->opc
.next
= instofs(ci
, inst2
);
1005 si1
->alt
= op2
->first
;
1006 instptr(ci
, op2
->last
)->next
= instofs(ci
, inst2
);
1007 push_operand(ci
, (void *)si1
, inst2
);
1010 rcerror(ci
, "unknown operator in evaluntil");
1017 /******************************************************************************/
1018 /* process operator */
1019 static void operator (re9_compiler_t
*ci
, int t
) {
1020 int csi
= ci
->cursubid
;
1021 if (ci
->comment_level
) {
1023 if (t
== RBRA
|| t
== LBRA
) {
1024 ci
->nbra
+= (t
== LBRA
? 1 : -1);
1025 ci
->comment_level
+= (t
== LBRA
? 1 : -1);
1028 /* not in comment */
1029 if (t
== RBRA
&& --ci
->nbra
< 0) rcerror(ci
, "unmatched right paren");
1033 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == '?') {
1034 if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == ':') {
1036 } else if (ci
->expr
+1 < ci
->expr_eol
&& ci
->expr
[1] == '#') {
1038 ci
->comment_level
= 1;
1039 ci
->last_was_operand
= RE9_TRUE
;
1042 rcerror(ci
, "invalid paren flags");
1045 if ((csi
= ++ci
->cursubid
) >= RE9_SUBEXP_MAX
) rcerror(ci
, "too many subexpressions");
1048 if (ci
->last_was_operand
) operator(ci
, CAT
);
1049 /* process empty parens */
1050 if (ci
->expr
< ci
->expr_eol
&& ci
->expr
[0] == ')') ci
->return_rune_nothing
= 1; /* hack: next lex() will return 'nothing' rune */
1054 if (t
!= RBRA
) push_operator(ci
, t
, csi
);
1057 case STAR
: case QUEST
: case PLUS
: case RBRA
:
1058 ci
->last_was_operand
= RE9_TRUE
; /* these look like operands */
1061 ci
->last_was_operand
= RE9_FALSE
;
1067 /* process operand: rune or character class */
1068 static void operand (re9_compiler_t
*ci
, int t
) {
1069 if (ci
->comment_level
== 0) {
1071 if (ci
->last_was_operand
) operator(ci
, CAT
); /* concatenate is implicit */
1073 case CCLASS
: case NCCLASS
:
1074 i
= newinst_class(ci
, t
);
1077 i
= newinst_rune(ci
, ci
->yyrune
);
1083 push_operand(ci
, i
, i
);
1085 ci
->last_was_operand
= RE9_TRUE
;
1089 #ifndef RE9_DISABLE_LEARNING
1090 /* trace VM code and build 'starting range' */
1091 /* return !0 if we hit 'ANY*' */
1092 static int learn_start_tracer (re9_compiler_t
*ci
, uint32_t pc
) {
1094 uint8_t opcode
= ci
->pp
->code
[pc
];
1095 const vmop_class_t
*opc
;
1098 if (opcode
< RUNE
) {
1099 /* wow, a solid hit */
1100 add_to_class_two(ci
, opcode
, opcode
);
1104 case RUNE
: /* another solid hit */
1105 add_to_class_two(ci
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
, ((const vmop_rune_t
*)(ci
->pp
->code
+pc
))->r
);
1107 case ANY
: case ANYNL
: /* shit... */
1109 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1111 case SPLIT_G
: case SPLIT_NG
: /* choose your way to die */
1112 case STAR_G
: case STAR_NG
: /* i can do better analyzis here, but i don't care for now */
1113 if (learn_start_tracer(ci
, ((const vmop_split_t
*)(ci
->pp
->code
+pc
))->alt
)) return 1; /* no way to go */
1116 ci
->pp
->startflags
|= PRG_STARTS_WITH_BOL
;
1119 ci
->pp
->startflags
|= PRG_STARTS_WITH_EOL
;
1122 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1123 for (f
= 0; f
< opc
->spi_count
; ++f
) add_to_class(ci
, opc
->spi
[f
]);
1126 /* oh, this is harder than previous one */
1127 /* we realy on the fact that range was sorted and there are gaps between ranges */
1128 opc
= ((const vmop_class_t
*)(ci
->pp
->code
+pc
));
1129 if (opc
->spi
[0] > 0) add_to_class_two(ci
, 0, opc
->spi
[0]-1);
1131 for (f
= 2; f
< opc
->spi_count
&& pr
<= RE9_RUNE_MAX
; f
+= 2) {
1132 add_to_class_two(ci
, pr
, opc
->spi
[f
]-1);
1133 pr
= opc
->spi
[f
+1]+1;
1135 if (pr
<= RE9_RUNE_MAX
) add_to_class_two(ci
, pr
, RE9_RUNE_MAX
);
1140 add_to_class_two(ci
, 0, RE9_RUNE_MAX
);
1142 /* ignore all other shit */
1144 pc
= instptr(ci
, pc
)->next
;
1149 static void learn_start (re9_compiler_t
*ci
) {
1152 ci
->pp
->startflags
= 0;
1153 ci
->pp
->startrange
= 0;
1154 ci
->pp
->startrune
= 0;
1155 any
= learn_start_tracer(ci
, ci
->pp
->startinst
);
1156 if (any
|| ci
->yyc_used
== 0) {
1157 ci
->pp
->startflags
= PRG_STARTS_WITH_ANY
;
1158 //fprintf(stderr, "PRG_STARTS_WITH_ANY\n");
1160 /* generate CCLASS */
1161 normalize_spans(ci
);
1162 if (ci
->yyc_used
== 2 && ci
->yyclass
[0] == ci
->yyclass
[1]) {
1163 /* simple RUNE will be enough */
1164 ci
->pp
->startflags
|= PRG_STARTS_WITH_RUNE
;
1165 ci
->pp
->startrune
= ci
->yyclass
[0];
1166 //fprintf(stderr, "PRG_STARTS_WITH_RUNE: %u\n", ci->pp->startrune);
1168 ci
->pp
->startflags
|= PRG_STARTS_WITH_CLASS
;
1169 ci
->pp
->startrange
= instofs(ci
, newinst_class(ci
, CCLASS
));
1170 //fprintf(stderr, "PRG_STARTS_WITH_CLASS\n ");
1171 #ifdef RE9_DEBUG_PRG
1172 //dump_opcode(stderr, ci->pp, ci->pp->startrange);
1180 /*TODO:??? collapse some things, get rid of nops? */
1181 static void optimize (re9_compiler_t
*ci
) {
1182 re9_prog_t
*pp
= ci
->pp
;
1185 #ifndef RE9_DISABLE_LEARNING
1188 /* get rid of NOOP chains */
1189 for (pc
= 0; pc
< pp
->code_used
; pc
+= vmop_size(pp
->code
+pc
)) {
1190 vmopcode_t
*inst
= instptr(ci
, pc
), *target
;
1191 target
= instptr(ci
, inst
->next
);
1192 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1193 if (inst
->opcode
== STAR_G
) inst
->opcode
= SPLIT_G
;
1194 else if (inst
->opcode
== STAR_NG
) inst
->opcode
= SPLIT_NG
;
1195 inst
->next
= instofs(ci
, target
);
1196 if (inst
->opcode
== SPLIT_NG
|| inst
->opcode
== SPLIT_G
) {
1197 vmop_split_t
*op
= (vmop_split_t
*)inst
;
1198 target
= instptr(ci
, op
->alt
);
1199 while (target
->opcode
== NOP
) target
= instptr(ci
, target
->next
);
1200 op
->alt
= instofs(ci
, target
);
1201 if (inst
->opcode
== SPLIT_NG
) ci
->pp
->flags
|= RE9_FLAG_NONGREEDY
;
1204 /* move startinst on non-NOOP */
1205 while (instptr(ci
, pp
->startinst
)->opcode
== NOP
) pp
->startinst
= instptr(ci
, pp
->startinst
)->next
;
1206 /* shrink code area */
1207 if (pc
> pp
->code_alloted
) {
1209 pp
->code_used
= pp
->code_alloted
= pc
;
1210 newc
= realloc(pp
->code
, pc
);
1211 if (newc
!= NULL
) pp
->code
= newc
;
1216 re9_prog_t
*re9_compile_ex (const char *s
, const char *eol
, int flags
, const char **errmsg
) {
1218 re9_compiler_t
* volatile ci
;
1221 if (errmsg
!= NULL
) *errmsg
= "can't parse NULL regexp";
1224 if (eol
== NULL
) eol
= s
+strlen(s
); else if (eol
< s
) eol
= s
;
1225 if ((ci
= malloc(sizeof(*ci
))) == NULL
) {
1226 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1229 memset(ci
, 0, sizeof(*ci
));
1230 /* get memory for the program */
1231 ci
->pp
= malloc(sizeof(*ci
->pp
));
1232 if (ci
->pp
== NULL
) {
1234 if (errmsg
!= NULL
) *errmsg
= "out of memory";
1237 memset(ci
->pp
, 0, sizeof(*ci
->pp
));
1239 ci
->pp
->flags
= (flags
&RE9_FLAG_CASEINSENS
);
1242 if (errmsg
!= NULL
) *errmsg
= NULL
;
1243 if (setjmp(ci
->regkaboom
)) {
1244 if (errmsg
!= NULL
) *errmsg
= ci
->errormsg
;
1245 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1246 if (ci
->pp
->code
!= NULL
) free(ci
->pp
->code
);
1251 /* go compile the sucker */
1255 ci
->atorp
= ci
->atorstack
;
1256 ci
->andp
= ci
->andstack
;
1257 ci
->subidp
= ci
->subidstack
;
1258 ci
->last_was_operand
= RE9_FALSE
;
1260 /* start with a low priority operator to prime parser */
1261 push_operator(ci
, START
-1, ci
->cursubid
);
1262 /* offset 0 should be NOOP */
1264 vmopcode_t
*op
= newinst(ci
, NOP
);
1265 op
->next
= ci
->pp
->code_used
;
1267 while ((token
= lex(ci
)) != END
) {
1268 if ((token
&0300) == OPERATOR
) operator(ci
, token
); else operand(ci
, token
);
1270 if (ci
->comment_level
== 0) {
1271 /* close with a low priority operator */
1272 evaluntil(ci
, START
);
1275 evaluntil(ci
, START
);
1277 #ifdef RE9_DEBUG_PRG
1278 //if (ci->flags&RE9_FLAG_DUMPPRG) dumpstack(ci);
1280 if (ci
->nbra
) rcerror(ci
, "unmatched left paren");
1281 --ci
->andp
; /* points to first and only operand */
1282 ci
->pp
->startinst
= ci
->andp
->first
;
1283 #ifdef RE9_DEBUG_PRG
1284 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1285 fprintf(stderr
, "=== unoptimized ===\n");
1286 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1291 #ifdef RE9_DEBUG_PRG
1292 if (ci
->flags
&RE9_FLAG_DUMPPRG
) {
1293 fprintf(stderr
, "=== optimized ===\n");
1294 fprintf(stderr
, "start: %u\n", ci
->pp
->startinst
);
1296 fprintf(stderr
, "------\n");
1300 if (ci
->yyclass
!= NULL
) free(ci
->yyclass
);
1307 re9_prog_t
*re9_compile (const char *s
, int flags
, const char **errmsg
) {
1308 return re9_compile_ex(s
, NULL
, flags
, errmsg
);
1312 int re9_nsub (const re9_prog_t
*p
) {
1313 return (p
!= NULL
? p
->nsub
: 0);
1317 void re9_free (re9_prog_t
*p
) {
1319 if (p
->code
!= NULL
) free(p
->code
);
1325 /******************************************************************************/
1327 #define LASTINST (0)
1330 /* substitution list */
1332 re9_sub_t m
[RE9_SUBEXP_MAX
];
1337 * regexec execution lists
1339 #define LISTSIZE (16)
1340 #define BIGLISTSIZE (64*LISTSIZE)
1343 uint32_t inst
; /* instruction of the thread */
1344 re9_sublist_t se
; /* matched subexpressions in this thread */
1349 re9_listitem_t
*start
;
1350 re9_listitem_t
*free
;
1351 re9_listitem_t
*end
;
1356 re9_listitem_t
*relist
[2], *relistf
[2];
1364 /* save a new match in mp */
1365 static inline void re9l_newmatch (re9_sub_t
*mp
, int ms
, re9_sublist_t
*sp
) {
1366 if (mp
== NULL
|| ms
<= 0) return;
1367 if (mp
[0].sp
== NULL
|| sp
->m
[0].sp
< mp
[0].sp
|| (sp
->m
[0].sp
== mp
[0].sp
&& sp
->m
[0].ep
> mp
[0].ep
)) {
1369 for (i
= 0; i
< ms
&& i
< RE9_SUBEXP_MAX
; ++i
) mp
[i
] = sp
->m
[i
];
1370 for (; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= 0;
1376 * Note optimization in re9l_newthread:
1377 * *lp must be pending when re9l_newthread called; if *lp has been looked at already, the optimization is a bug
1380 * lp: list to add to
1381 * ip: instruction to add
1382 * ms: number of elements at mp
1383 * sep: pointers to subexpressions
1385 static inline re9_listitem_t
*re9l_newthread (re9_list_t
*lp
, uint32_t ip
, int ms
, re9_sublist_t
*sep
, int nocheck
) {
1388 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1389 if (p
->inst
== ip
) {
1390 dlogf("newthread:%p; found at %u; sep->m[0].sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sep
->m
[0].sp
, p
->se
.m
[0].sp
);
1391 if (sep
->m
[0].sp
< p
->se
.m
[0].sp
) {
1392 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1399 if (p
>= lp
->end
-1) return NULL
;
1400 dlogf("newthread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1402 if (ms
> 1) p
->se
= *sep
; else p
->se
.m
[0] = sep
->m
[0];
1403 p
[1].inst
= LASTINST
;
1409 * same as renewthread, but called with initial empty start pointer
1412 * lp: list to add to
1413 * ip: instruction to add
1414 * ms: number of elements at mp
1415 * sp: pointers to subexpressions
1417 static inline re9_listitem_t
*re9l_newemptythread (re9_list_t
*lp
, uint32_t ip
, int ms
, const char *sp
, int nocheck
) {
1420 for (p
= lp
->start
; p
->inst
!= LASTINST
; ++p
) {
1421 if (p
->inst
== ip
) {
1422 dlogf("newemptythread:%p; found at %u; sp=%p; p->se.m[0].sp=%p\n", lp
, p
-lp
->start
, sp
, p
->se
.m
[0].sp
);
1423 if (sp
< p
->se
.m
[0].sp
) {
1424 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1432 if (p
>= lp
->end
-1) return NULL
;
1433 dlogf("newemptythread:%p; new at %u(%u)\n", lp
->start
, p
-lp
->start
, ip
);
1435 if (ms
> 1) memset(&p
->se
, 0, sizeof(p
->se
));
1437 p
[1].inst
= LASTINST
;
1442 static inline int is_word_rune (re9_rune r
) {
1444 (r
>= '0' && r
<= '9') ||
1445 (r
>= 'A' && r
<= 'Z') ||
1446 (r
>= 'a' && r
<= 'z') ||
1447 (r
>= 128 && r
<= RE9_RUNE_MAX
) ||
1453 * return 0 if no match
1455 * <0 if we ran out of list space
1458 * progp: program to run
1459 * bol: string to run machine on
1460 * mp: subexpression elements
1461 * ms: number of elements at mp
1464 static int regexec1 (const re9_prog_t
*progp
, re9_sub_t
*mp
, int ms
, re9_ljunk_t
*j
) {
1466 #ifdef RE9_CALC_MAXT
1470 #ifndef RE9_DISABLE_LEARNING
1476 re9_list_t tl
, nl
; /* this list, next list */
1477 re9_listitem_t
*tlp
, *tlxn
;
1478 int match
, i
, flags
;
1480 const vmop_class_t
*opc
;
1481 int dont_collapse
= 0;
1483 flags
= (progp
->flags
&(RE9_FLAG_CASEINSENS
|RE9_FLAG_NONGREEDY
))|(j
->flags
&(RE9_FLAG_NONUTF8
|RE9_FLAG_ANYDOT
));
1484 //dont_collapse = (flags&RE9_FLAG_NONGREEDY);
1486 #ifndef RE9_DISABLE_LEARNING
1487 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
);
1490 for (i
= 0; i
< ms
; ++i
) mp
[i
].sp
= mp
[i
].ep
= NULL
;
1494 if (ms
< 0) ms
= 0; else if (ms
> RE9_SUBEXP_MAX
) ms
= RE9_SUBEXP_MAX
;
1495 j
->relist
[0][0].inst
= j
->relist
[1][0].inst
= LASTINST
;
1496 j
->relistf
[0] = j
->relist
[0];
1497 j
->relistf
[1] = j
->relist
[1];
1498 /* execute machine once for each character, including terminal NUL */
1500 rprev
= RE9_RUNE_SPEC_BOL
;
1502 /* fast check for first char */
1503 #ifndef RE9_DISABLE_LEARNING
1505 dlogf("checkstart! 0x%02x\n", progp
->startflags
);
1506 switch (progp
->startflags
) {
1508 case PRG_STARTS_WITH_BOL
:
1509 if (s
== j
->bol
) break;
1510 if (s
== j
->eol
) goto mt_quit
;
1511 p
= memchr(s
, '\n', j
->eol
-s
);
1512 if (p
== NULL
|| p
+1 > j
->eol
) goto mt_quit
;
1514 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1516 case PRG_STARTS_WITH_EOL
:
1517 if (s
== j
->eol
) goto mt_quit
;
1518 p
= memchr(s
, '\n', j
->eol
-s
);
1519 if (p
== NULL
) p
= j
->eol
;
1521 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1523 case PRG_STARTS_WITH_RUNE
:
1524 if ((flags
&RE9_FLAG_CASEINSENS
) == 0) {
1525 /* case-sensitive */
1526 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1527 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1528 p
= memchr(s
, progp
->startrune
, j
->eol
-s
);
1530 p
= re8_rune_strchr(s
, progp
->startrune
, j
->eol
);
1533 /* case-insensitive */
1534 if (progp
->startrune
< 128 || (flags
&RE9_FLAG_NONUTF8
)) {
1535 if (progp
->startrune
> 255) goto mt_quit
; /* can't be found */
1536 for (p
= s
; p
< j
->eol
; ++p
) if (UPPER(((unsigned char)*p
)) == progp
->startrune
) break;
1539 while (p
< j
->eol
) {
1540 rune_size
= re9_char2rune(&r
, p
, j
->eol
);
1541 if (UPPER(r
) == progp
->startrune
) break;
1546 if (p
== NULL
|| p
> j
->eol
) goto mt_quit
;
1548 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1550 case PRG_STARTS_WITH_CLASS
:
1551 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1552 while (s
< j
->eol
) {
1553 r
= *(const unsigned char *)s
;
1554 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1555 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1556 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1557 if (f
< opc
->spi_count
) break;
1560 if (s
>= j
->eol
) goto mt_quit
;
1561 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1564 /* difficult cases */
1565 if ((progp
->startflags
&PRG_STARTS_WITH_BOL
) && s
== j
->bol
) goto difficult_found
;
1566 if ((progp
->startflags
&PRG_STARTS_WITH_EOL
) && s
== j
->eol
) goto difficult_found
;
1567 opc
= (const vmop_class_t
*)(progp
->code
+progp
->startrange
);
1568 while (s
< j
->eol
) {
1570 if (progp
->startflags
&PRG_STARTS_WITH_BOL
) { ++s
; break; }
1571 if (progp
->startflags
&PRG_STARTS_WITH_EOL
) break;
1573 if (progp
->startflags
&(PRG_STARTS_WITH_RUNE
|PRG_STARTS_WITH_CLASS
)) {
1574 r
= *(const unsigned char *)s
;
1575 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1576 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1577 if (progp
->startflags
&PRG_STARTS_WITH_RUNE
) {
1578 if (r
== progp
->startrune
) break;
1580 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1581 if (f
< opc
->spi_count
) break;
1588 if (s
> j
->eol
) goto mt_quit
;
1590 rprev
= (s
== j
->bol
? RE9_RUNE_SPEC_BOL
: s
[-1]); /*FIXME: previous rune can be UTF-8 one*/
1596 r
= *(const unsigned char *)s
;
1597 rune_size
= (r
< RE9_RUNE_SELF
|| (flags
&RE9_FLAG_NONUTF8
) ? 1 : re9_char2rune(&r
, s
, j
->eol
));
1598 if (flags
&RE9_FLAG_CASEINSENS
) r
= UPPER(r
);
1600 r
= RE9_RUNE_SPEC_EOL
;
1603 /* switch run lists */
1604 tl
.start
= j
->relist
[flag
];
1605 tl
.end
= j
->relist
[flag
]+j
->listsize
;
1606 tl
.free
= j
->relistf
[flag
];
1607 nl
.start
= j
->relist
[flag
^=1];
1608 nl
.end
= j
->relist
[flag
]+j
->listsize
;
1610 nl
.start
->inst
= LASTINST
;
1611 dlogf("*** new loop (flag=%d; mt=%d) %p\n", flag
, match
, tl
.start
);
1612 /* add first instruction to current list */
1614 if ((tlxn
= re9l_newemptythread(&tl
, progp
->startinst
, ms
, s
, dont_collapse
)) == NULL
) return -1;
1618 dlogf(" tlxn=%u\n", tlxn
-tl
.start
);
1619 for (tlp
= tl
.start
; tlp
< tl
.free
; ++tlp
) dlogf("%d: %u\n", tlp
-tl
.start
, tlp
->inst
);
1621 /* execute machine until current list is empty */
1622 if (tl
.start
->inst
== LASTINST
) break; /* nothing to do */
1623 #ifdef RE9_CALC_MAXT
1624 if (tl
.free
-tl
.start
> maxt
) maxt
= tl
.free
-tl
.start
;
1626 for (tlp
= tl
.start
; tlp
->inst
!= LASTINST
; ++tlp
) {
1628 dlogf("* new thread (%d) at %u\n", flag
, tlp
-tl
.start
);
1629 #ifdef RE9_CALC_MAXT
1630 if (tlp
-tl
.start
> maxt
) maxt
= tlp
-tl
.start
;
1632 for (ip
= tlp
->inst
; ip
!= LASTINST
; ip
= ((const vmopcode_t
*)(progp
->code
+ip
))->next
) {
1633 const vmopcode_t
*op
= (const vmopcode_t
*)(progp
->code
+ip
);
1635 dump_opcode(stderr
, progp
, ip
);
1637 if (op
->opcode
< RUNE
) {
1638 /* ascii character */
1639 if (op
->opcode
== r
) {
1640 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1643 switch (op
->opcode
) {
1644 case RUNE
: /* regular character */
1645 if (((const vmop_rune_t
*)op
)->r
== r
) {
1646 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1650 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].sp
= s
;
1653 tlp
->se
.m
[((const vmop_mark_t
*)op
)->subid
].ep
= s
;
1656 if ((flags
&RE9_FLAG_ANYDOT
) == 0 && r
== '\n') break;
1659 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1662 if (s
== j
->bol
|| rprev
== RE9_RUNE_SPEC_BOL
|| rprev
== '\n') continue;
1665 if (s
== j
->eol
|| r
== '\n') continue;
1668 if (is_word_rune(rprev
) != is_word_rune(r
)) continue;
1671 if (is_word_rune(rprev
) == is_word_rune(r
)) continue;
1674 opc
= (const vmop_class_t
*)op
;
1675 for (f
= 0; f
< opc
->spi_count
; f
+= 2) {
1676 if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) {
1677 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1683 opc
= (const vmop_class_t
*)op
;
1684 for (f
= 0; f
< opc
->spi_count
; f
+= 2) if (r
>= opc
->spi
[f
] && r
<= opc
->spi
[f
+1]) break;
1685 if (f
>= opc
->spi_count
) {
1686 if (re9l_newthread(&nl
, op
->next
, ms
, &tlp
->se
, dont_collapse
) == NULL
) return -1;
1689 #ifndef RE9_DISABLE_NONGREEDY
1691 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1692 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1695 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1696 if (flags
&RE9_FLAG_NONGREEDY
) {
1697 if (re9l_newthread(&tl
, op
->next
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1704 if (re9l_newthread(&tl
, ((const vmop_split_t
*)op
)->alt
, ms
, &tlp
->se
, 1) == NULL
) return -1;
1708 case END
: /* match! */
1710 tlp
->se
.m
[0].ep
= s
;
1711 if (mp
!= 0) re9l_newmatch(mp
, ms
, &tlp
->se
);
1712 dlogf("HIT at %u! (%d,%d) (%s)\n", tlp
-tl
.start
, tlp
->se
.m
[0].sp
-j
->bol
, tlp
->se
.m
[0].ep
-j
->bol
, (tlp
< tlxn
? "old" : "new"));
1713 #ifndef RE9_DISABLE_NONGREEDY
1714 if (flags
&RE9_FLAG_NONGREEDY
) {
1715 /* fuck off all 'old' instructions */
1717 /* we are in 'old' area, skip it all */
1718 tlp
= tlxn
-1; /* next one will be 'new' */
1720 /* we are in 'new' area, stop right here */
1721 tlp
[1].inst
= LASTINST
;
1728 fprintf(stderr
, "lebre9: INTRNAL ERROR: invalid command: 0%03o\n", op
->opcode
);
1735 j
->relistf
[flag
] = nl
.free
; /* update 'free' pointer */
1736 dlogf("+ free=%u; sz=%d\n", nl
.free
-nl
.start
, j
->listsize
);
1737 if (s
== j
->eol
) break;
1738 #ifndef RE9_DISABLE_LEARNING
1739 checkstart
= (progp
->startflags
!= PRG_STARTS_WITH_ANY
&& nl
.start
->inst
== LASTINST
&& !match
);
1743 dlogf("+++ (%d) eol=%d\n", r
, s
== j
->eol
);
1745 #ifndef RE9_DISABLE_LEARNING
1748 #ifdef RE9_CALC_MAXT
1749 fprintf(stderr
, "MAXT=%d\n", maxt
);
1755 static int re9_exec_internal (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, re9_listitem_t
*rl0
, re9_listitem_t
*rl1
, size_t listsz
) {
1757 /* use user-specified starting/ending location if specified */
1758 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) {
1759 j
.bol
= (mp
->sp
!= NULL
? mp
->sp
: bol
);
1760 j
.eol
= (mp
->ep
!= NULL
? mp
->ep
: j
.bol
+strlen(j
.bol
));
1762 j
.eol
= (j
.bol
= bol
)+strlen(bol
);
1765 j
.listsize
= listsz
;
1768 return regexec1(progp
, mp
, ms
, &j
);
1772 #ifdef REGEXP9_DEBUG_MEMSIZE
1778 * progp: program to run
1779 * bol: string to run machine on
1780 * mp: subexpression elements
1781 * ms: number of elements at mp
1783 int re9_execute_ex (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
, size_t maxmem
) {
1784 re9_listitem_t relist
[2][LISTSIZE
];
1786 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1787 #ifdef REGEXP9_DEBUG_MEMSIZE
1788 re9_memused
= LISTSIZE
;
1790 int rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, relist
[0], relist
[1], LISTSIZE
);
1792 re9_listitem_t
*r
[2];
1793 int listsize
= BIGLISTSIZE
, rv
;
1794 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1797 for (f
= 0; f
< 2; ++f
) {
1798 if ((r
[f
] = malloc(listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1799 while (--f
>= 0) free(r
[f
]);
1803 #ifdef REGEXP9_DEBUG_MEMSIZE
1804 re9_memused
= listsize
;
1806 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1807 rv
= re9_exec_internal(progp
, flags
, bol
, mp
, ms
, r
[0], r
[1], listsize
);
1808 while (--f
>= 0) free(r
[f
]);
1810 if (listsize
*2*sizeof(re9_listitem_t
)*2 > maxmem
) {
1811 size_t nsz
= maxmem
/(sizeof(re9_listitem_t
)*2);
1812 if (nsz
<= listsize
) break;
1818 #ifdef RE9_CALC_MAXT
1819 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%d\n", listsize
/2);
1826 int re9_execute (const re9_prog_t
*progp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1827 return re9_execute_ex(progp
, flags
, bol
, mp
, ms
, 0);
1831 /******************************************************************************/
1832 struct re9_prog_prepared_s
{
1833 const re9_prog_t
*progp
;
1834 re9_listitem_t
*relist
[2];
1840 re9_prog_prepared_t
*re9_prepare_ex (const re9_prog_t
*progp
, size_t maxmem
) {
1841 re9_prog_prepared_t
*res
;
1843 if (progp
== NULL
) return NULL
;
1844 if ((res
= malloc(sizeof(*res
))) == NULL
) return NULL
;
1845 if (maxmem
== 0) maxmem
= 1024*1024*32; /* up to 32MB */
1847 res
->listsize
= BIGLISTSIZE
;
1848 res
->maxmem
= maxmem
;
1849 for (f
= 0; f
< 2; ++f
) {
1850 if ((res
->relist
[f
] = malloc(res
->listsize
*sizeof(re9_listitem_t
))) == NULL
) {
1851 while (--f
>= 0) free(res
->relist
[f
]);
1860 re9_prog_prepared_t
*re9_prepare (const re9_prog_t
*progp
) {
1861 return re9_prepare_ex(progp
, 0);
1865 int re9_prepared_execute (re9_prog_prepared_t
*pp
, int flags
, const char *bol
, re9_sub_t
*mp
, int ms
) {
1869 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { b
= mp
[0].sp
; e
= mp
[0].ep
; } else { b
= e
= NULL
; }
1871 rv
= re9_exec_internal(pp
->progp
, flags
, bol
, mp
, ms
, pp
->relist
[0], pp
->relist
[1], pp
->listsize
);
1872 if (rv
>= 0 || pp
->listsize
*sizeof(re9_listitem_t
)*2 >= pp
->maxmem
) break;
1873 if (pp
->listsize
*2*sizeof(re9_listitem_t
)*2 > pp
->maxmem
) {
1874 size_t nsz
= pp
->maxmem
/(sizeof(re9_listitem_t
)*2);
1875 if (nsz
<= pp
->listsize
) break;
1880 for (f
= 0; f
< 2; ++f
) {
1881 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1882 if (r
== NULL
) break;
1885 if (f
< 2) { pp
->listsize
/= 2; rv
= -1; break; }
1886 if ((flags
&RE9_FLAG_MT0_RANGE
) && mp
!= NULL
) { mp
[0].sp
= b
; mp
[0].ep
= e
; }
1888 #ifdef RE9_CALC_MAXT
1889 if (rv
< 0) fprintf(stderr
, "OUT OF MEMORY! listsize=%u\n", pp
->listsize
);
1891 if (pp
->listsize
> 32767) {
1893 #ifdef RE9_CALC_MAXT
1894 fprintf(stderr
, "*SHRINK: %u --> 32768\n", pp
->listsize
);
1896 pp
->listsize
= 32768;
1897 for (f
= 0; f
< 2; ++f
) {
1898 re9_listitem_t
*r
= realloc(pp
->relist
[f
], pp
->listsize
*sizeof(re9_listitem_t
));
1899 if (r
== NULL
) break;
1909 void re9_prepared_free (re9_prog_prepared_t
*pp
) {
1911 if (pp
->relist
[1] != NULL
) free(pp
->relist
[1]);
1912 if (pp
->relist
[0] != NULL
) free(pp
->relist
[0]);
1918 /******************************************************************************/
1919 /* substitute into one string using the matches from the last re9_execute() */
1920 int re9_sub (char *dp
, size_t dlen
, const char *sp
, const re9_sub_t
*mp
, int ms
) {
1923 if (mp
== NULL
|| ms
<= 0) ms
= 0;
1924 if (dp
== NULL
) dlen
= 0;
1925 dlast
= (dlen
> 0 ? dp
+dlen
-1 : NULL
);
1927 if (*sp
== '\\' && (sp
[1] == '{' || (sp
[1] >= '0' && sp
[1] <= '9'))) {
1933 while (*sp
&& *sp
!= '}') {
1934 if (*sp
< '0' || *sp
> '9') break;
1939 while (*sp
&& *sp
!= '}') ++sp
;
1940 if (*sp
== '}') ++sp
;
1944 if (i
< ms
&& mp
[i
].sp
!= NULL
) {
1945 const char *ssp
= mp
[i
].sp
;
1946 int len
= (long)(mp
[i
].ep
-ssp
); /*FIXME: 2GB strings?*/
1949 if (len
> dlen
) len
= dlen
;
1950 if (len
> 0) memcpy(dp
, ssp
, len
);
1958 *dp
++ = (sp
[1] == 'z' ? 0 : sp
[1]);
1963 if (dlen
> 0) { *dp
++ = *sp
++; --dlen
; } else ++sp
;
1968 if (dlen
> 0) *dp
= '\0'; else if (dlast
!= NULL
) *dlast
= '\0';
1973 /******************************************************************************/
1974 static inline int fromhex (char ch
) {
1976 (ch
>= '0' && ch
<= '9' ? ch
-'0' :
1977 ch
>= 'A' && ch
<= 'F' ? ch
-'A'+10 :
1978 ch
>= 'a' && ch
<= 'f' ? ch
-'a'+10 :
1983 /* substitute into one string using the matches from the last re9_execute() */
1984 int re9_subst (char *dp
, size_t dlen
, const char *sp
, const re9_sub_t
*mp
, int ms
) {
1987 if (mp
== NULL
|| ms
<= 0) ms
= 0;
1988 if (dp
== NULL
) dlen
= 0;
1989 dlast
= (dlen
> 0 ? dp
+dlen
-1 : NULL
);
1991 if (*sp
== '$' && (sp
[1] == '{' || (sp
[1] >= '0' && sp
[1] <= '9'))) {
1997 while (*sp
&& *sp
!= '}') {
1998 if (*sp
< '0' || *sp
> '9') break;
2003 while (*sp
&& *sp
!= '}') ++sp
;
2004 if (*sp
== '}') ++sp
;
2008 if (i
< ms
&& mp
[i
].sp
!= NULL
) {
2009 const char *ssp
= mp
[i
].sp
;
2010 int len
= (long)(mp
[i
].ep
-ssp
); /*FIXME: 2GB strings?*/
2013 if (len
> dlen
) len
= dlen
;
2014 if (len
> 0) memcpy(dp
, ssp
, len
);
2023 case 0: --sp
; ch
= '\\'; break;
2024 case 'e': ch
= 27; break;
2025 case 'n': ch
= '\n'; break;
2026 case 'r': ch
= '\r'; break;
2027 case 't': ch
= '\t'; break;
2028 case 'z': ch
= 0; break;
2029 case 'x': /* hex code */
2035 if (n
>= 0) { ch
= ch
*16+n
; ++sp
; }
2040 default: ch
= sp
[-1]; break;
2043 if (dlen
> 0) { *dp
++ = ch
; --dlen
; }
2047 if (dlen
> 0) *dp
= '\0'; else if (dlast
!= NULL
) *dlast
= '\0';