From 0f6a097036cc6d5757e779063ad326a0cb462447 Mon Sep 17 00:00:00 2001 From: ketmar Date: Sun, 25 Aug 2013 10:11:33 +0300 Subject: [PATCH] parser now can parse strings with '\0' (but there is no API for that) --- src/libre9/re9.c | 63 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/src/libre9/re9.c b/src/libre9/re9.c index 3c2b24f..f633b7d 100644 --- a/src/libre9/re9.c +++ b/src/libre9/re9.c @@ -418,7 +418,6 @@ typedef struct { int nbra; /* number of opened parens */ const char *expr; /* pointer to next character in source expression */ const char *expr_eol; - int lexdone; /* eof reached? */ int flags; /* parser flags */ /* */ int comment_level; /* >0: in comment; counts parens */ @@ -474,22 +473,22 @@ static void add_to_class_two (re9_compiler_t *ci, re9_rune r0, re9_rune r1) { /******************************************************************************/ /* return 1 if char is quoted */ static int nextc (re9_compiler_t *ci, re9_rune *rp) { - if (ci->lexdone) { *rp = RE9_RUNE_SPEC_EOL; return 1; } - if (ci->expr[0] == 0) { ci->lexdone = 1; *rp = RE9_RUNE_SPEC_EOL; return 0; } + if (ci->expr >= ci->expr_eol) { *rp = RE9_RUNE_SPEC_EOL; return 1; } if ((ci->flags&RE9_FLAG_NONUTF8) == 0) { - if (ci->expr[0] == '\\' && ci->expr[1]) { + if (ci->expr[0] == '\\' && ci->expr+1 < ci->expr_eol) { ci->expr += re9_char2rune(rp, ci->expr+1, ci->expr_eol)+1; return 1; } else { ci->expr += re9_char2rune(rp, ci->expr, ci->expr_eol); } } else { - if (ci->expr[0] == '\\' && ci->expr[1]) { - *rp = (uint8_t)(ci->expr[1]); + if (ci->expr[0] == '\\' && ci->expr+1 < ci->expr_eol) { + *rp = (unsigned char)(ci->expr[1]); ci->expr += 2; return 1; } else { - if ((*rp = (uint8_t)(*ci->expr++)) == 0) --ci->expr; + *rp = (uint8_t)(*ci->expr++); + if (ci->expr > ci->expr_eol) ci->expr = ci->expr_eol; } } if (ci->flags&RE9_FLAG_CASEINSENS) *rp = UPPER(*rp); @@ -674,7 +673,7 @@ static int bldcclass (re9_compiler_t *ci) { if (!quoted && rune == ']' && (ci->yyc_used&0x01) == 0) break; if (quoted && ((rune >= '0' && rune <= '9') || (rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z'))) { /* metacharacters */ - if ((ci->yyc_used&0x01) != 0 || (!ci->lexdone && ci->expr[0] == '-')) rcerror(ci, "malformed '[]'"); /* metacharacter can't be used in rangedef */ + if ((ci->yyc_used&0x01) != 0 || (ci->expr < ci->expr_eol && ci->expr[0] == '-')) rcerror(ci, "malformed '[]'"); /* metacharacter can't be used in rangedef */ switch (rune) { case 'd': case 'D': addmeta_digit(ci, (rune <= 'Z')); break; case 's': case 'S': addmeta_space(ci, (rune <= 'Z')); break; @@ -686,28 +685,29 @@ static int bldcclass (re9_compiler_t *ci) { break; } #ifndef RE9_DISABLE_POSIX_CLASSES - } else if ((ci->yyc_used&0x01) == 0 && !quoted && rune == '[' && ci->expr[0] == ':') { - if (strncmp(ci->expr, ":alnum:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); addmeta_digit(ci, 0); } - else if (strncmp(ci->expr, ":alpha:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); } - else if (strncmp(ci->expr, ":ascii:]", 8) == 0) { ci->expr += 8; addmeta_ascii(ci); } - else if (strncmp(ci->expr, ":blank:]", 8) == 0) { ci->expr += 8; addmeta_blank(ci); } - else if (strncmp(ci->expr, ":cntrl:]", 8) == 0) { ci->expr += 8; addmeta_ctrl(ci); } - else if (strncmp(ci->expr, ":digit:]", 8) == 0) { ci->expr += 8; addmeta_digit(ci, 0); } - else if (strncmp(ci->expr, ":graph:]", 8) == 0) { ci->expr += 8; addmeta_graph(ci); } - else if (strncmp(ci->expr, ":lower:]", 8) == 0) { ci->expr += 8; addmeta_lower(ci); } - else if (strncmp(ci->expr, ":print:]", 8) == 0) { ci->expr += 8; addmeta_print(ci); } - else if (strncmp(ci->expr, ":punct:]", 8) == 0) { ci->expr += 8; addmeta_punct(ci); } - else if (strncmp(ci->expr, ":space:]", 8) == 0) { ci->expr += 8; addmeta_space(ci, 0); } - else if (strncmp(ci->expr, ":upper:]", 8) == 0) { ci->expr += 8; addmeta_upper(ci); } - else if (strncmp(ci->expr, ":word:]", 7) == 0) { ci->expr += 7; addmeta_word(ci, 0); } - else if (strncmp(ci->expr, ":xdigit:]", 9) == 0) { ci->expr += 9; addmeta_xdigit(ci); } + } else if ((ci->yyc_used&0x01) == 0 && !quoted && rune == '[' && ci->expr+6 < ci->expr_eol && ci->expr[0] == ':') { + if (ci->expr+7 < ci->expr_eol) { + if (strncmp(ci->expr, ":alnum:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); addmeta_digit(ci, 0); } + else if (strncmp(ci->expr, ":alpha:]", 8) == 0) { ci->expr += 8; addmeta_alpha(ci); } + else if (strncmp(ci->expr, ":ascii:]", 8) == 0) { ci->expr += 8; addmeta_ascii(ci); } + else if (strncmp(ci->expr, ":blank:]", 8) == 0) { ci->expr += 8; addmeta_blank(ci); } + else if (strncmp(ci->expr, ":cntrl:]", 8) == 0) { ci->expr += 8; addmeta_ctrl(ci); } + else if (strncmp(ci->expr, ":digit:]", 8) == 0) { ci->expr += 8; addmeta_digit(ci, 0); } + else if (strncmp(ci->expr, ":graph:]", 8) == 0) { ci->expr += 8; addmeta_graph(ci); } + else if (strncmp(ci->expr, ":lower:]", 8) == 0) { ci->expr += 8; addmeta_lower(ci); } + else if (strncmp(ci->expr, ":print:]", 8) == 0) { ci->expr += 8; addmeta_print(ci); } + else if (strncmp(ci->expr, ":punct:]", 8) == 0) { ci->expr += 8; addmeta_punct(ci); } + else if (strncmp(ci->expr, ":space:]", 8) == 0) { ci->expr += 8; addmeta_space(ci, 0); } + else if (strncmp(ci->expr, ":upper:]", 8) == 0) { ci->expr += 8; addmeta_upper(ci); } + } else if (strncmp(ci->expr, ":word:]", 7) == 0) { ci->expr += 7; addmeta_word(ci, 0); } + else if (ci->expr+8 < ci->expr_eol && strncmp(ci->expr, ":xdigit:]", 9) == 0) { ci->expr += 9; addmeta_xdigit(ci); } #endif } else { if (ci->yyc_used&0x01 && rune < ci->yyclass[ci->yyc_used-1]) rcerror(ci, "invalid range in '[]'"); add_to_class(ci, rune); if (ci->yyc_used&0x01) { /* this was first char of possible range */ - if (!ci->lexdone && ci->expr[0] == '-') { + if (ci->expr < ci->expr_eol && ci->expr[0] == '-') { /* rangedef; skip '-' */ ++ci->expr; } else { @@ -756,10 +756,10 @@ static int lex (re9_compiler_t *ci) { if (ci->return_rune_nothing) { ci->return_rune_nothing = 0; ci->yyrune = RE9_RUNE_SPEC_NOTHING; - } else if (ci->lexdone) { + } else if (ci->expr >= ci->expr_eol) { ci->yyrune = RE9_RUNE_SPEC_EOL; } else if (ci->flags&RE9_FLAG_LITERAL) { - if (ci->expr[0] == '\\') { + if (ci->expr < ci->expr_eol && ci->expr[0] == '\\') { ++ci->expr; ci->yyrune = '\\'; } else { @@ -776,7 +776,7 @@ static int lex (re9_compiler_t *ci) { case '*': quoted = STAR; goto closure; case '?': quoted = QUEST; goto closure; case '+': quoted = PLUS; /* fallthru */ -closure: if (ci->expr[0] == '?') { +closure: if (ci->expr < ci->expr_eol && ci->expr[0] == '?') { #ifndef RE9_DISABLE_NONGREEDY ++ci->expr; quoted |= 0x100; @@ -1014,10 +1014,10 @@ static void operator (re9_compiler_t *ci, int t) { if (t == LBRA) { ++ci->nbra; csi = -1; - if (!ci->lexdone && ci->expr[0] == '?') { - if (ci->expr[1] == ':') { + if (ci->expr < ci->expr_eol && ci->expr[0] == '?') { + if (ci->expr+1 < ci->expr_eol && ci->expr[1] == ':') { ci->expr += 2; - } else if (ci->expr[1] == '#') { + } else if (ci->expr+1 < ci->expr_eol && ci->expr[1] == '#') { ci->expr += 2; ci->comment_level = 1; ci->last_was_operand = RE9_TRUE; @@ -1031,7 +1031,7 @@ static void operator (re9_compiler_t *ci, int t) { } if (ci->last_was_operand) operator(ci, CAT); /* process empty parens */ - if (!ci->lexdone && ci->expr[0] == ')') ci->return_rune_nothing = 1; /* hack: next lex() will return 'nothing' rune */ + if (ci->expr < ci->expr_eol && ci->expr[0] == ')') ci->return_rune_nothing = 1; /* hack: next lex() will return 'nothing' rune */ } else { evaluntil(ci, t); } @@ -1228,7 +1228,6 @@ re9_prog_t *re9_compile (const char *s, int flags, const char **errmsg) { return NULL; } /* go compile the sucker */ - ci->lexdone = 0; ci->expr = s; ci->expr_eol = ci->expr+strlen(ci->expr); ci->nbra = 0; -- 2.11.4.GIT