1 /* see copyright notice in squirrel.h */
4 #include "sqstdstring.h"
7 #define scisprint iswprint
9 #define scisprint isprint
14 static const SQChar
*g_nnames
[] =
16 "NONE","OP_GREEDY", "OP_OR",
17 "OP_EXPR","OP_NOCAPEXPR","OP_DOT", "OP_CLASS",
18 "OP_CCLASS","OP_NCLASS","OP_RANGE","OP_CHAR",
19 "OP_EOL","OP_BOL","OP_WB"
24 #define OP_GREEDY (MAX_CHAR+1) // * + ? {n}
25 #define OP_OR (MAX_CHAR+2)
26 #define OP_EXPR (MAX_CHAR+3) //parentesis ()
27 #define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:)
28 #define OP_DOT (MAX_CHAR+5)
29 #define OP_CLASS (MAX_CHAR+6)
30 #define OP_CCLASS (MAX_CHAR+7)
31 #define OP_NCLASS (MAX_CHAR+8) //negates class the [^
32 #define OP_RANGE (MAX_CHAR+9)
33 #define OP_CHAR (MAX_CHAR+10)
34 #define OP_EOL (MAX_CHAR+11)
35 #define OP_BOL (MAX_CHAR+12)
36 #define OP_WB (MAX_CHAR+13)
38 #define SQREX_SYMBOL_ANY_CHAR ('.')
39 #define SQREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')
40 #define SQREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')
41 #define SQREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')
42 #define SQREX_SYMBOL_BRANCH ('|')
43 #define SQREX_SYMBOL_END_OF_STRING ('$')
44 #define SQREX_SYMBOL_BEGINNING_OF_STRING ('^')
45 #define SQREX_SYMBOL_ESCAPE_CHAR ('\\')
48 typedef int SQRexNodeType
;
50 typedef struct tagSQRexNode
{
64 SQInteger _nallocated
;
68 SQInteger _currsubexp
;
69 const SQChar
**_error
;
72 static SQInteger
sqstd_rex_list(SQRex
*exp
);
74 static SQInteger
sqstd_rex_newnode(SQRex
*exp
, SQRexNodeType type
)
78 n
.next
= n
.right
= n
.left
= -1;
80 n
.right
= exp
->_nsubexpr
++;
81 if(exp
->_nallocated
< (exp
->_nsize
+ 1)) {
82 SQInteger oldsize
= exp
->_nallocated
;
83 exp
->_nallocated
*= 2;
84 exp
->_nodes
= (SQRexNode
*)sq_realloc(exp
->_nodes
, oldsize
* sizeof(SQRexNode
) ,exp
->_nallocated
* sizeof(SQRexNode
));
86 exp
->_nodes
[exp
->_nsize
++] = n
;
87 SQInteger newid
= exp
->_nsize
- 1;
88 return (SQInteger
)newid
;
91 static void sqstd_rex_error(SQRex
*exp
,const SQChar
*error
)
93 if(exp
->_error
) *exp
->_error
= error
;
94 throw std::exception();
97 static void sqstd_rex_expect(SQRex
*exp
, SQChar n
){
99 sqstd_rex_error(exp
, "expected paren");
103 static SQChar
sqstd_rex_escapechar(SQRex
*exp
)
105 if(*exp
->_p
== SQREX_SYMBOL_ESCAPE_CHAR
){
108 case 'v': exp
->_p
++; return '\v';
109 case 'n': exp
->_p
++; return '\n';
110 case 't': exp
->_p
++; return '\t';
111 case 'r': exp
->_p
++; return '\r';
112 case 'f': exp
->_p
++; return '\f';
113 default: return (*exp
->_p
++);
115 } else if(!scisprint(*exp
->_p
)) sqstd_rex_error(exp
,"letter expected");
119 static SQInteger
sqstd_rex_charclass(SQRex
*exp
,SQInteger classid
)
121 SQInteger n
= sqstd_rex_newnode(exp
,OP_CCLASS
);
122 exp
->_nodes
[n
].left
= classid
;
126 static SQInteger
sqstd_rex_charnode(SQRex
*exp
,SQBool isclass
)
129 if(*exp
->_p
== SQREX_SYMBOL_ESCAPE_CHAR
) {
132 case 'n': exp
->_p
++; return sqstd_rex_newnode(exp
,'\n');
133 case 't': exp
->_p
++; return sqstd_rex_newnode(exp
,'\t');
134 case 'r': exp
->_p
++; return sqstd_rex_newnode(exp
,'\r');
135 case 'f': exp
->_p
++; return sqstd_rex_newnode(exp
,'\f');
136 case 'v': exp
->_p
++; return sqstd_rex_newnode(exp
,'\v');
137 case 'a': case 'A': case 'w': case 'W': case 's': case 'S':
138 case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':
139 case 'p': case 'P': case 'l': case 'u':
141 t
= *exp
->_p
; exp
->_p
++;
142 return sqstd_rex_charclass(exp
,t
);
147 SQInteger node
= sqstd_rex_newnode(exp
,OP_WB
);
148 exp
->_nodes
[node
].left
= *exp
->_p
;
153 t
= *exp
->_p
; exp
->_p
++;
154 return sqstd_rex_newnode(exp
,t
);
157 else if(!scisprint(*exp
->_p
)) {
159 sqstd_rex_error(exp
,"letter expected");
161 t
= *exp
->_p
; exp
->_p
++;
162 return sqstd_rex_newnode(exp
,t
);
164 static SQInteger
sqstd_rex_class(SQRex
*exp
)
167 SQInteger first
= -1,chain
;
168 if(*exp
->_p
== SQREX_SYMBOL_BEGINNING_OF_STRING
){
169 ret
= sqstd_rex_newnode(exp
,OP_NCLASS
);
171 }else ret
= sqstd_rex_newnode(exp
,OP_CLASS
);
173 if(*exp
->_p
== ']') sqstd_rex_error(exp
,"empty class");
175 while(*exp
->_p
!= ']' && exp
->_p
!= exp
->_eol
) {
176 if(*exp
->_p
== '-' && first
!= -1){
178 if(*exp
->_p
++ == ']') sqstd_rex_error(exp
,"unfinished range");
179 r
= sqstd_rex_newnode(exp
,OP_RANGE
);
180 if(exp
->_nodes
[first
].type
>*exp
->_p
) sqstd_rex_error(exp
,"invalid range");
181 if(exp
->_nodes
[first
].type
== OP_CCLASS
) sqstd_rex_error(exp
,"cannot use character classes in ranges");
182 exp
->_nodes
[r
].left
= exp
->_nodes
[first
].type
;
183 SQInteger t
= sqstd_rex_escapechar(exp
);
184 exp
->_nodes
[r
].right
= t
;
185 exp
->_nodes
[chain
].next
= r
;
192 exp
->_nodes
[chain
].next
= c
;
194 first
= sqstd_rex_charnode(exp
,SQTrue
);
197 first
= sqstd_rex_charnode(exp
,SQTrue
);
203 exp
->_nodes
[chain
].next
= c
;
208 exp
->_nodes
[ret
].left
= exp
->_nodes
[ret
].next
;
209 exp
->_nodes
[ret
].next
= -1;
213 static SQInteger
sqstd_rex_parsenumber(SQRex
*exp
)
215 SQInteger ret
= *exp
->_p
-'0';
216 SQInteger positions
= 10;
218 while(isdigit(*exp
->_p
)) {
219 ret
= ret
*10+(*exp
->_p
++-'0');
220 if(positions
==1000000000) sqstd_rex_error(exp
,"overflow in numeric constant");
226 static SQInteger
sqstd_rex_element(SQRex
*exp
)
238 sqstd_rex_expect(exp
,':');
239 expr
= sqstd_rex_newnode(exp
,OP_NOCAPEXPR
);
242 expr
= sqstd_rex_newnode(exp
,OP_EXPR
);
243 SQInteger newn
= sqstd_rex_list(exp
);
244 exp
->_nodes
[expr
].left
= newn
;
246 sqstd_rex_expect(exp
,')');
251 ret
= sqstd_rex_class(exp
);
252 sqstd_rex_expect(exp
,']');
254 case SQREX_SYMBOL_END_OF_STRING
: exp
->_p
++; ret
= sqstd_rex_newnode(exp
,OP_EOL
);break;
255 case SQREX_SYMBOL_ANY_CHAR
: exp
->_p
++; ret
= sqstd_rex_newnode(exp
,OP_DOT
);break;
257 ret
= sqstd_rex_charnode(exp
,SQFalse
);
263 SQBool isgreedy
= SQFalse
;
264 unsigned short p0
= 0, p1
= 0;
266 case SQREX_SYMBOL_GREEDY_ZERO_OR_MORE
: p0
= 0; p1
= 0xFFFF; exp
->_p
++; isgreedy
= SQTrue
; break;
267 case SQREX_SYMBOL_GREEDY_ONE_OR_MORE
: p0
= 1; p1
= 0xFFFF; exp
->_p
++; isgreedy
= SQTrue
; break;
268 case SQREX_SYMBOL_GREEDY_ZERO_OR_ONE
: p0
= 0; p1
= 1; exp
->_p
++; isgreedy
= SQTrue
; break;
271 if(!isdigit(*exp
->_p
)) sqstd_rex_error(exp
,"number expected");
272 p0
= (unsigned short)sqstd_rex_parsenumber(exp
);
273 /*******************************/
281 if(isdigit(*exp
->_p
)){
282 p1
= (unsigned short)sqstd_rex_parsenumber(exp
);
284 sqstd_rex_expect(exp
,'}');
287 sqstd_rex_error(exp
,", or } expected");
289 /*******************************/
295 SQInteger nnode
= sqstd_rex_newnode(exp
,OP_GREEDY
);
297 exp
->_nodes
[nnode
].left
= ret
;
298 exp
->_nodes
[nnode
].right
= ((p0
)<<16)|p1
;
302 if((*exp
->_p
!= SQREX_SYMBOL_BRANCH
) && (*exp
->_p
!= ')') && (*exp
->_p
!= SQREX_SYMBOL_GREEDY_ZERO_OR_MORE
) && (*exp
->_p
!= SQREX_SYMBOL_GREEDY_ONE_OR_MORE
) && (*exp
->_p
!= '\0')) {
303 SQInteger nnode
= sqstd_rex_element(exp
);
304 exp
->_nodes
[ret
].next
= nnode
;
310 static SQInteger
sqstd_rex_list(SQRex
*exp
)
313 if(*exp
->_p
== SQREX_SYMBOL_BEGINNING_OF_STRING
) {
315 ret
= sqstd_rex_newnode(exp
,OP_BOL
);
317 e
= sqstd_rex_element(exp
);
319 exp
->_nodes
[ret
].next
= e
;
323 if(*exp
->_p
== SQREX_SYMBOL_BRANCH
) {
324 SQInteger temp
,tright
;
326 temp
= sqstd_rex_newnode(exp
,OP_OR
);
327 exp
->_nodes
[temp
].left
= ret
;
328 tright
= sqstd_rex_list(exp
);
329 exp
->_nodes
[temp
].right
= tright
;
335 static SQBool
sqstd_rex_matchcclass(SQInteger cclass
,SQChar c
)
338 case 'a': return isalpha(c
)?SQTrue
:SQFalse
;
339 case 'A': return !isalpha(c
)?SQTrue
:SQFalse
;
340 case 'w': return (isalnum(c
) || c
== '_')?SQTrue
:SQFalse
;
341 case 'W': return (!isalnum(c
) && c
!= '_')?SQTrue
:SQFalse
;
342 case 's': return isspace(c
)?SQTrue
:SQFalse
;
343 case 'S': return !isspace(c
)?SQTrue
:SQFalse
;
344 case 'd': return isdigit(c
)?SQTrue
:SQFalse
;
345 case 'D': return !isdigit(c
)?SQTrue
:SQFalse
;
346 case 'x': return isxdigit(c
)?SQTrue
:SQFalse
;
347 case 'X': return !isxdigit(c
)?SQTrue
:SQFalse
;
348 case 'c': return iscntrl(c
)?SQTrue
:SQFalse
;
349 case 'C': return !iscntrl(c
)?SQTrue
:SQFalse
;
350 case 'p': return ispunct(c
)?SQTrue
:SQFalse
;
351 case 'P': return !ispunct(c
)?SQTrue
:SQFalse
;
352 case 'l': return islower(c
)?SQTrue
:SQFalse
;
353 case 'u': return isupper(c
)?SQTrue
:SQFalse
;
355 return SQFalse
; /*cannot happen*/
358 static SQBool
sqstd_rex_matchclass(SQRex
* exp
,SQRexNode
*node
,SQInteger c
)
363 if(c
>= node
->left
&& c
<= node
->right
) return SQTrue
;
366 if(sqstd_rex_matchcclass(node
->left
,c
)) return SQTrue
;
369 if(c
== node
->type
)return SQTrue
;
371 } while((node
->next
!= -1) && (node
= &exp
->_nodes
[node
->next
]));
375 static const SQChar
*sqstd_rex_matchnode(SQRex
* exp
,SQRexNode
*node
,const SQChar
*str
,SQRexNode
*next
)
378 SQRexNodeType type
= node
->type
;
381 //SQRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;
382 SQRexNode
*greedystop
= NULL
;
383 SQInteger p0
= (node
->right
>> 16)&0x0000FFFF, p1
= node
->right
&0x0000FFFF, nmaches
= 0;
384 const SQChar
*s
=str
, *good
= str
;
386 if(node
->next
!= -1) {
387 greedystop
= &exp
->_nodes
[node
->next
];
393 while((nmaches
== 0xFFFF || nmaches
< p1
)) {
396 if(!(s
= sqstd_rex_matchnode(exp
,&exp
->_nodes
[node
->left
],s
,greedystop
)))
401 //checks that 0 matches satisfy the expression(if so skips)
402 //if not would always stop(for instance if is a '?')
403 if(greedystop
->type
!= OP_GREEDY
||
404 (greedystop
->type
== OP_GREEDY
&& ((greedystop
->right
>> 16)&0x0000FFFF) != 0))
406 SQRexNode
*gnext
= NULL
;
407 if(greedystop
->next
!= -1) {
408 gnext
= &exp
->_nodes
[greedystop
->next
];
409 }else if(next
&& next
->next
!= -1){
410 gnext
= &exp
->_nodes
[next
->next
];
412 stop
= sqstd_rex_matchnode(exp
,greedystop
,s
,gnext
);
414 //if satisfied stop it
415 if(p0
== p1
&& p0
== nmaches
) break;
416 else if(nmaches
>= p0
&& p1
== 0xFFFF) break;
417 else if(nmaches
>= p0
&& nmaches
<= p1
) break;
425 if(p0
== p1
&& p0
== nmaches
) return good
;
426 else if(nmaches
>= p0
&& p1
== 0xFFFF) return good
;
427 else if(nmaches
>= p0
&& nmaches
<= p1
) return good
;
431 const SQChar
*asd
= str
;
432 SQRexNode
*temp
=&exp
->_nodes
[node
->left
];
433 while( (asd
= sqstd_rex_matchnode(exp
,temp
,asd
,NULL
)) ) {
435 temp
= &exp
->_nodes
[temp
->next
];
440 temp
= &exp
->_nodes
[node
->right
];
441 while( (asd
= sqstd_rex_matchnode(exp
,temp
,asd
,NULL
)) ) {
443 temp
= &exp
->_nodes
[temp
->next
];
452 SQRexNode
*n
= &exp
->_nodes
[node
->left
];
453 const SQChar
*cur
= str
;
454 SQInteger capture
= -1;
455 if(node
->type
!= OP_NOCAPEXPR
&& node
->right
== exp
->_currsubexp
) {
456 capture
= exp
->_currsubexp
;
457 exp
->_matches
[capture
].begin
= cur
;
462 SQRexNode
*subnext
= NULL
;
464 subnext
= &exp
->_nodes
[n
->next
];
468 if(!(cur
= sqstd_rex_matchnode(exp
,n
,cur
,subnext
))) {
470 exp
->_matches
[capture
].begin
= 0;
471 exp
->_matches
[capture
].len
= 0;
475 } while((n
->next
!= -1) && (n
= &exp
->_nodes
[n
->next
]));
478 exp
->_matches
[capture
].len
= cur
- exp
->_matches
[capture
].begin
;
482 if((str
== exp
->_bol
&& !isspace(*str
))
483 || (str
== exp
->_eol
&& !isspace(*(str
-1)))
484 || (!isspace(*str
) && isspace(*(str
+1)))
485 || (isspace(*str
) && !isspace(*(str
+1))) ) {
486 return (node
->left
== 'b')?str
:NULL
;
488 return (node
->left
== 'b')?NULL
:str
;
490 if(str
== exp
->_bol
) return str
;
493 if(str
== exp
->_eol
) return str
;
501 if(sqstd_rex_matchclass(exp
,&exp
->_nodes
[node
->left
],*str
)?(type
== OP_CLASS
?SQTrue
:SQFalse
):(type
== OP_NCLASS
?SQTrue
:SQFalse
)) {
507 if(sqstd_rex_matchcclass(node
->left
,*str
)) {
513 if(*str
!= (SQChar
)node
->type
) return NULL
;
521 SQRex
*sqstd_rex_compile(const SQChar
*pattern
,const SQChar
**error
)
523 SQRex
*exp
= (SQRex
*)sq_malloc(sizeof(SQRex
));
524 exp
->_eol
= exp
->_bol
= NULL
;
526 exp
->_nallocated
= (SQInteger
)strlen(pattern
) * sizeof(SQChar
);
527 exp
->_nodes
= (SQRexNode
*)sq_malloc(exp
->_nallocated
* sizeof(SQRexNode
));
531 exp
->_first
= sqstd_rex_newnode(exp
,OP_EXPR
);
534 SQInteger res
= sqstd_rex_list(exp
);
535 exp
->_nodes
[exp
->_first
].left
= res
;
537 sqstd_rex_error(exp
,"unexpected character");
545 /* XXX -- The (int) casts are needed to silent warnings on 64bit systems (SQInteger is 64bit, %d assumes 32bit, (int) is 32bit) */
546 for(i
= 0;i
< nsize
; i
++) {
547 if(exp
->_nodes
[i
].type
>MAX_CHAR
)
548 printf("[%02d] %10s ",(int)i
,g_nnames
[exp
->_nodes
[i
].type
-MAX_CHAR
]);
550 printf("[%02d] %10c ",(int)i
,exp
->_nodes
[i
].type
);
551 printf("left %02d right %02d next %02d\n",(int)exp
->_nodes
[i
].left
,(int)exp
->_nodes
[i
].right
,(int)exp
->_nodes
[i
].next
);
556 exp
->_matches
= (SQRexMatch
*) sq_malloc(exp
->_nsubexpr
* sizeof(SQRexMatch
));
557 memset(exp
->_matches
,0,exp
->_nsubexpr
* sizeof(SQRexMatch
));
566 void sqstd_rex_free(SQRex
*exp
)
569 if(exp
->_nodes
) sq_free(exp
->_nodes
,exp
->_nallocated
* sizeof(SQRexNode
));
570 if(exp
->_matches
) sq_free(exp
->_matches
,exp
->_nsubexpr
* sizeof(SQRexMatch
));
571 sq_free(exp
,sizeof(SQRex
));
575 SQBool
sqstd_rex_match(SQRex
* exp
,const SQChar
* text
)
577 const SQChar
* res
= NULL
;
579 exp
->_eol
= text
+ strlen(text
);
580 exp
->_currsubexp
= 0;
581 res
= sqstd_rex_matchnode(exp
,exp
->_nodes
,text
,NULL
);
582 if(res
== NULL
|| res
!= exp
->_eol
)
587 SQBool
sqstd_rex_searchrange(SQRex
* exp
,const SQChar
* text_begin
,const SQChar
* text_end
,const SQChar
** out_begin
, const SQChar
** out_end
)
589 const SQChar
*cur
= NULL
;
590 SQInteger node
= exp
->_first
;
591 if(text_begin
>= text_end
) return SQFalse
;
592 exp
->_bol
= text_begin
;
593 exp
->_eol
= text_end
;
597 exp
->_currsubexp
= 0;
598 cur
= sqstd_rex_matchnode(exp
,&exp
->_nodes
[node
],cur
,NULL
);
601 node
= exp
->_nodes
[node
].next
;
604 } while(cur
== NULL
&& text_begin
!= text_end
);
611 if(out_begin
) *out_begin
= text_begin
;
612 if(out_end
) *out_end
= cur
;
616 SQBool
sqstd_rex_search(SQRex
* exp
,const SQChar
* text
, const SQChar
** out_begin
, const SQChar
** out_end
)
618 return sqstd_rex_searchrange(exp
,text
,text
+ strlen(text
),out_begin
,out_end
);
621 SQInteger
sqstd_rex_getsubexpcount(SQRex
* exp
)
623 return exp
->_nsubexpr
;
626 SQBool
sqstd_rex_getsubexp(SQRex
* exp
, SQInteger n
, SQRexMatch
*subexp
)
628 if( n
<0 || n
>= exp
->_nsubexpr
) return SQFalse
;
629 *subexp
= exp
->_matches
[n
];