1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.3 2008/02/11 23:26:42 kurt Exp $ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 1998-2008 The OpenLDAP Foundation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
15 /* Copyright 1997, 1998, 1999 Computing Research Labs,
16 * New Mexico State University
18 * Permission is hereby granted, free of charge, to any person obtaining a
19 * copy of this software and associated documentation files (the "Software"),
20 * to deal in the Software without restriction, including without limitation
21 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 * and/or sell copies of the Software, and to permit persons to whom the
23 * Software is furnished to do so, subject to the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 /* $Id: ure.c,v 1.1.1.1 2008/02/11 23:26:42 lukem Exp $" */
40 #include <ac/stdlib.h>
41 #include <ac/string.h>
42 #include <ac/unistd.h>
47 * Flags used internally in the DFA.
49 #define _URE_DFA_CASEFOLD 0x01
50 #define _URE_DFA_BLANKLINE 0x02
52 static unsigned long cclass_flags
[] = {
89 * Symbol types for the DFA.
91 #define _URE_ANY_CHAR 1
94 #define _URE_NCCLASS 4
95 #define _URE_BOL_ANCHOR 5
96 #define _URE_EOL_ANCHOR 6
99 * Op codes for converting the NFA to a DFA.
101 #define _URE_SYMBOL 10
102 #define _URE_PAREN 11
103 #define _URE_QUEST 12
110 #define _URE_NOOP 0xffff
112 #define _URE_REGSTART 0x8000
113 #define _URE_REGEND 0x4000
116 * Structure used to handle a compacted range of characters.
124 _ure_range_t
*ranges
;
135 * This is a general element structure used for expressions and stack
147 * This is a structure used to track a list or a stack of states.
156 * Structure to track the list of unique states for a symbol
165 _ure_stlist_t states
;
169 * Structure to hold a single state.
182 * Structure used for keeping lists of states.
185 _ure_state_t
*states
;
191 * Structure to track pairs of DFA states when equivalent states are
200 * Structure used for constructing the NFA and reducing to a minimal DFA.
202 typedef struct _ure_buffer_t
{
210 * Table of unique symbols encountered.
212 _ure_symtab_t
*symtab
;
217 * Tracks the unique expressions generated for the NFA and when the NFA is
225 * The reduced table of unique groups of NFA states.
227 _ure_statetable_t states
;
230 * Tracks states when equivalent states are merged.
248 typedef struct _ure_dfa_t
{
254 _ure_dstate_t
*states
;
261 /*************************************************************************
265 *************************************************************************/
268 _ure_memmove(char *dest
, char *src
, unsigned long bytes
)
277 * Do a memmove using Ye Olde Duff's Device for efficiency.
286 case 7: *--dest
= *--src
;
287 case 6: *--dest
= *--src
;
288 case 5: *--dest
= *--src
;
289 case 4: *--dest
= *--src
;
290 case 3: *--dest
= *--src
;
291 case 2: *--dest
= *--src
;
292 case 1: *--dest
= *--src
;
295 } else if (src
> dest
) {
299 case 7: *dest
++ = *src
++;
300 case 6: *dest
++ = *src
++;
301 case 5: *dest
++ = *src
++;
302 case 4: *dest
++ = *src
++;
303 case 3: *dest
++ = *src
++;
304 case 2: *dest
++ = *src
++;
305 case 1: *dest
++ = *src
++;
312 _ure_push(ucs2_t v
, _ure_buffer_t
*b
)
320 * If the `reducing' parameter is non-zero, check to see if the value
321 * passed is already on the stack.
323 if (b
->reducing
!= 0 && b
->expr
[v
].onstack
!= 0)
327 if (s
->slist_used
== s
->slist_size
) {
328 if (s
->slist_size
== 0)
329 s
->slist
= (ucs2_t
*) malloc(sizeof(ucs2_t
) << 3);
331 s
->slist
= (ucs2_t
*) realloc((char *) s
->slist
,
332 sizeof(ucs2_t
) * (s
->slist_size
+ 8));
335 s
->slist
[s
->slist_used
++] = v
;
338 * If the `reducing' parameter is non-zero, flag the element as being on
341 if (b
->reducing
!= 0)
342 b
->expr
[v
].onstack
= 1;
346 _ure_peek(_ure_buffer_t
*b
)
348 if (b
== 0 || b
->stack
.slist_used
== 0)
351 return b
->stack
.slist
[b
->stack
.slist_used
- 1];
355 _ure_pop(_ure_buffer_t
*b
)
359 if (b
== 0 || b
->stack
.slist_used
== 0)
362 v
= b
->stack
.slist
[--b
->stack
.slist_used
];
364 b
->expr
[v
].onstack
= 0;
369 /*************************************************************************
371 * Start symbol parse functions.
373 *************************************************************************/
376 * Parse a comma-separated list of integers that represent character
377 * properties. Combine them into a mask that is returned in the `mask'
378 * variable, and return the number of characters consumed.
381 _ure_prop_list(ucs2_t
*pp
, unsigned long limit
, unsigned long *mask
,
390 for (m
= n
= 0; b
->error
== _URE_OK
&& sp
< ep
; sp
++) {
393 * Encountered a comma, so select the next character property flag
394 * and reset the number.
396 m
|= cclass_flags
[n
];
398 } else if (*sp
>= '0' && *sp
<= '9')
400 * Encountered a digit, so start or continue building the cardinal
401 * that represents the character property flag.
403 n
= (n
* 10) + (*sp
- '0');
406 * Encountered something that is not part of the property list.
407 * Indicate that we are done.
412 * If a property number greater than 32 occurs, then there is a
413 * problem. Most likely a missing comma separator.
416 b
->error
= _URE_INVALID_PROPERTY
;
420 m
|= cclass_flags
[n
];
423 * Set the mask that represents the group of character properties.
428 * Return the number of characters consumed.
434 * Collect a hex number with 1 to 4 digits and return the number
435 * of characters used.
438 _ure_hex(ucs2_t
*np
, unsigned long limit
, ucs4_t
*n
)
447 for (nn
= 0, i
= 0; i
< 4 && sp
< ep
; i
++, sp
++) {
448 if (*sp
>= '0' && *sp
<= '9')
449 nn
= (nn
<< 4) + (*sp
- '0');
450 else if (*sp
>= 'A' && *sp
<= 'F')
451 nn
= (nn
<< 4) + ((*sp
- 'A') + 10);
452 else if (*sp
>= 'a' && *sp
<= 'f')
453 nn
= (nn
<< 4) + ((*sp
- 'a') + 10);
456 * Encountered something that is not a hex digit.
462 * Assign the character code collected and return the number of
471 * Insert a range into a character class, removing duplicates and ordering
472 * them in increasing range-start order.
475 _ure_add_range(_ure_ccl_t
*ccl
, _ure_range_t
*r
, _ure_buffer_t
*b
)
482 * If the `casefold' flag is set, then make sure both endpoints of the
483 * range are converted to lower case.
485 if (b
->flags
& _URE_DFA_CASEFOLD
) {
486 r
->min_code
= _ure_tolower(r
->min_code
);
487 r
->max_code
= _ure_tolower(r
->max_code
);
491 * Swap the range endpoints if they are not in increasing order.
493 if (r
->min_code
> r
->max_code
) {
495 r
->min_code
= r
->max_code
;
499 for (i
= 0, rp
= ccl
->ranges
;
500 i
< ccl
->ranges_used
&& r
->min_code
< rp
->min_code
; i
++, rp
++) ;
503 * Check for a duplicate.
505 if (i
< ccl
->ranges_used
&&
506 r
->min_code
== rp
->min_code
&& r
->max_code
== rp
->max_code
)
509 if (ccl
->ranges_used
== ccl
->ranges_size
) {
510 if (ccl
->ranges_size
== 0)
511 ccl
->ranges
= (_ure_range_t
*) malloc(sizeof(_ure_range_t
) << 3);
513 ccl
->ranges
= (_ure_range_t
*)
514 realloc((char *) ccl
->ranges
,
515 sizeof(_ure_range_t
) * (ccl
->ranges_size
+ 8));
516 ccl
->ranges_size
+= 8;
519 rp
= ccl
->ranges
+ ccl
->ranges_used
;
521 if (i
< ccl
->ranges_used
)
522 _ure_memmove((char *) (rp
+ 1), (char *) rp
,
523 sizeof(_ure_range_t
) * (ccl
->ranges_used
- i
));
526 rp
->min_code
= r
->min_code
;
527 rp
->max_code
= r
->max_code
;
530 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
531 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
532 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
533 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
535 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
536 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
537 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
538 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
540 typedef void (*_ure_cclsetup_t
)(
550 _ure_cclsetup_t func
;
555 _ure_ccl_setup(_ure_symtab_t
*sym
, unsigned long mask
, _ure_buffer_t
*b
)
561 _ure_space_setup(_ure_symtab_t
*sym
, unsigned long mask
, _ure_buffer_t
*b
)
568 * Add the additional characters needed for handling isspace().
570 range
.min_code
= range
.max_code
= '\t';
571 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
572 range
.min_code
= range
.max_code
= '\r';
573 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
574 range
.min_code
= range
.max_code
= '\n';
575 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
576 range
.min_code
= range
.max_code
= '\f';
577 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
578 range
.min_code
= range
.max_code
= 0xfeff;
579 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
583 _ure_xdigit_setup(_ure_symtab_t
*sym
, unsigned long mask
, _ure_buffer_t
*b
)
588 * Add the additional characters needed for handling isxdigit().
590 range
.min_code
= '0';
591 range
.max_code
= '9';
592 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
593 range
.min_code
= 'A';
594 range
.max_code
= 'F';
595 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
596 range
.min_code
= 'a';
597 range
.max_code
= 'f';
598 _ure_add_range(&sym
->sym
.ccl
, &range
, b
);
601 static _ure_trie_t cclass_trie
[] = {
602 {0x003a, 1, 1, 0, 0},
603 {0x0061, 9, 10, 0, 0},
604 {0x0063, 8, 19, 0, 0},
605 {0x0064, 7, 24, 0, 0},
606 {0x0067, 6, 29, 0, 0},
607 {0x006c, 5, 34, 0, 0},
608 {0x0070, 4, 39, 0, 0},
609 {0x0073, 3, 49, 0, 0},
610 {0x0075, 2, 54, 0, 0},
611 {0x0078, 1, 59, 0, 0},
612 {0x006c, 1, 11, 0, 0},
613 {0x006e, 2, 13, 0, 0},
614 {0x0070, 1, 16, 0, 0},
615 {0x0075, 1, 14, 0, 0},
616 {0x006d, 1, 15, 0, 0},
617 {0x003a, 1, 16, _ure_ccl_setup
, _URE_ALNUM_MASK
},
618 {0x0068, 1, 17, 0, 0},
619 {0x0061, 1, 18, 0, 0},
620 {0x003a, 1, 19, _ure_ccl_setup
, _URE_ALPHA_MASK
},
621 {0x006e, 1, 20, 0, 0},
622 {0x0074, 1, 21, 0, 0},
623 {0x0072, 1, 22, 0, 0},
624 {0x006c, 1, 23, 0, 0},
625 {0x003a, 1, 24, _ure_ccl_setup
, _URE_CNTRL
},
626 {0x0069, 1, 25, 0, 0},
627 {0x0067, 1, 26, 0, 0},
628 {0x0069, 1, 27, 0, 0},
629 {0x0074, 1, 28, 0, 0},
630 {0x003a, 1, 29, _ure_ccl_setup
, _URE_NUMDIGIT
},
631 {0x0072, 1, 30, 0, 0},
632 {0x0061, 1, 31, 0, 0},
633 {0x0070, 1, 32, 0, 0},
634 {0x0068, 1, 33, 0, 0},
635 {0x003a, 1, 34, _ure_ccl_setup
, _URE_GRAPH_MASK
},
636 {0x006f, 1, 35, 0, 0},
637 {0x0077, 1, 36, 0, 0},
638 {0x0065, 1, 37, 0, 0},
639 {0x0072, 1, 38, 0, 0},
640 {0x003a, 1, 39, _ure_ccl_setup
, _URE_LOWER
},
641 {0x0072, 2, 41, 0, 0},
642 {0x0075, 1, 45, 0, 0},
643 {0x0069, 1, 42, 0, 0},
644 {0x006e, 1, 43, 0, 0},
645 {0x0074, 1, 44, 0, 0},
646 {0x003a, 1, 45, _ure_ccl_setup
, _URE_PRINT_MASK
},
647 {0x006e, 1, 46, 0, 0},
648 {0x0063, 1, 47, 0, 0},
649 {0x0074, 1, 48, 0, 0},
650 {0x003a, 1, 49, _ure_ccl_setup
, _URE_PUNCT_MASK
},
651 {0x0070, 1, 50, 0, 0},
652 {0x0061, 1, 51, 0, 0},
653 {0x0063, 1, 52, 0, 0},
654 {0x0065, 1, 53, 0, 0},
655 {0x003a, 1, 54, _ure_space_setup
, _URE_SPACE_MASK
},
656 {0x0070, 1, 55, 0, 0},
657 {0x0070, 1, 56, 0, 0},
658 {0x0065, 1, 57, 0, 0},
659 {0x0072, 1, 58, 0, 0},
660 {0x003a, 1, 59, _ure_ccl_setup
, _URE_UPPER
},
661 {0x0064, 1, 60, 0, 0},
662 {0x0069, 1, 61, 0, 0},
663 {0x0067, 1, 62, 0, 0},
664 {0x0069, 1, 63, 0, 0},
665 {0x0074, 1, 64, 0, 0},
666 {0x003a, 1, 65, _ure_xdigit_setup
, 0},
670 * Probe for one of the POSIX colon delimited character classes in the static
674 _ure_posix_ccl(ucs2_t
*cp
, unsigned long limit
, _ure_symtab_t
*sym
,
683 * If the number of characters left is less than 7, then this cannot be
684 * interpreted as one of the colon delimited classes.
692 for (i
= 0; sp
< ep
&& i
< 8; i
++, sp
++) {
695 for (; n
> 0 && tp
->key
!= *sp
; tp
++, n
--) ;
700 if (*sp
== ':' && (i
== 6 || i
== 7)) {
705 tp
= cclass_trie
+ tp
->next
;
710 (*tp
->func
)(sym
, tp
->mask
, b
);
716 * Construct a list of ranges and return the number of characters consumed.
719 _ure_cclass(ucs2_t
*cp
, unsigned long limit
, _ure_symtab_t
*symp
,
733 symp
->type
= _URE_NCCLASS
;
736 symp
->type
= _URE_CCLASS
;
738 for (last
= 0, range_end
= 0;
739 b
->error
== _URE_OK
&& sp
< ep
&& *sp
!= ']'; ) {
744 * The EOS was encountered when expecting the reverse solidus
745 * to be followed by the character it is escaping. Set an
746 * error code and return the number of characters consumed up
749 b
->error
= _URE_UNEXPECTED_EOS
;
778 sp
+= _ure_prop_list(sp
, ep
- sp
, &symp
->props
, b
);
780 * Invert the bit mask of the properties if this is a negated
781 * character class or if 'P' is used to specify a list of
782 * character properties that should *not* match in a
786 symp
->props
= ~symp
->props
;
794 ((*sp
>= '0' && *sp
<= '9') ||
795 (*sp
>= 'A' && *sp
<= 'F') ||
796 (*sp
>= 'a' && *sp
<= 'f')))
797 sp
+= _ure_hex(sp
, ep
- sp
, &c
);
799 } else if (c
== ':') {
801 * Probe for a POSIX colon delimited character class.
804 if ((n
= _ure_posix_ccl(sp
, ep
- sp
, symp
, b
)) == 0)
812 cclp
= &symp
->sym
.ccl
;
815 * Check to see if the current character is a low surrogate that needs
816 * to be combined with a preceding high surrogate.
819 if (c
>= 0xdc00 && c
<= 0xdfff)
821 * Construct the UTF16 character code.
823 c
= 0x10000 + (((last
& 0x03ff) << 10) | (c
& 0x03ff));
826 * Add the isolated high surrogate to the range.
829 range
.max_code
= last
& 0xffff;
831 range
.min_code
= range
.max_code
= last
& 0xffff;
833 _ure_add_range(cclp
, &range
, b
);
839 * Clear the last character code.
844 * This slightly awkward code handles the different cases needed to
847 if (c
>= 0xd800 && c
<= 0xdbff) {
849 * If the high surrogate is followed by a range indicator, simply
850 * add it as the range start. Otherwise, save it in case the next
851 * character is a low surrogate.
859 } else if (range_end
== 1) {
861 _ure_add_range(cclp
, &range
, b
);
864 range
.min_code
= range
.max_code
= c
;
869 _ure_add_range(cclp
, &range
, b
);
873 if (sp
< ep
&& *sp
== ']')
877 * The parse was not terminated by the character class close symbol
878 * (']'), so set an error code.
880 b
->error
= _URE_CCLASS_OPEN
;
886 * Probe for a low surrogate hex code.
889 _ure_probe_ls(ucs2_t
*ls
, unsigned long limit
, ucs4_t
*c
)
894 for (i
= code
= 0, sp
= ls
, ep
= sp
+ limit
; i
< 4 && sp
< ep
; sp
++) {
895 if (*sp
>= '0' && *sp
<= '9')
896 code
= (code
<< 4) + (*sp
- '0');
897 else if (*sp
>= 'A' && *sp
<= 'F')
898 code
= (code
<< 4) + ((*sp
- 'A') + 10);
899 else if (*sp
>= 'a' && *sp
<= 'f')
900 code
= (code
<< 4) + ((*sp
- 'a') + 10);
906 return (0xdc00 <= code
&& code
<= 0xdfff) ? sp
- ls
: 0;
910 _ure_compile_symbol(ucs2_t
*sym
, unsigned long limit
, _ure_symtab_t
*symp
,
919 if ((c
= *sp
++) == '\\') {
923 * The EOS was encountered when expecting the reverse solidus to
924 * be followed by the character it is escaping. Set an error code
925 * and return the number of characters consumed up to this point.
927 b
->error
= _URE_UNEXPECTED_EOS
;
935 symp
->type
= (c
== 'p') ? _URE_CCLASS
: _URE_NCCLASS
;
936 sp
+= _ure_prop_list(sp
, ep
- sp
, &symp
->props
, b
);
939 symp
->type
= _URE_CHAR
;
940 symp
->sym
.chr
= 0x07;
943 symp
->type
= _URE_CHAR
;
944 symp
->sym
.chr
= 0x08;
947 symp
->type
= _URE_CHAR
;
948 symp
->sym
.chr
= 0x0c;
951 symp
->type
= _URE_CHAR
;
952 symp
->sym
.chr
= 0x0a;
955 symp
->type
= _URE_CHAR
;
956 symp
->sym
.chr
= 0x0d;
959 symp
->type
= _URE_CHAR
;
960 symp
->sym
.chr
= 0x09;
963 symp
->type
= _URE_CHAR
;
964 symp
->sym
.chr
= 0x0b;
971 * Collect between 1 and 4 digits representing a UCS2 code. Fall
972 * through to the next case.
975 ((*sp
>= '0' && *sp
<= '9') ||
976 (*sp
>= 'A' && *sp
<= 'F') ||
977 (*sp
>= 'a' && *sp
<= 'f')))
978 sp
+= _ure_hex(sp
, ep
- sp
, &c
);
982 * Simply add an escaped character here.
984 symp
->type
= _URE_CHAR
;
987 } else if (c
== '^' || c
== '$')
989 * Handle the BOL and EOL anchors. This actually consists simply of
990 * setting a flag that indicates that the user supplied anchor match
991 * function should be called. This needs to be done instead of simply
992 * matching line/paragraph separators because beginning-of-text and
993 * end-of-text tests are needed as well.
995 symp
->type
= (c
== '^') ? _URE_BOL_ANCHOR
: _URE_EOL_ANCHOR
;
998 * Construct a character class.
1000 sp
+= _ure_cclass(sp
, ep
- sp
, symp
, b
);
1002 symp
->type
= _URE_ANY_CHAR
;
1004 symp
->type
= _URE_CHAR
;
1009 * If the symbol type happens to be a character and is a high surrogate,
1010 * then probe forward to see if it is followed by a low surrogate that
1011 * needs to be added.
1013 if (sp
< ep
&& symp
->type
== _URE_CHAR
&&
1014 0xd800 <= symp
->sym
.chr
&& symp
->sym
.chr
<= 0xdbff) {
1016 if (0xdc00 <= *sp
&& *sp
<= 0xdfff) {
1017 symp
->sym
.chr
= 0x10000 + (((symp
->sym
.chr
& 0x03ff) << 10) |
1020 } else if (*sp
== '\\' && (*(sp
+ 1) == 'x' || *(sp
+ 1) == 'X' ||
1021 *(sp
+ 1) == 'u' || *(sp
+ 1) == 'U')) {
1022 sp
+= _ure_probe_ls(sp
+ 2, ep
- (sp
+ 2), &c
);
1023 if (0xdc00 <= c
&& c
<= 0xdfff) {
1025 * Take into account the \[xu] in front of the hex code.
1028 symp
->sym
.chr
= 0x10000 + (((symp
->sym
.chr
& 0x03ff) << 10) |
1035 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1036 * the `casefold' flag is set.
1038 if ((b
->flags
& _URE_DFA_CASEFOLD
) && symp
->type
== _URE_CHAR
)
1039 symp
->sym
.chr
= _ure_tolower(symp
->sym
.chr
);
1042 * If the symbol constructed is anything other than one of the anchors,
1043 * make sure the _URE_DFA_BLANKLINE flag is removed.
1045 if (symp
->type
!= _URE_BOL_ANCHOR
&& symp
->type
!= _URE_EOL_ANCHOR
)
1046 b
->flags
&= ~_URE_DFA_BLANKLINE
;
1049 * Return the number of characters consumed.
1055 _ure_sym_neq(_ure_symtab_t
*a
, _ure_symtab_t
*b
)
1057 if (a
->type
!= b
->type
|| a
->mods
!= b
->mods
|| a
->props
!= b
->props
)
1060 if (a
->type
== _URE_CCLASS
|| a
->type
== _URE_NCCLASS
) {
1061 if (a
->sym
.ccl
.ranges_used
!= b
->sym
.ccl
.ranges_used
)
1063 if (a
->sym
.ccl
.ranges_used
> 0 &&
1064 memcmp((char *) a
->sym
.ccl
.ranges
, (char *) b
->sym
.ccl
.ranges
,
1065 sizeof(_ure_range_t
) * a
->sym
.ccl
.ranges_used
) != 0)
1067 } else if (a
->type
== _URE_CHAR
&& a
->sym
.chr
!= b
->sym
.chr
)
1073 * Construct a symbol, but only keep unique symbols.
1076 _ure_make_symbol(ucs2_t
*sym
, unsigned long limit
, unsigned long *consumed
,
1080 _ure_symtab_t
*sp
, symbol
;
1083 * Build the next symbol so we can test to see if it is already in the
1086 (void) memset((char *) &symbol
, '\0', sizeof(_ure_symtab_t
));
1087 *consumed
= _ure_compile_symbol(sym
, limit
, &symbol
, b
);
1090 * Check to see if the symbol exists.
1092 for (i
= 0, sp
= b
->symtab
;
1093 i
< b
->symtab_used
&& _ure_sym_neq(&symbol
, sp
); i
++, sp
++) ;
1095 if (i
< b
->symtab_used
) {
1097 * Free up any ranges used for the symbol.
1099 if ((symbol
.type
== _URE_CCLASS
|| symbol
.type
== _URE_NCCLASS
) &&
1100 symbol
.sym
.ccl
.ranges_size
> 0)
1101 free((char *) symbol
.sym
.ccl
.ranges
);
1103 return b
->symtab
[i
].id
;
1107 * Need to add the new symbol.
1109 if (b
->symtab_used
== b
->symtab_size
) {
1110 if (b
->symtab_size
== 0)
1111 b
->symtab
= (_ure_symtab_t
*) malloc(sizeof(_ure_symtab_t
) << 3);
1113 b
->symtab
= (_ure_symtab_t
*)
1114 realloc((char *) b
->symtab
,
1115 sizeof(_ure_symtab_t
) * (b
->symtab_size
+ 8));
1116 sp
= b
->symtab
+ b
->symtab_size
;
1117 (void) memset((char *) sp
, '\0', sizeof(_ure_symtab_t
) << 3);
1118 b
->symtab_size
+= 8;
1121 symbol
.id
= b
->symtab_used
++;
1122 (void) AC_MEMCPY((char *) &b
->symtab
[symbol
.id
], (char *) &symbol
,
1123 sizeof(_ure_symtab_t
));
1128 /*************************************************************************
1130 * End symbol parse functions.
1132 *************************************************************************/
1135 _ure_make_expr(ucs2_t type
, ucs2_t lhs
, ucs2_t rhs
, _ure_buffer_t
*b
)
1143 * Determine if the expression already exists or not.
1145 for (i
= 0; i
< b
->expr_used
; i
++) {
1146 if (b
->expr
[i
].type
== type
&& b
->expr
[i
].lhs
== lhs
&&
1147 b
->expr
[i
].rhs
== rhs
)
1150 if (i
< b
->expr_used
)
1154 * Need to add a new expression.
1156 if (b
->expr_used
== b
->expr_size
) {
1157 if (b
->expr_size
== 0)
1158 b
->expr
= (_ure_elt_t
*) malloc(sizeof(_ure_elt_t
) << 3);
1160 b
->expr
= (_ure_elt_t
*)
1161 realloc((char *) b
->expr
,
1162 sizeof(_ure_elt_t
) * (b
->expr_size
+ 8));
1166 b
->expr
[b
->expr_used
].onstack
= 0;
1167 b
->expr
[b
->expr_used
].type
= type
;
1168 b
->expr
[b
->expr_used
].lhs
= lhs
;
1169 b
->expr
[b
->expr_used
].rhs
= rhs
;
1171 return b
->expr_used
++;
1174 static unsigned char spmap
[] = {
1175 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1176 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1177 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1181 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184 * Convert the regular expression into an NFA in a form that will be easy to
1185 * reduce to a DFA. The starting state for the reduction will be returned.
1188 _ure_re2nfa(ucs2_t
*re
, unsigned long relen
, _ure_buffer_t
*b
)
1190 ucs2_t c
, state
, top
, sym
, *sp
, *ep
;
1197 while (b
->error
== _URE_OK
&& sp
< ep
) {
1201 _ure_push(_URE_PAREN
, b
);
1205 * Check for the case of too many close parentheses.
1207 if (_ure_peek(b
) == _URE_NOOP
) {
1208 b
->error
= _URE_UNBALANCED_GROUP
;
1212 while ((top
= _ure_peek(b
)) == _URE_AND
|| top
== _URE_OR
)
1214 * Make an expression with the AND or OR operator and its right
1217 state
= _ure_make_expr(_ure_pop(b
), _ure_pop(b
), state
, b
);
1220 * Remove the _URE_PAREN off the stack.
1225 state
= _ure_make_expr(_URE_STAR
, state
, _URE_NOOP
, b
);
1228 state
= _ure_make_expr(_URE_PLUS
, state
, _URE_NOOP
, b
);
1231 state
= _ure_make_expr(_URE_QUEST
, state
, _URE_NOOP
, b
);
1234 while ((top
= _ure_peek(b
)) == _URE_AND
|| top
== _URE_OR
)
1236 * Make an expression with the AND or OR operator and its right
1239 state
= _ure_make_expr(_ure_pop(b
), _ure_pop(b
), state
, b
);
1241 _ure_push(state
, b
);
1242 _ure_push(_URE_OR
, b
);
1246 sym
= _ure_make_symbol(sp
, ep
- sp
, &used
, b
);
1248 state
= _ure_make_expr(_URE_SYMBOL
, sym
, _URE_NOOP
, b
);
1252 if (c
!= '(' && c
!= '|' && sp
< ep
&&
1253 (!_ure_isspecial(*sp
) || *sp
== '(')) {
1254 _ure_push(state
, b
);
1255 _ure_push(_URE_AND
, b
);
1258 while ((top
= _ure_peek(b
)) == _URE_AND
|| top
== _URE_OR
)
1260 * Make an expression with the AND or OR operator and its right
1263 state
= _ure_make_expr(_ure_pop(b
), _ure_pop(b
), state
, b
);
1265 if (b
->stack
.slist_used
> 0)
1266 b
->error
= _URE_UNBALANCED_GROUP
;
1268 return (b
->error
== _URE_OK
) ? state
: _URE_NOOP
;
1272 _ure_add_symstate(ucs2_t sym
, ucs2_t state
, _ure_buffer_t
*b
)
1278 * Locate the symbol in the symbol table so the state can be added.
1279 * If the symbol doesn't exist, then a real problem exists.
1281 for (i
= 0, sp
= b
->symtab
; i
< b
->symtab_used
&& sym
!= sp
->id
;
1285 * Now find out if the state exists in the symbol's state list.
1287 for (i
= 0, stp
= sp
->states
.slist
;
1288 i
< sp
->states
.slist_used
&& state
> *stp
; i
++, stp
++) ;
1290 if (i
== sp
->states
.slist_used
|| state
< *stp
) {
1292 * Need to add the state in order.
1294 if (sp
->states
.slist_used
== sp
->states
.slist_size
) {
1295 if (sp
->states
.slist_size
== 0)
1296 sp
->states
.slist
= (ucs2_t
*) malloc(sizeof(ucs2_t
) << 3);
1298 sp
->states
.slist
= (ucs2_t
*)
1299 realloc((char *) sp
->states
.slist
,
1300 sizeof(ucs2_t
) * (sp
->states
.slist_size
+ 8));
1301 sp
->states
.slist_size
+= 8;
1303 if (i
< sp
->states
.slist_used
)
1304 (void) _ure_memmove((char *) (sp
->states
.slist
+ i
+ 1),
1305 (char *) (sp
->states
.slist
+ i
),
1306 sizeof(ucs2_t
) * (sp
->states
.slist_used
- i
));
1307 sp
->states
.slist
[i
] = state
;
1308 sp
->states
.slist_used
++;
1313 _ure_add_state(ucs2_t nstates
, ucs2_t
*states
, _ure_buffer_t
*b
)
1318 for (i
= 0, sp
= b
->states
.states
; i
< b
->states
.states_used
; i
++, sp
++) {
1319 if (sp
->st
.slist_used
== nstates
&&
1320 memcmp((char *) states
, (char *) sp
->st
.slist
,
1321 sizeof(ucs2_t
) * nstates
) == 0)
1325 if (i
== b
->states
.states_used
) {
1327 * Need to add a new DFA state (set of NFA states).
1329 if (b
->states
.states_used
== b
->states
.states_size
) {
1330 if (b
->states
.states_size
== 0)
1331 b
->states
.states
= (_ure_state_t
*)
1332 malloc(sizeof(_ure_state_t
) << 3);
1334 b
->states
.states
= (_ure_state_t
*)
1335 realloc((char *) b
->states
.states
,
1336 sizeof(_ure_state_t
) * (b
->states
.states_size
+ 8));
1337 sp
= b
->states
.states
+ b
->states
.states_size
;
1338 (void) memset((char *) sp
, '\0', sizeof(_ure_state_t
) << 3);
1339 b
->states
.states_size
+= 8;
1342 sp
= b
->states
.states
+ b
->states
.states_used
++;
1345 if (sp
->st
.slist_used
+ nstates
> sp
->st
.slist_size
) {
1346 if (sp
->st
.slist_size
== 0)
1347 sp
->st
.slist
= (ucs2_t
*)
1348 malloc(sizeof(ucs2_t
) * (sp
->st
.slist_used
+ nstates
));
1350 sp
->st
.slist
= (ucs2_t
*)
1351 realloc((char *) sp
->st
.slist
,
1352 sizeof(ucs2_t
) * (sp
->st
.slist_used
+ nstates
));
1353 sp
->st
.slist_size
= sp
->st
.slist_used
+ nstates
;
1355 sp
->st
.slist_used
= nstates
;
1356 (void) AC_MEMCPY((char *) sp
->st
.slist
, (char *) states
,
1357 sizeof(ucs2_t
) * nstates
);
1361 * Return the ID of the DFA state representing a group of NFA states.
1367 _ure_reduce(ucs2_t start
, _ure_buffer_t
*b
)
1369 ucs2_t i
, j
, state
, eval
, syms
, rhs
;
1370 ucs2_t s1
, s2
, ns1
, ns2
;
1377 * Add the starting state for the reduction.
1379 _ure_add_state(1, &start
, b
);
1382 * Process each set of NFA states that get created.
1384 for (i
= 0; i
< b
->states
.states_used
; i
++) {
1385 sp
= b
->states
.states
+ i
;
1388 * Push the current states on the stack.
1390 for (j
= 0; j
< sp
->st
.slist_used
; j
++)
1391 _ure_push(sp
->st
.slist
[j
], b
);
1394 * Reduce the NFA states.
1396 for (j
= sp
->accepting
= syms
= 0; j
< b
->stack
.slist_used
; j
++) {
1397 state
= b
->stack
.slist
[j
];
1401 * This inner loop is the iterative equivalent of recursively
1402 * reducing subexpressions generated as a result of a reduction.
1405 switch (b
->expr
[state
].type
) {
1407 ns1
= _ure_make_expr(_URE_ONE
, _URE_NOOP
, _URE_NOOP
, b
);
1408 _ure_add_symstate(b
->expr
[state
].lhs
, ns1
, b
);
1417 s1
= b
->expr
[state
].lhs
;
1418 ns1
= _ure_make_expr(_URE_ONE
, _URE_NOOP
, _URE_NOOP
, b
);
1419 state
= _ure_make_expr(_URE_OR
, ns1
, s1
, b
);
1422 s1
= b
->expr
[state
].lhs
;
1423 ns1
= _ure_make_expr(_URE_STAR
, s1
, _URE_NOOP
, b
);
1424 state
= _ure_make_expr(_URE_AND
, s1
, ns1
, b
);
1427 s1
= b
->expr
[state
].lhs
;
1428 ns1
= _ure_make_expr(_URE_ONE
, _URE_NOOP
, _URE_NOOP
, b
);
1429 ns2
= _ure_make_expr(_URE_PLUS
, s1
, _URE_NOOP
, b
);
1430 state
= _ure_make_expr(_URE_OR
, ns1
, ns2
, b
);
1433 s1
= b
->expr
[state
].lhs
;
1434 s2
= b
->expr
[state
].rhs
;
1440 s1
= b
->expr
[state
].lhs
;
1441 s2
= b
->expr
[state
].rhs
;
1442 switch (b
->expr
[s1
].type
) {
1444 _ure_add_symstate(b
->expr
[s1
].lhs
, s2
, b
);
1452 ns1
= b
->expr
[s1
].lhs
;
1453 ns2
= _ure_make_expr(_URE_AND
, ns1
, s2
, b
);
1454 state
= _ure_make_expr(_URE_OR
, s2
, ns2
, b
);
1457 ns1
= b
->expr
[s1
].lhs
;
1458 ns2
= _ure_make_expr(_URE_OR
, s2
, state
, b
);
1459 state
= _ure_make_expr(_URE_AND
, ns1
, ns2
, b
);
1462 ns1
= b
->expr
[s1
].lhs
;
1463 ns2
= _ure_make_expr(_URE_AND
, ns1
, state
, b
);
1464 state
= _ure_make_expr(_URE_OR
, s2
, ns2
, b
);
1467 ns1
= b
->expr
[s1
].lhs
;
1468 ns2
= b
->expr
[s1
].rhs
;
1469 ns1
= _ure_make_expr(_URE_AND
, ns1
, s2
, b
);
1470 ns2
= _ure_make_expr(_URE_AND
, ns2
, s2
, b
);
1471 state
= _ure_make_expr(_URE_OR
, ns1
, ns2
, b
);
1474 ns1
= b
->expr
[s1
].lhs
;
1475 ns2
= b
->expr
[s1
].rhs
;
1476 ns2
= _ure_make_expr(_URE_AND
, ns2
, s2
, b
);
1477 state
= _ure_make_expr(_URE_AND
, ns1
, ns2
, b
);
1485 * Clear the state stack.
1487 while (_ure_pop(b
) != _URE_NOOP
) ;
1490 * Reset the state pointer because the reduction may have moved it
1491 * during a reallocation.
1493 sp
= b
->states
.states
+ i
;
1496 * Generate the DFA states for the symbols collected during the
1497 * current reduction.
1499 if (sp
->trans_used
+ syms
> sp
->trans_size
) {
1500 if (sp
->trans_size
== 0)
1501 sp
->trans
= (_ure_elt_t
*)
1502 malloc(sizeof(_ure_elt_t
) * (sp
->trans_used
+ syms
));
1504 sp
->trans
= (_ure_elt_t
*)
1505 realloc((char *) sp
->trans
,
1506 sizeof(_ure_elt_t
) * (sp
->trans_used
+ syms
));
1507 sp
->trans_size
= sp
->trans_used
+ syms
;
1511 * Go through the symbol table and generate the DFA state transitions
1512 * for each symbol that has collected NFA states.
1514 for (j
= syms
= 0, smp
= b
->symtab
; j
< b
->symtab_used
; j
++, smp
++) {
1515 sp
= b
->states
.states
+ i
;
1517 if (smp
->states
.slist_used
> 0) {
1518 sp
->trans
[syms
].lhs
= smp
->id
;
1519 rhs
= _ure_add_state(smp
->states
.slist_used
,
1520 smp
->states
.slist
, b
);
1522 * Reset the state pointer in case the reallocation moves it
1525 sp
= b
->states
.states
+ i
;
1526 sp
->trans
[syms
].rhs
= rhs
;
1528 smp
->states
.slist_used
= 0;
1534 * Set the number of transitions actually used.
1536 sp
->trans_used
= syms
;
1542 _ure_add_equiv(ucs2_t l
, ucs2_t r
, _ure_buffer_t
*b
)
1546 l
= b
->states
.states
[l
].id
;
1547 r
= b
->states
.states
[r
].id
;
1559 * Check to see if the equivalence pair already exists.
1561 for (tmp
= 0; tmp
< b
->equiv_used
&&
1562 (b
->equiv
[tmp
].l
!= l
|| b
->equiv
[tmp
].r
!= r
);
1565 if (tmp
< b
->equiv_used
)
1568 if (b
->equiv_used
== b
->equiv_size
) {
1569 if (b
->equiv_size
== 0)
1570 b
->equiv
= (_ure_equiv_t
*) malloc(sizeof(_ure_equiv_t
) << 3);
1572 b
->equiv
= (_ure_equiv_t
*) realloc((char *) b
->equiv
,
1573 sizeof(_ure_equiv_t
) *
1574 (b
->equiv_size
+ 8));
1577 b
->equiv
[b
->equiv_used
].l
= l
;
1578 b
->equiv
[b
->equiv_used
].r
= r
;
1583 * Merge the DFA states that are equivalent.
1586 _ure_merge_equiv(_ure_buffer_t
*b
)
1588 ucs2_t i
, j
, k
, eq
, done
;
1589 _ure_state_t
*sp1
, *sp2
, *ls
, *rs
;
1591 for (i
= 0; i
< b
->states
.states_used
; i
++) {
1592 sp1
= b
->states
.states
+ i
;
1595 for (j
= 0; j
< i
; j
++) {
1596 sp2
= b
->states
.states
+ j
;
1600 _ure_add_equiv(i
, j
, b
);
1601 for (eq
= 0, done
= 0; eq
< b
->equiv_used
; eq
++) {
1602 ls
= b
->states
.states
+ b
->equiv
[eq
].l
;
1603 rs
= b
->states
.states
+ b
->equiv
[eq
].r
;
1604 if (ls
->accepting
!= rs
->accepting
||
1605 ls
->trans_used
!= rs
->trans_used
) {
1609 for (k
= 0; k
< ls
->trans_used
&&
1610 ls
->trans
[k
].lhs
== rs
->trans
[k
].lhs
; k
++) ;
1611 if (k
< ls
->trans_used
) {
1616 for (k
= 0; k
< ls
->trans_used
; k
++)
1617 _ure_add_equiv(ls
->trans
[k
].rhs
, rs
->trans
[k
].rhs
, b
);
1622 for (eq
= 0; j
< i
&& eq
< b
->equiv_used
; eq
++)
1623 b
->states
.states
[b
->equiv
[eq
].r
].id
=
1624 b
->states
.states
[b
->equiv
[eq
].l
].id
;
1628 * Renumber the states appropriately.
1630 for (i
= eq
= 0, sp1
= b
->states
.states
; i
< b
->states
.states_used
;
1632 sp1
->id
= (sp1
->id
== i
) ? eq
++ : b
->states
.states
[sp1
->id
].id
;
1635 /*************************************************************************
1639 *************************************************************************/
1642 ure_buffer_create(void)
1646 b
= (ure_buffer_t
) calloc(1, sizeof(_ure_buffer_t
));
1652 ure_buffer_free(ure_buffer_t buf
)
1659 if (buf
->stack
.slist_size
> 0)
1660 free((char *) buf
->stack
.slist
);
1662 if (buf
->expr_size
> 0)
1663 free((char *) buf
->expr
);
1665 for (i
= 0; i
< buf
->symtab_size
; i
++) {
1666 if (buf
->symtab
[i
].states
.slist_size
> 0)
1667 free((char *) buf
->symtab
[i
].states
.slist
);
1670 if (buf
->symtab_size
> 0)
1671 free((char *) buf
->symtab
);
1673 for (i
= 0; i
< buf
->states
.states_size
; i
++) {
1674 if (buf
->states
.states
[i
].trans_size
> 0)
1675 free((char *) buf
->states
.states
[i
].trans
);
1676 if (buf
->states
.states
[i
].st
.slist_size
> 0)
1677 free((char *) buf
->states
.states
[i
].st
.slist
);
1680 if (buf
->states
.states_size
> 0)
1681 free((char *) buf
->states
.states
);
1683 if (buf
->equiv_size
> 0)
1684 free((char *) buf
->equiv
);
1690 ure_compile(ucs2_t
*re
, unsigned long relen
, int casefold
, ure_buffer_t buf
)
1698 if (re
== 0 || *re
== 0 || relen
== 0 || buf
== 0)
1702 * Reset the various fields of the compilation buffer. Default the flags
1703 * to indicate the presense of the "^$" pattern. If any other pattern
1704 * occurs, then this flag will be removed. This is done to catch this
1705 * special pattern and handle it specially when matching.
1707 buf
->flags
= _URE_DFA_BLANKLINE
| ((casefold
) ? _URE_DFA_CASEFOLD
: 0);
1709 buf
->stack
.slist_used
= 0;
1712 for (i
= 0; i
< buf
->symtab_used
; i
++)
1713 buf
->symtab
[i
].states
.slist_used
= 0;
1714 buf
->symtab_used
= 0;
1716 for (i
= 0; i
< buf
->states
.states_used
; i
++) {
1717 buf
->states
.states
[i
].st
.slist_used
= 0;
1718 buf
->states
.states
[i
].trans_used
= 0;
1720 buf
->states
.states_used
= 0;
1723 * Construct the NFA. If this stage returns a 0, then an error occured or
1724 * an empty expression was passed.
1726 if ((state
= _ure_re2nfa(re
, relen
, buf
)) == _URE_NOOP
)
1730 * Do the expression reduction to get the initial DFA.
1732 _ure_reduce(state
, buf
);
1735 * Merge all the equivalent DFA states.
1737 _ure_merge_equiv(buf
);
1740 * Construct the minimal DFA.
1742 dfa
= (ure_dfa_t
) malloc(sizeof(_ure_dfa_t
));
1743 (void) memset((char *) dfa
, '\0', sizeof(_ure_dfa_t
));
1745 dfa
->flags
= buf
->flags
& (_URE_DFA_CASEFOLD
|_URE_DFA_BLANKLINE
);
1748 * Free up the NFA state groups and transfer the symbols from the buffer
1751 for (i
= 0; i
< buf
->symtab_size
; i
++) {
1752 if (buf
->symtab
[i
].states
.slist_size
> 0)
1753 free((char *) buf
->symtab
[i
].states
.slist
);
1755 dfa
->syms
= buf
->symtab
;
1756 dfa
->nsyms
= buf
->symtab_used
;
1758 buf
->symtab_used
= buf
->symtab_size
= 0;
1761 * Collect the total number of states and transitions needed for the DFA.
1763 for (i
= state
= 0, sp
= buf
->states
.states
; i
< buf
->states
.states_used
;
1765 if (sp
->id
== state
) {
1767 dfa
->ntrans
+= sp
->trans_used
;
1773 * Allocate enough space for the states and transitions.
1775 dfa
->states
= (_ure_dstate_t
*) malloc(sizeof(_ure_dstate_t
) *
1777 dfa
->trans
= (_ure_trans_t
*) malloc(sizeof(_ure_trans_t
) * dfa
->ntrans
);
1780 * Actually transfer the DFA states from the buffer.
1784 for (i
= state
= 0, sp
= buf
->states
.states
; i
< buf
->states
.states_used
;
1786 if (sp
->id
== state
) {
1788 dsp
->ntrans
= sp
->trans_used
;
1789 dsp
->accepting
= sp
->accepting
;
1792 * Add the transitions for the state.
1794 for (j
= 0; j
< dsp
->ntrans
; j
++, tp
++) {
1795 tp
->symbol
= sp
->trans
[j
].lhs
;
1796 tp
->next_state
= buf
->states
.states
[sp
->trans
[j
].rhs
].id
;
1808 ure_dfa_free(ure_dfa_t dfa
)
1815 for (i
= 0; i
< dfa
->nsyms
; i
++) {
1816 if ((dfa
->syms
[i
].type
== _URE_CCLASS
||
1817 dfa
->syms
[i
].type
== _URE_NCCLASS
) &&
1818 dfa
->syms
[i
].sym
.ccl
.ranges_size
> 0)
1819 free((char *) dfa
->syms
[i
].sym
.ccl
.ranges
);
1822 free((char *) dfa
->syms
);
1824 if (dfa
->nstates
> 0)
1825 free((char *) dfa
->states
);
1826 if (dfa
->ntrans
> 0)
1827 free((char *) dfa
->trans
);
1832 ure_write_dfa(ure_dfa_t dfa
, FILE *out
)
1834 ucs2_t i
, j
, k
, h
, l
;
1839 if (dfa
== 0 || out
== 0)
1843 * Write all the different character classes.
1845 for (i
= 0, sym
= dfa
->syms
; i
< dfa
->nsyms
; i
++, sym
++) {
1846 if (sym
->type
== _URE_CCLASS
|| sym
->type
== _URE_NCCLASS
) {
1847 fprintf(out
, "C%hd = ", sym
->id
);
1848 if (sym
->sym
.ccl
.ranges_used
> 0) {
1850 if (sym
->type
== _URE_NCCLASS
)
1853 if (sym
->props
!= 0) {
1854 if (sym
->type
== _URE_NCCLASS
)
1855 fprintf(out
, "\\P");
1857 fprintf(out
, "\\p");
1858 for (k
= h
= 0; k
< 32; k
++) {
1859 if (sym
->props
& (1 << k
)) {
1862 fprintf(out
, "%hd", k
+ 1);
1870 for (k
= 0, rp
= sym
->sym
.ccl
.ranges
;
1871 k
< sym
->sym
.ccl
.ranges_used
; k
++, rp
++) {
1873 * Check for UTF16 characters.
1875 if (0x10000 <= rp
->min_code
&&
1876 rp
->min_code
<= 0x10ffff) {
1877 h
= (ucs2_t
) (((rp
->min_code
- 0x10000) >> 10) + 0xd800);
1878 l
= (ucs2_t
) (((rp
->min_code
- 0x10000) & 1023) + 0xdc00);
1879 fprintf(out
, "\\x%04hX\\x%04hX", h
, l
);
1881 fprintf(out
, "\\x%04lX", rp
->min_code
& 0xffff);
1882 if (rp
->max_code
!= rp
->min_code
) {
1884 if (rp
->max_code
>= 0x10000 &&
1885 rp
->max_code
<= 0x10ffff) {
1886 h
= (ucs2_t
) (((rp
->max_code
- 0x10000) >> 10) + 0xd800);
1887 l
= (ucs2_t
) (((rp
->max_code
- 0x10000) & 1023) + 0xdc00);
1888 fprintf(out
, "\\x%04hX\\x%04hX", h
, l
);
1890 fprintf(out
, "\\x%04lX", rp
->max_code
& 0xffff);
1893 if (sym
->sym
.ccl
.ranges_used
> 0)
1899 for (i
= 0, sp
= dfa
->states
; i
< dfa
->nstates
; i
++, sp
++) {
1900 fprintf(out
, "S%hd = ", i
);
1901 if (sp
->accepting
) {
1906 for (j
= 0; j
< sp
->ntrans
; j
++) {
1910 sym
= dfa
->syms
+ sp
->trans
[j
].symbol
;
1911 switch (sym
->type
) {
1913 if (0x10000 <= sym
->sym
.chr
&& sym
->sym
.chr
<= 0x10ffff) {
1915 * Take care of UTF16 characters.
1917 h
= (ucs2_t
) (((sym
->sym
.chr
- 0x10000) >> 10) + 0xd800);
1918 l
= (ucs2_t
) (((sym
->sym
.chr
- 0x10000) & 1023) + 0xdc00);
1919 fprintf(out
, "\\x%04hX\\x%04hX ", h
, l
);
1921 fprintf(out
, "\\x%04lX ", sym
->sym
.chr
& 0xffff);
1924 fprintf(out
, "<any> ");
1926 case _URE_BOL_ANCHOR
:
1927 fprintf(out
, "<bol-anchor> ");
1929 case _URE_EOL_ANCHOR
:
1930 fprintf(out
, "<eol-anchor> ");
1934 fprintf(out
, "[C%hd] ", sym
->id
);
1937 fprintf(out
, "S%hd", sp
->trans
[j
].next_state
);
1938 if (j
+ 1 < sp
->ntrans
)
1945 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1949 ure_exec(ure_dfa_t dfa
, int flags
, ucs2_t
*text
, unsigned long textlen
,
1950 unsigned long *match_start
, unsigned long *match_end
)
1952 int i
, j
, matched
, found
, skip
;
1953 unsigned long ms
, me
;
1955 ucs2_t
*sp
, *ep
, *lp
;
1960 if (dfa
== 0 || text
== 0)
1964 * Handle the special case of an empty string matching the "^$" pattern.
1966 if (textlen
== 0 && (dfa
->flags
& _URE_DFA_BLANKLINE
)) {
1967 *match_start
= *match_end
= 0;
1978 for (found
= skip
= 0; found
== 0 && sp
< ep
; ) {
1983 * Check to see if this is a high surrogate that should be
1984 * combined with a following low surrogate.
1986 if (sp
< ep
&& 0xd800 <= c
&& c
<= 0xdbff &&
1987 0xdc00 <= *sp
&& *sp
<= 0xdfff)
1988 c
= 0x10000 + (((c
& 0x03ff) << 10) | (*sp
++ & 0x03ff));
1991 * Determine if the character is non-spacing and should be skipped.
1993 if (_ure_matches_properties(_URE_NONSPACING
, c
) &&
1994 (flags
& URE_IGNORE_NONSPACING
)) {
1999 if (dfa
->flags
& _URE_DFA_CASEFOLD
)
2000 c
= _ure_tolower(c
);
2003 * See if one of the transitions matches.
2005 for (i
= 0, matched
= 0; matched
== 0 && i
< stp
->ntrans
; i
++) {
2006 sym
= dfa
->syms
+ stp
->trans
[i
].symbol
;
2007 switch (sym
->type
) {
2009 if ((flags
& URE_DOT_MATCHES_SEPARATORS
) ||
2014 if (c
== sym
->sym
.chr
)
2017 case _URE_BOL_ANCHOR
:
2021 } else if (_ure_issep(c
)) {
2022 if (c
== '\r' && sp
< ep
&& *sp
== '\n')
2028 case _URE_EOL_ANCHOR
:
2029 if (_ure_issep(c
)) {
2031 * Put the pointer back before the separator so the match
2032 * end position will be correct. This case will also
2033 * cause the `sp' pointer to be advanced over the current
2034 * separator once the match end point has been recorded.
2042 if (sym
->props
!= 0)
2043 matched
= _ure_matches_properties(sym
->props
, c
);
2044 for (j
= 0, rp
= sym
->sym
.ccl
.ranges
;
2045 j
< sym
->sym
.ccl
.ranges_used
; j
++, rp
++) {
2046 if (rp
->min_code
<= c
&& c
<= rp
->max_code
)
2049 if (sym
->type
== _URE_NCCLASS
)
2059 stp
= dfa
->states
+ stp
->trans
[i
].next_state
;
2062 * If the match was an EOL anchor, adjust the pointer past the
2063 * separator that caused the match. The correct match
2064 * position has been recorded already.
2066 if (sym
->type
== _URE_EOL_ANCHOR
) {
2068 * Skip the character that caused the match.
2073 * Handle the infamous CRLF situation.
2075 if (sp
< ep
&& c
== '\r' && *sp
== '\n')
2082 if (stp
->accepting
== 0) {
2084 * If the last state was not accepting, then reset
2091 * The last state was accepting, so terminate the matching
2092 * loop to avoid more work.
2095 } else if (sp
== ep
) {
2096 if (!stp
->accepting
) {
2098 * This ugly hack is to make sure the end-of-line anchors
2099 * match when the source text hits the end. This is only done
2100 * if the last subexpression matches.
2102 for (i
= 0; found
== 0 && i
< stp
->ntrans
; i
++) {
2103 sym
= dfa
->syms
+ stp
->trans
[i
].symbol
;
2104 if (sym
->type
==_URE_EOL_ANCHOR
) {
2105 stp
= dfa
->states
+ stp
->trans
[i
].next_state
;
2106 if (stp
->accepting
) {
2115 * Make sure any conditions that match all the way to the end
2116 * of the string match.
2130 return (ms
!= ~0UL) ? 1 : 0;