Sync usage with man page.
[netbsd-mini2440.git] / external / bsd / openldap / dist / libraries / liblunicode / ure / ure.c
blobe54ae54ebc8a328586ab0a0a142bc5eb61c9b8ea
1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.3 2008/02/11 23:26:42 kurt Exp $ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 1998-2008 The OpenLDAP Foundation.
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
15 /* Copyright 1997, 1998, 1999 Computing Research Labs,
16 * New Mexico State University
18 * Permission is hereby granted, free of charge, to any person obtaining a
19 * copy of this software and associated documentation files (the "Software"),
20 * to deal in the Software without restriction, including without limitation
21 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 * and/or sell copies of the Software, and to permit persons to whom the
23 * Software is furnished to do so, subject to the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 /* $Id: ure.c,v 1.1.1.1 2008/02/11 23:26:42 lukem Exp $" */
38 #include "portable.h"
40 #include <ac/stdlib.h>
41 #include <ac/string.h>
42 #include <ac/unistd.h>
44 #include "ure.h"
47 * Flags used internally in the DFA.
49 #define _URE_DFA_CASEFOLD 0x01
50 #define _URE_DFA_BLANKLINE 0x02
52 static unsigned long cclass_flags[] = {
54 _URE_NONSPACING,
55 _URE_COMBINING,
56 _URE_NUMDIGIT,
57 _URE_NUMOTHER,
58 _URE_SPACESEP,
59 _URE_LINESEP,
60 _URE_PARASEP,
61 _URE_CNTRL,
62 _URE_PUA,
63 _URE_UPPER,
64 _URE_LOWER,
65 _URE_TITLE,
66 _URE_MODIFIER,
67 _URE_OTHERLETTER,
68 _URE_DASHPUNCT,
69 _URE_OPENPUNCT,
70 _URE_CLOSEPUNCT,
71 _URE_OTHERPUNCT,
72 _URE_MATHSYM,
73 _URE_CURRENCYSYM,
74 _URE_OTHERSYM,
75 _URE_LTR,
76 _URE_RTL,
77 _URE_EURONUM,
78 _URE_EURONUMSEP,
79 _URE_EURONUMTERM,
80 _URE_ARABNUM,
81 _URE_COMMONSEP,
82 _URE_BLOCKSEP,
83 _URE_SEGMENTSEP,
84 _URE_WHITESPACE,
85 _URE_OTHERNEUT,
89 * Symbol types for the DFA.
91 #define _URE_ANY_CHAR 1
92 #define _URE_CHAR 2
93 #define _URE_CCLASS 3
94 #define _URE_NCCLASS 4
95 #define _URE_BOL_ANCHOR 5
96 #define _URE_EOL_ANCHOR 6
99 * Op codes for converting the NFA to a DFA.
101 #define _URE_SYMBOL 10
102 #define _URE_PAREN 11
103 #define _URE_QUEST 12
104 #define _URE_STAR 13
105 #define _URE_PLUS 14
106 #define _URE_ONE 15
107 #define _URE_AND 16
108 #define _URE_OR 17
110 #define _URE_NOOP 0xffff
112 #define _URE_REGSTART 0x8000
113 #define _URE_REGEND 0x4000
116 * Structure used to handle a compacted range of characters.
118 typedef struct {
119 ucs4_t min_code;
120 ucs4_t max_code;
121 } _ure_range_t;
123 typedef struct {
124 _ure_range_t *ranges;
125 ucs2_t ranges_used;
126 ucs2_t ranges_size;
127 } _ure_ccl_t;
129 typedef union {
130 ucs4_t chr;
131 _ure_ccl_t ccl;
132 } _ure_sym_t;
135 * This is a general element structure used for expressions and stack
136 * elements.
138 typedef struct {
139 ucs2_t reg;
140 ucs2_t onstack;
141 ucs2_t type;
142 ucs2_t lhs;
143 ucs2_t rhs;
144 } _ure_elt_t;
147 * This is a structure used to track a list or a stack of states.
149 typedef struct {
150 ucs2_t *slist;
151 ucs2_t slist_size;
152 ucs2_t slist_used;
153 } _ure_stlist_t;
156 * Structure to track the list of unique states for a symbol
157 * during reduction.
159 typedef struct {
160 ucs2_t id;
161 ucs2_t type;
162 unsigned long mods;
163 unsigned long props;
164 _ure_sym_t sym;
165 _ure_stlist_t states;
166 } _ure_symtab_t;
169 * Structure to hold a single state.
171 typedef struct {
172 ucs2_t id;
173 ucs2_t accepting;
174 ucs2_t pad;
175 _ure_stlist_t st;
176 _ure_elt_t *trans;
177 ucs2_t trans_size;
178 ucs2_t trans_used;
179 } _ure_state_t;
182 * Structure used for keeping lists of states.
184 typedef struct {
185 _ure_state_t *states;
186 ucs2_t states_size;
187 ucs2_t states_used;
188 } _ure_statetable_t;
191 * Structure to track pairs of DFA states when equivalent states are
192 * merged.
194 typedef struct {
195 ucs2_t l;
196 ucs2_t r;
197 } _ure_equiv_t;
200 * Structure used for constructing the NFA and reducing to a minimal DFA.
202 typedef struct _ure_buffer_t {
203 int reducing;
204 int error;
205 unsigned long flags;
207 _ure_stlist_t stack;
210 * Table of unique symbols encountered.
212 _ure_symtab_t *symtab;
213 ucs2_t symtab_size;
214 ucs2_t symtab_used;
217 * Tracks the unique expressions generated for the NFA and when the NFA is
218 * reduced.
220 _ure_elt_t *expr;
221 ucs2_t expr_used;
222 ucs2_t expr_size;
225 * The reduced table of unique groups of NFA states.
227 _ure_statetable_t states;
230 * Tracks states when equivalent states are merged.
232 _ure_equiv_t *equiv;
233 ucs2_t equiv_used;
234 ucs2_t equiv_size;
235 } _ure_buffer_t;
237 typedef struct {
238 ucs2_t symbol;
239 ucs2_t next_state;
240 } _ure_trans_t;
242 typedef struct {
243 ucs2_t accepting;
244 ucs2_t ntrans;
245 _ure_trans_t *trans;
246 } _ure_dstate_t;
248 typedef struct _ure_dfa_t {
249 unsigned long flags;
251 _ure_symtab_t *syms;
252 ucs2_t nsyms;
254 _ure_dstate_t *states;
255 ucs2_t nstates;
257 _ure_trans_t *trans;
258 ucs2_t ntrans;
259 } _ure_dfa_t;
261 /*************************************************************************
263 * Functions.
265 *************************************************************************/
267 static void
268 _ure_memmove(char *dest, char *src, unsigned long bytes)
270 long i, j;
272 i = (long) bytes;
273 j = i & 7;
274 i = (i + 7) >> 3;
277 * Do a memmove using Ye Olde Duff's Device for efficiency.
279 if (src < dest) {
280 src += bytes;
281 dest += bytes;
283 switch (j) {
284 case 0: do {
285 *--dest = *--src;
286 case 7: *--dest = *--src;
287 case 6: *--dest = *--src;
288 case 5: *--dest = *--src;
289 case 4: *--dest = *--src;
290 case 3: *--dest = *--src;
291 case 2: *--dest = *--src;
292 case 1: *--dest = *--src;
293 } while (--i > 0);
295 } else if (src > dest) {
296 switch (j) {
297 case 0: do {
298 *dest++ = *src++;
299 case 7: *dest++ = *src++;
300 case 6: *dest++ = *src++;
301 case 5: *dest++ = *src++;
302 case 4: *dest++ = *src++;
303 case 3: *dest++ = *src++;
304 case 2: *dest++ = *src++;
305 case 1: *dest++ = *src++;
306 } while (--i > 0);
311 static void
312 _ure_push(ucs2_t v, _ure_buffer_t *b)
314 _ure_stlist_t *s;
316 if (b == 0)
317 return;
320 * If the `reducing' parameter is non-zero, check to see if the value
321 * passed is already on the stack.
323 if (b->reducing != 0 && b->expr[v].onstack != 0)
324 return;
326 s = &b->stack;
327 if (s->slist_used == s->slist_size) {
328 if (s->slist_size == 0)
329 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
330 else
331 s->slist = (ucs2_t *) realloc((char *) s->slist,
332 sizeof(ucs2_t) * (s->slist_size + 8));
333 s->slist_size += 8;
335 s->slist[s->slist_used++] = v;
338 * If the `reducing' parameter is non-zero, flag the element as being on
339 * the stack.
341 if (b->reducing != 0)
342 b->expr[v].onstack = 1;
345 static ucs2_t
346 _ure_peek(_ure_buffer_t *b)
348 if (b == 0 || b->stack.slist_used == 0)
349 return _URE_NOOP;
351 return b->stack.slist[b->stack.slist_used - 1];
354 static ucs2_t
355 _ure_pop(_ure_buffer_t *b)
357 ucs2_t v;
359 if (b == 0 || b->stack.slist_used == 0)
360 return _URE_NOOP;
362 v = b->stack.slist[--b->stack.slist_used];
363 if (b->reducing)
364 b->expr[v].onstack = 0;
366 return v;
369 /*************************************************************************
371 * Start symbol parse functions.
373 *************************************************************************/
376 * Parse a comma-separated list of integers that represent character
377 * properties. Combine them into a mask that is returned in the `mask'
378 * variable, and return the number of characters consumed.
380 static unsigned long
381 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
382 _ure_buffer_t *b)
384 unsigned long n, m;
385 ucs2_t *sp, *ep;
387 sp = pp;
388 ep = sp + limit;
390 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
391 if (*sp == ',') {
393 * Encountered a comma, so select the next character property flag
394 * and reset the number.
396 m |= cclass_flags[n];
397 n = 0;
398 } else if (*sp >= '0' && *sp <= '9')
400 * Encountered a digit, so start or continue building the cardinal
401 * that represents the character property flag.
403 n = (n * 10) + (*sp - '0');
404 else
406 * Encountered something that is not part of the property list.
407 * Indicate that we are done.
409 break;
412 * If a property number greater than 32 occurs, then there is a
413 * problem. Most likely a missing comma separator.
415 if (n > 32)
416 b->error = _URE_INVALID_PROPERTY;
419 if (n != 0)
420 m |= cclass_flags[n];
423 * Set the mask that represents the group of character properties.
425 *mask = m;
428 * Return the number of characters consumed.
430 return sp - pp;
434 * Collect a hex number with 1 to 4 digits and return the number
435 * of characters used.
437 static unsigned long
438 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
440 ucs2_t i;
441 ucs2_t *sp, *ep;
442 ucs4_t nn;
444 sp = np;
445 ep = sp + limit;
447 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
448 if (*sp >= '0' && *sp <= '9')
449 nn = (nn << 4) + (*sp - '0');
450 else if (*sp >= 'A' && *sp <= 'F')
451 nn = (nn << 4) + ((*sp - 'A') + 10);
452 else if (*sp >= 'a' && *sp <= 'f')
453 nn = (nn << 4) + ((*sp - 'a') + 10);
454 else
456 * Encountered something that is not a hex digit.
458 break;
462 * Assign the character code collected and return the number of
463 * characters used.
465 *n = nn;
467 return sp - np;
471 * Insert a range into a character class, removing duplicates and ordering
472 * them in increasing range-start order.
474 static void
475 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
477 ucs2_t i;
478 ucs4_t tmp;
479 _ure_range_t *rp;
482 * If the `casefold' flag is set, then make sure both endpoints of the
483 * range are converted to lower case.
485 if (b->flags & _URE_DFA_CASEFOLD) {
486 r->min_code = _ure_tolower(r->min_code);
487 r->max_code = _ure_tolower(r->max_code);
491 * Swap the range endpoints if they are not in increasing order.
493 if (r->min_code > r->max_code) {
494 tmp = r->min_code;
495 r->min_code = r->max_code;
496 r->max_code = tmp;
499 for (i = 0, rp = ccl->ranges;
500 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
503 * Check for a duplicate.
505 if (i < ccl->ranges_used &&
506 r->min_code == rp->min_code && r->max_code == rp->max_code)
507 return;
509 if (ccl->ranges_used == ccl->ranges_size) {
510 if (ccl->ranges_size == 0)
511 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
512 else
513 ccl->ranges = (_ure_range_t *)
514 realloc((char *) ccl->ranges,
515 sizeof(_ure_range_t) * (ccl->ranges_size + 8));
516 ccl->ranges_size += 8;
519 rp = ccl->ranges + ccl->ranges_used;
521 if (i < ccl->ranges_used)
522 _ure_memmove((char *) (rp + 1), (char *) rp,
523 sizeof(_ure_range_t) * (ccl->ranges_used - i));
525 ccl->ranges_used++;
526 rp->min_code = r->min_code;
527 rp->max_code = r->max_code;
530 #define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
531 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
532 #define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)
533 #define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
534 _URE_OTHERPUNCT)
535 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
536 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
537 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
538 #define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
540 typedef void (*_ure_cclsetup_t)(
541 _ure_symtab_t *sym,
542 unsigned long mask,
543 _ure_buffer_t *b
546 typedef struct {
547 ucs2_t key;
548 unsigned long len;
549 unsigned long next;
550 _ure_cclsetup_t func;
551 unsigned long mask;
552 } _ure_trie_t;
554 static void
555 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
557 sym->props |= mask;
560 static void
561 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
563 _ure_range_t range;
565 sym->props |= mask;
568 * Add the additional characters needed for handling isspace().
570 range.min_code = range.max_code = '\t';
571 _ure_add_range(&sym->sym.ccl, &range, b);
572 range.min_code = range.max_code = '\r';
573 _ure_add_range(&sym->sym.ccl, &range, b);
574 range.min_code = range.max_code = '\n';
575 _ure_add_range(&sym->sym.ccl, &range, b);
576 range.min_code = range.max_code = '\f';
577 _ure_add_range(&sym->sym.ccl, &range, b);
578 range.min_code = range.max_code = 0xfeff;
579 _ure_add_range(&sym->sym.ccl, &range, b);
582 static void
583 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
585 _ure_range_t range;
588 * Add the additional characters needed for handling isxdigit().
590 range.min_code = '0';
591 range.max_code = '9';
592 _ure_add_range(&sym->sym.ccl, &range, b);
593 range.min_code = 'A';
594 range.max_code = 'F';
595 _ure_add_range(&sym->sym.ccl, &range, b);
596 range.min_code = 'a';
597 range.max_code = 'f';
598 _ure_add_range(&sym->sym.ccl, &range, b);
601 static _ure_trie_t cclass_trie[] = {
602 {0x003a, 1, 1, 0, 0},
603 {0x0061, 9, 10, 0, 0},
604 {0x0063, 8, 19, 0, 0},
605 {0x0064, 7, 24, 0, 0},
606 {0x0067, 6, 29, 0, 0},
607 {0x006c, 5, 34, 0, 0},
608 {0x0070, 4, 39, 0, 0},
609 {0x0073, 3, 49, 0, 0},
610 {0x0075, 2, 54, 0, 0},
611 {0x0078, 1, 59, 0, 0},
612 {0x006c, 1, 11, 0, 0},
613 {0x006e, 2, 13, 0, 0},
614 {0x0070, 1, 16, 0, 0},
615 {0x0075, 1, 14, 0, 0},
616 {0x006d, 1, 15, 0, 0},
617 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
618 {0x0068, 1, 17, 0, 0},
619 {0x0061, 1, 18, 0, 0},
620 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
621 {0x006e, 1, 20, 0, 0},
622 {0x0074, 1, 21, 0, 0},
623 {0x0072, 1, 22, 0, 0},
624 {0x006c, 1, 23, 0, 0},
625 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
626 {0x0069, 1, 25, 0, 0},
627 {0x0067, 1, 26, 0, 0},
628 {0x0069, 1, 27, 0, 0},
629 {0x0074, 1, 28, 0, 0},
630 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
631 {0x0072, 1, 30, 0, 0},
632 {0x0061, 1, 31, 0, 0},
633 {0x0070, 1, 32, 0, 0},
634 {0x0068, 1, 33, 0, 0},
635 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
636 {0x006f, 1, 35, 0, 0},
637 {0x0077, 1, 36, 0, 0},
638 {0x0065, 1, 37, 0, 0},
639 {0x0072, 1, 38, 0, 0},
640 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
641 {0x0072, 2, 41, 0, 0},
642 {0x0075, 1, 45, 0, 0},
643 {0x0069, 1, 42, 0, 0},
644 {0x006e, 1, 43, 0, 0},
645 {0x0074, 1, 44, 0, 0},
646 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
647 {0x006e, 1, 46, 0, 0},
648 {0x0063, 1, 47, 0, 0},
649 {0x0074, 1, 48, 0, 0},
650 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
651 {0x0070, 1, 50, 0, 0},
652 {0x0061, 1, 51, 0, 0},
653 {0x0063, 1, 52, 0, 0},
654 {0x0065, 1, 53, 0, 0},
655 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
656 {0x0070, 1, 55, 0, 0},
657 {0x0070, 1, 56, 0, 0},
658 {0x0065, 1, 57, 0, 0},
659 {0x0072, 1, 58, 0, 0},
660 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
661 {0x0064, 1, 60, 0, 0},
662 {0x0069, 1, 61, 0, 0},
663 {0x0067, 1, 62, 0, 0},
664 {0x0069, 1, 63, 0, 0},
665 {0x0074, 1, 64, 0, 0},
666 {0x003a, 1, 65, _ure_xdigit_setup, 0},
670 * Probe for one of the POSIX colon delimited character classes in the static
671 * trie.
673 static unsigned long
674 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
675 _ure_buffer_t *b)
677 int i;
678 unsigned long n;
679 _ure_trie_t *tp;
680 ucs2_t *sp, *ep;
683 * If the number of characters left is less than 7, then this cannot be
684 * interpreted as one of the colon delimited classes.
686 if (limit < 7)
687 return 0;
689 sp = cp;
690 ep = sp + limit;
691 tp = cclass_trie;
692 for (i = 0; sp < ep && i < 8; i++, sp++) {
693 n = tp->len;
695 for (; n > 0 && tp->key != *sp; tp++, n--) ;
697 if (n == 0)
698 return 0;
700 if (*sp == ':' && (i == 6 || i == 7)) {
701 sp++;
702 break;
704 if (sp + 1 < ep)
705 tp = cclass_trie + tp->next;
707 if (tp->func == 0)
708 return 0;
710 (*tp->func)(sym, tp->mask, b);
712 return sp - cp;
716 * Construct a list of ranges and return the number of characters consumed.
718 static unsigned long
719 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
720 _ure_buffer_t *b)
722 int range_end;
723 unsigned long n;
724 ucs2_t *sp, *ep;
725 ucs4_t c, last;
726 _ure_ccl_t *cclp;
727 _ure_range_t range;
729 sp = cp;
730 ep = sp + limit;
732 if (*sp == '^') {
733 symp->type = _URE_NCCLASS;
734 sp++;
735 } else
736 symp->type = _URE_CCLASS;
738 for (last = 0, range_end = 0;
739 b->error == _URE_OK && sp < ep && *sp != ']'; ) {
740 c = *sp++;
741 if (c == '\\') {
742 if (sp == ep) {
744 * The EOS was encountered when expecting the reverse solidus
745 * to be followed by the character it is escaping. Set an
746 * error code and return the number of characters consumed up
747 * to this point.
749 b->error = _URE_UNEXPECTED_EOS;
750 return sp - cp;
753 c = *sp++;
754 switch (c) {
755 case 'a':
756 c = 0x07;
757 break;
758 case 'b':
759 c = 0x08;
760 break;
761 case 'f':
762 c = 0x0c;
763 break;
764 case 'n':
765 c = 0x0a;
766 break;
767 case 'r':
768 c = 0x0d;
769 break;
770 case 't':
771 c = 0x09;
772 break;
773 case 'v':
774 c = 0x0b;
775 break;
776 case 'p':
777 case 'P':
778 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
780 * Invert the bit mask of the properties if this is a negated
781 * character class or if 'P' is used to specify a list of
782 * character properties that should *not* match in a
783 * character class.
785 if (c == 'P')
786 symp->props = ~symp->props;
787 continue;
788 break;
789 case 'x':
790 case 'X':
791 case 'u':
792 case 'U':
793 if (sp < ep &&
794 ((*sp >= '0' && *sp <= '9') ||
795 (*sp >= 'A' && *sp <= 'F') ||
796 (*sp >= 'a' && *sp <= 'f')))
797 sp += _ure_hex(sp, ep - sp, &c);
799 } else if (c == ':') {
801 * Probe for a POSIX colon delimited character class.
803 sp--;
804 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
805 sp++;
806 else {
807 sp += n;
808 continue;
812 cclp = &symp->sym.ccl;
815 * Check to see if the current character is a low surrogate that needs
816 * to be combined with a preceding high surrogate.
818 if (last != 0) {
819 if (c >= 0xdc00 && c <= 0xdfff)
821 * Construct the UTF16 character code.
823 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
824 else {
826 * Add the isolated high surrogate to the range.
828 if (range_end == 1)
829 range.max_code = last & 0xffff;
830 else
831 range.min_code = range.max_code = last & 0xffff;
833 _ure_add_range(cclp, &range, b);
834 range_end = 0;
839 * Clear the last character code.
841 last = 0;
844 * This slightly awkward code handles the different cases needed to
845 * construct a range.
847 if (c >= 0xd800 && c <= 0xdbff) {
849 * If the high surrogate is followed by a range indicator, simply
850 * add it as the range start. Otherwise, save it in case the next
851 * character is a low surrogate.
853 if (*sp == '-') {
854 sp++;
855 range.min_code = c;
856 range_end = 1;
857 } else
858 last = c;
859 } else if (range_end == 1) {
860 range.max_code = c;
861 _ure_add_range(cclp, &range, b);
862 range_end = 0;
863 } else {
864 range.min_code = range.max_code = c;
865 if (*sp == '-') {
866 sp++;
867 range_end = 1;
868 } else
869 _ure_add_range(cclp, &range, b);
873 if (sp < ep && *sp == ']')
874 sp++;
875 else
877 * The parse was not terminated by the character class close symbol
878 * (']'), so set an error code.
880 b->error = _URE_CCLASS_OPEN;
882 return sp - cp;
886 * Probe for a low surrogate hex code.
888 static unsigned long
889 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
891 ucs4_t i, code;
892 ucs2_t *sp, *ep;
894 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
895 if (*sp >= '0' && *sp <= '9')
896 code = (code << 4) + (*sp - '0');
897 else if (*sp >= 'A' && *sp <= 'F')
898 code = (code << 4) + ((*sp - 'A') + 10);
899 else if (*sp >= 'a' && *sp <= 'f')
900 code = (code << 4) + ((*sp - 'a') + 10);
901 else
902 break;
905 *c = code;
906 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
909 static unsigned long
910 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
911 _ure_buffer_t *b)
913 ucs4_t c;
914 ucs2_t *sp, *ep;
916 sp = sym;
917 ep = sym + limit;
919 if ((c = *sp++) == '\\') {
921 if (sp == ep) {
923 * The EOS was encountered when expecting the reverse solidus to
924 * be followed by the character it is escaping. Set an error code
925 * and return the number of characters consumed up to this point.
927 b->error = _URE_UNEXPECTED_EOS;
928 return sp - sym;
931 c = *sp++;
932 switch (c) {
933 case 'p':
934 case 'P':
935 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
936 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
937 break;
938 case 'a':
939 symp->type = _URE_CHAR;
940 symp->sym.chr = 0x07;
941 break;
942 case 'b':
943 symp->type = _URE_CHAR;
944 symp->sym.chr = 0x08;
945 break;
946 case 'f':
947 symp->type = _URE_CHAR;
948 symp->sym.chr = 0x0c;
949 break;
950 case 'n':
951 symp->type = _URE_CHAR;
952 symp->sym.chr = 0x0a;
953 break;
954 case 'r':
955 symp->type = _URE_CHAR;
956 symp->sym.chr = 0x0d;
957 break;
958 case 't':
959 symp->type = _URE_CHAR;
960 symp->sym.chr = 0x09;
961 break;
962 case 'v':
963 symp->type = _URE_CHAR;
964 symp->sym.chr = 0x0b;
965 break;
966 case 'x':
967 case 'X':
968 case 'u':
969 case 'U':
971 * Collect between 1 and 4 digits representing a UCS2 code. Fall
972 * through to the next case.
974 if (sp < ep &&
975 ((*sp >= '0' && *sp <= '9') ||
976 (*sp >= 'A' && *sp <= 'F') ||
977 (*sp >= 'a' && *sp <= 'f')))
978 sp += _ure_hex(sp, ep - sp, &c);
979 /* FALLTHROUGH */
980 default:
982 * Simply add an escaped character here.
984 symp->type = _URE_CHAR;
985 symp->sym.chr = c;
987 } else if (c == '^' || c == '$')
989 * Handle the BOL and EOL anchors. This actually consists simply of
990 * setting a flag that indicates that the user supplied anchor match
991 * function should be called. This needs to be done instead of simply
992 * matching line/paragraph separators because beginning-of-text and
993 * end-of-text tests are needed as well.
995 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
996 else if (c == '[')
998 * Construct a character class.
1000 sp += _ure_cclass(sp, ep - sp, symp, b);
1001 else if (c == '.')
1002 symp->type = _URE_ANY_CHAR;
1003 else {
1004 symp->type = _URE_CHAR;
1005 symp->sym.chr = c;
1009 * If the symbol type happens to be a character and is a high surrogate,
1010 * then probe forward to see if it is followed by a low surrogate that
1011 * needs to be added.
1013 if (sp < ep && symp->type == _URE_CHAR &&
1014 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1016 if (0xdc00 <= *sp && *sp <= 0xdfff) {
1017 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1018 (*sp & 0x03ff));
1019 sp++;
1020 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1021 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1022 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1023 if (0xdc00 <= c && c <= 0xdfff) {
1025 * Take into account the \[xu] in front of the hex code.
1027 sp += 2;
1028 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1029 (c & 0x03ff));
1035 * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1036 * the `casefold' flag is set.
1038 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1039 symp->sym.chr = _ure_tolower(symp->sym.chr);
1042 * If the symbol constructed is anything other than one of the anchors,
1043 * make sure the _URE_DFA_BLANKLINE flag is removed.
1045 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1046 b->flags &= ~_URE_DFA_BLANKLINE;
1049 * Return the number of characters consumed.
1051 return sp - sym;
1054 static int
1055 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1057 if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1058 return 1;
1060 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1061 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1062 return 1;
1063 if (a->sym.ccl.ranges_used > 0 &&
1064 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1065 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1066 return 1;
1067 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1068 return 1;
1069 return 0;
1073 * Construct a symbol, but only keep unique symbols.
1075 static ucs2_t
1076 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1077 _ure_buffer_t *b)
1079 ucs2_t i;
1080 _ure_symtab_t *sp, symbol;
1083 * Build the next symbol so we can test to see if it is already in the
1084 * symbol table.
1086 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1087 *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1090 * Check to see if the symbol exists.
1092 for (i = 0, sp = b->symtab;
1093 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1095 if (i < b->symtab_used) {
1097 * Free up any ranges used for the symbol.
1099 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1100 symbol.sym.ccl.ranges_size > 0)
1101 free((char *) symbol.sym.ccl.ranges);
1103 return b->symtab[i].id;
1107 * Need to add the new symbol.
1109 if (b->symtab_used == b->symtab_size) {
1110 if (b->symtab_size == 0)
1111 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1112 else
1113 b->symtab = (_ure_symtab_t *)
1114 realloc((char *) b->symtab,
1115 sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1116 sp = b->symtab + b->symtab_size;
1117 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1118 b->symtab_size += 8;
1121 symbol.id = b->symtab_used++;
1122 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1123 sizeof(_ure_symtab_t));
1125 return symbol.id;
1128 /*************************************************************************
1130 * End symbol parse functions.
1132 *************************************************************************/
1134 static ucs2_t
1135 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1137 ucs2_t i;
1139 if (b == 0)
1140 return _URE_NOOP;
1143 * Determine if the expression already exists or not.
1145 for (i = 0; i < b->expr_used; i++) {
1146 if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1147 b->expr[i].rhs == rhs)
1148 break;
1150 if (i < b->expr_used)
1151 return i;
1154 * Need to add a new expression.
1156 if (b->expr_used == b->expr_size) {
1157 if (b->expr_size == 0)
1158 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1159 else
1160 b->expr = (_ure_elt_t *)
1161 realloc((char *) b->expr,
1162 sizeof(_ure_elt_t) * (b->expr_size + 8));
1163 b->expr_size += 8;
1166 b->expr[b->expr_used].onstack = 0;
1167 b->expr[b->expr_used].type = type;
1168 b->expr[b->expr_used].lhs = lhs;
1169 b->expr[b->expr_used].rhs = rhs;
1171 return b->expr_used++;
1174 static unsigned char spmap[] = {
1175 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1176 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1177 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1181 (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184 * Convert the regular expression into an NFA in a form that will be easy to
1185 * reduce to a DFA. The starting state for the reduction will be returned.
1187 static ucs2_t
1188 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1190 ucs2_t c, state, top, sym, *sp, *ep;
1191 unsigned long used;
1193 state = _URE_NOOP;
1195 sp = re;
1196 ep = sp + relen;
1197 while (b->error == _URE_OK && sp < ep) {
1198 c = *sp++;
1199 switch (c) {
1200 case '(':
1201 _ure_push(_URE_PAREN, b);
1202 break;
1203 case ')':
1205 * Check for the case of too many close parentheses.
1207 if (_ure_peek(b) == _URE_NOOP) {
1208 b->error = _URE_UNBALANCED_GROUP;
1209 break;
1212 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1214 * Make an expression with the AND or OR operator and its right
1215 * hand side.
1217 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1220 * Remove the _URE_PAREN off the stack.
1222 (void) _ure_pop(b);
1223 break;
1224 case '*':
1225 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1226 break;
1227 case '+':
1228 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1229 break;
1230 case '?':
1231 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1232 break;
1233 case '|':
1234 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1236 * Make an expression with the AND or OR operator and its right
1237 * hand side.
1239 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1241 _ure_push(state, b);
1242 _ure_push(_URE_OR, b);
1243 break;
1244 default:
1245 sp--;
1246 sym = _ure_make_symbol(sp, ep - sp, &used, b);
1247 sp += used;
1248 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1249 break;
1252 if (c != '(' && c != '|' && sp < ep &&
1253 (!_ure_isspecial(*sp) || *sp == '(')) {
1254 _ure_push(state, b);
1255 _ure_push(_URE_AND, b);
1258 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1260 * Make an expression with the AND or OR operator and its right
1261 * hand side.
1263 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1265 if (b->stack.slist_used > 0)
1266 b->error = _URE_UNBALANCED_GROUP;
1268 return (b->error == _URE_OK) ? state : _URE_NOOP;
1271 static void
1272 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1274 ucs2_t i, *stp;
1275 _ure_symtab_t *sp;
1278 * Locate the symbol in the symbol table so the state can be added.
1279 * If the symbol doesn't exist, then a real problem exists.
1281 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1282 i++, sp++) ;
1285 * Now find out if the state exists in the symbol's state list.
1287 for (i = 0, stp = sp->states.slist;
1288 i < sp->states.slist_used && state > *stp; i++, stp++) ;
1290 if (i == sp->states.slist_used || state < *stp) {
1292 * Need to add the state in order.
1294 if (sp->states.slist_used == sp->states.slist_size) {
1295 if (sp->states.slist_size == 0)
1296 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1297 else
1298 sp->states.slist = (ucs2_t *)
1299 realloc((char *) sp->states.slist,
1300 sizeof(ucs2_t) * (sp->states.slist_size + 8));
1301 sp->states.slist_size += 8;
1303 if (i < sp->states.slist_used)
1304 (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1305 (char *) (sp->states.slist + i),
1306 sizeof(ucs2_t) * (sp->states.slist_used - i));
1307 sp->states.slist[i] = state;
1308 sp->states.slist_used++;
1312 static ucs2_t
1313 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1315 ucs2_t i;
1316 _ure_state_t *sp;
1318 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1319 if (sp->st.slist_used == nstates &&
1320 memcmp((char *) states, (char *) sp->st.slist,
1321 sizeof(ucs2_t) * nstates) == 0)
1322 break;
1325 if (i == b->states.states_used) {
1327 * Need to add a new DFA state (set of NFA states).
1329 if (b->states.states_used == b->states.states_size) {
1330 if (b->states.states_size == 0)
1331 b->states.states = (_ure_state_t *)
1332 malloc(sizeof(_ure_state_t) << 3);
1333 else
1334 b->states.states = (_ure_state_t *)
1335 realloc((char *) b->states.states,
1336 sizeof(_ure_state_t) * (b->states.states_size + 8));
1337 sp = b->states.states + b->states.states_size;
1338 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1339 b->states.states_size += 8;
1342 sp = b->states.states + b->states.states_used++;
1343 sp->id = i;
1345 if (sp->st.slist_used + nstates > sp->st.slist_size) {
1346 if (sp->st.slist_size == 0)
1347 sp->st.slist = (ucs2_t *)
1348 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1349 else
1350 sp->st.slist = (ucs2_t *)
1351 realloc((char *) sp->st.slist,
1352 sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1353 sp->st.slist_size = sp->st.slist_used + nstates;
1355 sp->st.slist_used = nstates;
1356 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1357 sizeof(ucs2_t) * nstates);
1361 * Return the ID of the DFA state representing a group of NFA states.
1363 return i;
1366 static void
1367 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1369 ucs2_t i, j, state, eval, syms, rhs;
1370 ucs2_t s1, s2, ns1, ns2;
1371 _ure_state_t *sp;
1372 _ure_symtab_t *smp;
1374 b->reducing = 1;
1377 * Add the starting state for the reduction.
1379 _ure_add_state(1, &start, b);
1382 * Process each set of NFA states that get created.
1384 for (i = 0; i < b->states.states_used; i++) {
1385 sp = b->states.states + i;
1388 * Push the current states on the stack.
1390 for (j = 0; j < sp->st.slist_used; j++)
1391 _ure_push(sp->st.slist[j], b);
1394 * Reduce the NFA states.
1396 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1397 state = b->stack.slist[j];
1398 eval = 1;
1401 * This inner loop is the iterative equivalent of recursively
1402 * reducing subexpressions generated as a result of a reduction.
1404 while (eval) {
1405 switch (b->expr[state].type) {
1406 case _URE_SYMBOL:
1407 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1408 _ure_add_symstate(b->expr[state].lhs, ns1, b);
1409 syms++;
1410 eval = 0;
1411 break;
1412 case _URE_ONE:
1413 sp->accepting = 1;
1414 eval = 0;
1415 break;
1416 case _URE_QUEST:
1417 s1 = b->expr[state].lhs;
1418 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1419 state = _ure_make_expr(_URE_OR, ns1, s1, b);
1420 break;
1421 case _URE_PLUS:
1422 s1 = b->expr[state].lhs;
1423 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1424 state = _ure_make_expr(_URE_AND, s1, ns1, b);
1425 break;
1426 case _URE_STAR:
1427 s1 = b->expr[state].lhs;
1428 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1429 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1430 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1431 break;
1432 case _URE_OR:
1433 s1 = b->expr[state].lhs;
1434 s2 = b->expr[state].rhs;
1435 _ure_push(s1, b);
1436 _ure_push(s2, b);
1437 eval = 0;
1438 break;
1439 case _URE_AND:
1440 s1 = b->expr[state].lhs;
1441 s2 = b->expr[state].rhs;
1442 switch (b->expr[s1].type) {
1443 case _URE_SYMBOL:
1444 _ure_add_symstate(b->expr[s1].lhs, s2, b);
1445 syms++;
1446 eval = 0;
1447 break;
1448 case _URE_ONE:
1449 state = s2;
1450 break;
1451 case _URE_QUEST:
1452 ns1 = b->expr[s1].lhs;
1453 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1454 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1455 break;
1456 case _URE_PLUS:
1457 ns1 = b->expr[s1].lhs;
1458 ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1459 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1460 break;
1461 case _URE_STAR:
1462 ns1 = b->expr[s1].lhs;
1463 ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1464 state = _ure_make_expr(_URE_OR, s2, ns2, b);
1465 break;
1466 case _URE_OR:
1467 ns1 = b->expr[s1].lhs;
1468 ns2 = b->expr[s1].rhs;
1469 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1470 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1471 state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1472 break;
1473 case _URE_AND:
1474 ns1 = b->expr[s1].lhs;
1475 ns2 = b->expr[s1].rhs;
1476 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1477 state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1478 break;
1485 * Clear the state stack.
1487 while (_ure_pop(b) != _URE_NOOP) ;
1490 * Reset the state pointer because the reduction may have moved it
1491 * during a reallocation.
1493 sp = b->states.states + i;
1496 * Generate the DFA states for the symbols collected during the
1497 * current reduction.
1499 if (sp->trans_used + syms > sp->trans_size) {
1500 if (sp->trans_size == 0)
1501 sp->trans = (_ure_elt_t *)
1502 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1503 else
1504 sp->trans = (_ure_elt_t *)
1505 realloc((char *) sp->trans,
1506 sizeof(_ure_elt_t) * (sp->trans_used + syms));
1507 sp->trans_size = sp->trans_used + syms;
1511 * Go through the symbol table and generate the DFA state transitions
1512 * for each symbol that has collected NFA states.
1514 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1515 sp = b->states.states + i;
1517 if (smp->states.slist_used > 0) {
1518 sp->trans[syms].lhs = smp->id;
1519 rhs = _ure_add_state(smp->states.slist_used,
1520 smp->states.slist, b);
1522 * Reset the state pointer in case the reallocation moves it
1523 * in memory.
1525 sp = b->states.states + i;
1526 sp->trans[syms].rhs = rhs;
1528 smp->states.slist_used = 0;
1529 syms++;
1534 * Set the number of transitions actually used.
1536 sp->trans_used = syms;
1538 b->reducing = 0;
1541 static void
1542 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1544 ucs2_t tmp;
1546 l = b->states.states[l].id;
1547 r = b->states.states[r].id;
1549 if (l == r)
1550 return;
1552 if (l > r) {
1553 tmp = l;
1554 l = r;
1555 r = tmp;
1559 * Check to see if the equivalence pair already exists.
1561 for (tmp = 0; tmp < b->equiv_used &&
1562 (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1563 tmp++) ;
1565 if (tmp < b->equiv_used)
1566 return;
1568 if (b->equiv_used == b->equiv_size) {
1569 if (b->equiv_size == 0)
1570 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1571 else
1572 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1573 sizeof(_ure_equiv_t) *
1574 (b->equiv_size + 8));
1575 b->equiv_size += 8;
1577 b->equiv[b->equiv_used].l = l;
1578 b->equiv[b->equiv_used].r = r;
1579 b->equiv_used++;
1583 * Merge the DFA states that are equivalent.
1585 static void
1586 _ure_merge_equiv(_ure_buffer_t *b)
1588 ucs2_t i, j, k, eq, done;
1589 _ure_state_t *sp1, *sp2, *ls, *rs;
1591 for (i = 0; i < b->states.states_used; i++) {
1592 sp1 = b->states.states + i;
1593 if (sp1->id != i)
1594 continue;
1595 for (j = 0; j < i; j++) {
1596 sp2 = b->states.states + j;
1597 if (sp2->id != j)
1598 continue;
1599 b->equiv_used = 0;
1600 _ure_add_equiv(i, j, b);
1601 for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1602 ls = b->states.states + b->equiv[eq].l;
1603 rs = b->states.states + b->equiv[eq].r;
1604 if (ls->accepting != rs->accepting ||
1605 ls->trans_used != rs->trans_used) {
1606 done = 1;
1607 break;
1609 for (k = 0; k < ls->trans_used &&
1610 ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1611 if (k < ls->trans_used) {
1612 done = 1;
1613 break;
1616 for (k = 0; k < ls->trans_used; k++)
1617 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1619 if (done == 0)
1620 break;
1622 for (eq = 0; j < i && eq < b->equiv_used; eq++)
1623 b->states.states[b->equiv[eq].r].id =
1624 b->states.states[b->equiv[eq].l].id;
1628 * Renumber the states appropriately.
1630 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1631 sp1++, i++)
1632 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1635 /*************************************************************************
1637 * API.
1639 *************************************************************************/
1641 ure_buffer_t
1642 ure_buffer_create(void)
1644 ure_buffer_t b;
1646 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1648 return b;
1651 void
1652 ure_buffer_free(ure_buffer_t buf)
1654 unsigned long i;
1656 if (buf == 0)
1657 return;
1659 if (buf->stack.slist_size > 0)
1660 free((char *) buf->stack.slist);
1662 if (buf->expr_size > 0)
1663 free((char *) buf->expr);
1665 for (i = 0; i < buf->symtab_size; i++) {
1666 if (buf->symtab[i].states.slist_size > 0)
1667 free((char *) buf->symtab[i].states.slist);
1670 if (buf->symtab_size > 0)
1671 free((char *) buf->symtab);
1673 for (i = 0; i < buf->states.states_size; i++) {
1674 if (buf->states.states[i].trans_size > 0)
1675 free((char *) buf->states.states[i].trans);
1676 if (buf->states.states[i].st.slist_size > 0)
1677 free((char *) buf->states.states[i].st.slist);
1680 if (buf->states.states_size > 0)
1681 free((char *) buf->states.states);
1683 if (buf->equiv_size > 0)
1684 free((char *) buf->equiv);
1686 free((char *) buf);
1689 ure_dfa_t
1690 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1692 ucs2_t i, j, state;
1693 _ure_state_t *sp;
1694 _ure_dstate_t *dsp;
1695 _ure_trans_t *tp;
1696 ure_dfa_t dfa;
1698 if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1699 return 0;
1702 * Reset the various fields of the compilation buffer. Default the flags
1703 * to indicate the presense of the "^$" pattern. If any other pattern
1704 * occurs, then this flag will be removed. This is done to catch this
1705 * special pattern and handle it specially when matching.
1707 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1708 buf->reducing = 0;
1709 buf->stack.slist_used = 0;
1710 buf->expr_used = 0;
1712 for (i = 0; i < buf->symtab_used; i++)
1713 buf->symtab[i].states.slist_used = 0;
1714 buf->symtab_used = 0;
1716 for (i = 0; i < buf->states.states_used; i++) {
1717 buf->states.states[i].st.slist_used = 0;
1718 buf->states.states[i].trans_used = 0;
1720 buf->states.states_used = 0;
1723 * Construct the NFA. If this stage returns a 0, then an error occured or
1724 * an empty expression was passed.
1726 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1727 return 0;
1730 * Do the expression reduction to get the initial DFA.
1732 _ure_reduce(state, buf);
1735 * Merge all the equivalent DFA states.
1737 _ure_merge_equiv(buf);
1740 * Construct the minimal DFA.
1742 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1743 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1745 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1748 * Free up the NFA state groups and transfer the symbols from the buffer
1749 * to the DFA.
1751 for (i = 0; i < buf->symtab_size; i++) {
1752 if (buf->symtab[i].states.slist_size > 0)
1753 free((char *) buf->symtab[i].states.slist);
1755 dfa->syms = buf->symtab;
1756 dfa->nsyms = buf->symtab_used;
1758 buf->symtab_used = buf->symtab_size = 0;
1761 * Collect the total number of states and transitions needed for the DFA.
1763 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1764 i++, sp++) {
1765 if (sp->id == state) {
1766 dfa->nstates++;
1767 dfa->ntrans += sp->trans_used;
1768 state++;
1773 * Allocate enough space for the states and transitions.
1775 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1776 dfa->nstates);
1777 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1780 * Actually transfer the DFA states from the buffer.
1782 dsp = dfa->states;
1783 tp = dfa->trans;
1784 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1785 i++, sp++) {
1786 if (sp->id == state) {
1787 dsp->trans = tp;
1788 dsp->ntrans = sp->trans_used;
1789 dsp->accepting = sp->accepting;
1792 * Add the transitions for the state.
1794 for (j = 0; j < dsp->ntrans; j++, tp++) {
1795 tp->symbol = sp->trans[j].lhs;
1796 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1799 dsp++;
1800 state++;
1804 return dfa;
1807 void
1808 ure_dfa_free(ure_dfa_t dfa)
1810 ucs2_t i;
1812 if (dfa == 0)
1813 return;
1815 for (i = 0; i < dfa->nsyms; i++) {
1816 if ((dfa->syms[i].type == _URE_CCLASS ||
1817 dfa->syms[i].type == _URE_NCCLASS) &&
1818 dfa->syms[i].sym.ccl.ranges_size > 0)
1819 free((char *) dfa->syms[i].sym.ccl.ranges);
1821 if (dfa->nsyms > 0)
1822 free((char *) dfa->syms);
1824 if (dfa->nstates > 0)
1825 free((char *) dfa->states);
1826 if (dfa->ntrans > 0)
1827 free((char *) dfa->trans);
1828 free((char *) dfa);
1831 void
1832 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1834 ucs2_t i, j, k, h, l;
1835 _ure_dstate_t *sp;
1836 _ure_symtab_t *sym;
1837 _ure_range_t *rp;
1839 if (dfa == 0 || out == 0)
1840 return;
1843 * Write all the different character classes.
1845 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1846 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1847 fprintf(out, "C%hd = ", sym->id);
1848 if (sym->sym.ccl.ranges_used > 0) {
1849 putc('[', out);
1850 if (sym->type == _URE_NCCLASS)
1851 putc('^', out);
1853 if (sym->props != 0) {
1854 if (sym->type == _URE_NCCLASS)
1855 fprintf(out, "\\P");
1856 else
1857 fprintf(out, "\\p");
1858 for (k = h = 0; k < 32; k++) {
1859 if (sym->props & (1 << k)) {
1860 if (h != 0)
1861 putc(',', out);
1862 fprintf(out, "%hd", k + 1);
1863 h = 1;
1868 * Dump the ranges.
1870 for (k = 0, rp = sym->sym.ccl.ranges;
1871 k < sym->sym.ccl.ranges_used; k++, rp++) {
1873 * Check for UTF16 characters.
1875 if (0x10000 <= rp->min_code &&
1876 rp->min_code <= 0x10ffff) {
1877 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1878 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1879 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1880 } else
1881 fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1882 if (rp->max_code != rp->min_code) {
1883 putc('-', out);
1884 if (rp->max_code >= 0x10000 &&
1885 rp->max_code <= 0x10ffff) {
1886 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1887 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1888 fprintf(out, "\\x%04hX\\x%04hX", h, l);
1889 } else
1890 fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1893 if (sym->sym.ccl.ranges_used > 0)
1894 putc(']', out);
1895 putc('\n', out);
1899 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1900 fprintf(out, "S%hd = ", i);
1901 if (sp->accepting) {
1902 fprintf(out, "1 ");
1903 if (sp->ntrans)
1904 fprintf(out, "| ");
1906 for (j = 0; j < sp->ntrans; j++) {
1907 if (j > 0)
1908 fprintf(out, "| ");
1910 sym = dfa->syms + sp->trans[j].symbol;
1911 switch (sym->type) {
1912 case _URE_CHAR:
1913 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1915 * Take care of UTF16 characters.
1917 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1918 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1919 fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1920 } else
1921 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1922 break;
1923 case _URE_ANY_CHAR:
1924 fprintf(out, "<any> ");
1925 break;
1926 case _URE_BOL_ANCHOR:
1927 fprintf(out, "<bol-anchor> ");
1928 break;
1929 case _URE_EOL_ANCHOR:
1930 fprintf(out, "<eol-anchor> ");
1931 break;
1932 case _URE_CCLASS:
1933 case _URE_NCCLASS:
1934 fprintf(out, "[C%hd] ", sym->id);
1935 break;
1937 fprintf(out, "S%hd", sp->trans[j].next_state);
1938 if (j + 1 < sp->ntrans)
1939 putc(' ', out);
1941 putc('\n', out);
1945 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1946 (cc) == 0x2029)
1949 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1950 unsigned long *match_start, unsigned long *match_end)
1952 int i, j, matched, found, skip;
1953 unsigned long ms, me;
1954 ucs4_t c;
1955 ucs2_t *sp, *ep, *lp;
1956 _ure_dstate_t *stp;
1957 _ure_symtab_t *sym;
1958 _ure_range_t *rp;
1960 if (dfa == 0 || text == 0)
1961 return 0;
1964 * Handle the special case of an empty string matching the "^$" pattern.
1966 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1967 *match_start = *match_end = 0;
1968 return 1;
1971 sp = text;
1972 ep = sp + textlen;
1974 ms = me = ~0;
1976 stp = dfa->states;
1978 for (found = skip = 0; found == 0 && sp < ep; ) {
1979 lp = sp;
1980 c = *sp++;
1983 * Check to see if this is a high surrogate that should be
1984 * combined with a following low surrogate.
1986 if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1987 0xdc00 <= *sp && *sp <= 0xdfff)
1988 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1991 * Determine if the character is non-spacing and should be skipped.
1993 if (_ure_matches_properties(_URE_NONSPACING, c) &&
1994 (flags & URE_IGNORE_NONSPACING)) {
1995 sp++;
1996 continue;
1999 if (dfa->flags & _URE_DFA_CASEFOLD)
2000 c = _ure_tolower(c);
2003 * See if one of the transitions matches.
2005 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2006 sym = dfa->syms + stp->trans[i].symbol;
2007 switch (sym->type) {
2008 case _URE_ANY_CHAR:
2009 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2010 !_ure_issep(c))
2011 matched = 1;
2012 break;
2013 case _URE_CHAR:
2014 if (c == sym->sym.chr)
2015 matched = 1;
2016 break;
2017 case _URE_BOL_ANCHOR:
2018 if (lp == text) {
2019 sp = lp;
2020 matched = 1;
2021 } else if (_ure_issep(c)) {
2022 if (c == '\r' && sp < ep && *sp == '\n')
2023 sp++;
2024 lp = sp;
2025 matched = 1;
2027 break;
2028 case _URE_EOL_ANCHOR:
2029 if (_ure_issep(c)) {
2031 * Put the pointer back before the separator so the match
2032 * end position will be correct. This case will also
2033 * cause the `sp' pointer to be advanced over the current
2034 * separator once the match end point has been recorded.
2036 sp = lp;
2037 matched = 1;
2039 break;
2040 case _URE_CCLASS:
2041 case _URE_NCCLASS:
2042 if (sym->props != 0)
2043 matched = _ure_matches_properties(sym->props, c);
2044 for (j = 0, rp = sym->sym.ccl.ranges;
2045 j < sym->sym.ccl.ranges_used; j++, rp++) {
2046 if (rp->min_code <= c && c <= rp->max_code)
2047 matched = 1;
2049 if (sym->type == _URE_NCCLASS)
2050 matched = !matched;
2051 break;
2054 if (matched) {
2055 if (ms == ~0UL)
2056 ms = lp - text;
2057 else
2058 me = sp - text;
2059 stp = dfa->states + stp->trans[i].next_state;
2062 * If the match was an EOL anchor, adjust the pointer past the
2063 * separator that caused the match. The correct match
2064 * position has been recorded already.
2066 if (sym->type == _URE_EOL_ANCHOR) {
2068 * Skip the character that caused the match.
2070 sp++;
2073 * Handle the infamous CRLF situation.
2075 if (sp < ep && c == '\r' && *sp == '\n')
2076 sp++;
2081 if (matched == 0) {
2082 if (stp->accepting == 0) {
2084 * If the last state was not accepting, then reset
2085 * and start over.
2087 stp = dfa->states;
2088 ms = me = ~0;
2089 } else
2091 * The last state was accepting, so terminate the matching
2092 * loop to avoid more work.
2094 found = 1;
2095 } else if (sp == ep) {
2096 if (!stp->accepting) {
2098 * This ugly hack is to make sure the end-of-line anchors
2099 * match when the source text hits the end. This is only done
2100 * if the last subexpression matches.
2102 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2103 sym = dfa->syms + stp->trans[i].symbol;
2104 if (sym->type ==_URE_EOL_ANCHOR) {
2105 stp = dfa->states + stp->trans[i].next_state;
2106 if (stp->accepting) {
2107 me = sp - text;
2108 found = 1;
2109 } else
2110 break;
2113 } else {
2115 * Make sure any conditions that match all the way to the end
2116 * of the string match.
2118 found = 1;
2119 me = sp - text;
2124 if (found == 0)
2125 ms = me = ~0;
2127 *match_start = ms;
2128 *match_end = me;
2130 return (ms != ~0UL) ? 1 : 0;