add copyright/no-warranty comment
[coreutils.git] / lib / rx.c
blobd7ad522d58323983576e1b055f89260479f6c38c
1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 This file is part of the librx library.
5 Librx is free software; you can redistribute it and/or modify it under
6 the terms of the GNU Library General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 Librx is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with this software; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 /* NOTE!!! AIX is so losing it requires this to be the first thing in the
21 * file.
22 * Do not put ANYTHING before it!
24 #if !defined (__GNUC__) && defined (_AIX)
25 #pragma alloca
26 #endif
28 /* To make linux happy? */
29 #ifndef _GNU_SOURCE
30 #define _GNU_SOURCE
31 #endif
33 #if HAVE_CONFIG_H
34 #include <config.h>
35 #endif
37 const char *rx_version_string = "GNU Rx version 0.07.2";
39 /* ``Too hard!''
40 * -- anon.
44 #include <stdio.h>
45 #include <ctype.h>
46 #ifndef isgraph
47 #define isgraph(c) (isprint (c) && !isspace (c))
48 #endif
49 #ifndef isblank
50 #define isblank(c) ((c) == ' ' || (c) == '\t')
51 #endif
53 #include <sys/types.h>
55 #undef MAX
56 #undef MIN
57 #define MAX(a, b) ((a) > (b) ? (a) : (b))
58 #define MIN(a, b) ((a) < (b) ? (a) : (b))
60 typedef char boolean;
61 #define false 0
62 #define true 1
64 #ifndef __GCC__
65 #undef __inline__
66 #define __inline__
67 #endif
69 /* Emacs already defines alloca, sometimes. */
70 #ifndef alloca
72 /* Make alloca work the best possible way. */
73 #ifdef __GNUC__
74 #define alloca __builtin_alloca
75 #else /* not __GNUC__ */
76 #if HAVE_ALLOCA_H
77 #include <alloca.h>
78 #else /* not __GNUC__ or HAVE_ALLOCA_H */
79 #ifndef _AIX /* Already did AIX, up at the top. */
80 char *alloca ();
81 #endif /* not _AIX */
82 #endif /* not HAVE_ALLOCA_H */
83 #endif /* not __GNUC__ */
85 #endif /* not alloca */
87 /* Memory management and stuff for emacs. */
89 #define CHARBITS 8
90 #define remalloc(M, S) (M ? realloc (M, S) : malloc (S))
93 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
94 * use `alloca' instead of `malloc' for the backtracking stack.
96 * Emacs will die miserably if we don't do this.
99 #ifdef REGEX_MALLOC
100 #define REGEX_ALLOCATE malloc
101 #else /* not REGEX_MALLOC */
102 #define REGEX_ALLOCATE alloca
103 #endif /* not REGEX_MALLOC */
106 #ifdef RX_WANT_RX_DEFS
107 #define RX_DECL extern
108 #define RX_DEF_QUAL
109 #else
110 #define RX_WANT_RX_DEFS
111 #define RX_DECL static
112 #define RX_DEF_QUAL static
113 #endif
114 #include "rx.h"
115 #undef RX_DECL
116 #define RX_DECL RX_DEF_QUAL
119 #ifndef emacs
121 #ifndef SYNTAX
123 RX_DECL char re_syntax_table[CHAR_SET_SIZE];
125 #ifdef __STDC__
126 static void
127 init_syntax_once (void)
128 #else
129 static void
130 init_syntax_once ()
131 #endif
133 register int c;
134 static int done = 0;
136 if (done)
137 return;
139 bzero (re_syntax_table, sizeof re_syntax_table);
141 for (c = 'a'; c <= 'z'; c++)
142 re_syntax_table[c] = Sword;
144 for (c = 'A'; c <= 'Z'; c++)
145 re_syntax_table[c] = Sword;
147 for (c = '0'; c <= '9'; c++)
148 re_syntax_table[c] = Sword;
150 re_syntax_table['_'] = Sword;
152 done = 1;
154 #endif /* not SYNTAX */
155 #endif /* not emacs */
157 /* Compile with `-DRX_DEBUG' and use the following flags.
159 * Debugging flags:
160 * rx_debug - print information as a regexp is compiled
161 * rx_debug_trace - print information as a regexp is executed
164 #ifdef RX_DEBUG
166 int rx_debug_compile = 0;
167 int rx_debug_trace = 0;
168 static struct re_pattern_buffer * dbug_rxb = 0;
170 #ifdef __STDC__
171 typedef void (*side_effect_printer) (struct rx *, void *, FILE *);
172 #else
173 typedef void (*side_effect_printer) ();
174 #endif
176 #ifdef __STDC__
177 static void print_cset (struct rx *rx, rx_Bitset cset, FILE * fp);
178 #else
179 static void print_cset ();
180 #endif
182 #ifdef __STDC__
183 static void
184 print_rexp (struct rx *rx,
185 struct rexp_node *node, int depth,
186 side_effect_printer seprint, FILE * fp)
187 #else
188 static void
189 print_rexp (rx, node, depth, seprint, fp)
190 struct rx *rx;
191 struct rexp_node *node;
192 int depth;
193 side_effect_printer seprint;
194 FILE * fp;
195 #endif
197 if (!node)
198 return;
199 else
201 switch (node->type)
203 case r_cset:
205 fprintf (fp, "%*s", depth, "");
206 print_cset (rx, node->params.cset, fp);
207 fputc ('\n', fp);
208 break;
211 case r_opt:
212 case r_star:
213 fprintf (fp, "%*s%s\n", depth, "",
214 node->type == r_opt ? "opt" : "star");
215 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
216 break;
218 case r_2phase_star:
219 fprintf (fp, "%*s2phase star\n", depth, "");
220 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
221 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
222 break;
225 case r_alternate:
226 case r_concat:
227 fprintf (fp, "%*s%s\n", depth, "",
228 node->type == r_alternate ? "alt" : "concat");
229 print_rexp (rx, node->params.pair.left, depth + 3, seprint, fp);
230 print_rexp (rx, node->params.pair.right, depth + 3, seprint, fp);
231 break;
232 case r_side_effect:
233 fprintf (fp, "%*sSide effect: ", depth, "");
234 seprint (rx, node->params.side_effect, fp);
235 fputc ('\n', fp);
240 #ifdef __STDC__
241 static void
242 print_nfa (struct rx * rx,
243 struct rx_nfa_state * n,
244 side_effect_printer seprint, FILE * fp)
245 #else
246 static void
247 print_nfa (rx, n, seprint, fp)
248 struct rx * rx;
249 struct rx_nfa_state * n;
250 side_effect_printer seprint;
251 FILE * fp;
252 #endif
254 while (n)
256 struct rx_nfa_edge *e = n->edges;
257 struct rx_possible_future *ec = n->futures;
258 fprintf (fp, "node %d %s\n", n->id,
259 n->is_final ? "final" : (n->is_start ? "start" : ""));
260 while (e)
262 fprintf (fp, " edge to %d, ", e->dest->id);
263 switch (e->type)
265 case ne_epsilon:
266 fprintf (fp, "epsilon\n");
267 break;
268 case ne_side_effect:
269 fprintf (fp, "side effect ");
270 seprint (rx, e->params.side_effect, fp);
271 fputc ('\n', fp);
272 break;
273 case ne_cset:
274 fprintf (fp, "cset ");
275 print_cset (rx, e->params.cset, fp);
276 fputc ('\n', fp);
277 break;
279 e = e->next;
282 while (ec)
284 int x;
285 struct rx_nfa_state_set * s;
286 struct rx_se_list * l;
287 fprintf (fp, " eclosure to {");
288 for (s = ec->destset; s; s = s->cdr)
289 fprintf (fp, "%d ", s->car->id);
290 fprintf (fp, "} (");
291 for (l = ec->effects; l; l = l->cdr)
293 seprint (rx, l->car, fp);
294 fputc (' ', fp);
296 fprintf (fp, ")\n");
297 ec = ec->next;
299 n = n->next;
303 static char * efnames [] =
305 "bogon",
306 "re_se_try",
307 "re_se_pushback",
308 "re_se_push0",
309 "re_se_pushpos",
310 "re_se_chkpos",
311 "re_se_poppos",
312 "re_se_at_dot",
313 "re_se_syntax",
314 "re_se_not_syntax",
315 "re_se_begbuf",
316 "re_se_hat",
317 "re_se_wordbeg",
318 "re_se_wordbound",
319 "re_se_notwordbound",
320 "re_se_wordend",
321 "re_se_endbuf",
322 "re_se_dollar",
323 "re_se_fail",
326 static char * efnames2[] =
328 "re_se_win",
329 "re_se_lparen",
330 "re_se_rparen",
331 "re_se_backref",
332 "re_se_iter",
333 "re_se_end_iter",
334 "re_se_tv"
337 static char * inx_names[] =
339 "rx_backtrack_point",
340 "rx_do_side_effects",
341 "rx_cache_miss",
342 "rx_next_char",
343 "rx_backtrack",
344 "rx_error_inx",
345 "rx_num_instructions"
349 #ifdef __STDC__
350 static void
351 re_seprint (struct rx * rx, void * effect, FILE * fp)
352 #else
353 static void
354 re_seprint (rx, effect, fp)
355 struct rx * rx;
356 void * effect;
357 FILE * fp;
358 #endif
360 if ((int)effect < 0)
361 fputs (efnames[-(int)effect], fp);
362 else if (dbug_rxb)
364 struct re_se_params * p = &dbug_rxb->se_params[(int)effect];
365 fprintf (fp, "%s(%d,%d)", efnames2[p->se], p->op1, p->op2);
367 else
368 fprintf (fp, "[complex op # %d]", (int)effect);
372 /* These are so the regex.c regression tests will compile. */
373 void
374 print_compiled_pattern (rxb)
375 struct re_pattern_buffer * rxb;
379 void
380 print_fastmap (fm)
381 char * fm;
385 #endif /* RX_DEBUG */
389 /* This page: Bitsets. Completely unintersting. */
391 #ifdef __STDC__
392 RX_DECL int
393 rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b)
394 #else
395 RX_DECL int
396 rx_bitset_is_equal (size, a, b)
397 int size;
398 rx_Bitset a;
399 rx_Bitset b;
400 #endif
402 int x;
403 RX_subset s = b[0];
404 b[0] = ~a[0];
406 for (x = rx_bitset_numb_subsets(size) - 1; a[x] == b[x]; --x)
409 b[0] = s;
410 return !x && s == a[0];
413 #ifdef __STDC__
414 RX_DECL int
415 rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b)
416 #else
417 RX_DECL int
418 rx_bitset_is_subset (size, a, b)
419 int size;
420 rx_Bitset a;
421 rx_Bitset b;
422 #endif
424 int x = rx_bitset_numb_subsets(size) - 1;
425 while (x-- && (a[x] & b[x]) == a[x]);
426 return x == -1;
430 #ifdef __STDC__
431 RX_DECL int
432 rx_bitset_empty (int size, rx_Bitset set)
433 #else
434 RX_DECL int
435 rx_bitset_empty (size, set)
436 int size;
437 rx_Bitset set;
438 #endif
440 int x;
441 RX_subset s = set[0];
442 set[0] = 1;
443 for (x = rx_bitset_numb_subsets(size) - 1; !set[x]; --x)
445 set[0] = s;
446 return !s;
449 #ifdef __STDC__
450 RX_DECL void
451 rx_bitset_null (int size, rx_Bitset b)
452 #else
453 RX_DECL void
454 rx_bitset_null (size, b)
455 int size;
456 rx_Bitset b;
457 #endif
459 bzero (b, rx_sizeof_bitset(size));
463 #ifdef __STDC__
464 RX_DECL void
465 rx_bitset_universe (int size, rx_Bitset b)
466 #else
467 RX_DECL void
468 rx_bitset_universe (size, b)
469 int size;
470 rx_Bitset b;
471 #endif
473 int x = rx_bitset_numb_subsets (size);
474 while (x--)
475 *b++ = ~(RX_subset)0;
479 #ifdef __STDC__
480 RX_DECL void
481 rx_bitset_complement (int size, rx_Bitset b)
482 #else
483 RX_DECL void
484 rx_bitset_complement (size, b)
485 int size;
486 rx_Bitset b;
487 #endif
489 int x = rx_bitset_numb_subsets (size);
490 while (x--)
492 *b = ~*b;
493 ++b;
498 #ifdef __STDC__
499 RX_DECL void
500 rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b)
501 #else
502 RX_DECL void
503 rx_bitset_assign (size, a, b)
504 int size;
505 rx_Bitset a;
506 rx_Bitset b;
507 #endif
509 int x;
510 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
511 a[x] = b[x];
515 #ifdef __STDC__
516 RX_DECL void
517 rx_bitset_union (int size, rx_Bitset a, rx_Bitset b)
518 #else
519 RX_DECL void
520 rx_bitset_union (size, a, b)
521 int size;
522 rx_Bitset a;
523 rx_Bitset b;
524 #endif
526 int x;
527 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
528 a[x] |= b[x];
532 #ifdef __STDC__
533 RX_DECL void
534 rx_bitset_intersection (int size,
535 rx_Bitset a, rx_Bitset b)
536 #else
537 RX_DECL void
538 rx_bitset_intersection (size, a, b)
539 int size;
540 rx_Bitset a;
541 rx_Bitset b;
542 #endif
544 int x;
545 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
546 a[x] &= b[x];
550 #ifdef __STDC__
551 RX_DECL void
552 rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b)
553 #else
554 RX_DECL void
555 rx_bitset_difference (size, a, b)
556 int size;
557 rx_Bitset a;
558 rx_Bitset b;
559 #endif
561 int x;
562 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
563 a[x] &= ~ b[x];
567 #ifdef __STDC__
568 RX_DECL void
569 rx_bitset_revdifference (int size,
570 rx_Bitset a, rx_Bitset b)
571 #else
572 RX_DECL void
573 rx_bitset_revdifference (size, a, b)
574 int size;
575 rx_Bitset a;
576 rx_Bitset b;
577 #endif
579 int x;
580 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
581 a[x] = ~a[x] & b[x];
584 #ifdef __STDC__
585 RX_DECL void
586 rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b)
587 #else
588 RX_DECL void
589 rx_bitset_xor (size, a, b)
590 int size;
591 rx_Bitset a;
592 rx_Bitset b;
593 #endif
595 int x;
596 for (x = rx_bitset_numb_subsets(size) - 1; x >=0; --x)
597 a[x] ^= b[x];
601 #ifdef __STDC__
602 RX_DECL unsigned long
603 rx_bitset_hash (int size, rx_Bitset b)
604 #else
605 RX_DECL unsigned long
606 rx_bitset_hash (size, b)
607 int size;
608 rx_Bitset b;
609 #endif
611 int x;
612 unsigned long hash = (unsigned long)rx_bitset_hash;
614 for (x = rx_bitset_numb_subsets(size) - 1; x >= 0; --x)
615 hash ^= rx_bitset_subset_val(b, x);
617 return hash;
621 RX_DECL RX_subset rx_subset_singletons [RX_subset_bits] =
623 0x1,
624 0x2,
625 0x4,
626 0x8,
627 0x10,
628 0x20,
629 0x40,
630 0x80,
631 0x100,
632 0x200,
633 0x400,
634 0x800,
635 0x1000,
636 0x2000,
637 0x4000,
638 0x8000,
639 0x10000,
640 0x20000,
641 0x40000,
642 0x80000,
643 0x100000,
644 0x200000,
645 0x400000,
646 0x800000,
647 0x1000000,
648 0x2000000,
649 0x4000000,
650 0x8000000,
651 0x10000000,
652 0x20000000,
653 0x40000000,
654 0x80000000
657 #ifdef RX_DEBUG
659 #ifdef __STDC__
660 static void
661 print_cset (struct rx *rx, rx_Bitset cset, FILE * fp)
662 #else
663 static void
664 print_cset (rx, cset, fp)
665 struct rx *rx;
666 rx_Bitset cset;
667 FILE * fp;
668 #endif
670 int x;
671 fputc ('[', fp);
672 for (x = 0; x < rx->local_cset_size; ++x)
673 if (RX_bitset_member (cset, x))
675 if (isprint(x))
676 fputc (x, fp);
677 else
678 fprintf (fp, "\\0%o ", x);
680 fputc (']', fp);
683 #endif /* RX_DEBUG */
687 static unsigned long rx_hash_masks[4] =
689 0x12488421,
690 0x96699669,
691 0xbe7dd7eb,
692 0xffffffff
696 /* Hash tables */
697 #ifdef __STDC__
698 RX_DECL struct rx_hash_item *
699 rx_hash_find (struct rx_hash * table,
700 unsigned long hash,
701 void * value,
702 struct rx_hash_rules * rules)
703 #else
704 RX_DECL struct rx_hash_item *
705 rx_hash_find (table, hash, value, rules)
706 struct rx_hash * table;
707 unsigned long hash;
708 void * value;
709 struct rx_hash_rules * rules;
710 #endif
712 rx_hash_eq eq = rules->eq;
713 int maskc = 0;
714 long mask = rx_hash_masks [0];
715 int bucket = (hash & mask) % 13;
717 while (table->children [bucket])
719 table = table->children [bucket];
720 ++maskc;
721 mask = rx_hash_masks[maskc];
722 bucket = (hash & mask) % 13;
726 struct rx_hash_item * it = table->buckets[bucket];
727 while (it)
728 if (eq (it->data, value))
729 return it;
730 else
731 it = it->next_same_hash;
734 return 0;
738 #ifdef __STDC__
739 RX_DECL struct rx_hash_item *
740 rx_hash_store (struct rx_hash * table,
741 unsigned long hash,
742 void * value,
743 struct rx_hash_rules * rules)
744 #else
745 RX_DECL struct rx_hash_item *
746 rx_hash_store (table, hash, value, rules)
747 struct rx_hash * table;
748 unsigned long hash;
749 void * value;
750 struct rx_hash_rules * rules;
751 #endif
753 rx_hash_eq eq = rules->eq;
754 int maskc = 0;
755 long mask = rx_hash_masks[0];
756 int bucket = (hash & mask) % 13;
757 int depth = 0;
759 while (table->children [bucket])
761 table = table->children [bucket];
762 ++maskc;
763 mask = rx_hash_masks[maskc];
764 bucket = (hash & mask) % 13;
765 ++depth;
769 struct rx_hash_item * it = table->buckets[bucket];
770 while (it)
771 if (eq (it->data, value))
772 return it;
773 else
774 it = it->next_same_hash;
778 if ( (depth < 3)
779 && (table->bucket_size [bucket] >= 4))
781 struct rx_hash * newtab = ((struct rx_hash *)
782 rules->hash_alloc (rules));
783 if (!newtab)
784 goto add_to_bucket;
785 bzero (newtab, sizeof (*newtab));
786 newtab->parent = table;
788 struct rx_hash_item * them = table->buckets[bucket];
789 unsigned long newmask = rx_hash_masks[maskc + 1];
790 while (them)
792 struct rx_hash_item * save = them->next_same_hash;
793 int new_buck = (them->hash & newmask) % 13;
794 them->next_same_hash = newtab->buckets[new_buck];
795 newtab->buckets[new_buck] = them;
796 them->table = newtab;
797 them = save;
798 ++newtab->bucket_size[new_buck];
799 ++newtab->refs;
801 table->refs = (table->refs - table->bucket_size[bucket] + 1);
802 table->bucket_size[bucket] = 0;
803 table->buckets[bucket] = 0;
804 table->children[bucket] = newtab;
805 table = newtab;
806 bucket = (hash & newmask) % 13;
810 add_to_bucket:
812 struct rx_hash_item * it = ((struct rx_hash_item *)
813 rules->hash_item_alloc (rules, value));
814 if (!it)
815 return 0;
816 it->hash = hash;
817 it->table = table;
818 /* DATA and BINDING are to be set in hash_item_alloc */
819 it->next_same_hash = table->buckets [bucket];
820 table->buckets[bucket] = it;
821 ++table->bucket_size [bucket];
822 ++table->refs;
823 return it;
828 #ifdef __STDC__
829 RX_DECL void
830 rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules)
831 #else
832 RX_DECL void
833 rx_hash_free (it, rules)
834 struct rx_hash_item * it;
835 struct rx_hash_rules * rules;
836 #endif
838 if (it)
840 struct rx_hash * table = it->table;
841 unsigned long hash = it->hash;
842 int depth = (table->parent
843 ? (table->parent->parent
844 ? (table->parent->parent->parent
846 : 2)
847 : 1)
848 : 0);
849 int bucket = (hash & rx_hash_masks [depth]) % 13;
850 struct rx_hash_item ** pos = &table->buckets [bucket];
852 while (*pos != it)
853 pos = &(*pos)->next_same_hash;
854 *pos = it->next_same_hash;
855 rules->free_hash_item (it, rules);
856 --table->bucket_size[bucket];
857 --table->refs;
858 while (!table->refs && depth)
860 struct rx_hash * save = table;
861 table = table->parent;
862 --depth;
863 bucket = (hash & rx_hash_masks [depth]) % 13;
864 --table->refs;
865 table->children[bucket] = 0;
866 rules->free_hash (save, rules);
871 #ifdef __STDC__
872 RX_DECL void
873 rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
874 struct rx_hash_rules * rules)
875 #else
876 RX_DECL void
877 rx_free_hash_table (tab, freefn, rules)
878 struct rx_hash * tab;
879 rx_hash_freefn freefn;
880 struct rx_hash_rules * rules;
881 #endif
883 int x;
885 for (x = 0; x < 13; ++x)
886 if (tab->children[x])
888 rx_free_hash_table (tab->children[x], freefn, rules);
889 rules->free_hash (tab->children[x], rules);
891 else
893 struct rx_hash_item * them = tab->buckets[x];
894 while (them)
896 struct rx_hash_item * that = them;
897 them = that->next_same_hash;
898 freefn (that);
899 rules->free_hash_item (that, rules);
906 /* Utilities for manipulating bitset represntations of characters sets. */
908 #ifdef __STDC__
909 RX_DECL rx_Bitset
910 rx_cset (struct rx *rx)
911 #else
912 RX_DECL rx_Bitset
913 rx_cset (rx)
914 struct rx *rx;
915 #endif
917 rx_Bitset b = (rx_Bitset) malloc (rx_sizeof_bitset (rx->local_cset_size));
918 if (b)
919 rx_bitset_null (rx->local_cset_size, b);
920 return b;
924 #ifdef __STDC__
925 RX_DECL rx_Bitset
926 rx_copy_cset (struct rx *rx, rx_Bitset a)
927 #else
928 RX_DECL rx_Bitset
929 rx_copy_cset (rx, a)
930 struct rx *rx;
931 rx_Bitset a;
932 #endif
934 rx_Bitset cs = rx_cset (rx);
936 if (cs)
937 rx_bitset_union (rx->local_cset_size, cs, a);
939 return cs;
943 #ifdef __STDC__
944 RX_DECL void
945 rx_free_cset (struct rx * rx, rx_Bitset c)
946 #else
947 RX_DECL void
948 rx_free_cset (rx, c)
949 struct rx * rx;
950 rx_Bitset c;
951 #endif
953 if (c)
954 free ((char *)c);
958 /* Hash table memory allocation policy for the regexp compiler */
960 #ifdef __STDC__
961 static struct rx_hash *
962 compiler_hash_alloc (struct rx_hash_rules * rules)
963 #else
964 static struct rx_hash *
965 compiler_hash_alloc (rules)
966 struct rx_hash_rules * rules;
967 #endif
969 return (struct rx_hash *)malloc (sizeof (struct rx_hash));
973 #ifdef __STDC__
974 static struct rx_hash_item *
975 compiler_hash_item_alloc (struct rx_hash_rules * rules, void * value)
976 #else
977 static struct rx_hash_item *
978 compiler_hash_item_alloc (rules, value)
979 struct rx_hash_rules * rules;
980 void * value;
981 #endif
983 struct rx_hash_item * it;
984 it = (struct rx_hash_item *)malloc (sizeof (*it));
985 if (it)
987 it->data = value;
988 it->binding = 0;
990 return it;
994 #ifdef __STDC__
995 static void
996 compiler_free_hash (struct rx_hash * tab,
997 struct rx_hash_rules * rules)
998 #else
999 static void
1000 compiler_free_hash (tab, rules)
1001 struct rx_hash * tab;
1002 struct rx_hash_rules * rules;
1003 #endif
1005 free ((char *)tab);
1009 #ifdef __STDC__
1010 static void
1011 compiler_free_hash_item (struct rx_hash_item * item,
1012 struct rx_hash_rules * rules)
1013 #else
1014 static void
1015 compiler_free_hash_item (item, rules)
1016 struct rx_hash_item * item;
1017 struct rx_hash_rules * rules;
1018 #endif
1020 free ((char *)item);
1024 /* This page: REXP_NODE (expression tree) structures. */
1026 #ifdef __STDC__
1027 RX_DECL struct rexp_node *
1028 rexp_node (struct rx *rx,
1029 enum rexp_node_type type)
1030 #else
1031 RX_DECL struct rexp_node *
1032 rexp_node (rx, type)
1033 struct rx *rx;
1034 enum rexp_node_type type;
1035 #endif
1037 struct rexp_node *n;
1039 n = (struct rexp_node *)malloc (sizeof (*n));
1040 bzero (n, sizeof (*n));
1041 if (n)
1042 n->type = type;
1043 return n;
1047 /* free_rexp_node assumes that the bitset passed to rx_mk_r_cset
1048 * can be freed using rx_free_cset.
1050 #ifdef __STDC__
1051 RX_DECL struct rexp_node *
1052 rx_mk_r_cset (struct rx * rx,
1053 rx_Bitset b)
1054 #else
1055 RX_DECL struct rexp_node *
1056 rx_mk_r_cset (rx, b)
1057 struct rx * rx;
1058 rx_Bitset b;
1059 #endif
1061 struct rexp_node * n = rexp_node (rx, r_cset);
1062 if (n)
1063 n->params.cset = b;
1064 return n;
1068 #ifdef __STDC__
1069 RX_DECL struct rexp_node *
1070 rx_mk_r_concat (struct rx * rx,
1071 struct rexp_node * a,
1072 struct rexp_node * b)
1073 #else
1074 RX_DECL struct rexp_node *
1075 rx_mk_r_concat (rx, a, b)
1076 struct rx * rx;
1077 struct rexp_node * a;
1078 struct rexp_node * b;
1079 #endif
1081 struct rexp_node * n = rexp_node (rx, r_concat);
1082 if (n)
1084 n->params.pair.left = a;
1085 n->params.pair.right = b;
1087 return n;
1091 #ifdef __STDC__
1092 RX_DECL struct rexp_node *
1093 rx_mk_r_alternate (struct rx * rx,
1094 struct rexp_node * a,
1095 struct rexp_node * b)
1096 #else
1097 RX_DECL struct rexp_node *
1098 rx_mk_r_alternate (rx, a, b)
1099 struct rx * rx;
1100 struct rexp_node * a;
1101 struct rexp_node * b;
1102 #endif
1104 struct rexp_node * n = rexp_node (rx, r_alternate);
1105 if (n)
1107 n->params.pair.left = a;
1108 n->params.pair.right = b;
1110 return n;
1114 #ifdef __STDC__
1115 RX_DECL struct rexp_node *
1116 rx_mk_r_opt (struct rx * rx,
1117 struct rexp_node * a)
1118 #else
1119 RX_DECL struct rexp_node *
1120 rx_mk_r_opt (rx, a)
1121 struct rx * rx;
1122 struct rexp_node * a;
1123 #endif
1125 struct rexp_node * n = rexp_node (rx, r_opt);
1126 if (n)
1128 n->params.pair.left = a;
1129 n->params.pair.right = 0;
1131 return n;
1135 #ifdef __STDC__
1136 RX_DECL struct rexp_node *
1137 rx_mk_r_star (struct rx * rx,
1138 struct rexp_node * a)
1139 #else
1140 RX_DECL struct rexp_node *
1141 rx_mk_r_star (rx, a)
1142 struct rx * rx;
1143 struct rexp_node * a;
1144 #endif
1146 struct rexp_node * n = rexp_node (rx, r_star);
1147 if (n)
1149 n->params.pair.left = a;
1150 n->params.pair.right = 0;
1152 return n;
1156 #ifdef __STDC__
1157 RX_DECL struct rexp_node *
1158 rx_mk_r_2phase_star (struct rx * rx,
1159 struct rexp_node * a,
1160 struct rexp_node * b)
1161 #else
1162 RX_DECL struct rexp_node *
1163 rx_mk_r_2phase_star (rx, a, b)
1164 struct rx * rx;
1165 struct rexp_node * a;
1166 struct rexp_node * b;
1167 #endif
1169 struct rexp_node * n = rexp_node (rx, r_2phase_star);
1170 if (n)
1172 n->params.pair.left = a;
1173 n->params.pair.right = b;
1175 return n;
1179 #ifdef __STDC__
1180 RX_DECL struct rexp_node *
1181 rx_mk_r_side_effect (struct rx * rx,
1182 rx_side_effect a)
1183 #else
1184 RX_DECL struct rexp_node *
1185 rx_mk_r_side_effect (rx, a)
1186 struct rx * rx;
1187 rx_side_effect a;
1188 #endif
1190 struct rexp_node * n = rexp_node (rx, r_side_effect);
1191 if (n)
1193 n->params.side_effect = a;
1194 n->params.pair.right = 0;
1196 return n;
1200 #ifdef __STDC__
1201 RX_DECL struct rexp_node *
1202 rx_mk_r_data (struct rx * rx,
1203 void * a)
1204 #else
1205 RX_DECL struct rexp_node *
1206 rx_mk_r_data (rx, a)
1207 struct rx * rx;
1208 void * a;
1209 #endif
1211 struct rexp_node * n = rexp_node (rx, r_data);
1212 if (n)
1214 n->params.pair.left = a;
1215 n->params.pair.right = 0;
1217 return n;
1221 #ifdef __STDC__
1222 RX_DECL void
1223 rx_free_rexp (struct rx * rx, struct rexp_node * node)
1224 #else
1225 RX_DECL void
1226 rx_free_rexp (rx, node)
1227 struct rx * rx;
1228 struct rexp_node * node;
1229 #endif
1231 if (node)
1233 switch (node->type)
1235 case r_cset:
1236 if (node->params.cset)
1237 rx_free_cset (rx, node->params.cset);
1239 case r_side_effect:
1240 break;
1242 case r_concat:
1243 case r_alternate:
1244 case r_2phase_star:
1245 case r_opt:
1246 case r_star:
1247 rx_free_rexp (rx, node->params.pair.left);
1248 rx_free_rexp (rx, node->params.pair.right);
1249 break;
1251 case r_data:
1252 /* This shouldn't occur. */
1253 break;
1255 free ((char *)node);
1260 #ifdef __STDC__
1261 RX_DECL struct rexp_node *
1262 rx_copy_rexp (struct rx *rx,
1263 struct rexp_node *node)
1264 #else
1265 RX_DECL struct rexp_node *
1266 rx_copy_rexp (rx, node)
1267 struct rx *rx;
1268 struct rexp_node *node;
1269 #endif
1271 if (!node)
1272 return 0;
1273 else
1275 struct rexp_node *n = rexp_node (rx, node->type);
1276 if (!n)
1277 return 0;
1278 switch (node->type)
1280 case r_cset:
1281 n->params.cset = rx_copy_cset (rx, node->params.cset);
1282 if (!n->params.cset)
1284 rx_free_rexp (rx, n);
1285 return 0;
1287 break;
1289 case r_side_effect:
1290 n->params.side_effect = node->params.side_effect;
1291 break;
1293 case r_concat:
1294 case r_alternate:
1295 case r_opt:
1296 case r_2phase_star:
1297 case r_star:
1298 n->params.pair.left =
1299 rx_copy_rexp (rx, node->params.pair.left);
1300 n->params.pair.right =
1301 rx_copy_rexp (rx, node->params.pair.right);
1302 if ( (node->params.pair.left && !n->params.pair.left)
1303 || (node->params.pair.right && !n->params.pair.right))
1305 rx_free_rexp (rx, n);
1306 return 0;
1308 break;
1309 case r_data:
1310 /* shouldn't happen */
1311 break;
1313 return n;
1319 /* This page: functions to build and destroy graphs that describe nfa's */
1321 /* Constructs a new nfa node. */
1322 #ifdef __STDC__
1323 RX_DECL struct rx_nfa_state *
1324 rx_nfa_state (struct rx *rx)
1325 #else
1326 RX_DECL struct rx_nfa_state *
1327 rx_nfa_state (rx)
1328 struct rx *rx;
1329 #endif
1331 struct rx_nfa_state * n = (struct rx_nfa_state *)malloc (sizeof (*n));
1332 if (!n)
1333 return 0;
1334 bzero (n, sizeof (*n));
1335 n->next = rx->nfa_states;
1336 rx->nfa_states = n;
1337 return n;
1341 #ifdef __STDC__
1342 RX_DECL void
1343 rx_free_nfa_state (struct rx_nfa_state * n)
1344 #else
1345 RX_DECL void
1346 rx_free_nfa_state (n)
1347 struct rx_nfa_state * n;
1348 #endif
1350 free ((char *)n);
1354 /* This looks up an nfa node, given a numeric id. Numeric id's are
1355 * assigned after the nfa has been built.
1357 #ifdef __STDC__
1358 RX_DECL struct rx_nfa_state *
1359 rx_id_to_nfa_state (struct rx * rx,
1360 int id)
1361 #else
1362 RX_DECL struct rx_nfa_state *
1363 rx_id_to_nfa_state (rx, id)
1364 struct rx * rx;
1365 int id;
1366 #endif
1368 struct rx_nfa_state * n;
1369 for (n = rx->nfa_states; n; n = n->next)
1370 if (n->id == id)
1371 return n;
1372 return 0;
1376 /* This adds an edge between two nodes, but doesn't initialize the
1377 * edge label.
1380 #ifdef __STDC__
1381 RX_DECL struct rx_nfa_edge *
1382 rx_nfa_edge (struct rx *rx,
1383 enum rx_nfa_etype type,
1384 struct rx_nfa_state *start,
1385 struct rx_nfa_state *dest)
1386 #else
1387 RX_DECL struct rx_nfa_edge *
1388 rx_nfa_edge (rx, type, start, dest)
1389 struct rx *rx;
1390 enum rx_nfa_etype type;
1391 struct rx_nfa_state *start;
1392 struct rx_nfa_state *dest;
1393 #endif
1395 struct rx_nfa_edge *e;
1396 e = (struct rx_nfa_edge *)malloc (sizeof (*e));
1397 if (!e)
1398 return 0;
1399 e->next = start->edges;
1400 start->edges = e;
1401 e->type = type;
1402 e->dest = dest;
1403 return e;
1407 #ifdef __STDC__
1408 RX_DECL void
1409 rx_free_nfa_edge (struct rx_nfa_edge * e)
1410 #else
1411 RX_DECL void
1412 rx_free_nfa_edge (e)
1413 struct rx_nfa_edge * e;
1414 #endif
1416 free ((char *)e);
1420 /* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
1421 * of an NFA. These are added to an nfa automaticly by eclose_nfa.
1424 #ifdef __STDC__
1425 static struct rx_possible_future *
1426 rx_possible_future (struct rx * rx,
1427 struct rx_se_list * effects)
1428 #else
1429 static struct rx_possible_future *
1430 rx_possible_future (rx, effects)
1431 struct rx * rx;
1432 struct rx_se_list * effects;
1433 #endif
1435 struct rx_possible_future *ec;
1436 ec = (struct rx_possible_future *) malloc (sizeof (*ec));
1437 if (!ec)
1438 return 0;
1439 ec->destset = 0;
1440 ec->next = 0;
1441 ec->effects = effects;
1442 return ec;
1446 #ifdef __STDC__
1447 static void
1448 rx_free_possible_future (struct rx_possible_future * pf)
1449 #else
1450 static void
1451 rx_free_possible_future (pf)
1452 struct rx_possible_future * pf;
1453 #endif
1455 free ((char *)pf);
1459 #ifdef __STDC__
1460 RX_DECL void
1461 rx_free_nfa (struct rx *rx)
1462 #else
1463 RX_DECL void
1464 rx_free_nfa (rx)
1465 struct rx *rx;
1466 #endif
1468 while (rx->nfa_states)
1470 while (rx->nfa_states->edges)
1472 switch (rx->nfa_states->edges->type)
1474 case ne_cset:
1475 rx_free_cset (rx, rx->nfa_states->edges->params.cset);
1476 break;
1477 default:
1478 break;
1481 struct rx_nfa_edge * e;
1482 e = rx->nfa_states->edges;
1483 rx->nfa_states->edges = rx->nfa_states->edges->next;
1484 rx_free_nfa_edge (e);
1486 } /* while (rx->nfa_states->edges) */
1488 /* Iterate over the partial epsilon closures of rx->nfa_states */
1489 struct rx_possible_future * pf = rx->nfa_states->futures;
1490 while (pf)
1492 struct rx_possible_future * pft = pf;
1493 pf = pf->next;
1494 rx_free_possible_future (pft);
1498 struct rx_nfa_state *n;
1499 n = rx->nfa_states;
1500 rx->nfa_states = rx->nfa_states->next;
1501 rx_free_nfa_state (n);
1508 /* This page: translating a pattern expression into an nfa and doing the
1509 * static part of the nfa->super-nfa translation.
1512 /* This is the thompson regexp->nfa algorithm.
1513 * It is modified to allow for `side-effect epsilons.' Those are
1514 * edges that are taken whenever a similar epsilon edge would be,
1515 * but which imply that some side effect occurs when the edge
1516 * is taken.
1518 * Side effects are used to model parts of the pattern langauge
1519 * that are not regular (in the formal sense).
1522 #ifdef __STDC__
1523 RX_DECL int
1524 rx_build_nfa (struct rx *rx,
1525 struct rexp_node *rexp,
1526 struct rx_nfa_state **start,
1527 struct rx_nfa_state **end)
1528 #else
1529 RX_DECL int
1530 rx_build_nfa (rx, rexp, start, end)
1531 struct rx *rx;
1532 struct rexp_node *rexp;
1533 struct rx_nfa_state **start;
1534 struct rx_nfa_state **end;
1535 #endif
1537 struct rx_nfa_edge *edge;
1539 /* Start & end nodes may have been allocated by the caller. */
1540 *start = *start ? *start : rx_nfa_state (rx);
1542 if (!*start)
1543 return 0;
1545 if (!rexp)
1547 *end = *start;
1548 return 1;
1551 *end = *end ? *end : rx_nfa_state (rx);
1553 if (!*end)
1555 rx_free_nfa_state (*start);
1556 return 0;
1559 switch (rexp->type)
1561 case r_data:
1562 return 0;
1564 case r_cset:
1565 edge = rx_nfa_edge (rx, ne_cset, *start, *end);
1566 if (!edge)
1567 return 0;
1568 edge->params.cset = rx_copy_cset (rx, rexp->params.cset);
1569 if (!edge->params.cset)
1571 rx_free_nfa_edge (edge);
1572 return 0;
1574 return 1;
1576 case r_opt:
1577 return (rx_build_nfa (rx, rexp->params.pair.left, start, end)
1578 && rx_nfa_edge (rx, ne_epsilon, *start, *end));
1580 case r_star:
1582 struct rx_nfa_state * star_start = 0;
1583 struct rx_nfa_state * star_end = 0;
1584 return (rx_build_nfa (rx, rexp->params.pair.left,
1585 &star_start, &star_end)
1586 && star_start
1587 && star_end
1588 && rx_nfa_edge (rx, ne_epsilon, star_start, star_end)
1589 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1590 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1592 && rx_nfa_edge (rx, ne_epsilon, star_end, star_start));
1595 case r_2phase_star:
1597 struct rx_nfa_state * star_start = 0;
1598 struct rx_nfa_state * star_end = 0;
1599 struct rx_nfa_state * loop_exp_start = 0;
1600 struct rx_nfa_state * loop_exp_end = 0;
1602 return (rx_build_nfa (rx, rexp->params.pair.left,
1603 &star_start, &star_end)
1604 && rx_build_nfa (rx, rexp->params.pair.right,
1605 &loop_exp_start, &loop_exp_end)
1606 && star_start
1607 && star_end
1608 && loop_exp_end
1609 && loop_exp_start
1610 && rx_nfa_edge (rx, ne_epsilon, star_start, *end)
1611 && rx_nfa_edge (rx, ne_epsilon, *start, star_start)
1612 && rx_nfa_edge (rx, ne_epsilon, star_end, *end)
1614 && rx_nfa_edge (rx, ne_epsilon, star_end, loop_exp_start)
1615 && rx_nfa_edge (rx, ne_epsilon, loop_exp_end, star_start));
1619 case r_concat:
1621 struct rx_nfa_state *shared = 0;
1622 return
1623 (rx_build_nfa (rx, rexp->params.pair.left, start, &shared)
1624 && rx_build_nfa (rx, rexp->params.pair.right, &shared, end));
1627 case r_alternate:
1629 struct rx_nfa_state *ls = 0;
1630 struct rx_nfa_state *le = 0;
1631 struct rx_nfa_state *rs = 0;
1632 struct rx_nfa_state *re = 0;
1633 return (rx_build_nfa (rx, rexp->params.pair.left, &ls, &le)
1634 && rx_build_nfa (rx, rexp->params.pair.right, &rs, &re)
1635 && rx_nfa_edge (rx, ne_epsilon, *start, ls)
1636 && rx_nfa_edge (rx, ne_epsilon, *start, rs)
1637 && rx_nfa_edge (rx, ne_epsilon, le, *end)
1638 && rx_nfa_edge (rx, ne_epsilon, re, *end));
1641 case r_side_effect:
1642 edge = rx_nfa_edge (rx, ne_side_effect, *start, *end);
1643 if (!edge)
1644 return 0;
1645 edge->params.side_effect = rexp->params.side_effect;
1646 return 1;
1649 /* this should never happen */
1650 return 0;
1654 /* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon
1655 * transitions. Only these nodes can occur in super-states.
1656 * All nodes are given an integer id.
1657 * The id is non-negative if the node has non-epsilon out-transitions, negative
1658 * otherwise (this is because we want the non-negative ids to be used as
1659 * array indexes in a few places).
1662 #ifdef __STDC__
1663 RX_DECL void
1664 rx_name_nfa_states (struct rx *rx)
1665 #else
1666 RX_DECL void
1667 rx_name_nfa_states (rx)
1668 struct rx *rx;
1669 #endif
1671 struct rx_nfa_state *n = rx->nfa_states;
1673 rx->nodec = 0;
1674 rx->epsnodec = -1;
1676 while (n)
1678 struct rx_nfa_edge *e = n->edges;
1680 if (n->is_start)
1681 n->eclosure_needed = 1;
1683 while (e)
1685 switch (e->type)
1687 case ne_epsilon:
1688 case ne_side_effect:
1689 break;
1691 case ne_cset:
1692 n->id = rx->nodec++;
1694 struct rx_nfa_edge *from_n = n->edges;
1695 while (from_n)
1697 from_n->dest->eclosure_needed = 1;
1698 from_n = from_n->next;
1701 goto cont;
1703 e = e->next;
1705 n->id = rx->epsnodec--;
1706 cont:
1707 n = n->next;
1709 rx->epsnodec = -rx->epsnodec;
1713 /* This page: data structures for the static part of the nfa->supernfa
1714 * translation.
1716 * There are side effect lists -- lists of side effects occuring
1717 * along an uninterrupted, acyclic path of side-effect epsilon edges.
1718 * Such paths are collapsed to single edges in the course of computing
1719 * epsilon closures. Such single edges are labled with a list of all
1720 * the side effects entailed in crossing them. Like lists of side
1721 * effects are made == by the constructors below.
1723 * There are also nfa state sets. These are used to hold a list of all
1724 * states reachable from a starting state for a given type of transition
1725 * and side effect list. These are also hash-consed.
1728 /* The next several functions compare, construct, etc. lists of side
1729 * effects. See ECLOSE_NFA (below) for details.
1732 /* Ordering of rx_se_list
1733 * (-1, 0, 1 return value convention).
1736 #ifdef __STDC__
1737 static int
1738 se_list_cmp (void * va, void * vb)
1739 #else
1740 static int
1741 se_list_cmp (va, vb)
1742 void * va;
1743 void * vb;
1744 #endif
1746 struct rx_se_list * a = (struct rx_se_list *)va;
1747 struct rx_se_list * b = (struct rx_se_list *)vb;
1749 return ((va == vb)
1751 : (!va
1752 ? -1
1753 : (!vb
1755 : ((long)a->car < (long)b->car
1757 : ((long)a->car > (long)b->car
1758 ? -1
1759 : se_list_cmp ((void *)a->cdr, (void *)b->cdr))))));
1763 #ifdef __STDC__
1764 static int
1765 se_list_equal (void * va, void * vb)
1766 #else
1767 static int
1768 se_list_equal (va, vb)
1769 void * va;
1770 void * vb;
1771 #endif
1773 return !(se_list_cmp (va, vb));
1776 static struct rx_hash_rules se_list_hash_rules =
1778 se_list_equal,
1779 compiler_hash_alloc,
1780 compiler_free_hash,
1781 compiler_hash_item_alloc,
1782 compiler_free_hash_item
1786 #ifdef __STDC__
1787 static struct rx_se_list *
1788 side_effect_cons (struct rx * rx,
1789 void * se, struct rx_se_list * list)
1790 #else
1791 static struct rx_se_list *
1792 side_effect_cons (rx, se, list)
1793 struct rx * rx;
1794 void * se;
1795 struct rx_se_list * list;
1796 #endif
1798 struct rx_se_list * l;
1799 l = ((struct rx_se_list *) malloc (sizeof (*l)));
1800 if (!l)
1801 return 0;
1802 l->car = se;
1803 l->cdr = list;
1804 return l;
1808 #ifdef __STDC__
1809 static struct rx_se_list *
1810 hash_cons_se_prog (struct rx * rx,
1811 struct rx_hash * memo,
1812 void * car, struct rx_se_list * cdr)
1813 #else
1814 static struct rx_se_list *
1815 hash_cons_se_prog (rx, memo, car, cdr)
1816 struct rx * rx;
1817 struct rx_hash * memo;
1818 void * car;
1819 struct rx_se_list * cdr;
1820 #endif
1822 long hash = (long)car ^ (long)cdr;
1823 struct rx_se_list template;
1825 template.car = car;
1826 template.cdr = cdr;
1828 struct rx_hash_item * it = rx_hash_store (memo, hash,
1829 (void *)&template,
1830 &se_list_hash_rules);
1831 if (!it)
1832 return 0;
1833 if (it->data == (void *)&template)
1835 struct rx_se_list * consed;
1836 consed = (struct rx_se_list *) malloc (sizeof (*consed));
1837 *consed = template;
1838 it->data = (void *)consed;
1840 return (struct rx_se_list *)it->data;
1845 #ifdef __STDC__
1846 static struct rx_se_list *
1847 hash_se_prog (struct rx * rx, struct rx_hash * memo, struct rx_se_list * prog)
1848 #else
1849 static struct rx_se_list *
1850 hash_se_prog (rx, memo, prog)
1851 struct rx * rx;
1852 struct rx_hash * memo;
1853 struct rx_se_list * prog;
1854 #endif
1856 struct rx_se_list * answer = 0;
1857 while (prog)
1859 answer = hash_cons_se_prog (rx, memo, prog->car, answer);
1860 if (!answer)
1861 return 0;
1862 prog = prog->cdr;
1864 return answer;
1867 #ifdef __STDC__
1868 static int
1869 nfa_set_cmp (void * va, void * vb)
1870 #else
1871 static int
1872 nfa_set_cmp (va, vb)
1873 void * va;
1874 void * vb;
1875 #endif
1877 struct rx_nfa_state_set * a = (struct rx_nfa_state_set *)va;
1878 struct rx_nfa_state_set * b = (struct rx_nfa_state_set *)vb;
1880 return ((va == vb)
1882 : (!va
1883 ? -1
1884 : (!vb
1886 : (a->car->id < b->car->id
1888 : (a->car->id > b->car->id
1889 ? -1
1890 : nfa_set_cmp ((void *)a->cdr, (void *)b->cdr))))));
1893 #ifdef __STDC__
1894 static int
1895 nfa_set_equal (void * va, void * vb)
1896 #else
1897 static int
1898 nfa_set_equal (va, vb)
1899 void * va;
1900 void * vb;
1901 #endif
1903 return !nfa_set_cmp (va, vb);
1906 static struct rx_hash_rules nfa_set_hash_rules =
1908 nfa_set_equal,
1909 compiler_hash_alloc,
1910 compiler_free_hash,
1911 compiler_hash_item_alloc,
1912 compiler_free_hash_item
1916 #ifdef __STDC__
1917 static struct rx_nfa_state_set *
1918 nfa_set_cons (struct rx * rx,
1919 struct rx_hash * memo, struct rx_nfa_state * state,
1920 struct rx_nfa_state_set * set)
1921 #else
1922 static struct rx_nfa_state_set *
1923 nfa_set_cons (rx, memo, state, set)
1924 struct rx * rx;
1925 struct rx_hash * memo;
1926 struct rx_nfa_state * state;
1927 struct rx_nfa_state_set * set;
1928 #endif
1930 struct rx_nfa_state_set template;
1931 struct rx_hash_item * node;
1932 template.car = state;
1933 template.cdr = set;
1934 node = rx_hash_store (memo,
1935 (((long)state) >> 8) ^ (long)set,
1936 &template, &nfa_set_hash_rules);
1937 if (!node)
1938 return 0;
1939 if (node->data == &template)
1941 struct rx_nfa_state_set * l;
1942 l = (struct rx_nfa_state_set *) malloc (sizeof (*l));
1943 node->data = (void *) l;
1944 if (!l)
1945 return 0;
1946 *l = template;
1948 return (struct rx_nfa_state_set *)node->data;
1952 #ifdef __STDC__
1953 static struct rx_nfa_state_set *
1954 nfa_set_enjoin (struct rx * rx,
1955 struct rx_hash * memo, struct rx_nfa_state * state,
1956 struct rx_nfa_state_set * set)
1957 #else
1958 static struct rx_nfa_state_set *
1959 nfa_set_enjoin (rx, memo, state, set)
1960 struct rx * rx;
1961 struct rx_hash * memo;
1962 struct rx_nfa_state * state;
1963 struct rx_nfa_state_set * set;
1964 #endif
1966 if (!set || state->id < set->car->id)
1967 return nfa_set_cons (rx, memo, state, set);
1968 if (state->id == set->car->id)
1969 return set;
1970 else
1972 struct rx_nfa_state_set * newcdr
1973 = nfa_set_enjoin (rx, memo, state, set->cdr);
1974 if (newcdr != set->cdr)
1975 set = nfa_set_cons (rx, memo, set->car, newcdr);
1976 return set;
1982 /* This page: computing epsilon closures. The closures aren't total.
1983 * Each node's closures are partitioned according to the side effects entailed
1984 * along the epsilon edges. Return true on success.
1987 struct eclose_frame
1989 struct rx_se_list *prog_backwards;
1993 #ifdef __STDC__
1994 static int
1995 eclose_node (struct rx *rx, struct rx_nfa_state *outnode,
1996 struct rx_nfa_state *node, struct eclose_frame *frame)
1997 #else
1998 static int
1999 eclose_node (rx, outnode, node, frame)
2000 struct rx *rx;
2001 struct rx_nfa_state *outnode;
2002 struct rx_nfa_state *node;
2003 struct eclose_frame *frame;
2004 #endif
2006 struct rx_nfa_edge *e = node->edges;
2008 /* For each node, we follow all epsilon paths to build the closure.
2009 * The closure omits nodes that have only epsilon edges.
2010 * The closure is split into partial closures -- all the states in
2011 * a partial closure are reached by crossing the same list of
2012 * of side effects (though not necessarily the same path).
2014 if (node->mark)
2015 return 1;
2016 node->mark = 1;
2018 if (node->id >= 0 || node->is_final)
2020 struct rx_possible_future **ec;
2021 struct rx_se_list * prog_in_order
2022 = ((struct rx_se_list *)hash_se_prog (rx,
2023 &rx->se_list_memo,
2024 frame->prog_backwards));
2025 int cmp;
2027 ec = &outnode->futures;
2029 while (*ec)
2031 cmp = se_list_cmp ((void *)(*ec)->effects, (void *)prog_in_order);
2032 if (cmp <= 0)
2033 break;
2034 ec = &(*ec)->next;
2036 if (!*ec || (cmp < 0))
2038 struct rx_possible_future * saved = *ec;
2039 *ec = rx_possible_future (rx, prog_in_order);
2040 (*ec)->next = saved;
2041 if (!*ec)
2042 return 0;
2044 if (node->id >= 0)
2046 (*ec)->destset = nfa_set_enjoin (rx, &rx->set_list_memo,
2047 node, (*ec)->destset);
2048 if (!(*ec)->destset)
2049 return 0;
2053 while (e)
2055 switch (e->type)
2057 case ne_epsilon:
2058 if (!eclose_node (rx, outnode, e->dest, frame))
2059 return 0;
2060 break;
2061 case ne_side_effect:
2063 frame->prog_backwards = side_effect_cons (rx,
2064 e->params.side_effect,
2065 frame->prog_backwards);
2066 if (!frame->prog_backwards)
2067 return 0;
2068 if (!eclose_node (rx, outnode, e->dest, frame))
2069 return 0;
2071 struct rx_se_list * dying = frame->prog_backwards;
2072 frame->prog_backwards = frame->prog_backwards->cdr;
2073 free ((char *)dying);
2075 break;
2077 default:
2078 break;
2080 e = e->next;
2082 node->mark = 0;
2083 return 1;
2087 #ifdef __STDC__
2088 RX_DECL int
2089 rx_eclose_nfa (struct rx *rx)
2090 #else
2091 RX_DECL int
2092 rx_eclose_nfa (rx)
2093 struct rx *rx;
2094 #endif
2096 struct rx_nfa_state *n = rx->nfa_states;
2097 struct eclose_frame frame;
2098 static int rx_id = 0;
2100 frame.prog_backwards = 0;
2101 rx->rx_id = rx_id++;
2102 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2103 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2104 while (n)
2106 n->futures = 0;
2107 if (n->eclosure_needed && !eclose_node (rx, n, n, &frame))
2108 return 0;
2109 /* clear_marks (rx); */
2110 n = n->next;
2112 return 1;
2116 /* This deletes epsilon edges from an NFA. After running eclose_node,
2117 * we have no more need for these edges. They are removed to simplify
2118 * further operations on the NFA.
2121 #ifdef __STDC__
2122 RX_DECL void
2123 rx_delete_epsilon_transitions (struct rx *rx)
2124 #else
2125 RX_DECL void
2126 rx_delete_epsilon_transitions (rx)
2127 struct rx *rx;
2128 #endif
2130 struct rx_nfa_state *n = rx->nfa_states;
2131 struct rx_nfa_edge **e;
2133 while (n)
2135 e = &n->edges;
2136 while (*e)
2138 struct rx_nfa_edge *t;
2139 switch ((*e)->type)
2141 case ne_epsilon:
2142 case ne_side_effect:
2143 t = *e;
2144 *e = t->next;
2145 rx_free_nfa_edge (t);
2146 break;
2148 default:
2149 e = &(*e)->next;
2150 break;
2153 n = n->next;
2158 /* This page: storing the nfa in a contiguous region of memory for
2159 * subsequent conversion to a super-nfa.
2162 /* This is for qsort on an array of nfa_states. The order
2163 * is based on state ids and goes
2164 * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0)
2165 * This way, positive ids double as array indices.
2168 #ifdef __STDC__
2169 static int
2170 nfacmp (void * va, void * vb)
2171 #else
2172 static int
2173 nfacmp (va, vb)
2174 void * va;
2175 void * vb;
2176 #endif
2178 struct rx_nfa_state **a = (struct rx_nfa_state **)va;
2179 struct rx_nfa_state **b = (struct rx_nfa_state **)vb;
2180 return (*a == *b /* &&&& 3.18 */
2182 : (((*a)->id < 0) == ((*b)->id < 0)
2183 ? (((*a)->id < (*b)->id) ? -1 : 1)
2184 : (((*a)->id < 0)
2185 ? 1 : -1)));
2188 #ifdef __STDC__
2189 static int
2190 count_hash_nodes (struct rx_hash * st)
2191 #else
2192 static int
2193 count_hash_nodes (st)
2194 struct rx_hash * st;
2195 #endif
2197 int x;
2198 int count = 0;
2199 for (x = 0; x < 13; ++x)
2200 count += ((st->children[x])
2201 ? count_hash_nodes (st->children[x])
2202 : st->bucket_size[x]);
2204 return count;
2208 #ifdef __STDC__
2209 static void
2210 se_memo_freer (struct rx_hash_item * node)
2211 #else
2212 static void
2213 se_memo_freer (node)
2214 struct rx_hash_item * node;
2215 #endif
2217 free ((char *)node->data);
2221 #ifdef __STDC__
2222 static void
2223 nfa_set_freer (struct rx_hash_item * node)
2224 #else
2225 static void
2226 nfa_set_freer (node)
2227 struct rx_hash_item * node;
2228 #endif
2230 free ((char *)node->data);
2234 /* This copies an entire NFA into a single malloced block of memory.
2235 * Mostly this is for compatability with regex.c, though it is convenient
2236 * to have the nfa nodes in an array.
2239 #ifdef __STDC__
2240 RX_DECL int
2241 rx_compactify_nfa (struct rx *rx,
2242 void **mem, unsigned long *size)
2243 #else
2244 RX_DECL int
2245 rx_compactify_nfa (rx, mem, size)
2246 struct rx *rx;
2247 void **mem;
2248 unsigned long *size;
2249 #endif
2251 int total_nodec;
2252 struct rx_nfa_state *n;
2253 int edgec = 0;
2254 int eclosec = 0;
2255 int se_list_consc = count_hash_nodes (&rx->se_list_memo);
2256 int nfa_setc = count_hash_nodes (&rx->set_list_memo);
2257 unsigned long total_size;
2259 /* This takes place in two stages. First, the total size of the
2260 * nfa is computed, then structures are copied.
2262 n = rx->nfa_states;
2263 total_nodec = 0;
2264 while (n)
2266 struct rx_nfa_edge *e = n->edges;
2267 struct rx_possible_future *ec = n->futures;
2268 ++total_nodec;
2269 while (e)
2271 ++edgec;
2272 e = e->next;
2274 while (ec)
2276 ++eclosec;
2277 ec = ec->next;
2279 n = n->next;
2282 total_size = (total_nodec * sizeof (struct rx_nfa_state)
2283 + edgec * rx_sizeof_bitset (rx->local_cset_size)
2284 + edgec * sizeof (struct rx_nfa_edge)
2285 + nfa_setc * sizeof (struct rx_nfa_state_set)
2286 + eclosec * sizeof (struct rx_possible_future)
2287 + se_list_consc * sizeof (struct rx_se_list)
2288 + rx->reserved);
2290 if (total_size > *size)
2292 *mem = remalloc (*mem, total_size);
2293 if (*mem)
2294 *size = total_size;
2295 else
2296 return 0;
2298 /* Now we've allocated the memory; this copies the NFA. */
2300 static struct rx_nfa_state **scratch = 0;
2301 static int scratch_alloc = 0;
2302 struct rx_nfa_state *state_base = (struct rx_nfa_state *) * mem;
2303 struct rx_nfa_state *new_state = state_base;
2304 struct rx_nfa_edge *new_edge =
2305 (struct rx_nfa_edge *)
2306 ((char *) state_base + total_nodec * sizeof (struct rx_nfa_state));
2307 struct rx_se_list * new_se_list =
2308 (struct rx_se_list *)
2309 ((char *)new_edge + edgec * sizeof (struct rx_nfa_edge));
2310 struct rx_possible_future *new_close =
2311 ((struct rx_possible_future *)
2312 ((char *) new_se_list
2313 + se_list_consc * sizeof (struct rx_se_list)));
2314 struct rx_nfa_state_set * new_nfa_set =
2315 ((struct rx_nfa_state_set *)
2316 ((char *)new_close + eclosec * sizeof (struct rx_possible_future)));
2317 char *new_bitset =
2318 ((char *) new_nfa_set + nfa_setc * sizeof (struct rx_nfa_state_set));
2319 int x;
2320 struct rx_nfa_state *n;
2322 if (scratch_alloc < total_nodec)
2324 scratch = ((struct rx_nfa_state **)
2325 remalloc (scratch, total_nodec * sizeof (*scratch)));
2326 if (scratch)
2327 scratch_alloc = total_nodec;
2328 else
2330 scratch_alloc = 0;
2331 return 0;
2335 for (x = 0, n = rx->nfa_states; n; n = n->next)
2336 scratch[x++] = n;
2338 qsort (scratch, total_nodec,
2339 sizeof (struct rx_nfa_state *), (int (*)())nfacmp);
2341 for (x = 0; x < total_nodec; ++x)
2343 struct rx_possible_future *eclose = scratch[x]->futures;
2344 struct rx_nfa_edge *edge = scratch[x]->edges;
2345 struct rx_nfa_state *cn = new_state++;
2346 cn->futures = 0;
2347 cn->edges = 0;
2348 cn->next = (x == total_nodec - 1) ? 0 : (cn + 1);
2349 cn->id = scratch[x]->id;
2350 cn->is_final = scratch[x]->is_final;
2351 cn->is_start = scratch[x]->is_start;
2352 cn->mark = 0;
2353 while (edge)
2355 int indx = (edge->dest->id < 0
2356 ? (total_nodec + edge->dest->id)
2357 : edge->dest->id);
2358 struct rx_nfa_edge *e = new_edge++;
2359 rx_Bitset cset = (rx_Bitset) new_bitset;
2360 new_bitset += rx_sizeof_bitset (rx->local_cset_size);
2361 rx_bitset_null (rx->local_cset_size, cset);
2362 rx_bitset_union (rx->local_cset_size, cset, edge->params.cset);
2363 e->next = cn->edges;
2364 cn->edges = e;
2365 e->type = edge->type;
2366 e->dest = state_base + indx;
2367 e->params.cset = cset;
2368 edge = edge->next;
2370 while (eclose)
2372 struct rx_possible_future *ec = new_close++;
2373 struct rx_hash_item * sp;
2374 struct rx_se_list ** sepos;
2375 struct rx_se_list * sesrc;
2376 struct rx_nfa_state_set * destlst;
2377 struct rx_nfa_state_set ** destpos;
2378 ec->next = cn->futures;
2379 cn->futures = ec;
2380 for (sepos = &ec->effects, sesrc = eclose->effects;
2381 sesrc;
2382 sesrc = sesrc->cdr, sepos = &(*sepos)->cdr)
2384 sp = rx_hash_find (&rx->se_list_memo,
2385 (long)sesrc->car ^ (long)sesrc->cdr,
2386 sesrc, &se_list_hash_rules);
2387 if (sp->binding)
2389 sesrc = (struct rx_se_list *)sp->binding;
2390 break;
2392 *new_se_list = *sesrc;
2393 sp->binding = (void *)new_se_list;
2394 *sepos = new_se_list;
2395 ++new_se_list;
2397 *sepos = sesrc;
2398 for (destpos = &ec->destset, destlst = eclose->destset;
2399 destlst;
2400 destpos = &(*destpos)->cdr, destlst = destlst->cdr)
2402 sp = rx_hash_find (&rx->set_list_memo,
2403 ((((long)destlst->car) >> 8)
2404 ^ (long)destlst->cdr),
2405 destlst, &nfa_set_hash_rules);
2406 if (sp->binding)
2408 destlst = (struct rx_nfa_state_set *)sp->binding;
2409 break;
2411 *new_nfa_set = *destlst;
2412 new_nfa_set->car = state_base + destlst->car->id;
2413 sp->binding = (void *)new_nfa_set;
2414 *destpos = new_nfa_set;
2415 ++new_nfa_set;
2417 *destpos = destlst;
2418 eclose = eclose->next;
2422 rx_free_hash_table (&rx->se_list_memo, se_memo_freer, &se_list_hash_rules);
2423 bzero (&rx->se_list_memo, sizeof (rx->se_list_memo));
2424 rx_free_hash_table (&rx->set_list_memo, nfa_set_freer, &nfa_set_hash_rules);
2425 bzero (&rx->set_list_memo, sizeof (rx->set_list_memo));
2427 rx_free_nfa (rx);
2428 rx->nfa_states = (struct rx_nfa_state *)*mem;
2429 return 1;
2433 /* The functions in the next several pages define the lazy-NFA-conversion used
2434 * by matchers. The input to this construction is an NFA such as
2435 * is built by compactify_nfa (rx.c). The output is the superNFA.
2438 /* Match engines can use arbitrary values for opcodes. So, the parse tree
2439 * is built using instructions names (enum rx_opcode), but the superstate
2440 * nfa is populated with mystery opcodes (void *).
2442 * For convenience, here is an id table. The opcodes are == to their inxs
2444 * The lables in re_search_2 would make good values for instructions.
2447 void * rx_id_instruction_table[rx_num_instructions] =
2449 (void *) rx_backtrack_point,
2450 (void *) rx_do_side_effects,
2451 (void *) rx_cache_miss,
2452 (void *) rx_next_char,
2453 (void *) rx_backtrack,
2454 (void *) rx_error_inx
2459 /* Memory mgt. for superstate graphs. */
2461 #ifdef __STDC__
2462 static char *
2463 rx_cache_malloc (struct rx_cache * cache, int bytes)
2464 #else
2465 static char *
2466 rx_cache_malloc (cache, bytes)
2467 struct rx_cache * cache;
2468 int bytes;
2469 #endif
2471 while (cache->bytes_left < bytes)
2473 if (cache->memory_pos)
2474 cache->memory_pos = cache->memory_pos->next;
2475 if (!cache->memory_pos)
2477 cache->morecore (cache);
2478 if (!cache->memory_pos)
2479 return 0;
2481 cache->bytes_left = cache->memory_pos->bytes;
2482 cache->memory_addr = ((char *)cache->memory_pos
2483 + sizeof (struct rx_blocklist));
2485 cache->bytes_left -= bytes;
2487 char * addr = cache->memory_addr;
2488 cache->memory_addr += bytes;
2489 return addr;
2493 #ifdef __STDC__
2494 static void
2495 rx_cache_free (struct rx_cache * cache,
2496 struct rx_freelist ** freelist, char * mem)
2497 #else
2498 static void
2499 rx_cache_free (cache, freelist, mem)
2500 struct rx_cache * cache;
2501 struct rx_freelist ** freelist;
2502 char * mem;
2503 #endif
2505 struct rx_freelist * it = (struct rx_freelist *)mem;
2506 it->next = *freelist;
2507 *freelist = it;
2511 /* The partially instantiated superstate graph has a transition
2512 * table at every node. There is one entry for every character.
2513 * This fills in the transition for a set.
2515 #ifdef __STDC__
2516 static void
2517 install_transition (struct rx_superstate *super,
2518 struct rx_inx *answer, rx_Bitset trcset)
2519 #else
2520 static void
2521 install_transition (super, answer, trcset)
2522 struct rx_superstate *super;
2523 struct rx_inx *answer;
2524 rx_Bitset trcset;
2525 #endif
2527 struct rx_inx * transitions = super->transitions;
2528 int chr;
2529 for (chr = 0; chr < 256; )
2530 if (!*trcset)
2532 ++trcset;
2533 chr += 32;
2535 else
2537 RX_subset sub = *trcset;
2538 RX_subset mask = 1;
2539 int bound = chr + 32;
2540 while (chr < bound)
2542 if (sub & mask)
2543 transitions [chr] = *answer;
2544 ++chr;
2545 mask <<= 1;
2547 ++trcset;
2552 #ifdef __STDC__
2553 static int
2554 qlen (struct rx_superstate * q)
2555 #else
2556 static int
2557 qlen (q)
2558 struct rx_superstate * q;
2559 #endif
2561 int count = 1;
2562 struct rx_superstate * it;
2563 if (!q)
2564 return 0;
2565 for (it = q->next_recyclable; it != q; it = it->next_recyclable)
2566 ++count;
2567 return count;
2570 #ifdef __STDC__
2571 static void
2572 check_cache (struct rx_cache * cache)
2573 #else
2574 static void
2575 check_cache (cache)
2576 struct rx_cache * cache;
2577 #endif
2579 struct rx_cache * you_fucked_up = 0;
2580 int total = cache->superstates;
2581 int semi = cache->semifree_superstates;
2582 if (semi != qlen (cache->semifree_superstate))
2583 check_cache (you_fucked_up);
2584 if ((total - semi) != qlen (cache->lru_superstate))
2585 check_cache (you_fucked_up);
2588 /* When a superstate is old and neglected, it can enter a
2589 * semi-free state. A semi-free state is slated to die.
2590 * Incoming transitions to a semi-free state are re-written
2591 * to cause an (interpreted) fault when they are taken.
2592 * The fault handler revives the semi-free state, patches
2593 * incoming transitions back to normal, and continues.
2595 * The idea is basicly to free in two stages, aborting
2596 * between the two if the state turns out to be useful again.
2597 * When a free is aborted, the rescued superstate is placed
2598 * in the most-favored slot to maximize the time until it
2599 * is next semi-freed.
2602 #ifdef __STDC__
2603 static void
2604 semifree_superstate (struct rx_cache * cache)
2605 #else
2606 static void
2607 semifree_superstate (cache)
2608 struct rx_cache * cache;
2609 #endif
2611 int disqualified = cache->semifree_superstates;
2612 if (disqualified == cache->superstates)
2613 return;
2614 while (cache->lru_superstate->locks)
2616 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2617 ++disqualified;
2618 if (disqualified == cache->superstates)
2619 return;
2622 struct rx_superstate * it = cache->lru_superstate;
2623 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2624 it->prev_recyclable->next_recyclable = it->next_recyclable;
2625 cache->lru_superstate = (it == it->next_recyclable
2627 : it->next_recyclable);
2628 if (!cache->semifree_superstate)
2630 cache->semifree_superstate = it;
2631 it->next_recyclable = it;
2632 it->prev_recyclable = it;
2634 else
2636 it->prev_recyclable = cache->semifree_superstate->prev_recyclable;
2637 it->next_recyclable = cache->semifree_superstate;
2638 it->prev_recyclable->next_recyclable = it;
2639 it->next_recyclable->prev_recyclable = it;
2642 struct rx_distinct_future *df;
2643 it->is_semifree = 1;
2644 ++cache->semifree_superstates;
2645 df = it->transition_refs;
2646 if (df)
2648 df->prev_same_dest->next_same_dest = 0;
2649 for (df = it->transition_refs; df; df = df->next_same_dest)
2651 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2652 df->future_frame.data = 0;
2653 df->future_frame.data_2 = (void *) df;
2654 /* If there are any NEXT-CHAR instruction frames that
2655 * refer to this state, we convert them to CACHE-MISS frames.
2657 if (!df->effects
2658 && (df->edge->options->next_same_super_edge[0]
2659 == df->edge->options))
2660 install_transition (df->present, &df->future_frame,
2661 df->edge->cset);
2663 df = it->transition_refs;
2664 df->prev_same_dest->next_same_dest = df;
2671 #ifdef __STDC__
2672 static void
2673 refresh_semifree_superstate (struct rx_cache * cache,
2674 struct rx_superstate * super)
2675 #else
2676 static void
2677 refresh_semifree_superstate (cache, super)
2678 struct rx_cache * cache;
2679 struct rx_superstate * super;
2680 #endif
2682 struct rx_distinct_future *df;
2684 if (super->transition_refs)
2686 super->transition_refs->prev_same_dest->next_same_dest = 0;
2687 for (df = super->transition_refs; df; df = df->next_same_dest)
2689 df->future_frame.inx = cache->instruction_table[rx_next_char];
2690 df->future_frame.data = (void *) super->transitions;
2691 /* CACHE-MISS instruction frames that refer to this state,
2692 * must be converted to NEXT-CHAR frames.
2694 if (!df->effects
2695 && (df->edge->options->next_same_super_edge[0]
2696 == df->edge->options))
2697 install_transition (df->present, &df->future_frame,
2698 df->edge->cset);
2700 super->transition_refs->prev_same_dest->next_same_dest
2701 = super->transition_refs;
2703 if (cache->semifree_superstate == super)
2704 cache->semifree_superstate = (super->prev_recyclable == super
2706 : super->prev_recyclable);
2707 super->next_recyclable->prev_recyclable = super->prev_recyclable;
2708 super->prev_recyclable->next_recyclable = super->next_recyclable;
2710 if (!cache->lru_superstate)
2711 (cache->lru_superstate
2712 = super->next_recyclable
2713 = super->prev_recyclable
2714 = super);
2715 else
2717 super->next_recyclable = cache->lru_superstate;
2718 super->prev_recyclable = cache->lru_superstate->prev_recyclable;
2719 super->next_recyclable->prev_recyclable = super;
2720 super->prev_recyclable->next_recyclable = super;
2722 super->is_semifree = 0;
2723 --cache->semifree_superstates;
2726 #ifdef __STDC__
2727 static void
2728 rx_refresh_this_superstate (struct rx_cache * cache, struct rx_superstate * superstate)
2729 #else
2730 static void
2731 rx_refresh_this_superstate (cache, superstate)
2732 struct rx_cache * cache;
2733 struct rx_superstate * superstate;
2734 #endif
2736 if (superstate->is_semifree)
2737 refresh_semifree_superstate (cache, superstate);
2738 else if (cache->lru_superstate == superstate)
2739 cache->lru_superstate = superstate->next_recyclable;
2740 else if (superstate != cache->lru_superstate->prev_recyclable)
2742 superstate->next_recyclable->prev_recyclable
2743 = superstate->prev_recyclable;
2744 superstate->prev_recyclable->next_recyclable
2745 = superstate->next_recyclable;
2746 superstate->next_recyclable = cache->lru_superstate;
2747 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
2748 superstate->next_recyclable->prev_recyclable = superstate;
2749 superstate->prev_recyclable->next_recyclable = superstate;
2753 #ifdef __STDC__
2754 static void
2755 release_superset_low (struct rx_cache * cache,
2756 struct rx_superset *set)
2757 #else
2758 static void
2759 release_superset_low (cache, set)
2760 struct rx_cache * cache;
2761 struct rx_superset *set;
2762 #endif
2764 if (!--set->refs)
2766 if (set->cdr)
2767 release_superset_low (cache, set->cdr);
2769 set->starts_for = 0;
2771 rx_hash_free
2772 (rx_hash_find
2773 (&cache->superset_table,
2774 (unsigned long)set->car ^ set->id ^ (unsigned long)set->cdr,
2775 (void *)set,
2776 &cache->superset_hash_rules),
2777 &cache->superset_hash_rules);
2778 rx_cache_free (cache, &cache->free_supersets, (char *)set);
2782 #ifdef __STDC__
2783 RX_DECL void
2784 rx_release_superset (struct rx *rx,
2785 struct rx_superset *set)
2786 #else
2787 RX_DECL void
2788 rx_release_superset (rx, set)
2789 struct rx *rx;
2790 struct rx_superset *set;
2791 #endif
2793 release_superset_low (rx->cache, set);
2796 /* This tries to add a new superstate to the superstate freelist.
2797 * It might, as a result, free some edge pieces or hash tables.
2798 * If nothing can be freed because too many locks are being held, fail.
2801 #ifdef __STDC__
2802 static int
2803 rx_really_free_superstate (struct rx_cache * cache)
2804 #else
2805 static int
2806 rx_really_free_superstate (cache)
2807 struct rx_cache * cache;
2808 #endif
2810 int locked_superstates = 0;
2811 struct rx_superstate * it;
2813 if (!cache->superstates)
2814 return 0;
2817 /* This is a total guess. The idea is that we should expect as
2818 * many misses as we've recently experienced. I.e., cache->misses
2819 * should be the same as cache->semifree_superstates.
2821 while ((cache->hits + cache->misses) > cache->superstates_allowed)
2823 cache->hits >>= 1;
2824 cache->misses >>= 1;
2826 if ( ((cache->hits + cache->misses) * cache->semifree_superstates)
2827 < (cache->superstates * cache->misses))
2829 semifree_superstate (cache);
2830 semifree_superstate (cache);
2834 while (cache->semifree_superstate && cache->semifree_superstate->locks)
2836 refresh_semifree_superstate (cache, cache->semifree_superstate);
2837 ++locked_superstates;
2838 if (locked_superstates == cache->superstates)
2839 return 0;
2842 if (cache->semifree_superstate)
2844 it = cache->semifree_superstate;
2845 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2846 it->prev_recyclable->next_recyclable = it->next_recyclable;
2847 cache->semifree_superstate = ((it == it->next_recyclable)
2849 : it->next_recyclable);
2850 --cache->semifree_superstates;
2852 else
2854 while (cache->lru_superstate->locks)
2856 cache->lru_superstate = cache->lru_superstate->next_recyclable;
2857 ++locked_superstates;
2858 if (locked_superstates == cache->superstates)
2859 return 0;
2861 it = cache->lru_superstate;
2862 it->next_recyclable->prev_recyclable = it->prev_recyclable;
2863 it->prev_recyclable->next_recyclable = it->next_recyclable;
2864 cache->lru_superstate = ((it == it->next_recyclable)
2866 : it->next_recyclable);
2869 if (it->transition_refs)
2871 struct rx_distinct_future *df;
2872 for (df = it->transition_refs,
2873 df->prev_same_dest->next_same_dest = 0;
2875 df = df->next_same_dest)
2877 df->future_frame.inx = cache->instruction_table[rx_cache_miss];
2878 df->future_frame.data = 0;
2879 df->future_frame.data_2 = (void *) df;
2880 df->future = 0;
2882 it->transition_refs->prev_same_dest->next_same_dest =
2883 it->transition_refs;
2886 struct rx_super_edge *tc = it->edges;
2887 while (tc)
2889 struct rx_distinct_future * df;
2890 struct rx_super_edge *tct = tc->next;
2891 df = tc->options;
2892 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
2893 while (df)
2895 struct rx_distinct_future *dft = df;
2896 df = df->next_same_super_edge[0];
2899 if (dft->future && dft->future->transition_refs == dft)
2901 dft->future->transition_refs = dft->next_same_dest;
2902 if (dft->future->transition_refs == dft)
2903 dft->future->transition_refs = 0;
2905 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
2906 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
2907 rx_cache_free (cache, &cache->free_discernable_futures,
2908 (char *)dft);
2910 rx_cache_free (cache, &cache->free_transition_classes, (char *)tc);
2911 tc = tct;
2915 if (it->contents->superstate == it)
2916 it->contents->superstate = 0;
2917 release_superset_low (cache, it->contents);
2918 rx_cache_free (cache, &cache->free_superstates, (char *)it);
2919 --cache->superstates;
2920 return 1;
2923 #ifdef __STDC__
2924 static char *
2925 rx_cache_get (struct rx_cache * cache,
2926 struct rx_freelist ** freelist)
2927 #else
2928 static char *
2929 rx_cache_get (cache, freelist)
2930 struct rx_cache * cache;
2931 struct rx_freelist ** freelist;
2932 #endif
2934 while (!*freelist && rx_really_free_superstate (cache))
2936 if (!*freelist)
2937 return 0;
2939 struct rx_freelist * it = *freelist;
2940 *freelist = it->next;
2941 return (char *)it;
2945 #ifdef __STDC__
2946 static char *
2947 rx_cache_malloc_or_get (struct rx_cache * cache,
2948 struct rx_freelist ** freelist, int bytes)
2949 #else
2950 static char *
2951 rx_cache_malloc_or_get (cache, freelist, bytes)
2952 struct rx_cache * cache;
2953 struct rx_freelist ** freelist;
2954 int bytes;
2955 #endif
2957 if (!*freelist)
2959 char * answer = rx_cache_malloc (cache, bytes);
2960 if (answer)
2961 return answer;
2964 return rx_cache_get (cache, freelist);
2967 #ifdef __STDC__
2968 static char *
2969 rx_cache_get_superstate (struct rx_cache * cache)
2970 #else
2971 static char *
2972 rx_cache_get_superstate (cache)
2973 struct rx_cache * cache;
2974 #endif
2976 char * answer;
2977 int bytes = ( sizeof (struct rx_superstate)
2978 + cache->local_cset_size * sizeof (struct rx_inx));
2979 if (!cache->free_superstates
2980 && (cache->superstates < cache->superstates_allowed))
2982 answer = rx_cache_malloc (cache, bytes);
2983 if (answer)
2985 ++cache->superstates;
2986 return answer;
2989 answer = rx_cache_get (cache, &cache->free_superstates);
2990 if (!answer)
2992 answer = rx_cache_malloc (cache, bytes);
2993 if (answer)
2994 ++cache->superstates_allowed;
2996 ++cache->superstates;
2997 return answer;
3002 #ifdef __STDC__
3003 static int
3004 supersetcmp (void * va, void * vb)
3005 #else
3006 static int
3007 supersetcmp (va, vb)
3008 void * va;
3009 void * vb;
3010 #endif
3012 struct rx_superset * a = (struct rx_superset *)va;
3013 struct rx_superset * b = (struct rx_superset *)vb;
3014 return ( (a == b)
3015 || (a && b && (a->car == b->car) && (a->cdr == b->cdr)));
3018 #ifdef __STDC__
3019 static struct rx_hash_item *
3020 superset_allocator (struct rx_hash_rules * rules, void * val)
3021 #else
3022 static struct rx_hash_item *
3023 superset_allocator (rules, val)
3024 struct rx_hash_rules * rules;
3025 void * val;
3026 #endif
3028 struct rx_cache * cache
3029 = ((struct rx_cache *)
3030 ((char *)rules
3031 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3032 struct rx_superset * template = (struct rx_superset *)val;
3033 struct rx_superset * newset
3034 = ((struct rx_superset *)
3035 rx_cache_malloc_or_get (cache,
3036 &cache->free_supersets,
3037 sizeof (*template)));
3038 if (!newset)
3039 return 0;
3040 newset->refs = 0;
3041 newset->car = template->car;
3042 newset->id = template->car->id;
3043 newset->cdr = template->cdr;
3044 newset->superstate = 0;
3045 rx_protect_superset (rx, template->cdr);
3046 newset->hash_item.data = (void *)newset;
3047 newset->hash_item.binding = 0;
3048 return &newset->hash_item;
3051 #ifdef __STDC__
3052 static struct rx_hash *
3053 super_hash_allocator (struct rx_hash_rules * rules)
3054 #else
3055 static struct rx_hash *
3056 super_hash_allocator (rules)
3057 struct rx_hash_rules * rules;
3058 #endif
3060 struct rx_cache * cache
3061 = ((struct rx_cache *)
3062 ((char *)rules
3063 - (unsigned long)(&((struct rx_cache *)0)->superset_hash_rules)));
3064 return ((struct rx_hash *)
3065 rx_cache_malloc_or_get (cache,
3066 &cache->free_hash, sizeof (struct rx_hash)));
3070 #ifdef __STDC__
3071 static void
3072 super_hash_liberator (struct rx_hash * hash, struct rx_hash_rules * rules)
3073 #else
3074 static void
3075 super_hash_liberator (hash, rules)
3076 struct rx_hash * hash;
3077 struct rx_hash_rules * rules;
3078 #endif
3080 struct rx_cache * cache
3081 = ((struct rx_cache *)
3082 (char *)rules - (long)(&((struct rx_cache *)0)->superset_hash_rules));
3083 rx_cache_free (cache, &cache->free_hash, (char *)hash);
3086 #ifdef __STDC__
3087 static void
3088 superset_hash_item_liberator (struct rx_hash_item * it,
3089 struct rx_hash_rules * rules)
3090 #else
3091 static void
3092 superset_hash_item_liberator (it, rules) /* Well, it does ya know. */
3093 struct rx_hash_item * it;
3094 struct rx_hash_rules * rules;
3095 #endif
3099 int rx_cache_bound = 128;
3100 static int rx_default_cache_got = 0;
3102 #ifdef __STDC__
3103 static int
3104 bytes_for_cache_size (int supers, int cset_size)
3105 #else
3106 static int
3107 bytes_for_cache_size (supers, cset_size)
3108 int supers;
3109 int cset_size;
3110 #endif
3112 /* What the hell is this? !!!*/
3113 return (int)
3114 ((float)supers *
3115 ( (1.03 * (float) ( rx_sizeof_bitset (cset_size)
3116 + sizeof (struct rx_super_edge)))
3117 + (1.80 * (float) sizeof (struct rx_possible_future))
3118 + (float) ( sizeof (struct rx_superstate)
3119 + cset_size * sizeof (struct rx_inx))));
3122 #ifdef __STDC__
3123 static void
3124 rx_morecore (struct rx_cache * cache)
3125 #else
3126 static void
3127 rx_morecore (cache)
3128 struct rx_cache * cache;
3129 #endif
3131 if (rx_default_cache_got >= rx_cache_bound)
3132 return;
3134 rx_default_cache_got += 16;
3135 cache->superstates_allowed = rx_cache_bound;
3137 struct rx_blocklist ** pos = &cache->memory;
3138 int size = bytes_for_cache_size (16, cache->local_cset_size);
3139 while (*pos)
3140 pos = &(*pos)->next;
3141 *pos = ((struct rx_blocklist *)
3142 malloc (size + sizeof (struct rx_blocklist)));
3143 if (!*pos)
3144 return;
3146 (*pos)->next = 0;
3147 (*pos)->bytes = size;
3148 cache->memory_pos = *pos;
3149 cache->memory_addr = (char *)*pos + sizeof (**pos);
3150 cache->bytes_left = size;
3154 static struct rx_cache default_cache =
3157 supersetcmp,
3158 super_hash_allocator,
3159 super_hash_liberator,
3160 superset_allocator,
3161 superset_hash_item_liberator,
3167 rx_morecore,
3184 128,
3186 256,
3187 rx_id_instruction_table,
3192 {0},
3193 {0},
3198 /* This adds an element to a superstate set. These sets are lists, such
3199 * that lists with == elements are ==. The empty set is returned by
3200 * superset_cons (rx, 0, 0) and is NOT equivelent to
3201 * (struct rx_superset)0.
3204 #ifdef __STDC__
3205 RX_DECL struct rx_superset *
3206 rx_superset_cons (struct rx * rx,
3207 struct rx_nfa_state *car, struct rx_superset *cdr)
3208 #else
3209 RX_DECL struct rx_superset *
3210 rx_superset_cons (rx, car, cdr)
3211 struct rx * rx;
3212 struct rx_nfa_state *car;
3213 struct rx_superset *cdr;
3214 #endif
3216 struct rx_cache * cache = rx->cache;
3217 if (!car && !cdr)
3219 if (!cache->empty_superset)
3221 cache->empty_superset
3222 = ((struct rx_superset *)
3223 rx_cache_malloc_or_get (cache, &cache->free_supersets,
3224 sizeof (struct rx_superset)));
3225 if (!cache->empty_superset)
3226 return 0;
3227 bzero (cache->empty_superset, sizeof (struct rx_superset));
3228 cache->empty_superset->refs = 1000;
3230 return cache->empty_superset;
3233 struct rx_superset template;
3234 struct rx_hash_item * hit;
3235 template.car = car;
3236 template.cdr = cdr;
3237 template.id = car->id;
3238 hit = rx_hash_store (&cache->superset_table,
3239 (unsigned long)car ^ car->id ^ (unsigned long)cdr,
3240 (void *)&template,
3241 &cache->superset_hash_rules);
3242 return (hit
3243 ? (struct rx_superset *)hit->data
3244 : 0);
3248 /* This computes a union of two NFA state sets. The sets do not have the
3249 * same representation though. One is a RX_SUPERSET structure (part
3250 * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA).
3253 #ifdef __STDC__
3254 RX_DECL struct rx_superset *
3255 rx_superstate_eclosure_union
3256 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl)
3257 #else
3258 RX_DECL struct rx_superset *
3259 rx_superstate_eclosure_union (rx, set, ecl)
3260 struct rx * rx;
3261 struct rx_superset *set;
3262 struct rx_nfa_state_set *ecl;
3263 #endif
3265 if (!ecl)
3266 return set;
3268 if (!set->car)
3269 return rx_superset_cons (rx, ecl->car,
3270 rx_superstate_eclosure_union (rx, set, ecl->cdr));
3271 if (set->car == ecl->car)
3272 return rx_superstate_eclosure_union (rx, set, ecl->cdr);
3275 struct rx_superset * tail;
3276 struct rx_nfa_state * first;
3278 if (set->car > ecl->car)
3280 tail = rx_superstate_eclosure_union (rx, set->cdr, ecl);
3281 first = set->car;
3283 else
3285 tail = rx_superstate_eclosure_union (rx, set, ecl->cdr);
3286 first = ecl->car;
3288 if (!tail)
3289 return 0;
3290 else
3292 struct rx_superset * answer;
3293 answer = rx_superset_cons (rx, first, tail);
3294 if (!answer)
3296 rx_protect_superset (rx, tail);
3297 rx_release_superset (rx, tail);
3298 return 0;
3300 else
3301 return answer;
3310 * This makes sure that a list of rx_distinct_futures contains
3311 * a future for each possible set of side effects in the eclosure
3312 * of a given state. This is some of the work of filling in a
3313 * superstate transition.
3316 #ifdef __STDC__
3317 static struct rx_distinct_future *
3318 include_futures (struct rx *rx,
3319 struct rx_distinct_future *df, struct rx_nfa_state
3320 *state, struct rx_superstate *superstate)
3321 #else
3322 static struct rx_distinct_future *
3323 include_futures (rx, df, state, superstate)
3324 struct rx *rx;
3325 struct rx_distinct_future *df;
3326 struct rx_nfa_state *state;
3327 struct rx_superstate *superstate;
3328 #endif
3330 struct rx_possible_future *future;
3331 struct rx_cache * cache = rx->cache;
3332 for (future = state->futures; future; future = future->next)
3334 struct rx_distinct_future *dfp;
3335 struct rx_distinct_future *insert_before = 0;
3336 if (df)
3337 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3338 for (dfp = df; dfp; dfp = dfp->next_same_super_edge[0])
3339 if (dfp->effects == future->effects)
3340 break;
3341 else
3343 int order = rx->se_list_cmp (rx, dfp->effects, future->effects);
3344 if (order > 0)
3346 insert_before = dfp;
3347 dfp = 0;
3348 break;
3351 if (df)
3352 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3353 if (!dfp)
3356 = ((struct rx_distinct_future *)
3357 rx_cache_malloc_or_get (cache, &cache->free_discernable_futures,
3358 sizeof (struct rx_distinct_future)));
3359 if (!dfp)
3360 return 0;
3361 if (!df)
3363 df = insert_before = dfp;
3364 df->next_same_super_edge[0] = df->next_same_super_edge[1] = df;
3366 else if (!insert_before)
3367 insert_before = df;
3368 else if (insert_before == df)
3369 df = dfp;
3371 dfp->next_same_super_edge[0] = insert_before;
3372 dfp->next_same_super_edge[1]
3373 = insert_before->next_same_super_edge[1];
3374 dfp->next_same_super_edge[1]->next_same_super_edge[0] = dfp;
3375 dfp->next_same_super_edge[0]->next_same_super_edge[1] = dfp;
3376 dfp->next_same_dest = dfp->prev_same_dest = dfp;
3377 dfp->future = 0;
3378 dfp->present = superstate;
3379 dfp->future_frame.inx = rx->instruction_table[rx_cache_miss];
3380 dfp->future_frame.data = 0;
3381 dfp->future_frame.data_2 = (void *) dfp;
3382 dfp->side_effects_frame.inx
3383 = rx->instruction_table[rx_do_side_effects];
3384 dfp->side_effects_frame.data = 0;
3385 dfp->side_effects_frame.data_2 = (void *) dfp;
3386 dfp->effects = future->effects;
3389 return df;
3394 /* This constructs a new superstate from its state set. The only
3395 * complexity here is memory management.
3397 #ifdef __STDC__
3398 RX_DECL struct rx_superstate *
3399 rx_superstate (struct rx *rx,
3400 struct rx_superset *set)
3401 #else
3402 RX_DECL struct rx_superstate *
3403 rx_superstate (rx, set)
3404 struct rx *rx;
3405 struct rx_superset *set;
3406 #endif
3408 struct rx_cache * cache = rx->cache;
3409 struct rx_superstate * superstate = 0;
3411 /* Does the superstate already exist in the cache? */
3412 if (set->superstate)
3414 if (set->superstate->rx_id != rx->rx_id)
3416 /* Aha. It is in the cache, but belongs to a superstate
3417 * that refers to an NFA that no longer exists.
3418 * (We know it no longer exists because it was evidently
3419 * stored in the same region of memory as the current nfa
3420 * yet it has a different id.)
3422 superstate = set->superstate;
3423 if (!superstate->is_semifree)
3425 if (cache->lru_superstate == superstate)
3427 cache->lru_superstate = superstate->next_recyclable;
3428 if (cache->lru_superstate == superstate)
3429 cache->lru_superstate = 0;
3432 superstate->next_recyclable->prev_recyclable
3433 = superstate->prev_recyclable;
3434 superstate->prev_recyclable->next_recyclable
3435 = superstate->next_recyclable;
3436 if (!cache->semifree_superstate)
3438 (cache->semifree_superstate
3439 = superstate->next_recyclable
3440 = superstate->prev_recyclable
3441 = superstate);
3443 else
3445 superstate->next_recyclable = cache->semifree_superstate;
3446 superstate->prev_recyclable
3447 = cache->semifree_superstate->prev_recyclable;
3448 superstate->next_recyclable->prev_recyclable
3449 = superstate;
3450 superstate->prev_recyclable->next_recyclable
3451 = superstate;
3452 cache->semifree_superstate = superstate;
3454 ++cache->semifree_superstates;
3457 set->superstate = 0;
3458 goto handle_cache_miss;
3460 ++cache->hits;
3461 superstate = set->superstate;
3463 rx_refresh_this_superstate (cache, superstate);
3464 return superstate;
3467 handle_cache_miss:
3469 /* This point reached only for cache misses. */
3470 ++cache->misses;
3471 #if RX_DEBUG
3472 if (rx_debug_trace > 1)
3474 struct rx_superset * setp = set;
3475 fprintf (stderr, "Building a superstet %d(%d): ", rx->rx_id, set);
3476 while (setp)
3478 fprintf (stderr, "%d ", setp->id);
3479 setp = setp->cdr;
3481 fprintf (stderr, "(%d)\n", set);
3483 #endif
3484 superstate = (struct rx_superstate *)rx_cache_get_superstate (cache);
3485 if (!superstate)
3486 return 0;
3488 if (!cache->lru_superstate)
3489 (cache->lru_superstate
3490 = superstate->next_recyclable
3491 = superstate->prev_recyclable
3492 = superstate);
3493 else
3495 superstate->next_recyclable = cache->lru_superstate;
3496 superstate->prev_recyclable = cache->lru_superstate->prev_recyclable;
3497 ( superstate->prev_recyclable->next_recyclable
3498 = superstate->next_recyclable->prev_recyclable
3499 = superstate);
3501 superstate->rx_id = rx->rx_id;
3502 superstate->transition_refs = 0;
3503 superstate->locks = 0;
3504 superstate->is_semifree = 0;
3505 set->superstate = superstate;
3506 superstate->contents = set;
3507 rx_protect_superset (rx, set);
3508 superstate->edges = 0;
3510 int x;
3511 /* None of the transitions from this superstate are known yet. */
3512 for (x = 0; x < rx->local_cset_size; ++x) /* &&&&& 3.8 % */
3514 struct rx_inx * ifr = &superstate->transitions[x];
3515 ifr->inx = rx->instruction_table [rx_cache_miss];
3516 ifr->data = ifr->data_2 = 0;
3519 return superstate;
3523 /* This computes the destination set of one edge of the superstate NFA.
3524 * Note that a RX_DISTINCT_FUTURE is a superstate edge.
3525 * Returns 0 on an allocation failure.
3528 #ifdef __STDC__
3529 static int
3530 solve_destination (struct rx *rx, struct rx_distinct_future *df)
3531 #else
3532 static int
3533 solve_destination (rx, df)
3534 struct rx *rx;
3535 struct rx_distinct_future *df;
3536 #endif
3538 struct rx_super_edge *tc = df->edge;
3539 struct rx_superset *nfa_state;
3540 struct rx_superset *nil_set = rx_superset_cons (rx, 0, 0);
3541 struct rx_superset *solution = nil_set;
3542 struct rx_superstate *dest;
3544 rx_protect_superset (rx, solution);
3545 /* Iterate over all NFA states in the state set of this superstate. */
3546 for (nfa_state = df->present->contents;
3547 nfa_state->car;
3548 nfa_state = nfa_state->cdr)
3550 struct rx_nfa_edge *e;
3551 /* Iterate over all edges of each NFA state. */
3552 for (e = nfa_state->car->edges; e; e = e->next)
3553 /* If we find an edge that is labeled with
3554 * the characters we are solving for.....
3556 if (rx_bitset_is_subset (rx->local_cset_size,
3557 tc->cset, e->params.cset))
3559 struct rx_nfa_state *n = e->dest;
3560 struct rx_possible_future *pf;
3561 /* ....search the partial epsilon closures of the destination
3562 * of that edge for a path that involves the same set of
3563 * side effects we are solving for.
3564 * If we find such a RX_POSSIBLE_FUTURE, we add members to the
3565 * stateset we are computing.
3567 for (pf = n->futures; pf; pf = pf->next)
3568 if (pf->effects == df->effects)
3570 struct rx_superset * old_sol;
3571 old_sol = solution;
3572 solution = rx_superstate_eclosure_union (rx, solution,
3573 pf->destset);
3574 if (!solution)
3575 return 0;
3576 rx_protect_superset (rx, solution);
3577 rx_release_superset (rx, old_sol);
3581 /* It is possible that the RX_DISTINCT_FUTURE we are working on has
3582 * the empty set of NFA states as its definition. In that case, this
3583 * is a failure point.
3585 if (solution == nil_set)
3587 df->future_frame.inx = (void *) rx_backtrack;
3588 df->future_frame.data = 0;
3589 df->future_frame.data_2 = 0;
3590 return 1;
3592 dest = rx_superstate (rx, solution);
3593 rx_release_superset (rx, solution);
3594 if (!dest)
3595 return 0;
3598 struct rx_distinct_future *dft;
3599 dft = df;
3600 df->prev_same_dest->next_same_dest = 0;
3601 while (dft)
3603 dft->future = dest;
3604 dft->future_frame.inx = rx->instruction_table[rx_next_char];
3605 dft->future_frame.data = (void *) dest->transitions;
3606 dft = dft->next_same_dest;
3608 df->prev_same_dest->next_same_dest = df;
3610 if (!dest->transition_refs)
3611 dest->transition_refs = df;
3612 else
3614 struct rx_distinct_future *dft = dest->transition_refs->next_same_dest;
3615 dest->transition_refs->next_same_dest = df->next_same_dest;
3616 df->next_same_dest->prev_same_dest = dest->transition_refs;
3617 df->next_same_dest = dft;
3618 dft->prev_same_dest = df;
3620 return 1;
3624 /* This takes a superstate and a character, and computes some edges
3625 * from the superstate NFA. In particular, this computes all edges
3626 * that lead from SUPERSTATE given CHR. This function also
3627 * computes the set of characters that share this edge set.
3628 * This returns 0 on allocation error.
3629 * The character set and list of edges are returned through
3630 * the paramters CSETOUT and DFOUT.
3631 } */
3633 #ifdef __STDC__
3634 static int
3635 compute_super_edge (struct rx *rx, struct rx_distinct_future **dfout,
3636 rx_Bitset csetout, struct rx_superstate *superstate,
3637 unsigned char chr)
3638 #else
3639 static int
3640 compute_super_edge (rx, dfout, csetout, superstate, chr)
3641 struct rx *rx;
3642 struct rx_distinct_future **dfout;
3643 rx_Bitset csetout;
3644 struct rx_superstate *superstate;
3645 unsigned char chr;
3646 #endif
3648 struct rx_superset *stateset = superstate->contents;
3650 /* To compute the set of characters that share edges with CHR,
3651 * we start with the full character set, and subtract.
3653 rx_bitset_universe (rx->local_cset_size, csetout);
3654 *dfout = 0;
3656 /* Iterate over the NFA states in the superstate state-set. */
3657 while (stateset->car)
3659 struct rx_nfa_edge *e;
3660 for (e = stateset->car->edges; e; e = e->next)
3661 if (RX_bitset_member (e->params.cset, chr))
3663 /* If we find an NFA edge that applies, we make sure there
3664 * are corresponding edges in the superstate NFA.
3667 struct rx_distinct_future * saved;
3668 saved = *dfout;
3669 *dfout = include_futures (rx, *dfout, e->dest, superstate);
3670 if (!*dfout)
3672 struct rx_distinct_future * df;
3673 df = saved;
3674 if (df)
3675 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3676 while (df)
3678 struct rx_distinct_future *dft;
3679 dft = df;
3680 df = df->next_same_super_edge[0];
3682 if (dft->future && dft->future->transition_refs == dft)
3684 dft->future->transition_refs = dft->next_same_dest;
3685 if (dft->future->transition_refs == dft)
3686 dft->future->transition_refs = 0;
3688 dft->next_same_dest->prev_same_dest = dft->prev_same_dest;
3689 dft->prev_same_dest->next_same_dest = dft->next_same_dest;
3690 rx_cache_free (rx->cache,
3691 &rx->cache->free_discernable_futures,
3692 (char *)dft);
3694 return 0;
3697 /* We also trim the character set a bit. */
3698 rx_bitset_intersection (rx->local_cset_size,
3699 csetout, e->params.cset);
3701 else
3702 /* An edge that doesn't apply at least tells us some characters
3703 * that don't share the same edge set as CHR.
3705 rx_bitset_difference (rx->local_cset_size, csetout, e->params.cset);
3706 stateset = stateset->cdr;
3708 return 1;
3712 /* This is a constructor for RX_SUPER_EDGE structures. These are
3713 * wrappers for lists of superstate NFA edges that share character sets labels.
3714 * If a transition class contains more than one rx_distinct_future (superstate
3715 * edge), then it represents a non-determinism in the superstate NFA.
3718 #ifdef __STDC__
3719 static struct rx_super_edge *
3720 rx_super_edge (struct rx *rx,
3721 struct rx_superstate *super, rx_Bitset cset,
3722 struct rx_distinct_future *df)
3723 #else
3724 static struct rx_super_edge *
3725 rx_super_edge (rx, super, cset, df)
3726 struct rx *rx;
3727 struct rx_superstate *super;
3728 rx_Bitset cset;
3729 struct rx_distinct_future *df;
3730 #endif
3732 struct rx_super_edge *tc =
3733 (struct rx_super_edge *)rx_cache_malloc_or_get
3734 (rx->cache, &rx->cache->free_transition_classes,
3735 sizeof (struct rx_super_edge) + rx_sizeof_bitset (rx->local_cset_size));
3737 if (!tc)
3738 return 0;
3739 tc->next = super->edges;
3740 super->edges = tc;
3741 tc->rx_backtrack_frame.inx = rx->instruction_table[rx_backtrack_point];
3742 tc->rx_backtrack_frame.data = 0;
3743 tc->rx_backtrack_frame.data_2 = (void *) tc;
3744 tc->options = df;
3745 tc->cset = (rx_Bitset) ((char *) tc + sizeof (*tc));
3746 rx_bitset_assign (rx->local_cset_size, tc->cset, cset);
3747 if (df)
3749 struct rx_distinct_future * dfp = df;
3750 df->next_same_super_edge[1]->next_same_super_edge[0] = 0;
3751 while (dfp)
3753 dfp->edge = tc;
3754 dfp = dfp->next_same_super_edge[0];
3756 df->next_same_super_edge[1]->next_same_super_edge[0] = df;
3758 return tc;
3762 /* There are three kinds of cache miss. The first occurs when a
3763 * transition is taken that has never been computed during the
3764 * lifetime of the source superstate. That cache miss is handled by
3765 * calling COMPUTE_SUPER_EDGE. The second kind of cache miss
3766 * occurs when the destination superstate of a transition doesn't
3767 * exist. SOLVE_DESTINATION is used to construct the destination superstate.
3768 * Finally, the third kind of cache miss occurs when the destination
3769 * superstate of a transition is in a `semi-free state'. That case is
3770 * handled by UNFREE_SUPERSTATE.
3772 * The function of HANDLE_CACHE_MISS is to figure out which of these
3773 * cases applies.
3776 #ifdef __STDC__
3777 static void
3778 install_partial_transition (struct rx_superstate *super,
3779 struct rx_inx *answer,
3780 RX_subset set, int offset)
3781 #else
3782 static void
3783 install_partial_transition (super, answer, set, offset)
3784 struct rx_superstate *super;
3785 struct rx_inx *answer;
3786 RX_subset set;
3787 int offset;
3788 #endif
3790 int start = offset;
3791 int end = start + 32;
3792 RX_subset pos = 1;
3793 struct rx_inx * transitions = super->transitions;
3795 while (start < end)
3797 if (set & pos)
3798 transitions[start] = *answer;
3799 pos <<= 1;
3800 ++start;
3805 #ifdef __STDC__
3806 RX_DECL struct rx_inx *
3807 rx_handle_cache_miss
3808 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data)
3809 #else
3810 RX_DECL struct rx_inx *
3811 rx_handle_cache_miss (rx, super, chr, data)
3812 struct rx *rx;
3813 struct rx_superstate *super;
3814 unsigned char chr;
3815 void *data;
3816 #endif
3818 int offset = chr / RX_subset_bits;
3819 struct rx_distinct_future *df = data;
3821 if (!df) /* must be the shared_cache_miss_frame */
3823 /* Perhaps this is just a transition waiting to be filled. */
3824 struct rx_super_edge *tc;
3825 RX_subset mask = rx_subset_singletons [chr % RX_subset_bits];
3827 for (tc = super->edges; tc; tc = tc->next)
3828 if (tc->cset[offset] & mask)
3830 struct rx_inx * answer;
3831 df = tc->options;
3832 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3833 ? &tc->rx_backtrack_frame
3834 : (df->effects
3835 ? &df->side_effects_frame
3836 : &df->future_frame));
3837 install_partial_transition (super, answer,
3838 tc->cset [offset], offset * 32);
3839 return answer;
3841 /* Otherwise, it's a flushed or newly encountered edge. */
3843 char cset_space[1024]; /* this limit is far from unreasonable */
3844 rx_Bitset trcset;
3845 struct rx_inx *answer;
3847 if (rx_sizeof_bitset (rx->local_cset_size) > sizeof (cset_space))
3848 return 0; /* If the arbitrary limit is hit, always fail */
3849 /* cleanly. */
3850 trcset = (rx_Bitset)cset_space;
3851 rx_lock_superstate (rx, super);
3852 if (!compute_super_edge (rx, &df, trcset, super, chr))
3854 rx_unlock_superstate (rx, super);
3855 return 0;
3857 if (!df) /* We just computed the fail transition. */
3859 static struct rx_inx
3860 shared_fail_frame = { 0, 0, (void *)rx_backtrack, 0 };
3861 answer = &shared_fail_frame;
3863 else
3865 tc = rx_super_edge (rx, super, trcset, df);
3866 if (!tc)
3868 rx_unlock_superstate (rx, super);
3869 return 0;
3871 answer = ((tc->options->next_same_super_edge[0] != tc->options)
3872 ? &tc->rx_backtrack_frame
3873 : (df->effects
3874 ? &df->side_effects_frame
3875 : &df->future_frame));
3877 install_partial_transition (super, answer,
3878 trcset[offset], offset * 32);
3879 rx_unlock_superstate (rx, super);
3880 return answer;
3883 else if (df->future) /* A cache miss on an edge with a future? Must be
3884 * a semi-free destination. */
3886 if (df->future->is_semifree)
3887 refresh_semifree_superstate (rx->cache, df->future);
3888 return &df->future_frame;
3890 else
3891 /* no future superstate on an existing edge */
3893 rx_lock_superstate (rx, super);
3894 if (!solve_destination (rx, df))
3896 rx_unlock_superstate (rx, super);
3897 return 0;
3899 if (!df->effects
3900 && (df->edge->options->next_same_super_edge[0] == df->edge->options))
3901 install_partial_transition (super, &df->future_frame,
3902 df->edge->cset[offset], offset * 32);
3903 rx_unlock_superstate (rx, super);
3904 return &df->future_frame;
3911 /* The rest of the code provides a regex.c compatable interface. */
3914 __const__ char *re_error_msg[] =
3916 0, /* REG_NOUT */
3917 "No match", /* REG_NOMATCH */
3918 "Invalid regular expression", /* REG_BADPAT */
3919 "Invalid collation character", /* REG_ECOLLATE */
3920 "Invalid character class name", /* REG_ECTYPE */
3921 "Trailing backslash", /* REG_EESCAPE */
3922 "Invalid back reference", /* REG_ESUBREG */
3923 "Unmatched [ or [^", /* REG_EBRACK */
3924 "Unmatched ( or \\(", /* REG_EPAREN */
3925 "Unmatched \\{", /* REG_EBRACE */
3926 "Invalid content of \\{\\}", /* REG_BADBR */
3927 "Invalid range end", /* REG_ERANGE */
3928 "Memory exhausted", /* REG_ESPACE */
3929 "Invalid preceding regular expression", /* REG_BADRPT */
3930 "Premature end of regular expression", /* REG_EEND */
3931 "Regular expression too big", /* REG_ESIZE */
3932 "Unmatched ) or \\)", /* REG_ERPAREN */
3938 * Macros used while compiling patterns.
3940 * By convention, PEND points just past the end of the uncompiled pattern,
3941 * P points to the read position in the pattern. `translate' is the name
3942 * of the translation table (`TRANSLATE' is the name of a macro that looks
3943 * things up in `translate').
3948 * Fetch the next character in the uncompiled pattern---translating it
3949 * if necessary. *Also cast from a signed character in the constant
3950 * string passed to us by the user to an unsigned char that we can use
3951 * as an array index (in, e.g., `translate').
3953 #define PATFETCH(c) \
3954 do {if (p == pend) return REG_EEND; \
3955 c = (unsigned char) *p++; \
3956 c = translate[c]; \
3957 } while (0)
3960 * Fetch the next character in the uncompiled pattern, with no
3961 * translation.
3963 #define PATFETCH_RAW(c) \
3964 do {if (p == pend) return REG_EEND; \
3965 c = (unsigned char) *p++; \
3966 } while (0)
3968 /* Go backwards one character in the pattern. */
3969 #define PATUNFETCH p--
3972 #define TRANSLATE(d) translate[(unsigned char) (d)]
3974 typedef unsigned regnum_t;
3976 /* Since offsets can go either forwards or backwards, this type needs to
3977 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
3979 typedef int pattern_offset_t;
3981 typedef struct
3983 struct rexp_node ** top_expression; /* was begalt */
3984 struct rexp_node ** last_expression; /* was laststart */
3985 pattern_offset_t inner_group_offset;
3986 regnum_t regnum;
3987 } compile_stack_elt_t;
3989 typedef struct
3991 compile_stack_elt_t *stack;
3992 unsigned size;
3993 unsigned avail; /* Offset of next open position. */
3994 } compile_stack_type;
3997 #define INIT_COMPILE_STACK_SIZE 32
3999 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
4000 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
4002 /* The next available element. */
4003 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
4006 /* Set the bit for character C in a list. */
4007 #define SET_LIST_BIT(c) \
4008 (b[((unsigned char) (c)) / CHARBITS] \
4009 |= 1 << (((unsigned char) c) % CHARBITS))
4011 /* Get the next unsigned number in the uncompiled pattern. */
4012 #define GET_UNSIGNED_NUMBER(num) \
4013 { if (p != pend) \
4015 PATFETCH (c); \
4016 while (isdigit (c)) \
4018 if (num < 0) \
4019 num = 0; \
4020 num = num * 10 + c - '0'; \
4021 if (p == pend) \
4022 break; \
4023 PATFETCH (c); \
4028 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
4030 #define IS_CHAR_CLASS(string) \
4031 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
4032 || !strcmp (string, "lower") || !strcmp (string, "digit") \
4033 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
4034 || !strcmp (string, "space") || !strcmp (string, "print") \
4035 || !strcmp (string, "punct") || !strcmp (string, "graph") \
4036 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
4039 /* These predicates are used in regex_compile. */
4041 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4042 * after an alternative or a begin-subexpression. We assume there is at
4043 * least one character before the ^.
4046 #ifdef __STDC__
4047 static boolean
4048 at_begline_loc_p (__const__ char *pattern, __const__ char * p, reg_syntax_t syntax)
4049 #else
4050 static boolean
4051 at_begline_loc_p (pattern, p, syntax)
4052 __const__ char *pattern;
4053 __const__ char * p;
4054 reg_syntax_t syntax;
4055 #endif
4057 __const__ char *prev = p - 2;
4058 boolean prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
4060 return
4062 (/* After a subexpression? */
4063 ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
4065 /* After an alternative? */
4066 ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
4070 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4071 * at least one character after the $, i.e., `P < PEND'.
4074 #ifdef __STDC__
4075 static boolean
4076 at_endline_loc_p (__const__ char *p, __const__ char *pend, int syntax)
4077 #else
4078 static boolean
4079 at_endline_loc_p (p, pend, syntax)
4080 __const__ char *p;
4081 __const__ char *pend;
4082 int syntax;
4083 #endif
4085 __const__ char *next = p;
4086 boolean next_backslash = (*next == '\\');
4087 __const__ char *next_next = (p + 1 < pend) ? (p + 1) : 0;
4089 return
4091 /* Before a subexpression? */
4092 ((syntax & RE_NO_BK_PARENS)
4093 ? (*next == ')')
4094 : (next_backslash && next_next && (*next_next == ')')))
4096 /* Before an alternative? */
4097 ((syntax & RE_NO_BK_VBAR)
4098 ? (*next == '|')
4099 : (next_backslash && next_next && (*next_next == '|')))
4104 unsigned char rx_id_translation[256] =
4106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4107 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
4108 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
4109 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
4110 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
4111 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
4112 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
4113 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
4114 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
4115 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
4117 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
4118 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
4119 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
4120 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
4121 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
4122 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
4123 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
4124 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
4125 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
4126 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
4128 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
4129 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
4130 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
4131 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
4132 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
4133 250, 251, 252, 253, 254, 255
4136 /* The compiler keeps an inverted translation table.
4137 * This looks up/inititalize elements.
4138 * VALID is an array of booleans that validate CACHE.
4141 #ifdef __STDC__
4142 static rx_Bitset
4143 inverse_translation (struct re_pattern_buffer * rxb,
4144 char * valid, rx_Bitset cache,
4145 unsigned char * translate, int c)
4146 #else
4147 static rx_Bitset
4148 inverse_translation (rxb, valid, cache, translate, c)
4149 struct re_pattern_buffer * rxb;
4150 char * valid;
4151 rx_Bitset cache;
4152 unsigned char * translate;
4153 int c;
4154 #endif
4156 rx_Bitset cs
4157 = cache + c * rx_bitset_numb_subsets (rxb->rx.local_cset_size);
4159 if (!valid[c])
4161 int x;
4162 int c_tr = TRANSLATE(c);
4163 rx_bitset_null (rxb->rx.local_cset_size, cs);
4164 for (x = 0; x < 256; ++x) /* &&&& 13.37 */
4165 if (TRANSLATE(x) == c_tr)
4166 RX_bitset_enjoin (cs, x);
4167 valid[c] = 1;
4169 return cs;
4175 /* More subroutine declarations and macros for regex_compile. */
4177 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4178 false if it's not. */
4180 #ifdef __STDC__
4181 static boolean
4182 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
4183 #else
4184 static boolean
4185 group_in_compile_stack (compile_stack, regnum)
4186 compile_stack_type compile_stack;
4187 regnum_t regnum;
4188 #endif
4190 int this_element;
4192 for (this_element = compile_stack.avail - 1;
4193 this_element >= 0;
4194 this_element--)
4195 if (compile_stack.stack[this_element].regnum == regnum)
4196 return true;
4198 return false;
4203 * Read the ending character of a range (in a bracket expression) from the
4204 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
4205 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4206 * Then we set the translation of all bits between the starting and
4207 * ending characters (inclusive) in the compiled pattern B.
4209 * Return an error code.
4211 * We use these short variable names so we can use the same macros as
4212 * `regex_compile' itself.
4215 #ifdef __STDC__
4216 static reg_errcode_t
4217 compile_range (struct re_pattern_buffer * rxb, rx_Bitset cs,
4218 __const__ char ** p_ptr, __const__ char * pend,
4219 unsigned char * translate, reg_syntax_t syntax,
4220 rx_Bitset inv_tr, char * valid_inv_tr)
4221 #else
4222 static reg_errcode_t
4223 compile_range (rxb, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
4224 struct re_pattern_buffer * rxb;
4225 rx_Bitset cs;
4226 __const__ char ** p_ptr;
4227 __const__ char * pend;
4228 unsigned char * translate;
4229 reg_syntax_t syntax;
4230 rx_Bitset inv_tr;
4231 char * valid_inv_tr;
4232 #endif
4234 unsigned this_char;
4236 __const__ char *p = *p_ptr;
4238 unsigned char range_end;
4239 unsigned char range_start = TRANSLATE(p[-2]);
4241 if (p == pend)
4242 return REG_ERANGE;
4244 PATFETCH (range_end);
4246 (*p_ptr)++;
4248 if (range_start > range_end)
4249 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4251 for (this_char = range_start; this_char <= range_end; this_char++)
4253 rx_Bitset it =
4254 inverse_translation (rxb, valid_inv_tr, inv_tr, translate, this_char);
4255 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
4258 return REG_NOERROR;
4262 /* This searches a regexp for backreference side effects.
4263 * It fills in the array OUT with 1 at the index of every register pair
4264 * referenced by a backreference.
4266 * This is used to help optimize patterns for searching. The information is
4267 * useful because, if the caller doesn't want register values, backreferenced
4268 * registers are the only registers for which we need rx_backtrack.
4271 #ifdef __STDC__
4272 static void
4273 find_backrefs (char * out, struct rexp_node * rexp,
4274 struct re_se_params * params)
4275 #else
4276 static void
4277 find_backrefs (out, rexp, params)
4278 char * out;
4279 struct rexp_node * rexp;
4280 struct re_se_params * params;
4281 #endif
4283 if (rexp)
4284 switch (rexp->type)
4286 case r_cset:
4287 case r_data:
4288 return;
4289 case r_alternate:
4290 case r_concat:
4291 case r_opt:
4292 case r_star:
4293 case r_2phase_star:
4294 find_backrefs (out, rexp->params.pair.left, params);
4295 find_backrefs (out, rexp->params.pair.right, params);
4296 return;
4297 case r_side_effect:
4298 if ( ((long)rexp->params.side_effect >= 0)
4299 && (params [(long)rexp->params.side_effect].se == re_se_backref))
4300 out[ params [(long)rexp->params.side_effect].op1] = 1;
4301 return;
4307 /* Returns 0 unless the pattern can match the empty string. */
4309 #ifdef __STDC__
4310 static int
4311 compute_fastset (struct re_pattern_buffer * rxb, struct rexp_node * rexp)
4312 #else
4313 static int
4314 compute_fastset (rxb, rexp)
4315 struct re_pattern_buffer * rxb;
4316 struct rexp_node * rexp;
4317 #endif
4319 if (!rexp)
4320 return 1;
4321 switch (rexp->type)
4323 case r_data:
4324 return 1;
4325 case r_cset:
4327 rx_bitset_union (rxb->rx.local_cset_size,
4328 rxb->fastset, rexp->params.cset);
4330 return 0;
4331 case r_concat:
4332 return (compute_fastset (rxb, rexp->params.pair.left)
4333 && compute_fastset (rxb, rexp->params.pair.right));
4334 case r_2phase_star:
4335 compute_fastset (rxb, rexp->params.pair.left);
4336 /* compute_fastset (rxb, rexp->params.pair.right); nope... */
4337 return 1;
4338 case r_alternate:
4339 return !!(compute_fastset (rxb, rexp->params.pair.left)
4340 + compute_fastset (rxb, rexp->params.pair.right));
4341 case r_opt:
4342 case r_star:
4343 compute_fastset (rxb, rexp->params.pair.left);
4344 return 1;
4345 case r_side_effect:
4346 return 1;
4349 /* this should never happen */
4350 return 0;
4354 /* returns
4355 * 1 -- yes, definately anchored by the given side effect.
4356 * 2 -- maybe anchored, maybe the empty string.
4357 * 0 -- definately not anchored
4358 * There is simply no other possibility.
4361 #ifdef __STDC__
4362 static int
4363 is_anchored (struct rexp_node * rexp, rx_side_effect se)
4364 #else
4365 static int
4366 is_anchored (rexp, se)
4367 struct rexp_node * rexp;
4368 rx_side_effect se;
4369 #endif
4371 if (!rexp)
4372 return 2;
4373 switch (rexp->type)
4375 case r_cset:
4376 case r_data:
4377 return 0;
4378 case r_concat:
4379 case r_2phase_star:
4381 int l = is_anchored (rexp->params.pair.left, se);
4382 return (l == 2 ? is_anchored (rexp->params.pair.right, se) : l);
4384 case r_alternate:
4386 int l = is_anchored (rexp->params.pair.left, se);
4387 int r = l ? is_anchored (rexp->params.pair.right, se) : 0;
4389 if (l == r)
4390 return l;
4391 else if ((l == 0) || (r == 0))
4392 return 0;
4393 else
4394 return 2;
4396 case r_opt:
4397 case r_star:
4398 return is_anchored (rexp->params.pair.left, se) ? 2 : 0;
4400 case r_side_effect:
4401 return ((rexp->params.side_effect == se)
4402 ? 1 : 2);
4405 /* this should never happen */
4406 return 0;
4410 /* This removes register assignments that aren't required by backreferencing.
4411 * This can speed up explore_future, especially if it eliminates
4412 * non-determinism in the superstate NFA.
4414 * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS.
4415 * The non-zero elements of the array indicate which register assignments
4416 * can NOT be removed from the expression.
4419 #ifdef __STDC__
4420 static struct rexp_node *
4421 remove_unecessary_side_effects (struct rx * rx, char * needed,
4422 struct rexp_node * rexp,
4423 struct re_se_params * params)
4424 #else
4425 static struct rexp_node *
4426 remove_unecessary_side_effects (rx, needed, rexp, params)
4427 struct rx * rx;
4428 char * needed;
4429 struct rexp_node * rexp;
4430 struct re_se_params * params;
4431 #endif
4433 struct rexp_node * l;
4434 struct rexp_node * r;
4435 if (!rexp)
4436 return 0;
4437 else
4438 switch (rexp->type)
4440 case r_cset:
4441 case r_data:
4442 return rexp;
4443 case r_alternate:
4444 case r_concat:
4445 case r_2phase_star:
4446 l = remove_unecessary_side_effects (rx, needed,
4447 rexp->params.pair.left, params);
4448 r = remove_unecessary_side_effects (rx, needed,
4449 rexp->params.pair.right, params);
4450 if ((l && r) || (rexp->type != r_concat))
4452 rexp->params.pair.left = l;
4453 rexp->params.pair.right = r;
4454 return rexp;
4456 else
4458 rexp->params.pair.left = rexp->params.pair.right = 0;
4459 rx_free_rexp (rx, rexp);
4460 return l ? l : r;
4462 case r_opt:
4463 case r_star:
4464 l = remove_unecessary_side_effects (rx, needed,
4465 rexp->params.pair.left, params);
4466 if (l)
4468 rexp->params.pair.left = l;
4469 return rexp;
4471 else
4473 rexp->params.pair.left = 0;
4474 rx_free_rexp (rx, rexp);
4475 return 0;
4477 case r_side_effect:
4479 int se = (long)rexp->params.side_effect;
4480 if ( (se >= 0)
4481 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4482 || ((enum re_side_effects)params[se].se == re_se_rparen))
4483 && (params [se].op1 > 0)
4484 && (!needed [params [se].op1]))
4486 rx_free_rexp (rx, rexp);
4487 return 0;
4489 else
4490 return rexp;
4494 /* this should never happen */
4495 return 0;
4500 #ifdef __STDC__
4501 static int
4502 pointless_if_repeated (struct rexp_node * node, struct re_se_params * params)
4503 #else
4504 static int
4505 pointless_if_repeated (node, params)
4506 struct rexp_node * node;
4507 struct re_se_params * params;
4508 #endif
4510 if (!node)
4511 return 1;
4512 switch (node->type)
4514 case r_cset:
4515 return 0;
4516 case r_alternate:
4517 case r_concat:
4518 case r_2phase_star:
4519 return (pointless_if_repeated (node->params.pair.left, params)
4520 && pointless_if_repeated (node->params.pair.right, params));
4521 case r_opt:
4522 case r_star:
4523 return pointless_if_repeated (node->params.pair.left, params);
4524 case r_side_effect:
4525 switch (((long)node->params.side_effect < 0)
4526 ? (enum re_side_effects)node->params.side_effect
4527 : (enum re_side_effects)params[(long)node->params.side_effect].se)
4529 case re_se_try:
4530 case re_se_at_dot:
4531 case re_se_begbuf:
4532 case re_se_hat:
4533 case re_se_wordbeg:
4534 case re_se_wordbound:
4535 case re_se_notwordbound:
4536 case re_se_wordend:
4537 case re_se_endbuf:
4538 case re_se_dollar:
4539 case re_se_fail:
4540 case re_se_win:
4541 return 1;
4542 case re_se_lparen:
4543 case re_se_rparen:
4544 case re_se_iter:
4545 case re_se_end_iter:
4546 case re_se_syntax:
4547 case re_se_not_syntax:
4548 case re_se_backref:
4549 return 0;
4551 case r_data:
4552 default:
4553 return 0;
4559 #ifdef __STDC__
4560 static int
4561 registers_on_stack (struct re_pattern_buffer * rxb,
4562 struct rexp_node * rexp, int in_danger,
4563 struct re_se_params * params)
4564 #else
4565 static int
4566 registers_on_stack (rxb, rexp, in_danger, params)
4567 struct re_pattern_buffer * rxb;
4568 struct rexp_node * rexp;
4569 int in_danger;
4570 struct re_se_params * params;
4571 #endif
4573 if (!rexp)
4574 return 0;
4575 else
4576 switch (rexp->type)
4578 case r_cset:
4579 case r_data:
4580 return 0;
4581 case r_alternate:
4582 case r_concat:
4583 return ( registers_on_stack (rxb, rexp->params.pair.left,
4584 in_danger, params)
4585 || (registers_on_stack
4586 (rxb, rexp->params.pair.right,
4587 in_danger, params)));
4588 case r_opt:
4589 return registers_on_stack (rxb, rexp->params.pair.left, 0, params);
4590 case r_star:
4591 return registers_on_stack (rxb, rexp->params.pair.left, 1, params);
4592 case r_2phase_star:
4593 return
4594 ( registers_on_stack (rxb, rexp->params.pair.left, 1, params)
4595 || registers_on_stack (rxb, rexp->params.pair.right, 1, params));
4596 case r_side_effect:
4598 int se = (long)rexp->params.side_effect;
4599 if ( in_danger
4600 && (se >= 0)
4601 && (params [se].op1 > 0)
4602 && ( ((enum re_side_effects)params[se].se == re_se_lparen)
4603 || ((enum re_side_effects)params[se].se == re_se_rparen)))
4604 return 1;
4605 else
4606 return 0;
4610 /* this should never happen */
4611 return 0;
4616 static char idempotent_complex_se[] =
4618 #define RX_WANT_SE_DEFS 1
4619 #undef RX_DEF_SE
4620 #undef RX_DEF_CPLX_SE
4621 #define RX_DEF_SE(IDEM, NAME, VALUE)
4622 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM,
4623 #include "rx.h"
4624 #undef RX_DEF_SE
4625 #undef RX_DEF_CPLX_SE
4626 #undef RX_WANT_SE_DEFS
4630 static char idempotent_se[] =
4633 #define RX_WANT_SE_DEFS 1
4634 #undef RX_DEF_SE
4635 #undef RX_DEF_CPLX_SE
4636 #define RX_DEF_SE(IDEM, NAME, VALUE) IDEM,
4637 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)
4638 #include "rx.h"
4639 #undef RX_DEF_SE
4640 #undef RX_DEF_CPLX_SE
4641 #undef RX_WANT_SE_DEFS
4648 #ifdef __STDC__
4649 static int
4650 has_any_se (struct rx * rx,
4651 struct rexp_node * rexp)
4652 #else
4653 static int
4654 has_any_se (rx, rexp)
4655 struct rx * rx;
4656 struct rexp_node * rexp;
4657 #endif
4659 if (!rexp)
4660 return 0;
4662 switch (rexp->type)
4664 case r_cset:
4665 case r_data:
4666 return 0;
4668 case r_side_effect:
4669 return 1;
4671 case r_2phase_star:
4672 case r_concat:
4673 case r_alternate:
4674 return
4675 ( has_any_se (rx, rexp->params.pair.left)
4676 || has_any_se (rx, rexp->params.pair.right));
4678 case r_opt:
4679 case r_star:
4680 return has_any_se (rx, rexp->params.pair.left);
4683 /* this should never happen */
4684 return 0;
4689 /* This must be called AFTER `convert_hard_loops' for a given REXP. */
4690 #ifdef __STDC__
4691 static int
4692 has_non_idempotent_epsilon_path (struct rx * rx,
4693 struct rexp_node * rexp,
4694 struct re_se_params * params)
4695 #else
4696 static int
4697 has_non_idempotent_epsilon_path (rx, rexp, params)
4698 struct rx * rx;
4699 struct rexp_node * rexp;
4700 struct re_se_params * params;
4701 #endif
4703 if (!rexp)
4704 return 0;
4706 switch (rexp->type)
4708 case r_cset:
4709 case r_data:
4710 case r_star:
4711 return 0;
4713 case r_side_effect:
4714 return
4715 !((long)rexp->params.side_effect > 0
4716 ? idempotent_complex_se [ params [(long)rexp->params.side_effect].se ]
4717 : idempotent_se [-(long)rexp->params.side_effect]);
4719 case r_alternate:
4720 return
4721 ( has_non_idempotent_epsilon_path (rx,
4722 rexp->params.pair.left, params)
4723 || has_non_idempotent_epsilon_path (rx,
4724 rexp->params.pair.right, params));
4726 case r_2phase_star:
4727 case r_concat:
4728 return
4729 ( has_non_idempotent_epsilon_path (rx,
4730 rexp->params.pair.left, params)
4731 && has_non_idempotent_epsilon_path (rx,
4732 rexp->params.pair.right, params));
4734 case r_opt:
4735 return has_non_idempotent_epsilon_path (rx,
4736 rexp->params.pair.left, params);
4739 /* this should never happen */
4740 return 0;
4745 /* This computes rougly what it's name suggests. It can (and does) go wrong
4746 * in the direction of returning spurious 0 without causing disasters.
4748 #ifdef __STDC__
4749 static int
4750 begins_with_complex_se (struct rx * rx, struct rexp_node * rexp)
4751 #else
4752 static int
4753 begins_with_complex_se (rx, rexp)
4754 struct rx * rx;
4755 struct rexp_node * rexp;
4756 #endif
4758 if (!rexp)
4759 return 0;
4761 switch (rexp->type)
4763 case r_cset:
4764 case r_data:
4765 return 0;
4767 case r_side_effect:
4768 return ((long)rexp->params.side_effect >= 0);
4770 case r_alternate:
4771 return
4772 ( begins_with_complex_se (rx, rexp->params.pair.left)
4773 && begins_with_complex_se (rx, rexp->params.pair.right));
4776 case r_concat:
4777 return has_any_se (rx, rexp->params.pair.left);
4778 case r_opt:
4779 case r_star:
4780 case r_2phase_star:
4781 return 0;
4784 /* this should never happen */
4785 return 0;
4789 /* This destructively removes some of the re_se_tv side effects from
4790 * a rexp tree. In particular, during parsing re_se_tv was inserted on the
4791 * right half of every | to guarantee that posix path preference could be
4792 * honored. This function removes some which it can be determined aren't
4793 * needed.
4796 #ifdef __STDC__
4797 static void
4798 speed_up_alt (struct rx * rx,
4799 struct rexp_node * rexp,
4800 int unposix)
4801 #else
4802 static void
4803 speed_up_alt (rx, rexp, unposix)
4804 struct rx * rx;
4805 struct rexp_node * rexp;
4806 int unposix;
4807 #endif
4809 if (!rexp)
4810 return;
4812 switch (rexp->type)
4814 case r_cset:
4815 case r_data:
4816 case r_side_effect:
4817 return;
4819 case r_opt:
4820 case r_star:
4821 speed_up_alt (rx, rexp->params.pair.left, unposix);
4822 return;
4824 case r_2phase_star:
4825 case r_concat:
4826 speed_up_alt (rx, rexp->params.pair.left, unposix);
4827 speed_up_alt (rx, rexp->params.pair.right, unposix);
4828 return;
4830 case r_alternate:
4831 /* the right child is guaranteed to be (concat re_se_tv <subexp>) */
4833 speed_up_alt (rx, rexp->params.pair.left, unposix);
4834 speed_up_alt (rx, rexp->params.pair.right->params.pair.right, unposix);
4836 if ( unposix
4837 || (begins_with_complex_se
4838 (rx, rexp->params.pair.right->params.pair.right))
4839 || !( has_any_se (rx, rexp->params.pair.right->params.pair.right)
4840 || has_any_se (rx, rexp->params.pair.left)))
4842 struct rexp_node * conc = rexp->params.pair.right;
4843 rexp->params.pair.right = conc->params.pair.right;
4844 conc->params.pair.right = 0;
4845 rx_free_rexp (rx, conc);
4854 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
4855 Returns one of error codes defined in `regex.h', or zero for success.
4857 Assumes the `allocated' (and perhaps `buffer') and `translate'
4858 fields are set in BUFP on entry.
4860 If it succeeds, results are put in BUFP (if it returns an error, the
4861 contents of BUFP are undefined):
4862 `buffer' is the compiled pattern;
4863 `syntax' is set to SYNTAX;
4864 `used' is set to the length of the compiled pattern;
4865 `fastmap_accurate' is set to zero;
4866 `re_nsub' is set to the number of groups in PATTERN;
4867 `not_bol' and `not_eol' are set to zero.
4869 The `fastmap' and `newline_anchor' fields are neither
4870 examined nor set. */
4874 #ifdef __STDC__
4875 RX_DECL reg_errcode_t
4876 rx_compile (__const__ char *pattern, int size,
4877 reg_syntax_t syntax,
4878 struct re_pattern_buffer * rxb)
4879 #else
4880 RX_DECL reg_errcode_t
4881 rx_compile (pattern, size, syntax, rxb)
4882 __const__ char *pattern;
4883 int size;
4884 reg_syntax_t syntax;
4885 struct re_pattern_buffer * rxb;
4886 #endif
4888 RX_subset
4889 inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4890 char
4891 validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
4893 /* We fetch characters from PATTERN here. Even though PATTERN is
4894 `char *' (i.e., signed), we declare these variables as unsigned, so
4895 they can be reliably used as array indices. */
4896 register unsigned char c, c1;
4898 /* A random tempory spot in PATTERN. */
4899 __const__ char *p1;
4901 /* Keeps track of unclosed groups. */
4902 compile_stack_type compile_stack;
4904 /* Points to the current (ending) position in the pattern. */
4905 __const__ char *p = pattern;
4906 __const__ char *pend = pattern + size;
4908 /* How to translate the characters in the pattern. */
4909 unsigned char *translate = (rxb->translate
4910 ? rxb->translate
4911 : rx_id_translation);
4913 /* When parsing is done, this will hold the expression tree. */
4914 struct rexp_node * rexp = 0;
4916 /* In the midst of compilation, this holds onto the regexp
4917 * first parst while rexp goes on to aquire additional constructs.
4919 struct rexp_node * orig_rexp = 0;
4920 struct rexp_node * fewer_side_effects = 0;
4922 /* This and top_expression are saved on the compile stack. */
4923 struct rexp_node ** top_expression = &rexp;
4924 struct rexp_node ** last_expression = top_expression;
4926 /* Parameter to `goto append_node' */
4927 struct rexp_node * append;
4929 /* Counts open-groups as they are encountered. This is the index of the
4930 * innermost group being compiled.
4932 regnum_t regnum = 0;
4934 /* Place in the uncompiled pattern (i.e., the {) to
4935 * which to go back if the interval is invalid.
4937 __const__ char *beg_interval;
4939 struct re_se_params * params = 0;
4940 int paramc = 0; /* How many complex side effects so far? */
4942 rx_side_effect side; /* param to `goto add_side_effect' */
4944 bzero (validate_inv_tr, sizeof (validate_inv_tr));
4946 rxb->rx.instruction_table = rx_id_instruction_table;
4949 /* Initialize the compile stack. */
4950 compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
4951 if (compile_stack.stack == 0)
4952 return REG_ESPACE;
4954 compile_stack.size = INIT_COMPILE_STACK_SIZE;
4955 compile_stack.avail = 0;
4957 /* Initialize the pattern buffer. */
4958 rxb->rx.cache = &default_cache;
4959 rxb->syntax = syntax;
4960 rxb->fastmap_accurate = 0;
4961 rxb->not_bol = rxb->not_eol = 0;
4962 rxb->least_subs = 0;
4964 /* Always count groups, whether or not rxb->no_sub is set.
4965 * The whole pattern is implicitly group 0, so counting begins
4966 * with 1.
4968 rxb->re_nsub = 0;
4970 #if !defined (emacs) && !defined (SYNTAX)
4971 /* Initialize the syntax table. */
4972 init_syntax_once ();
4973 #endif
4975 /* Loop through the uncompiled pattern until we're at the end. */
4976 while (p != pend)
4978 PATFETCH (c);
4980 switch (c)
4982 case '^':
4984 if ( /* If at start of pattern, it's an operator. */
4985 p == pattern + 1
4986 /* If context independent, it's an operator. */
4987 || syntax & RE_CONTEXT_INDEP_ANCHORS
4988 /* Otherwise, depends on what's come before. */
4989 || at_begline_loc_p (pattern, p, syntax))
4991 struct rexp_node * n
4992 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_hat);
4993 if (!n)
4994 return REG_ESPACE;
4995 append = n;
4996 goto append_node;
4998 else
4999 goto normal_char;
5001 break;
5004 case '$':
5006 if ( /* If at end of pattern, it's an operator. */
5007 p == pend
5008 /* If context independent, it's an operator. */
5009 || syntax & RE_CONTEXT_INDEP_ANCHORS
5010 /* Otherwise, depends on what's next. */
5011 || at_endline_loc_p (p, pend, syntax))
5013 struct rexp_node * n
5014 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)re_se_dollar);
5015 if (!n)
5016 return REG_ESPACE;
5017 append = n;
5018 goto append_node;
5020 else
5021 goto normal_char;
5023 break;
5026 case '+':
5027 case '?':
5028 if ((syntax & RE_BK_PLUS_QM)
5029 || (syntax & RE_LIMITED_OPS))
5030 goto normal_char;
5032 handle_plus:
5033 case '*':
5034 /* If there is no previous pattern... */
5035 if (pointless_if_repeated (*last_expression, params))
5037 if (syntax & RE_CONTEXT_INVALID_OPS)
5038 return REG_BADRPT;
5039 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5040 goto normal_char;
5044 /* 1 means zero (many) matches is allowed. */
5045 char zero_times_ok = 0, many_times_ok = 0;
5047 /* If there is a sequence of repetition chars, collapse it
5048 down to just one (the right one). We can't combine
5049 interval operators with these because of, e.g., `a{2}*',
5050 which should only match an even number of `a's. */
5052 for (;;)
5054 zero_times_ok |= c != '+';
5055 many_times_ok |= c != '?';
5057 if (p == pend)
5058 break;
5060 PATFETCH (c);
5062 if (c == '*'
5063 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
5066 else if (syntax & RE_BK_PLUS_QM && c == '\\')
5068 if (p == pend) return REG_EESCAPE;
5070 PATFETCH (c1);
5071 if (!(c1 == '+' || c1 == '?'))
5073 PATUNFETCH;
5074 PATUNFETCH;
5075 break;
5078 c = c1;
5080 else
5082 PATUNFETCH;
5083 break;
5086 /* If we get here, we found another repeat character. */
5089 /* Star, etc. applied to an empty pattern is equivalent
5090 to an empty pattern. */
5091 if (!last_expression)
5092 break;
5094 /* Now we know whether or not zero matches is allowed
5095 * and also whether or not two or more matches is allowed.
5099 struct rexp_node * inner_exp = *last_expression;
5100 int need_sync = 0;
5102 if (many_times_ok
5103 && has_non_idempotent_epsilon_path (&rxb->rx,
5104 inner_exp, params))
5106 struct rexp_node * pusher
5107 = rx_mk_r_side_effect (&rxb->rx,
5108 (rx_side_effect)re_se_pushpos);
5109 struct rexp_node * checker
5110 = rx_mk_r_side_effect (&rxb->rx,
5111 (rx_side_effect)re_se_chkpos);
5112 struct rexp_node * pushback
5113 = rx_mk_r_side_effect (&rxb->rx,
5114 (rx_side_effect)re_se_pushback);
5115 rx_Bitset cs = rx_cset (&rxb->rx);
5116 struct rexp_node * lit_t = rx_mk_r_cset (&rxb->rx, cs);
5117 struct rexp_node * fake_state
5118 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5119 struct rexp_node * phase2
5120 = rx_mk_r_concat (&rxb->rx, checker, fake_state);
5121 struct rexp_node * popper
5122 = rx_mk_r_side_effect (&rxb->rx,
5123 (rx_side_effect)re_se_poppos);
5124 struct rexp_node * star
5125 = rx_mk_r_2phase_star (&rxb->rx, inner_exp, phase2);
5126 struct rexp_node * a
5127 = rx_mk_r_concat (&rxb->rx, pusher, star);
5128 struct rexp_node * whole_thing
5129 = rx_mk_r_concat (&rxb->rx, a, popper);
5130 if (!(pusher && star && pushback && lit_t && fake_state
5131 && lit_t && phase2 && checker && popper
5132 && a && whole_thing))
5133 return REG_ESPACE;
5134 RX_bitset_enjoin (cs, 't');
5135 *last_expression = whole_thing;
5137 else
5139 struct rexp_node * star =
5140 (many_times_ok ? rx_mk_r_star : rx_mk_r_opt)
5141 (&rxb->rx, *last_expression);
5142 if (!star)
5143 return REG_ESPACE;
5144 *last_expression = star;
5145 need_sync = has_any_se (&rxb->rx, *last_expression);
5147 if (!zero_times_ok)
5149 struct rexp_node * concat
5150 = rx_mk_r_concat (&rxb->rx, inner_exp,
5151 rx_copy_rexp (&rxb->rx,
5152 *last_expression));
5153 if (!concat)
5154 return REG_ESPACE;
5155 *last_expression = concat;
5157 if (need_sync)
5159 int sync_se = paramc;
5160 params = (params
5161 ? ((struct re_se_params *)
5162 realloc (params,
5163 sizeof (*params) * (1 + paramc)))
5164 : ((struct re_se_params *)
5165 malloc (sizeof (*params))));
5166 if (!params)
5167 return REG_ESPACE;
5168 ++paramc;
5169 params [sync_se].se = re_se_tv;
5170 side = (rx_side_effect)sync_se;
5171 goto add_side_effect;
5174 /* The old regex.c used to optimize `.*\n'.
5175 * Maybe rx should too?
5178 break;
5181 case '.':
5183 rx_Bitset cs = rx_cset (&rxb->rx);
5184 struct rexp_node * n = rx_mk_r_cset (&rxb->rx, cs);
5185 if (!(cs && n))
5186 return REG_ESPACE;
5188 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5189 if (!(rxb->syntax & RE_DOT_NEWLINE))
5190 RX_bitset_remove (cs, '\n');
5191 if (!(rxb->syntax & RE_DOT_NOT_NULL))
5192 RX_bitset_remove (cs, 0);
5194 append = n;
5195 goto append_node;
5196 break;
5200 case '[':
5201 if (p == pend) return REG_EBRACK;
5203 boolean had_char_class = false;
5204 rx_Bitset cs = rx_cset (&rxb->rx);
5205 struct rexp_node * node = rx_mk_r_cset (&rxb->rx, cs);
5206 int is_inverted = *p == '^';
5208 if (!(node && cs))
5209 return REG_ESPACE;
5211 /* This branch of the switch is normally exited with
5212 *`goto append_node'
5214 append = node;
5216 if (is_inverted)
5217 p++;
5219 /* Remember the first position in the bracket expression. */
5220 p1 = p;
5222 /* Read in characters and ranges, setting map bits. */
5223 for (;;)
5225 if (p == pend) return REG_EBRACK;
5227 PATFETCH (c);
5229 /* \ might escape characters inside [...] and [^...]. */
5230 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
5232 if (p == pend) return REG_EESCAPE;
5234 PATFETCH (c1);
5236 rx_Bitset it = inverse_translation (rxb,
5237 validate_inv_tr,
5238 inverse_translate,
5239 translate,
5240 c1);
5241 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5243 continue;
5246 /* Could be the end of the bracket expression. If it's
5247 not (i.e., when the bracket expression is `[]' so
5248 far), the ']' character bit gets set way below. */
5249 if (c == ']' && p != p1 + 1)
5250 goto finalize_class_and_append;
5252 /* Look ahead to see if it's a range when the last thing
5253 was a character class. */
5254 if (had_char_class && c == '-' && *p != ']')
5255 return REG_ERANGE;
5257 /* Look ahead to see if it's a range when the last thing
5258 was a character: if this is a hyphen not at the
5259 beginning or the end of a list, then it's the range
5260 operator. */
5261 if (c == '-'
5262 && !(p - 2 >= pattern && p[-2] == '[')
5263 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
5264 && *p != ']')
5266 reg_errcode_t ret
5267 = compile_range (rxb, cs, &p, pend, translate, syntax,
5268 inverse_translate, validate_inv_tr);
5269 if (ret != REG_NOERROR) return ret;
5272 else if (p[0] == '-' && p[1] != ']')
5273 { /* This handles ranges made up of characters only. */
5274 reg_errcode_t ret;
5276 /* Move past the `-'. */
5277 PATFETCH (c1);
5279 ret = compile_range (rxb, cs, &p, pend, translate, syntax,
5280 inverse_translate, validate_inv_tr);
5281 if (ret != REG_NOERROR) return ret;
5284 /* See if we're at the beginning of a possible character
5285 class. */
5287 else if ((syntax & RE_CHAR_CLASSES)
5288 && (c == '[') && (*p == ':'))
5290 char str[CHAR_CLASS_MAX_LENGTH + 1];
5292 PATFETCH (c);
5293 c1 = 0;
5295 /* If pattern is `[[:'. */
5296 if (p == pend) return REG_EBRACK;
5298 for (;;)
5300 PATFETCH (c);
5301 if (c == ':' || c == ']' || p == pend
5302 || c1 == CHAR_CLASS_MAX_LENGTH)
5303 break;
5304 str[c1++] = c;
5306 str[c1] = '\0';
5308 /* If isn't a word bracketed by `[:' and:`]':
5309 undo the ending character, the letters, and leave
5310 the leading `:' and `[' (but set bits for them). */
5311 if (c == ':' && *p == ']')
5313 int ch;
5314 boolean is_alnum = !strcmp (str, "alnum");
5315 boolean is_alpha = !strcmp (str, "alpha");
5316 boolean is_blank = !strcmp (str, "blank");
5317 boolean is_cntrl = !strcmp (str, "cntrl");
5318 boolean is_digit = !strcmp (str, "digit");
5319 boolean is_graph = !strcmp (str, "graph");
5320 boolean is_lower = !strcmp (str, "lower");
5321 boolean is_print = !strcmp (str, "print");
5322 boolean is_punct = !strcmp (str, "punct");
5323 boolean is_space = !strcmp (str, "space");
5324 boolean is_upper = !strcmp (str, "upper");
5325 boolean is_xdigit = !strcmp (str, "xdigit");
5327 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
5329 /* Throw away the ] at the end of the character
5330 class. */
5331 PATFETCH (c);
5333 if (p == pend) return REG_EBRACK;
5335 for (ch = 0; ch < 1 << CHARBITS; ch++)
5337 if ( (is_alnum && isalnum (ch))
5338 || (is_alpha && isalpha (ch))
5339 || (is_blank && isblank (ch))
5340 || (is_cntrl && iscntrl (ch))
5341 || (is_digit && isdigit (ch))
5342 || (is_graph && isgraph (ch))
5343 || (is_lower && islower (ch))
5344 || (is_print && isprint (ch))
5345 || (is_punct && ispunct (ch))
5346 || (is_space && isspace (ch))
5347 || (is_upper && isupper (ch))
5348 || (is_xdigit && isxdigit (ch)))
5350 rx_Bitset it =
5351 inverse_translation (rxb,
5352 validate_inv_tr,
5353 inverse_translate,
5354 translate,
5355 ch);
5356 rx_bitset_union (rxb->rx.local_cset_size,
5357 cs, it);
5360 had_char_class = true;
5362 else
5364 c1++;
5365 while (c1--)
5366 PATUNFETCH;
5368 rx_Bitset it =
5369 inverse_translation (rxb,
5370 validate_inv_tr,
5371 inverse_translate,
5372 translate,
5373 '[');
5374 rx_bitset_union (rxb->rx.local_cset_size,
5375 cs, it);
5378 rx_Bitset it =
5379 inverse_translation (rxb,
5380 validate_inv_tr,
5381 inverse_translate,
5382 translate,
5383 ':');
5384 rx_bitset_union (rxb->rx.local_cset_size,
5385 cs, it);
5387 had_char_class = false;
5390 else
5392 had_char_class = false;
5394 rx_Bitset it = inverse_translation (rxb,
5395 validate_inv_tr,
5396 inverse_translate,
5397 translate,
5399 rx_bitset_union (rxb->rx.local_cset_size, cs, it);
5404 finalize_class_and_append:
5405 if (is_inverted)
5407 rx_bitset_complement (rxb->rx.local_cset_size, cs);
5408 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
5409 RX_bitset_remove (cs, '\n');
5411 goto append_node;
5413 break;
5416 case '(':
5417 if (syntax & RE_NO_BK_PARENS)
5418 goto handle_open;
5419 else
5420 goto normal_char;
5423 case ')':
5424 if (syntax & RE_NO_BK_PARENS)
5425 goto handle_close;
5426 else
5427 goto normal_char;
5430 case '\n':
5431 if (syntax & RE_NEWLINE_ALT)
5432 goto handle_alt;
5433 else
5434 goto normal_char;
5437 case '|':
5438 if (syntax & RE_NO_BK_VBAR)
5439 goto handle_alt;
5440 else
5441 goto normal_char;
5444 case '{':
5445 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5446 goto handle_interval;
5447 else
5448 goto normal_char;
5451 case '\\':
5452 if (p == pend) return REG_EESCAPE;
5454 /* Do not translate the character after the \, so that we can
5455 distinguish, e.g., \B from \b, even if we normally would
5456 translate, e.g., B to b. */
5457 PATFETCH_RAW (c);
5459 switch (c)
5461 case '(':
5462 if (syntax & RE_NO_BK_PARENS)
5463 goto normal_backslash;
5465 handle_open:
5466 rxb->re_nsub++;
5467 regnum++;
5468 if (COMPILE_STACK_FULL)
5470 ((compile_stack.stack) =
5471 (compile_stack_elt_t *) realloc (compile_stack.stack, ( compile_stack.size << 1) * sizeof (
5472 compile_stack_elt_t)));
5473 if (compile_stack.stack == 0) return REG_ESPACE;
5475 compile_stack.size <<= 1;
5478 if (*last_expression)
5480 struct rexp_node * concat
5481 = rx_mk_r_concat (&rxb->rx, *last_expression, 0);
5482 if (!concat)
5483 return REG_ESPACE;
5484 *last_expression = concat;
5485 last_expression = &concat->params.pair.right;
5489 * These are the values to restore when we hit end of this
5490 * group.
5492 COMPILE_STACK_TOP.top_expression = top_expression;
5493 COMPILE_STACK_TOP.last_expression = last_expression;
5494 COMPILE_STACK_TOP.regnum = regnum;
5496 compile_stack.avail++;
5498 top_expression = last_expression;
5499 break;
5502 case ')':
5503 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
5505 handle_close:
5506 /* See similar code for backslashed left paren above. */
5507 if (COMPILE_STACK_EMPTY)
5508 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
5509 goto normal_char;
5510 else
5511 return REG_ERPAREN;
5513 /* Since we just checked for an empty stack above, this
5514 ``can't happen''. */
5517 /* We don't just want to restore into `regnum', because
5518 later groups should continue to be numbered higher,
5519 as in `(ab)c(de)' -- the second group is #2. */
5520 regnum_t this_group_regnum;
5521 struct rexp_node ** inner = top_expression;
5523 compile_stack.avail--;
5524 top_expression = COMPILE_STACK_TOP.top_expression;
5525 last_expression = COMPILE_STACK_TOP.last_expression;
5526 this_group_regnum = COMPILE_STACK_TOP.regnum;
5528 int left_se = paramc;
5529 int right_se = paramc + 1;
5531 params = (params
5532 ? ((struct re_se_params *)
5533 realloc (params,
5534 (paramc + 2) * sizeof (params[0])))
5535 : ((struct re_se_params *)
5536 malloc (2 * sizeof (params[0]))));
5537 if (!params)
5538 return REG_ESPACE;
5539 paramc += 2;
5541 params[left_se].se = re_se_lparen;
5542 params[left_se].op1 = this_group_regnum;
5543 params[right_se].se = re_se_rparen;
5544 params[right_se].op1 = this_group_regnum;
5546 struct rexp_node * left
5547 = rx_mk_r_side_effect (&rxb->rx,
5548 (rx_side_effect)left_se);
5549 struct rexp_node * right
5550 = rx_mk_r_side_effect (&rxb->rx,
5551 (rx_side_effect)right_se);
5552 struct rexp_node * c1
5553 = (*inner
5554 ? rx_mk_r_concat (&rxb->rx, left, *inner) : left);
5555 struct rexp_node * c2
5556 = rx_mk_r_concat (&rxb->rx, c1, right);
5557 if (!(left && right && c1 && c2))
5558 return REG_ESPACE;
5559 *inner = c2;
5562 break;
5565 case '|': /* `\|'. */
5566 if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
5567 goto normal_backslash;
5568 handle_alt:
5569 if (syntax & RE_LIMITED_OPS)
5570 goto normal_char;
5573 struct rexp_node * alt
5574 = rx_mk_r_alternate (&rxb->rx, *top_expression, 0);
5575 if (!alt)
5576 return REG_ESPACE;
5577 *top_expression = alt;
5578 last_expression = &alt->params.pair.right;
5580 int sync_se = paramc;
5582 params = (params
5583 ? ((struct re_se_params *)
5584 realloc (params,
5585 (paramc + 1) * sizeof (params[0])))
5586 : ((struct re_se_params *)
5587 malloc (sizeof (params[0]))));
5588 if (!params)
5589 return REG_ESPACE;
5590 ++paramc;
5592 params[sync_se].se = re_se_tv;
5594 struct rexp_node * sync
5595 = rx_mk_r_side_effect (&rxb->rx,
5596 (rx_side_effect)sync_se);
5597 struct rexp_node * conc
5598 = rx_mk_r_concat (&rxb->rx, sync, 0);
5600 if (!sync || !conc)
5601 return REG_ESPACE;
5603 *last_expression = conc;
5604 last_expression = &conc->params.pair.right;
5608 break;
5611 case '{':
5612 /* If \{ is a literal. */
5613 if (!(syntax & RE_INTERVALS)
5614 /* If we're at `\{' and it's not the open-interval
5615 operator. */
5616 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
5617 || (p - 2 == pattern && p == pend))
5618 goto normal_backslash;
5620 handle_interval:
5622 /* If got here, then the syntax allows intervals. */
5624 /* At least (most) this many matches must be made. */
5625 int lower_bound = -1, upper_bound = -1;
5627 beg_interval = p - 1;
5629 if (p == pend)
5631 if (syntax & RE_NO_BK_BRACES)
5632 goto unfetch_interval;
5633 else
5634 return REG_EBRACE;
5637 GET_UNSIGNED_NUMBER (lower_bound);
5639 if (c == ',')
5641 GET_UNSIGNED_NUMBER (upper_bound);
5642 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
5644 else
5645 /* Interval such as `{1}' => match exactly once. */
5646 upper_bound = lower_bound;
5648 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
5649 || lower_bound > upper_bound)
5651 if (syntax & RE_NO_BK_BRACES)
5652 goto unfetch_interval;
5653 else
5654 return REG_BADBR;
5657 if (!(syntax & RE_NO_BK_BRACES))
5659 if (c != '\\') return REG_EBRACE;
5660 PATFETCH (c);
5663 if (c != '}')
5665 if (syntax & RE_NO_BK_BRACES)
5666 goto unfetch_interval;
5667 else
5668 return REG_BADBR;
5671 /* We just parsed a valid interval. */
5673 /* If it's invalid to have no preceding re. */
5674 if (pointless_if_repeated (*last_expression, params))
5676 if (syntax & RE_CONTEXT_INVALID_OPS)
5677 return REG_BADRPT;
5678 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
5679 goto unfetch_interval;
5680 /* was: else laststart = b; */
5683 /* If the upper bound is zero, don't want to iterate
5684 * at all.
5686 if (upper_bound == 0)
5688 if (*last_expression)
5690 rx_free_rexp (&rxb->rx, *last_expression);
5691 *last_expression = 0;
5694 else
5695 /* Otherwise, we have a nontrivial interval. */
5697 int iter_se = paramc;
5698 int end_se = paramc + 1;
5699 params = (params
5700 ? ((struct re_se_params *)
5701 realloc (params,
5702 sizeof (*params) * (2 + paramc)))
5703 : ((struct re_se_params *)
5704 malloc (2 * sizeof (*params))));
5705 if (!params)
5706 return REG_ESPACE;
5707 paramc += 2;
5708 params [iter_se].se = re_se_iter;
5709 params [iter_se].op1 = lower_bound;
5710 params[iter_se].op2 = upper_bound;
5712 params[end_se].se = re_se_end_iter;
5713 params[end_se].op1 = lower_bound;
5714 params[end_se].op2 = upper_bound;
5716 struct rexp_node * push0
5717 = rx_mk_r_side_effect (&rxb->rx,
5718 (rx_side_effect)re_se_push0);
5719 struct rexp_node * start_one_iter
5720 = rx_mk_r_side_effect (&rxb->rx,
5721 (rx_side_effect)iter_se);
5722 struct rexp_node * phase1
5723 = rx_mk_r_concat (&rxb->rx, start_one_iter,
5724 *last_expression);
5725 struct rexp_node * pushback
5726 = rx_mk_r_side_effect (&rxb->rx,
5727 (rx_side_effect)re_se_pushback);
5728 rx_Bitset cs = rx_cset (&rxb->rx);
5729 struct rexp_node * lit_t
5730 = rx_mk_r_cset (&rxb->rx, cs);
5731 struct rexp_node * phase2
5732 = rx_mk_r_concat (&rxb->rx, pushback, lit_t);
5733 struct rexp_node * loop
5734 = rx_mk_r_2phase_star (&rxb->rx, phase1, phase2);
5735 struct rexp_node * push_n_loop
5736 = rx_mk_r_concat (&rxb->rx, push0, loop);
5737 struct rexp_node * final_test
5738 = rx_mk_r_side_effect (&rxb->rx,
5739 (rx_side_effect)end_se);
5740 struct rexp_node * full_exp
5741 = rx_mk_r_concat (&rxb->rx, push_n_loop, final_test);
5743 if (!(push0 && start_one_iter && phase1
5744 && pushback && lit_t && phase2
5745 && loop && push_n_loop && final_test && full_exp))
5746 return REG_ESPACE;
5748 RX_bitset_enjoin(cs, 't');
5750 *last_expression = full_exp;
5753 beg_interval = 0;
5755 break;
5757 unfetch_interval:
5758 /* If an invalid interval, match the characters as literals. */
5759 p = beg_interval;
5760 beg_interval = 0;
5762 /* normal_char and normal_backslash need `c'. */
5763 PATFETCH (c);
5765 if (!(syntax & RE_NO_BK_BRACES))
5767 if (p > pattern && p[-1] == '\\')
5768 goto normal_backslash;
5770 goto normal_char;
5772 #ifdef emacs
5773 /* There is no way to specify the before_dot and after_dot
5774 operators. rms says this is ok. --karl */
5775 case '=':
5776 side = (rx_side_effect)rx_se_at_dot;
5777 goto add_side_effect;
5778 break;
5780 case 's':
5781 case 'S':
5783 rx_Bitset cs = rx_cset (&rxb->rx);
5784 struct rexp_node * set = rx_mk_r_cset (&rxb->rx, cs);
5785 if (!(cs && set))
5786 return REG_ESPACE;
5787 if (c == 'S')
5788 rx_bitset_universe (rxb->rx.local_cset_size, cs);
5790 PATFETCH (c);
5792 int x;
5793 enum syntaxcode code = syntax_spec_code [c];
5794 for (x = 0; x < 256; ++x)
5797 if (SYNTAX (x) == code)
5799 rx_Bitset it =
5800 inverse_translation (rxb, validate_inv_tr,
5801 inverse_translate,
5802 translate, x);
5803 rx_bitset_xor (rxb->rx.local_cset_size, cs, it);
5807 append = set;
5808 goto append_node;
5810 break;
5811 #endif /* emacs */
5814 case 'w':
5815 case 'W':
5817 rx_Bitset cs = rx_cset (&rxb->rx);
5818 struct rexp_node * n = (cs ? rx_mk_r_cset (&rxb->rx, cs) : 0);
5819 if (!(cs && n))
5820 return REG_ESPACE;
5821 if (c == 'W')
5822 rx_bitset_universe (rxb->rx.local_cset_size ,cs);
5824 int x;
5825 for (x = rxb->rx.local_cset_size - 1; x > 0; --x)
5826 if (SYNTAX(x) & Sword)
5827 RX_bitset_toggle (cs, x);
5829 append = n;
5830 goto append_node;
5832 break;
5834 /* With a little extra work, some of these side effects could be optimized
5835 * away (basicly by looking at what we already know about the surrounding
5836 * chars).
5838 case '<':
5839 side = (rx_side_effect)re_se_wordbeg;
5840 goto add_side_effect;
5841 break;
5843 case '>':
5844 side = (rx_side_effect)re_se_wordend;
5845 goto add_side_effect;
5846 break;
5848 case 'b':
5849 side = (rx_side_effect)re_se_wordbound;
5850 goto add_side_effect;
5851 break;
5853 case 'B':
5854 side = (rx_side_effect)re_se_notwordbound;
5855 goto add_side_effect;
5856 break;
5858 case '`':
5859 side = (rx_side_effect)re_se_begbuf;
5860 goto add_side_effect;
5861 break;
5863 case '\'':
5864 side = (rx_side_effect)re_se_endbuf;
5865 goto add_side_effect;
5866 break;
5868 add_side_effect:
5870 struct rexp_node * se
5871 = rx_mk_r_side_effect (&rxb->rx, side);
5872 if (!se)
5873 return REG_ESPACE;
5874 append = se;
5875 goto append_node;
5877 break;
5879 case '1': case '2': case '3': case '4': case '5':
5880 case '6': case '7': case '8': case '9':
5881 if (syntax & RE_NO_BK_REFS)
5882 goto normal_char;
5884 c1 = c - '0';
5886 if (c1 > regnum)
5887 return REG_ESUBREG;
5889 /* Can't back reference to a subexpression if inside of it. */
5890 if (group_in_compile_stack (compile_stack, c1))
5891 return REG_ESUBREG;
5894 int backref_se = paramc;
5895 params = (params
5896 ? ((struct re_se_params *)
5897 realloc (params,
5898 sizeof (*params) * (1 + paramc)))
5899 : ((struct re_se_params *)
5900 malloc (sizeof (*params))));
5901 if (!params)
5902 return REG_ESPACE;
5903 ++paramc;
5904 params[backref_se].se = re_se_backref;
5905 params[backref_se].op1 = c1;
5906 side = (rx_side_effect)backref_se;
5907 goto add_side_effect;
5909 break;
5911 case '+':
5912 case '?':
5913 if (syntax & RE_BK_PLUS_QM)
5914 goto handle_plus;
5915 else
5916 goto normal_backslash;
5918 default:
5919 normal_backslash:
5920 /* You might think it would be useful for \ to mean
5921 not to translate; but if we don't translate it
5922 it will never match anything. */
5923 c = TRANSLATE (c);
5924 goto normal_char;
5926 break;
5929 default:
5930 /* Expects the character in `c'. */
5931 normal_char:
5933 rx_Bitset cs = rx_cset(&rxb->rx);
5934 struct rexp_node * match = rx_mk_r_cset (&rxb->rx, cs);
5935 rx_Bitset it;
5936 if (!(cs && match))
5937 return REG_ESPACE;
5938 it = inverse_translation (rxb, validate_inv_tr,
5939 inverse_translate, translate, c);
5940 rx_bitset_union (CHAR_SET_SIZE, cs, it);
5941 append = match;
5943 append_node:
5944 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
5945 * and then parses the next character normally.
5947 if (*last_expression)
5949 struct rexp_node * concat
5950 = rx_mk_r_concat (&rxb->rx, *last_expression, append);
5951 if (!concat)
5952 return REG_ESPACE;
5953 *last_expression = concat;
5954 last_expression = &concat->params.pair.right;
5956 else
5957 *last_expression = append;
5959 } /* switch (c) */
5960 } /* while p != pend */
5964 int win_se = paramc;
5965 params = (params
5966 ? ((struct re_se_params *)
5967 realloc (params,
5968 sizeof (*params) * (1 + paramc)))
5969 : ((struct re_se_params *)
5970 malloc (sizeof (*params))));
5971 if (!params)
5972 return REG_ESPACE;
5973 ++paramc;
5974 params[win_se].se = re_se_win;
5976 struct rexp_node * se
5977 = rx_mk_r_side_effect (&rxb->rx, (rx_side_effect)win_se);
5978 struct rexp_node * concat
5979 = rx_mk_r_concat (&rxb->rx, rexp, se);
5980 if (!(se && concat))
5981 return REG_ESPACE;
5982 rexp = concat;
5987 /* Through the pattern now. */
5989 if (!COMPILE_STACK_EMPTY)
5990 return REG_EPAREN;
5992 free (compile_stack.stack);
5994 orig_rexp = rexp;
5995 #ifdef RX_DEBUG
5996 if (rx_debug_compile)
5998 dbug_rxb = rxb;
5999 fputs ("\n\nCompiling ", stdout);
6000 fwrite (pattern, 1, size, stdout);
6001 fputs (":\n", stdout);
6002 rxb->se_params = params;
6003 print_rexp (&rxb->rx, orig_rexp, 2, re_seprint, stdout);
6005 #endif
6007 rx_Bitset cs = rx_cset(&rxb->rx);
6008 rx_Bitset cs2 = rx_cset(&rxb->rx);
6009 char * se_map = (char *) alloca (paramc);
6010 struct rexp_node * new_rexp = 0;
6013 bzero (se_map, paramc);
6014 find_backrefs (se_map, rexp, params);
6015 fewer_side_effects =
6016 remove_unecessary_side_effects (&rxb->rx, se_map,
6017 rx_copy_rexp (&rxb->rx, rexp), params);
6019 speed_up_alt (&rxb->rx, rexp, 0);
6020 speed_up_alt (&rxb->rx, fewer_side_effects, 1);
6023 char * syntax_parens = rxb->syntax_parens;
6024 if (syntax_parens == (char *)0x1)
6025 rexp = remove_unecessary_side_effects
6026 (&rxb->rx, se_map, rexp, params);
6027 else if (syntax_parens)
6029 int x;
6030 for (x = 0; x < paramc; ++x)
6031 if (( (params[x].se == re_se_lparen)
6032 || (params[x].se == re_se_rparen))
6033 && (!syntax_parens [params[x].op1]))
6034 se_map [x] = 1;
6035 rexp = remove_unecessary_side_effects
6036 (&rxb->rx, se_map, rexp, params);
6040 /* At least one more optimization would be nice to have here but i ran out
6041 * of time. The idea would be to delay side effects.
6042 * For examle, `(abc)' is the same thing as `abc()' except that the
6043 * left paren is offset by 3 (which we know at compile time).
6044 * (In this comment, write that second pattern `abc(:3:)'
6045 * where `(:3:' is a syntactic unit.)
6047 * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))'
6048 * (The paren nesting may be hard to follow -- that's an alternation
6049 * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens
6050 * followed by the closing paren from the original expression.)
6052 * Neither the expression tree representation nor the the nfa make
6053 * this very easy to write. :(
6056 /* What we compile is different than what the parser returns.
6057 * Suppose the parser returns expression R.
6058 * Let R' be R with unnecessary register assignments removed
6059 * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above).
6061 * What we will compile is the expression:
6063 * m{try}R{win}\|s{try}R'{win}
6065 * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE).
6067 * When trying a match, we insert an `m' at the beginning of the
6068 * string if the user wants registers to be filled, `s' if not.
6070 new_rexp =
6071 rx_mk_r_alternate
6072 (&rxb->rx,
6073 rx_mk_r_concat (&rxb->rx, rx_mk_r_cset (&rxb->rx, cs2), rexp),
6074 rx_mk_r_concat (&rxb->rx,
6075 rx_mk_r_cset (&rxb->rx, cs), fewer_side_effects));
6077 if (!(new_rexp && cs && cs2))
6078 return REG_ESPACE;
6079 RX_bitset_enjoin (cs2, '\0'); /* prefixed to the rexp used for matching. */
6080 RX_bitset_enjoin (cs, '\1'); /* prefixed to the rexp used for searching. */
6081 rexp = new_rexp;
6084 #ifdef RX_DEBUG
6085 if (rx_debug_compile)
6087 fputs ("\n...which is compiled as:\n", stdout);
6088 print_rexp (&rxb->rx, rexp, 2, re_seprint, stdout);
6090 #endif
6092 struct rx_nfa_state *start = 0;
6093 struct rx_nfa_state *end = 0;
6095 if (!rx_build_nfa (&rxb->rx, rexp, &start, &end))
6096 return REG_ESPACE; /* */
6097 else
6099 void * mem = (void *)rxb->buffer;
6100 unsigned long size = rxb->allocated;
6101 int start_id;
6102 char * perm_mem;
6103 int iterator_size = paramc * sizeof (params[0]);
6105 end->is_final = 1;
6106 start->is_start = 1;
6107 rx_name_nfa_states (&rxb->rx);
6108 start_id = start->id;
6109 #ifdef RX_DEBUG
6110 if (rx_debug_compile)
6112 fputs ("...giving the NFA: \n", stdout);
6113 dbug_rxb = rxb;
6114 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6116 #endif
6117 if (!rx_eclose_nfa (&rxb->rx))
6118 return REG_ESPACE;
6119 else
6121 rx_delete_epsilon_transitions (&rxb->rx);
6123 /* For compatability reasons, we need to shove the
6124 * compiled nfa into one chunk of malloced memory.
6126 rxb->rx.reserved = ( sizeof (params[0]) * paramc
6127 + rx_sizeof_bitset (rxb->rx.local_cset_size));
6128 #ifdef RX_DEBUG
6129 if (rx_debug_compile)
6131 dbug_rxb = rxb;
6132 fputs ("...which cooks down (uncompactified) to: \n", stdout);
6133 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6135 #endif
6136 if (!rx_compactify_nfa (&rxb->rx, &mem, &size))
6137 return REG_ESPACE;
6138 rxb->buffer = mem;
6139 rxb->allocated = size;
6140 rxb->rx.buffer = mem;
6141 rxb->rx.allocated = size;
6142 perm_mem = ((char *)rxb->rx.buffer
6143 + rxb->rx.allocated - rxb->rx.reserved);
6144 rxb->se_params = ((struct re_se_params *)perm_mem);
6145 bcopy (params, rxb->se_params, iterator_size);
6146 perm_mem += iterator_size;
6147 rxb->fastset = (rx_Bitset) perm_mem;
6148 rxb->start = rx_id_to_nfa_state (&rxb->rx, start_id);
6150 rx_bitset_null (rxb->rx.local_cset_size, rxb->fastset);
6151 rxb->can_match_empty = compute_fastset (rxb, orig_rexp);
6152 rxb->match_regs_on_stack =
6153 registers_on_stack (rxb, orig_rexp, 0, params);
6154 rxb->search_regs_on_stack =
6155 registers_on_stack (rxb, fewer_side_effects, 0, params);
6156 if (rxb->can_match_empty)
6157 rx_bitset_universe (rxb->rx.local_cset_size, rxb->fastset);
6158 rxb->is_anchored = is_anchored (orig_rexp, (rx_side_effect) re_se_hat);
6159 rxb->begbuf_only = is_anchored (orig_rexp,
6160 (rx_side_effect) re_se_begbuf);
6162 rx_free_rexp (&rxb->rx, rexp);
6163 if (params)
6164 free (params);
6165 #ifdef RX_DEBUG
6166 if (rx_debug_compile)
6168 dbug_rxb = rxb;
6169 fputs ("...which cooks down to: \n", stdout);
6170 print_nfa (&rxb->rx, rxb->rx.nfa_states, re_seprint, stdout);
6172 #endif
6174 return REG_NOERROR;
6179 /* This table gives an error message for each of the error codes listed
6180 in regex.h. Obviously the order here has to be same as there. */
6182 __const__ char * rx_error_msg[] =
6183 { 0, /* REG_NOERROR */
6184 "No match", /* REG_NOMATCH */
6185 "Invalid regular expression", /* REG_BADPAT */
6186 "Invalid collation character", /* REG_ECOLLATE */
6187 "Invalid character class name", /* REG_ECTYPE */
6188 "Trailing backslash", /* REG_EESCAPE */
6189 "Invalid back reference", /* REG_ESUBREG */
6190 "Unmatched [ or [^", /* REG_EBRACK */
6191 "Unmatched ( or \\(", /* REG_EPAREN */
6192 "Unmatched \\{", /* REG_EBRACE */
6193 "Invalid content of \\{\\}", /* REG_BADBR */
6194 "Invalid range end", /* REG_ERANGE */
6195 "Memory exhausted", /* REG_ESPACE */
6196 "Invalid preceding regular expression", /* REG_BADRPT */
6197 "Premature end of regular expression", /* REG_EEND */
6198 "Regular expression too big", /* REG_ESIZE */
6199 "Unmatched ) or \\)", /* REG_ERPAREN */
6205 char rx_slowmap [256] =
6207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6222 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6225 #ifdef __STDC__
6226 RX_DECL void
6227 rx_blow_up_fastmap (struct re_pattern_buffer * rxb)
6228 #else
6229 RX_DECL void
6230 rx_blow_up_fastmap (rxb)
6231 struct re_pattern_buffer * rxb;
6232 #endif
6234 int x;
6235 for (x = 0; x < 256; ++x) /* &&&& 3.6 % */
6236 rxb->fastmap [x] = !!RX_bitset_member (rxb->fastset, x);
6237 rxb->fastmap_accurate = 1;
6243 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6244 #define RE_SEARCH_2_FN inner_re_search_2
6245 #define RE_S2_QUAL static
6246 #else
6247 #define RE_SEARCH_2_FN re_search_2
6248 #define RE_S2_QUAL
6249 #endif
6251 struct re_search_2_closure
6253 __const__ unsigned char * string1;
6254 int size1;
6255 __const__ unsigned char * string2;
6256 int size2;
6260 static __inline__ enum rx_get_burst_return
6261 re_search_2_get_burst (pos, vclosure, stop)
6262 struct rx_string_position * pos;
6263 void * vclosure;
6264 int stop;
6266 struct re_search_2_closure * closure;
6267 closure = (struct re_search_2_closure *)vclosure;
6268 if (!closure->string2)
6270 int inset;
6272 inset = pos->pos - pos->string;
6273 if ((inset < -1) || (inset > closure->size1))
6274 return rx_get_burst_no_more;
6275 else
6277 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6278 pos->string = (__const__ unsigned char *) closure->string1;
6279 pos->size = closure->size1;
6280 pos->end = ((__const__ unsigned char *)
6281 MIN(closure->string1 + closure->size1,
6282 closure->string1 + stop));
6283 pos->offset = 0;
6284 return ((pos->pos < pos->end)
6285 ? rx_get_burst_ok
6286 : rx_get_burst_no_more);
6289 else if (!closure->string1)
6291 int inset;
6293 inset = pos->pos - pos->string;
6294 pos->pos = (__const__ unsigned char *) closure->string2 + inset;
6295 pos->string = (__const__ unsigned char *) closure->string2;
6296 pos->size = closure->size2;
6297 pos->end = ((__const__ unsigned char *)
6298 MIN(closure->string2 + closure->size2,
6299 closure->string2 + stop));
6300 pos->offset = 0;
6301 return ((pos->pos < pos->end)
6302 ? rx_get_burst_ok
6303 : rx_get_burst_no_more);
6305 else
6307 int inset;
6309 inset = pos->pos - pos->string + pos->offset;
6310 if (inset < closure->size1)
6312 pos->pos = (__const__ unsigned char *) closure->string1 + inset;
6313 pos->string = (__const__ unsigned char *) closure->string1;
6314 pos->size = closure->size1;
6315 pos->end = ((__const__ unsigned char *)
6316 MIN(closure->string1 + closure->size1,
6317 closure->string1 + stop));
6318 pos->offset = 0;
6319 return rx_get_burst_ok;
6321 else
6323 pos->pos = ((__const__ unsigned char *)
6324 closure->string2 + inset - closure->size1);
6325 pos->string = (__const__ unsigned char *) closure->string2;
6326 pos->size = closure->size2;
6327 pos->end = ((__const__ unsigned char *)
6328 MIN(closure->string2 + closure->size2,
6329 closure->string2 + stop - closure->size1));
6330 pos->offset = closure->size1;
6331 return ((pos->pos < pos->end)
6332 ? rx_get_burst_ok
6333 : rx_get_burst_no_more);
6339 static __inline__ enum rx_back_check_return
6340 re_search_2_back_check (pos, lparen, rparen, translate, vclosure, stop)
6341 struct rx_string_position * pos;
6342 int lparen;
6343 int rparen;
6344 unsigned char * translate;
6345 void * vclosure;
6346 int stop;
6348 struct rx_string_position there;
6349 struct rx_string_position past;
6351 there = *pos;
6352 there.pos = there.string + lparen - there.offset;
6353 re_search_2_get_burst (&there, vclosure, stop);
6355 past = *pos;
6356 past.pos = past.string + rparen - there.offset;
6357 re_search_2_get_burst (&past, vclosure, stop);
6359 ++pos->pos;
6360 re_search_2_get_burst (pos, vclosure, stop);
6362 while ( (there.pos != past.pos)
6363 && (pos->pos != pos->end))
6364 if (TRANSLATE(*there.pos) != TRANSLATE(*pos->pos))
6365 return rx_back_check_fail;
6366 else
6368 ++there.pos;
6369 ++pos->pos;
6370 if (there.pos == there.end)
6371 re_search_2_get_burst (&there, vclosure, stop);
6372 if (pos->pos == pos->end)
6373 re_search_2_get_burst (pos, vclosure, stop);
6376 if (there.pos != past.pos)
6377 return rx_back_check_fail;
6378 --pos->pos;
6379 re_search_2_get_burst (pos, vclosure, stop);
6380 return rx_back_check_pass;
6383 static __inline__ int
6384 re_search_2_fetch_char (pos, offset, app_closure, stop)
6385 struct rx_string_position * pos;
6386 int offset;
6387 void * app_closure;
6388 int stop;
6390 struct re_search_2_closure * closure;
6391 closure = (struct re_search_2_closure *)app_closure;
6392 if (offset == 0)
6394 if (pos->pos >= pos->string)
6395 return *pos->pos;
6396 else
6398 if ( (pos->string == closure->string2)
6399 && (closure->string1)
6400 && (closure->size1))
6401 return closure->string1[closure->size1 - 1];
6402 else
6403 return 0; /* sure, why not. */
6406 if (pos->pos == pos->end)
6407 return *closure->string2;
6408 else
6409 return pos->pos[1];
6413 #ifdef __STDC__
6414 RE_S2_QUAL int
6415 RE_SEARCH_2_FN (struct re_pattern_buffer *rxb,
6416 __const__ char * string1, int size1,
6417 __const__ char * string2, int size2,
6418 int startpos, int range,
6419 struct re_registers *regs,
6420 int stop)
6421 #else
6422 RE_S2_QUAL int
6423 RE_SEARCH_2_FN (rxb,
6424 string1, size1, string2, size2, startpos, range, regs, stop)
6425 struct re_pattern_buffer *rxb;
6426 __const__ char * string1;
6427 int size1;
6428 __const__ char * string2;
6429 int size2;
6430 int startpos;
6431 int range;
6432 struct re_registers *regs;
6433 int stop;
6434 #endif
6436 int answer;
6437 struct re_search_2_closure closure;
6438 closure.string1 = (__const__ unsigned char *) string1;
6439 closure.size1 = size1;
6440 closure.string2 = (__const__ unsigned char *) string2;
6441 closure.size2 = size2;
6442 answer = rx_search (rxb, startpos, range, stop, size1 + size2,
6443 re_search_2_get_burst,
6444 re_search_2_back_check,
6445 re_search_2_fetch_char,
6446 (void *)&closure,
6447 regs,
6450 switch (answer)
6452 case rx_search_continuation:
6453 abort ();
6454 case rx_search_error:
6455 return -2;
6456 case rx_search_soft_fail:
6457 case rx_search_fail:
6458 return -1;
6459 default:
6460 return answer;
6464 /* Export rx_search to callers outside this file. */
6467 re_rx_search (rxb, startpos, range, stop, total_size,
6468 get_burst, back_check, fetch_char,
6469 app_closure, regs, resume_state, save_state)
6470 struct re_pattern_buffer * rxb;
6471 int startpos;
6472 int range;
6473 int stop;
6474 int total_size;
6475 rx_get_burst_fn get_burst;
6476 rx_back_check_fn back_check;
6477 rx_fetch_char_fn fetch_char;
6478 void * app_closure;
6479 struct re_registers * regs;
6480 struct rx_search_state * resume_state;
6481 struct rx_search_state * save_state;
6483 return rx_search (rxb, startpos, range, stop, total_size,
6484 get_burst, back_check, fetch_char, app_closure,
6485 regs, resume_state, save_state);
6488 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6489 #ifdef __STDC__
6491 re_search_2 (struct re_pattern_buffer *rxb,
6492 __const__ char * string1, int size1,
6493 __const__ char * string2, int size2,
6494 int startpos, int range,
6495 struct re_registers *regs,
6496 int stop)
6497 #else
6499 re_search_2 (rxb, string1, size1, string2, size2, startpos, range, regs, stop)
6500 struct re_pattern_buffer *rxb;
6501 __const__ char * string1;
6502 int size1;
6503 __const__ char * string2;
6504 int size2;
6505 int startpos;
6506 int range;
6507 struct re_registers *regs;
6508 int stop;
6509 #endif
6511 int ret;
6512 ret = inner_re_search_2 (rxb, string1, size1, string2, size2, startpos,
6513 range, regs, stop);
6514 alloca (0);
6515 return ret;
6517 #endif
6520 /* Like re_search_2, above, but only one string is specified, and
6521 * doesn't let you say where to stop matching.
6524 #ifdef __STDC__
6526 re_search (struct re_pattern_buffer * rxb, __const__ char *string,
6527 int size, int startpos, int range,
6528 struct re_registers *regs)
6529 #else
6531 re_search (rxb, string, size, startpos, range, regs)
6532 struct re_pattern_buffer * rxb;
6533 __const__ char * string;
6534 int size;
6535 int startpos;
6536 int range;
6537 struct re_registers *regs;
6538 #endif
6540 return re_search_2 (rxb, 0, 0, string, size, startpos, range, regs, size);
6543 #ifdef __STDC__
6545 re_match_2 (struct re_pattern_buffer * rxb,
6546 __const__ char * string1, int size1,
6547 __const__ char * string2, int size2,
6548 int pos, struct re_registers *regs, int stop)
6549 #else
6551 re_match_2 (rxb, string1, size1, string2, size2, pos, regs, stop)
6552 struct re_pattern_buffer * rxb;
6553 __const__ char * string1;
6554 int size1;
6555 __const__ char * string2;
6556 int size2;
6557 int pos;
6558 struct re_registers *regs;
6559 int stop;
6560 #endif
6562 struct re_registers some_regs;
6563 regoff_t start;
6564 regoff_t end;
6565 int srch;
6566 int save = rxb->regs_allocated;
6567 struct re_registers * regs_to_pass = regs;
6569 if (!regs)
6571 some_regs.start = &start;
6572 some_regs.end = &end;
6573 some_regs.num_regs = 1;
6574 regs_to_pass = &some_regs;
6575 rxb->regs_allocated = REGS_FIXED;
6578 srch = re_search_2 (rxb, string1, size1, string2, size2,
6579 pos, 1, regs_to_pass, stop);
6580 if (regs_to_pass != regs)
6581 rxb->regs_allocated = save;
6582 if (srch < 0)
6583 return srch;
6584 return regs_to_pass->end[0] - regs_to_pass->start[0];
6587 /* re_match is like re_match_2 except it takes only a single string. */
6589 #ifdef __STDC__
6591 re_match (struct re_pattern_buffer * rxb,
6592 __const__ char * string,
6593 int size, int pos,
6594 struct re_registers *regs)
6595 #else
6597 re_match (rxb, string, size, pos, regs)
6598 struct re_pattern_buffer * rxb;
6599 __const__ char *string;
6600 int size;
6601 int pos;
6602 struct re_registers *regs;
6603 #endif
6605 return re_match_2 (rxb, string, size, 0, 0, pos, regs, size);
6610 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
6611 also be assigned to arbitrarily: each pattern buffer stores its own
6612 syntax, so it can be changed between regex compilations. */
6613 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
6616 /* Specify the precise syntax of regexps for compilation. This provides
6617 for compatibility for various utilities which historically have
6618 different, incompatible syntaxes.
6620 The argument SYNTAX is a bit mask comprised of the various bits
6621 defined in regex.h. We return the old syntax. */
6623 #ifdef __STDC__
6624 reg_syntax_t
6625 re_set_syntax (reg_syntax_t syntax)
6626 #else
6627 reg_syntax_t
6628 re_set_syntax (syntax)
6629 reg_syntax_t syntax;
6630 #endif
6632 reg_syntax_t ret = re_syntax_options;
6634 re_syntax_options = syntax;
6635 return ret;
6639 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6640 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6641 this memory for recording register information. STARTS and ENDS
6642 must be allocated using the malloc library routine, and must each
6643 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6645 If NUM_REGS == 0, then subsequent matches should allocate their own
6646 register data.
6648 Unless this function is called, the first search or match using
6649 PATTERN_BUFFER will allocate its own register data, without
6650 freeing the old data. */
6652 #ifdef __STDC__
6653 void
6654 re_set_registers (struct re_pattern_buffer *bufp,
6655 struct re_registers *regs,
6656 unsigned num_regs,
6657 regoff_t * starts, regoff_t * ends)
6658 #else
6659 void
6660 re_set_registers (bufp, regs, num_regs, starts, ends)
6661 struct re_pattern_buffer *bufp;
6662 struct re_registers *regs;
6663 unsigned num_regs;
6664 regoff_t * starts;
6665 regoff_t * ends;
6666 #endif
6668 if (num_regs)
6670 bufp->regs_allocated = REGS_REALLOCATE;
6671 regs->num_regs = num_regs;
6672 regs->start = starts;
6673 regs->end = ends;
6675 else
6677 bufp->regs_allocated = REGS_UNALLOCATED;
6678 regs->num_regs = 0;
6679 regs->start = regs->end = (regoff_t) 0;
6686 #ifdef __STDC__
6687 static int
6688 cplx_se_sublist_len (struct rx_se_list * list)
6689 #else
6690 static int
6691 cplx_se_sublist_len (list)
6692 struct rx_se_list * list;
6693 #endif
6695 int x = 0;
6696 while (list)
6698 if ((long)list->car >= 0)
6699 ++x;
6700 list = list->cdr;
6702 return x;
6706 /* For rx->se_list_cmp */
6708 #ifdef __STDC__
6709 static int
6710 posix_se_list_order (struct rx * rx,
6711 struct rx_se_list * a, struct rx_se_list * b)
6712 #else
6713 static int
6714 posix_se_list_order (rx, a, b)
6715 struct rx * rx;
6716 struct rx_se_list * a;
6717 struct rx_se_list * b;
6718 #endif
6720 int al = cplx_se_sublist_len (a);
6721 int bl = cplx_se_sublist_len (b);
6723 if (!al && !bl)
6724 return ((a == b)
6726 : ((a < b) ? -1 : 1));
6728 else if (!al)
6729 return -1;
6731 else if (!bl)
6732 return 1;
6734 else
6736 rx_side_effect * av = ((rx_side_effect *)
6737 alloca (sizeof (rx_side_effect) * (al + 1)));
6738 rx_side_effect * bv = ((rx_side_effect *)
6739 alloca (sizeof (rx_side_effect) * (bl + 1)));
6740 struct rx_se_list * ap = a;
6741 struct rx_se_list * bp = b;
6742 int ai, bi;
6744 for (ai = al - 1; ai >= 0; --ai)
6746 while ((long)ap->car < 0)
6747 ap = ap->cdr;
6748 av[ai] = ap->car;
6749 ap = ap->cdr;
6751 av[al] = (rx_side_effect)-2;
6752 for (bi = bl - 1; bi >= 0; --bi)
6754 while ((long)bp->car < 0)
6755 bp = bp->cdr;
6756 bv[bi] = bp->car;
6757 bp = bp->cdr;
6759 bv[bl] = (rx_side_effect)-1;
6762 int ret;
6763 int x = 0;
6764 while (av[x] == bv[x])
6765 ++x;
6766 ret = (((unsigned *)(av[x]) < (unsigned *)(bv[x])) ? -1 : 1);
6767 return ret;
6775 /* re_compile_pattern is the GNU regular expression compiler: it
6776 compiles PATTERN (of length SIZE) and puts the result in RXB.
6777 Returns 0 if the pattern was valid, otherwise an error string.
6779 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6780 are set in RXB on entry.
6782 We call rx_compile to do the actual compilation. */
6784 #ifdef __STDC__
6785 __const__ char *
6786 re_compile_pattern (__const__ char *pattern,
6787 int length,
6788 struct re_pattern_buffer * rxb)
6789 #else
6790 __const__ char *
6791 re_compile_pattern (pattern, length, rxb)
6792 __const__ char *pattern;
6793 int length;
6794 struct re_pattern_buffer * rxb;
6795 #endif
6797 reg_errcode_t ret;
6799 /* GNU code is written to assume at least RE_NREGS registers will be set
6800 (and at least one extra will be -1). */
6801 rxb->regs_allocated = REGS_UNALLOCATED;
6803 /* And GNU code determines whether or not to get register information
6804 by passing null for the REGS argument to re_match, etc., not by
6805 setting no_sub. */
6806 rxb->no_sub = 0;
6808 rxb->rx.local_cset_size = 256;
6810 /* Match anchors at newline. */
6811 rxb->newline_anchor = 1;
6813 rxb->re_nsub = 0;
6814 rxb->start = 0;
6815 rxb->se_params = 0;
6816 rxb->rx.nodec = 0;
6817 rxb->rx.epsnodec = 0;
6818 rxb->rx.instruction_table = 0;
6819 rxb->rx.nfa_states = 0;
6820 rxb->rx.se_list_cmp = posix_se_list_order;
6821 rxb->rx.start_set = 0;
6823 ret = rx_compile (pattern, length, re_syntax_options, rxb);
6824 alloca (0);
6825 return rx_error_msg[(int) ret];
6830 #ifdef __STDC__
6832 re_compile_fastmap (struct re_pattern_buffer * rxb)
6833 #else
6835 re_compile_fastmap (rxb)
6836 struct re_pattern_buffer * rxb;
6837 #endif
6839 rx_blow_up_fastmap (rxb);
6840 return 0;
6846 /* Entry points compatible with 4.2 BSD regex library. We don't define
6847 them if this is an Emacs or POSIX compilation. */
6849 #if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX)
6851 /* BSD has one and only one pattern buffer. */
6852 static struct re_pattern_buffer rx_comp_buf;
6854 #ifdef __STDC__
6855 char *
6856 re_comp (__const__ char *s)
6857 #else
6858 char *
6859 re_comp (s)
6860 __const__ char *s;
6861 #endif
6863 reg_errcode_t ret;
6865 if (!s || (*s == '\0'))
6867 if (!rx_comp_buf.buffer)
6868 return "No previous regular expression";
6869 return 0;
6872 if (!rx_comp_buf.fastmap)
6874 rx_comp_buf.fastmap = (char *) malloc (1 << CHARBITS);
6875 if (!rx_comp_buf.fastmap)
6876 return "Memory exhausted";
6879 /* Since `rx_exec' always passes NULL for the `regs' argument, we
6880 don't need to initialize the pattern buffer fields which affect it. */
6882 /* Match anchors at newlines. */
6883 rx_comp_buf.newline_anchor = 1;
6885 rx_comp_buf.fastmap_accurate = 0;
6886 rx_comp_buf.re_nsub = 0;
6887 rx_comp_buf.start = 0;
6888 rx_comp_buf.se_params = 0;
6889 rx_comp_buf.rx.nodec = 0;
6890 rx_comp_buf.rx.epsnodec = 0;
6891 rx_comp_buf.rx.instruction_table = 0;
6892 rx_comp_buf.rx.nfa_states = 0;
6893 rx_comp_buf.rx.start = 0;
6894 rx_comp_buf.rx.se_list_cmp = posix_se_list_order;
6895 rx_comp_buf.rx.start_set = 0;
6896 rx_comp_buf.rx.local_cset_size = 256;
6898 ret = rx_compile (s, strlen (s), re_syntax_options, &rx_comp_buf);
6899 alloca (0);
6901 /* Yes, we're discarding `__const__' here. */
6902 return (char *) rx_error_msg[(int) ret];
6906 #ifdef __STDC__
6908 re_exec (__const__ char *s)
6909 #else
6911 re_exec (s)
6912 __const__ char *s;
6913 #endif
6915 __const__ int len = strlen (s);
6916 return
6917 0 <= re_search (&rx_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6919 #endif /* not emacs and not _POSIX_SOURCE */
6923 /* POSIX.2 functions. Don't define these for Emacs. */
6925 #if !defined(emacs)
6927 /* regcomp takes a regular expression as a string and compiles it.
6929 PREG is a regex_t *. We do not expect any fields to be initialized,
6930 since POSIX says we shouldn't. Thus, we set
6932 `buffer' to the compiled pattern;
6933 `used' to the length of the compiled pattern;
6934 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6935 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6936 RE_SYNTAX_POSIX_BASIC;
6937 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6938 `fastmap' and `fastmap_accurate' to zero;
6939 `re_nsub' to the number of subexpressions in PATTERN.
6941 PATTERN is the address of the pattern string.
6943 CFLAGS is a series of bits which affect compilation.
6945 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6946 use POSIX basic syntax.
6948 If REG_NEWLINE is set, then . and [^...] don't match newline.
6949 Also, regexec will try a match beginning after every newline.
6951 If REG_ICASE is set, then we considers upper- and lowercase
6952 versions of letters to be equivalent when matching.
6954 If REG_NOSUB is set, then when PREG is passed to regexec, that
6955 routine will report only success or failure, and nothing about the
6956 registers.
6958 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6959 the return codes and their meanings.) */
6962 #ifdef __STDC__
6964 regcomp (regex_t * preg, __const__ char * pattern, int cflags)
6965 #else
6967 regcomp (preg, pattern, cflags)
6968 regex_t * preg;
6969 __const__ char * pattern;
6970 int cflags;
6971 #endif
6973 reg_errcode_t ret;
6974 unsigned syntax
6975 = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6977 /* regex_compile will allocate the space for the compiled pattern. */
6978 preg->buffer = 0;
6979 preg->allocated = 0;
6980 preg->fastmap = malloc (256);
6981 if (!preg->fastmap)
6982 return REG_ESPACE;
6983 preg->fastmap_accurate = 0;
6985 if (cflags & REG_ICASE)
6987 unsigned i;
6989 preg->translate = (unsigned char *) malloc (256);
6990 if (!preg->translate)
6991 return (int) REG_ESPACE;
6993 /* Map uppercase characters to corresponding lowercase ones. */
6994 for (i = 0; i < CHAR_SET_SIZE; i++)
6995 preg->translate[i] = isupper (i) ? tolower (i) : i;
6997 else
6998 preg->translate = 0;
7000 /* If REG_NEWLINE is set, newlines are treated differently. */
7001 if (cflags & REG_NEWLINE)
7002 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7003 syntax &= ~RE_DOT_NEWLINE;
7004 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7005 /* It also changes the matching behavior. */
7006 preg->newline_anchor = 1;
7008 else
7009 preg->newline_anchor = 0;
7011 preg->no_sub = !!(cflags & REG_NOSUB);
7013 /* POSIX says a null character in the pattern terminates it, so we
7014 can use strlen here in compiling the pattern. */
7015 preg->re_nsub = 0;
7016 preg->start = 0;
7017 preg->se_params = 0;
7018 preg->syntax_parens = 0;
7019 preg->rx.nodec = 0;
7020 preg->rx.epsnodec = 0;
7021 preg->rx.instruction_table = 0;
7022 preg->rx.nfa_states = 0;
7023 preg->rx.local_cset_size = 256;
7024 preg->rx.start = 0;
7025 preg->rx.se_list_cmp = posix_se_list_order;
7026 preg->rx.start_set = 0;
7027 ret = rx_compile (pattern, strlen (pattern), syntax, preg);
7028 alloca (0);
7030 /* POSIX doesn't distinguish between an unmatched open-group and an
7031 unmatched close-group: both are REG_EPAREN. */
7032 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7034 return (int) ret;
7038 /* regexec searches for a given pattern, specified by PREG, in the
7039 string STRING.
7041 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7042 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7043 least NMATCH elements, and we set them to the offsets of the
7044 corresponding matched substrings.
7046 EFLAGS specifies `execution flags' which affect matching: if
7047 REG_NOTBOL is set, then ^ does not match at the beginning of the
7048 string; if REG_NOTEOL is set, then $ does not match at the end.
7050 We return 0 if we find a match and REG_NOMATCH if not. */
7052 #ifdef __STDC__
7054 regexec (__const__ regex_t *preg, __const__ char *string,
7055 size_t nmatch, regmatch_t pmatch[],
7056 int eflags)
7057 #else
7059 regexec (preg, string, nmatch, pmatch, eflags)
7060 __const__ regex_t *preg;
7061 __const__ char *string;
7062 size_t nmatch;
7063 regmatch_t pmatch[];
7064 int eflags;
7065 #endif
7067 int ret;
7068 struct re_registers regs;
7069 regex_t private_preg;
7070 int len = strlen (string);
7071 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7073 private_preg = *preg;
7075 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7076 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7078 /* The user has told us exactly how many registers to return
7079 * information about, via `nmatch'. We have to pass that on to the
7080 * matching routines.
7082 private_preg.regs_allocated = REGS_FIXED;
7084 if (want_reg_info)
7086 regs.num_regs = nmatch;
7087 regs.start = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7088 regs.end = (( regoff_t *) malloc ((nmatch) * sizeof ( regoff_t)));
7089 if (regs.start == 0 || regs.end == 0)
7090 return (int) REG_NOMATCH;
7093 /* Perform the searching operation. */
7094 ret = re_search (&private_preg,
7095 string, len,
7096 /* start: */ 0,
7097 /* range: */ len,
7098 want_reg_info ? &regs : (struct re_registers *) 0);
7100 /* Copy the register information to the POSIX structure. */
7101 if (want_reg_info)
7103 if (ret >= 0)
7105 unsigned r;
7107 for (r = 0; r < nmatch; r++)
7109 pmatch[r].rm_so = regs.start[r];
7110 pmatch[r].rm_eo = regs.end[r];
7114 /* If we needed the temporary register info, free the space now. */
7115 free (regs.start);
7116 free (regs.end);
7119 /* We want zero return to mean success, unlike `re_search'. */
7120 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7124 /* Returns a message corresponding to an error code, ERRCODE, returned
7125 from either regcomp or regexec. */
7127 #ifdef __STDC__
7128 size_t
7129 regerror (int errcode, __const__ regex_t *preg,
7130 char *errbuf, size_t errbuf_size)
7131 #else
7132 size_t
7133 regerror (errcode, preg, errbuf, errbuf_size)
7134 int errcode;
7135 __const__ regex_t *preg;
7136 char *errbuf;
7137 size_t errbuf_size;
7138 #endif
7140 __const__ char *msg
7141 = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode];
7142 size_t msg_size = strlen (msg) + 1; /* Includes the 0. */
7144 if (errbuf_size != 0)
7146 if (msg_size > errbuf_size)
7148 strncpy (errbuf, msg, errbuf_size - 1);
7149 errbuf[errbuf_size - 1] = 0;
7151 else
7152 strcpy (errbuf, msg);
7155 return msg_size;
7159 /* Free dynamically allocated space used by PREG. */
7161 #ifdef __STDC__
7162 void
7163 regfree (regex_t *preg)
7164 #else
7165 void
7166 regfree (preg)
7167 regex_t *preg;
7168 #endif
7170 if (preg->buffer != 0)
7171 free (preg->buffer);
7172 preg->buffer = 0;
7173 preg->allocated = 0;
7175 if (preg->fastmap != 0)
7176 free (preg->fastmap);
7177 preg->fastmap = 0;
7178 preg->fastmap_accurate = 0;
7180 if (preg->translate != 0)
7181 free (preg->translate);
7182 preg->translate = 0;
7185 #endif /* not emacs */