1 /* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
3 This file is part of the librx library.
5 Librx is free software; you can redistribute it and/or modify it under
6 the terms of the GNU Library General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 Librx is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 You should have received a copy of the GNU Library General Public
16 License along with this software; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
20 /* NOTE!!! AIX is so losing it requires this to be the first thing in the
22 * Do not put ANYTHING before it!
24 #if !defined (__GNUC__) && defined (_AIX)
28 /* To make linux happy? */
37 const char *rx_version_string
= "GNU Rx version 0.07.2";
47 #define isgraph(c) (isprint (c) && !isspace (c))
50 #define isblank(c) ((c) == ' ' || (c) == '\t')
53 #include <sys/types.h>
57 #define MAX(a, b) ((a) > (b) ? (a) : (b))
58 #define MIN(a, b) ((a) < (b) ? (a) : (b))
69 /* Emacs already defines alloca, sometimes. */
72 /* Make alloca work the best possible way. */
74 #define alloca __builtin_alloca
75 #else /* not __GNUC__ */
78 #else /* not __GNUC__ or HAVE_ALLOCA_H */
79 #ifndef _AIX /* Already did AIX, up at the top. */
82 #endif /* not HAVE_ALLOCA_H */
83 #endif /* not __GNUC__ */
85 #endif /* not alloca */
87 /* Memory management and stuff for emacs. */
90 #define remalloc(M, S) (M ? realloc (M, S) : malloc (S))
93 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
94 * use `alloca' instead of `malloc' for the backtracking stack.
96 * Emacs will die miserably if we don't do this.
100 #define REGEX_ALLOCATE malloc
101 #else /* not REGEX_MALLOC */
102 #define REGEX_ALLOCATE alloca
103 #endif /* not REGEX_MALLOC */
106 #ifdef RX_WANT_RX_DEFS
107 #define RX_DECL extern
110 #define RX_WANT_RX_DEFS
111 #define RX_DECL static
112 #define RX_DEF_QUAL static
116 #define RX_DECL RX_DEF_QUAL
123 RX_DECL
char re_syntax_table
[CHAR_SET_SIZE
];
127 init_syntax_once (void)
139 bzero (re_syntax_table
, sizeof re_syntax_table
);
141 for (c
= 'a'; c
<= 'z'; c
++)
142 re_syntax_table
[c
] = Sword
;
144 for (c
= 'A'; c
<= 'Z'; c
++)
145 re_syntax_table
[c
] = Sword
;
147 for (c
= '0'; c
<= '9'; c
++)
148 re_syntax_table
[c
] = Sword
;
150 re_syntax_table
['_'] = Sword
;
154 #endif /* not SYNTAX */
155 #endif /* not emacs */
157 /* Compile with `-DRX_DEBUG' and use the following flags.
160 * rx_debug - print information as a regexp is compiled
161 * rx_debug_trace - print information as a regexp is executed
166 int rx_debug_compile
= 0;
167 int rx_debug_trace
= 0;
168 static struct re_pattern_buffer
* dbug_rxb
= 0;
171 typedef void (*side_effect_printer
) (struct rx
*, void *, FILE *);
173 typedef void (*side_effect_printer
) ();
177 static void print_cset (struct rx
*rx
, rx_Bitset cset
, FILE * fp
);
179 static void print_cset ();
184 print_rexp (struct rx
*rx
,
185 struct rexp_node
*node
, int depth
,
186 side_effect_printer seprint
, FILE * fp
)
189 print_rexp (rx
, node
, depth
, seprint
, fp
)
191 struct rexp_node
*node
;
193 side_effect_printer seprint
;
205 fprintf (fp
, "%*s", depth
, "");
206 print_cset (rx
, node
->params
.cset
, fp
);
213 fprintf (fp
, "%*s%s\n", depth
, "",
214 node
->type
== r_opt
? "opt" : "star");
215 print_rexp (rx
, node
->params
.pair
.left
, depth
+ 3, seprint
, fp
);
219 fprintf (fp
, "%*s2phase star\n", depth
, "");
220 print_rexp (rx
, node
->params
.pair
.right
, depth
+ 3, seprint
, fp
);
221 print_rexp (rx
, node
->params
.pair
.left
, depth
+ 3, seprint
, fp
);
227 fprintf (fp
, "%*s%s\n", depth
, "",
228 node
->type
== r_alternate
? "alt" : "concat");
229 print_rexp (rx
, node
->params
.pair
.left
, depth
+ 3, seprint
, fp
);
230 print_rexp (rx
, node
->params
.pair
.right
, depth
+ 3, seprint
, fp
);
233 fprintf (fp
, "%*sSide effect: ", depth
, "");
234 seprint (rx
, node
->params
.side_effect
, fp
);
242 print_nfa (struct rx
* rx
,
243 struct rx_nfa_state
* n
,
244 side_effect_printer seprint
, FILE * fp
)
247 print_nfa (rx
, n
, seprint
, fp
)
249 struct rx_nfa_state
* n
;
250 side_effect_printer seprint
;
256 struct rx_nfa_edge
*e
= n
->edges
;
257 struct rx_possible_future
*ec
= n
->futures
;
258 fprintf (fp
, "node %d %s\n", n
->id
,
259 n
->is_final
? "final" : (n
->is_start
? "start" : ""));
262 fprintf (fp
, " edge to %d, ", e
->dest
->id
);
266 fprintf (fp
, "epsilon\n");
269 fprintf (fp
, "side effect ");
270 seprint (rx
, e
->params
.side_effect
, fp
);
274 fprintf (fp
, "cset ");
275 print_cset (rx
, e
->params
.cset
, fp
);
285 struct rx_nfa_state_set
* s
;
286 struct rx_se_list
* l
;
287 fprintf (fp
, " eclosure to {");
288 for (s
= ec
->destset
; s
; s
= s
->cdr
)
289 fprintf (fp
, "%d ", s
->car
->id
);
291 for (l
= ec
->effects
; l
; l
= l
->cdr
)
293 seprint (rx
, l
->car
, fp
);
303 static char * efnames
[] =
319 "re_se_notwordbound",
326 static char * efnames2
[] =
337 static char * inx_names
[] =
339 "rx_backtrack_point",
340 "rx_do_side_effects",
345 "rx_num_instructions"
351 re_seprint (struct rx
* rx
, void * effect
, FILE * fp
)
354 re_seprint (rx
, effect
, fp
)
361 fputs (efnames
[-(int)effect
], fp
);
364 struct re_se_params
* p
= &dbug_rxb
->se_params
[(int)effect
];
365 fprintf (fp
, "%s(%d,%d)", efnames2
[p
->se
], p
->op1
, p
->op2
);
368 fprintf (fp
, "[complex op # %d]", (int)effect
);
372 /* These are so the regex.c regression tests will compile. */
374 print_compiled_pattern (rxb
)
375 struct re_pattern_buffer
* rxb
;
385 #endif /* RX_DEBUG */
389 /* This page: Bitsets. Completely unintersting. */
393 rx_bitset_is_equal (int size
, rx_Bitset a
, rx_Bitset b
)
396 rx_bitset_is_equal (size
, a
, b
)
406 for (x
= rx_bitset_numb_subsets(size
) - 1; a
[x
] == b
[x
]; --x
)
410 return !x
&& s
== a
[0];
415 rx_bitset_is_subset (int size
, rx_Bitset a
, rx_Bitset b
)
418 rx_bitset_is_subset (size
, a
, b
)
424 int x
= rx_bitset_numb_subsets(size
) - 1;
425 while (x
-- && (a
[x
] & b
[x
]) == a
[x
]);
432 rx_bitset_empty (int size
, rx_Bitset set
)
435 rx_bitset_empty (size
, set
)
441 RX_subset s
= set
[0];
443 for (x
= rx_bitset_numb_subsets(size
) - 1; !set
[x
]; --x
)
451 rx_bitset_null (int size
, rx_Bitset b
)
454 rx_bitset_null (size
, b
)
459 bzero (b
, rx_sizeof_bitset(size
));
465 rx_bitset_universe (int size
, rx_Bitset b
)
468 rx_bitset_universe (size
, b
)
473 int x
= rx_bitset_numb_subsets (size
);
475 *b
++ = ~(RX_subset
)0;
481 rx_bitset_complement (int size
, rx_Bitset b
)
484 rx_bitset_complement (size
, b
)
489 int x
= rx_bitset_numb_subsets (size
);
500 rx_bitset_assign (int size
, rx_Bitset a
, rx_Bitset b
)
503 rx_bitset_assign (size
, a
, b
)
510 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
517 rx_bitset_union (int size
, rx_Bitset a
, rx_Bitset b
)
520 rx_bitset_union (size
, a
, b
)
527 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
534 rx_bitset_intersection (int size
,
535 rx_Bitset a
, rx_Bitset b
)
538 rx_bitset_intersection (size
, a
, b
)
545 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
552 rx_bitset_difference (int size
, rx_Bitset a
, rx_Bitset b
)
555 rx_bitset_difference (size
, a
, b
)
562 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
569 rx_bitset_revdifference (int size
,
570 rx_Bitset a
, rx_Bitset b
)
573 rx_bitset_revdifference (size
, a
, b
)
580 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
586 rx_bitset_xor (int size
, rx_Bitset a
, rx_Bitset b
)
589 rx_bitset_xor (size
, a
, b
)
596 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>=0; --x
)
602 RX_DECL
unsigned long
603 rx_bitset_hash (int size
, rx_Bitset b
)
605 RX_DECL
unsigned long
606 rx_bitset_hash (size
, b
)
612 unsigned long hash
= (unsigned long)rx_bitset_hash
;
614 for (x
= rx_bitset_numb_subsets(size
) - 1; x
>= 0; --x
)
615 hash
^= rx_bitset_subset_val(b
, x
);
621 RX_DECL RX_subset rx_subset_singletons
[RX_subset_bits
] =
661 print_cset (struct rx
*rx
, rx_Bitset cset
, FILE * fp
)
664 print_cset (rx
, cset
, fp
)
672 for (x
= 0; x
< rx
->local_cset_size
; ++x
)
673 if (RX_bitset_member (cset
, x
))
678 fprintf (fp
, "\\0%o ", x
);
683 #endif /* RX_DEBUG */
687 static unsigned long rx_hash_masks
[4] =
698 RX_DECL
struct rx_hash_item
*
699 rx_hash_find (struct rx_hash
* table
,
702 struct rx_hash_rules
* rules
)
704 RX_DECL
struct rx_hash_item
*
705 rx_hash_find (table
, hash
, value
, rules
)
706 struct rx_hash
* table
;
709 struct rx_hash_rules
* rules
;
712 rx_hash_eq eq
= rules
->eq
;
714 long mask
= rx_hash_masks
[0];
715 int bucket
= (hash
& mask
) % 13;
717 while (table
->children
[bucket
])
719 table
= table
->children
[bucket
];
721 mask
= rx_hash_masks
[maskc
];
722 bucket
= (hash
& mask
) % 13;
726 struct rx_hash_item
* it
= table
->buckets
[bucket
];
728 if (eq (it
->data
, value
))
731 it
= it
->next_same_hash
;
739 RX_DECL
struct rx_hash_item
*
740 rx_hash_store (struct rx_hash
* table
,
743 struct rx_hash_rules
* rules
)
745 RX_DECL
struct rx_hash_item
*
746 rx_hash_store (table
, hash
, value
, rules
)
747 struct rx_hash
* table
;
750 struct rx_hash_rules
* rules
;
753 rx_hash_eq eq
= rules
->eq
;
755 long mask
= rx_hash_masks
[0];
756 int bucket
= (hash
& mask
) % 13;
759 while (table
->children
[bucket
])
761 table
= table
->children
[bucket
];
763 mask
= rx_hash_masks
[maskc
];
764 bucket
= (hash
& mask
) % 13;
769 struct rx_hash_item
* it
= table
->buckets
[bucket
];
771 if (eq (it
->data
, value
))
774 it
= it
->next_same_hash
;
779 && (table
->bucket_size
[bucket
] >= 4))
781 struct rx_hash
* newtab
= ((struct rx_hash
*)
782 rules
->hash_alloc (rules
));
785 bzero (newtab
, sizeof (*newtab
));
786 newtab
->parent
= table
;
788 struct rx_hash_item
* them
= table
->buckets
[bucket
];
789 unsigned long newmask
= rx_hash_masks
[maskc
+ 1];
792 struct rx_hash_item
* save
= them
->next_same_hash
;
793 int new_buck
= (them
->hash
& newmask
) % 13;
794 them
->next_same_hash
= newtab
->buckets
[new_buck
];
795 newtab
->buckets
[new_buck
] = them
;
796 them
->table
= newtab
;
798 ++newtab
->bucket_size
[new_buck
];
801 table
->refs
= (table
->refs
- table
->bucket_size
[bucket
] + 1);
802 table
->bucket_size
[bucket
] = 0;
803 table
->buckets
[bucket
] = 0;
804 table
->children
[bucket
] = newtab
;
806 bucket
= (hash
& newmask
) % 13;
812 struct rx_hash_item
* it
= ((struct rx_hash_item
*)
813 rules
->hash_item_alloc (rules
, value
));
818 /* DATA and BINDING are to be set in hash_item_alloc */
819 it
->next_same_hash
= table
->buckets
[bucket
];
820 table
->buckets
[bucket
] = it
;
821 ++table
->bucket_size
[bucket
];
830 rx_hash_free (struct rx_hash_item
* it
, struct rx_hash_rules
* rules
)
833 rx_hash_free (it
, rules
)
834 struct rx_hash_item
* it
;
835 struct rx_hash_rules
* rules
;
840 struct rx_hash
* table
= it
->table
;
841 unsigned long hash
= it
->hash
;
842 int depth
= (table
->parent
843 ? (table
->parent
->parent
844 ? (table
->parent
->parent
->parent
849 int bucket
= (hash
& rx_hash_masks
[depth
]) % 13;
850 struct rx_hash_item
** pos
= &table
->buckets
[bucket
];
853 pos
= &(*pos
)->next_same_hash
;
854 *pos
= it
->next_same_hash
;
855 rules
->free_hash_item (it
, rules
);
856 --table
->bucket_size
[bucket
];
858 while (!table
->refs
&& depth
)
860 struct rx_hash
* save
= table
;
861 table
= table
->parent
;
863 bucket
= (hash
& rx_hash_masks
[depth
]) % 13;
865 table
->children
[bucket
] = 0;
866 rules
->free_hash (save
, rules
);
873 rx_free_hash_table (struct rx_hash
* tab
, rx_hash_freefn freefn
,
874 struct rx_hash_rules
* rules
)
877 rx_free_hash_table (tab
, freefn
, rules
)
878 struct rx_hash
* tab
;
879 rx_hash_freefn freefn
;
880 struct rx_hash_rules
* rules
;
885 for (x
= 0; x
< 13; ++x
)
886 if (tab
->children
[x
])
888 rx_free_hash_table (tab
->children
[x
], freefn
, rules
);
889 rules
->free_hash (tab
->children
[x
], rules
);
893 struct rx_hash_item
* them
= tab
->buckets
[x
];
896 struct rx_hash_item
* that
= them
;
897 them
= that
->next_same_hash
;
899 rules
->free_hash_item (that
, rules
);
906 /* Utilities for manipulating bitset represntations of characters sets. */
910 rx_cset (struct rx
*rx
)
917 rx_Bitset b
= (rx_Bitset
) malloc (rx_sizeof_bitset (rx
->local_cset_size
));
919 rx_bitset_null (rx
->local_cset_size
, b
);
926 rx_copy_cset (struct rx
*rx
, rx_Bitset a
)
934 rx_Bitset cs
= rx_cset (rx
);
937 rx_bitset_union (rx
->local_cset_size
, cs
, a
);
945 rx_free_cset (struct rx
* rx
, rx_Bitset c
)
958 /* Hash table memory allocation policy for the regexp compiler */
961 static struct rx_hash
*
962 compiler_hash_alloc (struct rx_hash_rules
* rules
)
964 static struct rx_hash
*
965 compiler_hash_alloc (rules
)
966 struct rx_hash_rules
* rules
;
969 return (struct rx_hash
*)malloc (sizeof (struct rx_hash
));
974 static struct rx_hash_item
*
975 compiler_hash_item_alloc (struct rx_hash_rules
* rules
, void * value
)
977 static struct rx_hash_item
*
978 compiler_hash_item_alloc (rules
, value
)
979 struct rx_hash_rules
* rules
;
983 struct rx_hash_item
* it
;
984 it
= (struct rx_hash_item
*)malloc (sizeof (*it
));
996 compiler_free_hash (struct rx_hash
* tab
,
997 struct rx_hash_rules
* rules
)
1000 compiler_free_hash (tab
, rules
)
1001 struct rx_hash
* tab
;
1002 struct rx_hash_rules
* rules
;
1011 compiler_free_hash_item (struct rx_hash_item
* item
,
1012 struct rx_hash_rules
* rules
)
1015 compiler_free_hash_item (item
, rules
)
1016 struct rx_hash_item
* item
;
1017 struct rx_hash_rules
* rules
;
1020 free ((char *)item
);
1024 /* This page: REXP_NODE (expression tree) structures. */
1027 RX_DECL
struct rexp_node
*
1028 rexp_node (struct rx
*rx
,
1029 enum rexp_node_type type
)
1031 RX_DECL
struct rexp_node
*
1032 rexp_node (rx
, type
)
1034 enum rexp_node_type type
;
1037 struct rexp_node
*n
;
1039 n
= (struct rexp_node
*)malloc (sizeof (*n
));
1040 bzero (n
, sizeof (*n
));
1047 /* free_rexp_node assumes that the bitset passed to rx_mk_r_cset
1048 * can be freed using rx_free_cset.
1051 RX_DECL
struct rexp_node
*
1052 rx_mk_r_cset (struct rx
* rx
,
1055 RX_DECL
struct rexp_node
*
1056 rx_mk_r_cset (rx
, b
)
1061 struct rexp_node
* n
= rexp_node (rx
, r_cset
);
1069 RX_DECL
struct rexp_node
*
1070 rx_mk_r_concat (struct rx
* rx
,
1071 struct rexp_node
* a
,
1072 struct rexp_node
* b
)
1074 RX_DECL
struct rexp_node
*
1075 rx_mk_r_concat (rx
, a
, b
)
1077 struct rexp_node
* a
;
1078 struct rexp_node
* b
;
1081 struct rexp_node
* n
= rexp_node (rx
, r_concat
);
1084 n
->params
.pair
.left
= a
;
1085 n
->params
.pair
.right
= b
;
1092 RX_DECL
struct rexp_node
*
1093 rx_mk_r_alternate (struct rx
* rx
,
1094 struct rexp_node
* a
,
1095 struct rexp_node
* b
)
1097 RX_DECL
struct rexp_node
*
1098 rx_mk_r_alternate (rx
, a
, b
)
1100 struct rexp_node
* a
;
1101 struct rexp_node
* b
;
1104 struct rexp_node
* n
= rexp_node (rx
, r_alternate
);
1107 n
->params
.pair
.left
= a
;
1108 n
->params
.pair
.right
= b
;
1115 RX_DECL
struct rexp_node
*
1116 rx_mk_r_opt (struct rx
* rx
,
1117 struct rexp_node
* a
)
1119 RX_DECL
struct rexp_node
*
1122 struct rexp_node
* a
;
1125 struct rexp_node
* n
= rexp_node (rx
, r_opt
);
1128 n
->params
.pair
.left
= a
;
1129 n
->params
.pair
.right
= 0;
1136 RX_DECL
struct rexp_node
*
1137 rx_mk_r_star (struct rx
* rx
,
1138 struct rexp_node
* a
)
1140 RX_DECL
struct rexp_node
*
1141 rx_mk_r_star (rx
, a
)
1143 struct rexp_node
* a
;
1146 struct rexp_node
* n
= rexp_node (rx
, r_star
);
1149 n
->params
.pair
.left
= a
;
1150 n
->params
.pair
.right
= 0;
1157 RX_DECL
struct rexp_node
*
1158 rx_mk_r_2phase_star (struct rx
* rx
,
1159 struct rexp_node
* a
,
1160 struct rexp_node
* b
)
1162 RX_DECL
struct rexp_node
*
1163 rx_mk_r_2phase_star (rx
, a
, b
)
1165 struct rexp_node
* a
;
1166 struct rexp_node
* b
;
1169 struct rexp_node
* n
= rexp_node (rx
, r_2phase_star
);
1172 n
->params
.pair
.left
= a
;
1173 n
->params
.pair
.right
= b
;
1180 RX_DECL
struct rexp_node
*
1181 rx_mk_r_side_effect (struct rx
* rx
,
1184 RX_DECL
struct rexp_node
*
1185 rx_mk_r_side_effect (rx
, a
)
1190 struct rexp_node
* n
= rexp_node (rx
, r_side_effect
);
1193 n
->params
.side_effect
= a
;
1194 n
->params
.pair
.right
= 0;
1201 RX_DECL
struct rexp_node
*
1202 rx_mk_r_data (struct rx
* rx
,
1205 RX_DECL
struct rexp_node
*
1206 rx_mk_r_data (rx
, a
)
1211 struct rexp_node
* n
= rexp_node (rx
, r_data
);
1214 n
->params
.pair
.left
= a
;
1215 n
->params
.pair
.right
= 0;
1223 rx_free_rexp (struct rx
* rx
, struct rexp_node
* node
)
1226 rx_free_rexp (rx
, node
)
1228 struct rexp_node
* node
;
1236 if (node
->params
.cset
)
1237 rx_free_cset (rx
, node
->params
.cset
);
1247 rx_free_rexp (rx
, node
->params
.pair
.left
);
1248 rx_free_rexp (rx
, node
->params
.pair
.right
);
1252 /* This shouldn't occur. */
1255 free ((char *)node
);
1261 RX_DECL
struct rexp_node
*
1262 rx_copy_rexp (struct rx
*rx
,
1263 struct rexp_node
*node
)
1265 RX_DECL
struct rexp_node
*
1266 rx_copy_rexp (rx
, node
)
1268 struct rexp_node
*node
;
1275 struct rexp_node
*n
= rexp_node (rx
, node
->type
);
1281 n
->params
.cset
= rx_copy_cset (rx
, node
->params
.cset
);
1282 if (!n
->params
.cset
)
1284 rx_free_rexp (rx
, n
);
1290 n
->params
.side_effect
= node
->params
.side_effect
;
1298 n
->params
.pair
.left
=
1299 rx_copy_rexp (rx
, node
->params
.pair
.left
);
1300 n
->params
.pair
.right
=
1301 rx_copy_rexp (rx
, node
->params
.pair
.right
);
1302 if ( (node
->params
.pair
.left
&& !n
->params
.pair
.left
)
1303 || (node
->params
.pair
.right
&& !n
->params
.pair
.right
))
1305 rx_free_rexp (rx
, n
);
1310 /* shouldn't happen */
1319 /* This page: functions to build and destroy graphs that describe nfa's */
1321 /* Constructs a new nfa node. */
1323 RX_DECL
struct rx_nfa_state
*
1324 rx_nfa_state (struct rx
*rx
)
1326 RX_DECL
struct rx_nfa_state
*
1331 struct rx_nfa_state
* n
= (struct rx_nfa_state
*)malloc (sizeof (*n
));
1334 bzero (n
, sizeof (*n
));
1335 n
->next
= rx
->nfa_states
;
1343 rx_free_nfa_state (struct rx_nfa_state
* n
)
1346 rx_free_nfa_state (n
)
1347 struct rx_nfa_state
* n
;
1354 /* This looks up an nfa node, given a numeric id. Numeric id's are
1355 * assigned after the nfa has been built.
1358 RX_DECL
struct rx_nfa_state
*
1359 rx_id_to_nfa_state (struct rx
* rx
,
1362 RX_DECL
struct rx_nfa_state
*
1363 rx_id_to_nfa_state (rx
, id
)
1368 struct rx_nfa_state
* n
;
1369 for (n
= rx
->nfa_states
; n
; n
= n
->next
)
1376 /* This adds an edge between two nodes, but doesn't initialize the
1381 RX_DECL
struct rx_nfa_edge
*
1382 rx_nfa_edge (struct rx
*rx
,
1383 enum rx_nfa_etype type
,
1384 struct rx_nfa_state
*start
,
1385 struct rx_nfa_state
*dest
)
1387 RX_DECL
struct rx_nfa_edge
*
1388 rx_nfa_edge (rx
, type
, start
, dest
)
1390 enum rx_nfa_etype type
;
1391 struct rx_nfa_state
*start
;
1392 struct rx_nfa_state
*dest
;
1395 struct rx_nfa_edge
*e
;
1396 e
= (struct rx_nfa_edge
*)malloc (sizeof (*e
));
1399 e
->next
= start
->edges
;
1409 rx_free_nfa_edge (struct rx_nfa_edge
* e
)
1412 rx_free_nfa_edge (e
)
1413 struct rx_nfa_edge
* e
;
1420 /* This constructs a POSSIBLE_FUTURE, which is a kind epsilon-closure
1421 * of an NFA. These are added to an nfa automaticly by eclose_nfa.
1425 static struct rx_possible_future
*
1426 rx_possible_future (struct rx
* rx
,
1427 struct rx_se_list
* effects
)
1429 static struct rx_possible_future
*
1430 rx_possible_future (rx
, effects
)
1432 struct rx_se_list
* effects
;
1435 struct rx_possible_future
*ec
;
1436 ec
= (struct rx_possible_future
*) malloc (sizeof (*ec
));
1441 ec
->effects
= effects
;
1448 rx_free_possible_future (struct rx_possible_future
* pf
)
1451 rx_free_possible_future (pf
)
1452 struct rx_possible_future
* pf
;
1461 rx_free_nfa (struct rx
*rx
)
1468 while (rx
->nfa_states
)
1470 while (rx
->nfa_states
->edges
)
1472 switch (rx
->nfa_states
->edges
->type
)
1475 rx_free_cset (rx
, rx
->nfa_states
->edges
->params
.cset
);
1481 struct rx_nfa_edge
* e
;
1482 e
= rx
->nfa_states
->edges
;
1483 rx
->nfa_states
->edges
= rx
->nfa_states
->edges
->next
;
1484 rx_free_nfa_edge (e
);
1486 } /* while (rx->nfa_states->edges) */
1488 /* Iterate over the partial epsilon closures of rx->nfa_states */
1489 struct rx_possible_future
* pf
= rx
->nfa_states
->futures
;
1492 struct rx_possible_future
* pft
= pf
;
1494 rx_free_possible_future (pft
);
1498 struct rx_nfa_state
*n
;
1500 rx
->nfa_states
= rx
->nfa_states
->next
;
1501 rx_free_nfa_state (n
);
1508 /* This page: translating a pattern expression into an nfa and doing the
1509 * static part of the nfa->super-nfa translation.
1512 /* This is the thompson regexp->nfa algorithm.
1513 * It is modified to allow for `side-effect epsilons.' Those are
1514 * edges that are taken whenever a similar epsilon edge would be,
1515 * but which imply that some side effect occurs when the edge
1518 * Side effects are used to model parts of the pattern langauge
1519 * that are not regular (in the formal sense).
1524 rx_build_nfa (struct rx
*rx
,
1525 struct rexp_node
*rexp
,
1526 struct rx_nfa_state
**start
,
1527 struct rx_nfa_state
**end
)
1530 rx_build_nfa (rx
, rexp
, start
, end
)
1532 struct rexp_node
*rexp
;
1533 struct rx_nfa_state
**start
;
1534 struct rx_nfa_state
**end
;
1537 struct rx_nfa_edge
*edge
;
1539 /* Start & end nodes may have been allocated by the caller. */
1540 *start
= *start
? *start
: rx_nfa_state (rx
);
1551 *end
= *end
? *end
: rx_nfa_state (rx
);
1555 rx_free_nfa_state (*start
);
1565 edge
= rx_nfa_edge (rx
, ne_cset
, *start
, *end
);
1568 edge
->params
.cset
= rx_copy_cset (rx
, rexp
->params
.cset
);
1569 if (!edge
->params
.cset
)
1571 rx_free_nfa_edge (edge
);
1577 return (rx_build_nfa (rx
, rexp
->params
.pair
.left
, start
, end
)
1578 && rx_nfa_edge (rx
, ne_epsilon
, *start
, *end
));
1582 struct rx_nfa_state
* star_start
= 0;
1583 struct rx_nfa_state
* star_end
= 0;
1584 return (rx_build_nfa (rx
, rexp
->params
.pair
.left
,
1585 &star_start
, &star_end
)
1588 && rx_nfa_edge (rx
, ne_epsilon
, star_start
, star_end
)
1589 && rx_nfa_edge (rx
, ne_epsilon
, *start
, star_start
)
1590 && rx_nfa_edge (rx
, ne_epsilon
, star_end
, *end
)
1592 && rx_nfa_edge (rx
, ne_epsilon
, star_end
, star_start
));
1597 struct rx_nfa_state
* star_start
= 0;
1598 struct rx_nfa_state
* star_end
= 0;
1599 struct rx_nfa_state
* loop_exp_start
= 0;
1600 struct rx_nfa_state
* loop_exp_end
= 0;
1602 return (rx_build_nfa (rx
, rexp
->params
.pair
.left
,
1603 &star_start
, &star_end
)
1604 && rx_build_nfa (rx
, rexp
->params
.pair
.right
,
1605 &loop_exp_start
, &loop_exp_end
)
1610 && rx_nfa_edge (rx
, ne_epsilon
, star_start
, *end
)
1611 && rx_nfa_edge (rx
, ne_epsilon
, *start
, star_start
)
1612 && rx_nfa_edge (rx
, ne_epsilon
, star_end
, *end
)
1614 && rx_nfa_edge (rx
, ne_epsilon
, star_end
, loop_exp_start
)
1615 && rx_nfa_edge (rx
, ne_epsilon
, loop_exp_end
, star_start
));
1621 struct rx_nfa_state
*shared
= 0;
1623 (rx_build_nfa (rx
, rexp
->params
.pair
.left
, start
, &shared
)
1624 && rx_build_nfa (rx
, rexp
->params
.pair
.right
, &shared
, end
));
1629 struct rx_nfa_state
*ls
= 0;
1630 struct rx_nfa_state
*le
= 0;
1631 struct rx_nfa_state
*rs
= 0;
1632 struct rx_nfa_state
*re
= 0;
1633 return (rx_build_nfa (rx
, rexp
->params
.pair
.left
, &ls
, &le
)
1634 && rx_build_nfa (rx
, rexp
->params
.pair
.right
, &rs
, &re
)
1635 && rx_nfa_edge (rx
, ne_epsilon
, *start
, ls
)
1636 && rx_nfa_edge (rx
, ne_epsilon
, *start
, rs
)
1637 && rx_nfa_edge (rx
, ne_epsilon
, le
, *end
)
1638 && rx_nfa_edge (rx
, ne_epsilon
, re
, *end
));
1642 edge
= rx_nfa_edge (rx
, ne_side_effect
, *start
, *end
);
1645 edge
->params
.side_effect
= rexp
->params
.side_effect
;
1649 /* this should never happen */
1654 /* RX_NAME_NFA_STATES identifies all nodes with outgoing non-epsilon
1655 * transitions. Only these nodes can occur in super-states.
1656 * All nodes are given an integer id.
1657 * The id is non-negative if the node has non-epsilon out-transitions, negative
1658 * otherwise (this is because we want the non-negative ids to be used as
1659 * array indexes in a few places).
1664 rx_name_nfa_states (struct rx
*rx
)
1667 rx_name_nfa_states (rx
)
1671 struct rx_nfa_state
*n
= rx
->nfa_states
;
1678 struct rx_nfa_edge
*e
= n
->edges
;
1681 n
->eclosure_needed
= 1;
1688 case ne_side_effect
:
1692 n
->id
= rx
->nodec
++;
1694 struct rx_nfa_edge
*from_n
= n
->edges
;
1697 from_n
->dest
->eclosure_needed
= 1;
1698 from_n
= from_n
->next
;
1705 n
->id
= rx
->epsnodec
--;
1709 rx
->epsnodec
= -rx
->epsnodec
;
1713 /* This page: data structures for the static part of the nfa->supernfa
1716 * There are side effect lists -- lists of side effects occuring
1717 * along an uninterrupted, acyclic path of side-effect epsilon edges.
1718 * Such paths are collapsed to single edges in the course of computing
1719 * epsilon closures. Such single edges are labled with a list of all
1720 * the side effects entailed in crossing them. Like lists of side
1721 * effects are made == by the constructors below.
1723 * There are also nfa state sets. These are used to hold a list of all
1724 * states reachable from a starting state for a given type of transition
1725 * and side effect list. These are also hash-consed.
1728 /* The next several functions compare, construct, etc. lists of side
1729 * effects. See ECLOSE_NFA (below) for details.
1732 /* Ordering of rx_se_list
1733 * (-1, 0, 1 return value convention).
1738 se_list_cmp (void * va
, void * vb
)
1741 se_list_cmp (va
, vb
)
1746 struct rx_se_list
* a
= (struct rx_se_list
*)va
;
1747 struct rx_se_list
* b
= (struct rx_se_list
*)vb
;
1755 : ((long)a
->car
< (long)b
->car
1757 : ((long)a
->car
> (long)b
->car
1759 : se_list_cmp ((void *)a
->cdr
, (void *)b
->cdr
))))));
1765 se_list_equal (void * va
, void * vb
)
1768 se_list_equal (va
, vb
)
1773 return !(se_list_cmp (va
, vb
));
1776 static struct rx_hash_rules se_list_hash_rules
=
1779 compiler_hash_alloc
,
1781 compiler_hash_item_alloc
,
1782 compiler_free_hash_item
1787 static struct rx_se_list
*
1788 side_effect_cons (struct rx
* rx
,
1789 void * se
, struct rx_se_list
* list
)
1791 static struct rx_se_list
*
1792 side_effect_cons (rx
, se
, list
)
1795 struct rx_se_list
* list
;
1798 struct rx_se_list
* l
;
1799 l
= ((struct rx_se_list
*) malloc (sizeof (*l
)));
1809 static struct rx_se_list
*
1810 hash_cons_se_prog (struct rx
* rx
,
1811 struct rx_hash
* memo
,
1812 void * car
, struct rx_se_list
* cdr
)
1814 static struct rx_se_list
*
1815 hash_cons_se_prog (rx
, memo
, car
, cdr
)
1817 struct rx_hash
* memo
;
1819 struct rx_se_list
* cdr
;
1822 long hash
= (long)car
^ (long)cdr
;
1823 struct rx_se_list
template;
1828 struct rx_hash_item
* it
= rx_hash_store (memo
, hash
,
1830 &se_list_hash_rules
);
1833 if (it
->data
== (void *)&template)
1835 struct rx_se_list
* consed
;
1836 consed
= (struct rx_se_list
*) malloc (sizeof (*consed
));
1838 it
->data
= (void *)consed
;
1840 return (struct rx_se_list
*)it
->data
;
1846 static struct rx_se_list
*
1847 hash_se_prog (struct rx
* rx
, struct rx_hash
* memo
, struct rx_se_list
* prog
)
1849 static struct rx_se_list
*
1850 hash_se_prog (rx
, memo
, prog
)
1852 struct rx_hash
* memo
;
1853 struct rx_se_list
* prog
;
1856 struct rx_se_list
* answer
= 0;
1859 answer
= hash_cons_se_prog (rx
, memo
, prog
->car
, answer
);
1869 nfa_set_cmp (void * va
, void * vb
)
1872 nfa_set_cmp (va
, vb
)
1877 struct rx_nfa_state_set
* a
= (struct rx_nfa_state_set
*)va
;
1878 struct rx_nfa_state_set
* b
= (struct rx_nfa_state_set
*)vb
;
1886 : (a
->car
->id
< b
->car
->id
1888 : (a
->car
->id
> b
->car
->id
1890 : nfa_set_cmp ((void *)a
->cdr
, (void *)b
->cdr
))))));
1895 nfa_set_equal (void * va
, void * vb
)
1898 nfa_set_equal (va
, vb
)
1903 return !nfa_set_cmp (va
, vb
);
1906 static struct rx_hash_rules nfa_set_hash_rules
=
1909 compiler_hash_alloc
,
1911 compiler_hash_item_alloc
,
1912 compiler_free_hash_item
1917 static struct rx_nfa_state_set
*
1918 nfa_set_cons (struct rx
* rx
,
1919 struct rx_hash
* memo
, struct rx_nfa_state
* state
,
1920 struct rx_nfa_state_set
* set
)
1922 static struct rx_nfa_state_set
*
1923 nfa_set_cons (rx
, memo
, state
, set
)
1925 struct rx_hash
* memo
;
1926 struct rx_nfa_state
* state
;
1927 struct rx_nfa_state_set
* set
;
1930 struct rx_nfa_state_set
template;
1931 struct rx_hash_item
* node
;
1932 template.car
= state
;
1934 node
= rx_hash_store (memo
,
1935 (((long)state
) >> 8) ^ (long)set
,
1936 &template, &nfa_set_hash_rules
);
1939 if (node
->data
== &template)
1941 struct rx_nfa_state_set
* l
;
1942 l
= (struct rx_nfa_state_set
*) malloc (sizeof (*l
));
1943 node
->data
= (void *) l
;
1948 return (struct rx_nfa_state_set
*)node
->data
;
1953 static struct rx_nfa_state_set
*
1954 nfa_set_enjoin (struct rx
* rx
,
1955 struct rx_hash
* memo
, struct rx_nfa_state
* state
,
1956 struct rx_nfa_state_set
* set
)
1958 static struct rx_nfa_state_set
*
1959 nfa_set_enjoin (rx
, memo
, state
, set
)
1961 struct rx_hash
* memo
;
1962 struct rx_nfa_state
* state
;
1963 struct rx_nfa_state_set
* set
;
1966 if (!set
|| state
->id
< set
->car
->id
)
1967 return nfa_set_cons (rx
, memo
, state
, set
);
1968 if (state
->id
== set
->car
->id
)
1972 struct rx_nfa_state_set
* newcdr
1973 = nfa_set_enjoin (rx
, memo
, state
, set
->cdr
);
1974 if (newcdr
!= set
->cdr
)
1975 set
= nfa_set_cons (rx
, memo
, set
->car
, newcdr
);
1982 /* This page: computing epsilon closures. The closures aren't total.
1983 * Each node's closures are partitioned according to the side effects entailed
1984 * along the epsilon edges. Return true on success.
1989 struct rx_se_list
*prog_backwards
;
1995 eclose_node (struct rx
*rx
, struct rx_nfa_state
*outnode
,
1996 struct rx_nfa_state
*node
, struct eclose_frame
*frame
)
1999 eclose_node (rx
, outnode
, node
, frame
)
2001 struct rx_nfa_state
*outnode
;
2002 struct rx_nfa_state
*node
;
2003 struct eclose_frame
*frame
;
2006 struct rx_nfa_edge
*e
= node
->edges
;
2008 /* For each node, we follow all epsilon paths to build the closure.
2009 * The closure omits nodes that have only epsilon edges.
2010 * The closure is split into partial closures -- all the states in
2011 * a partial closure are reached by crossing the same list of
2012 * of side effects (though not necessarily the same path).
2018 if (node
->id
>= 0 || node
->is_final
)
2020 struct rx_possible_future
**ec
;
2021 struct rx_se_list
* prog_in_order
2022 = ((struct rx_se_list
*)hash_se_prog (rx
,
2024 frame
->prog_backwards
));
2027 ec
= &outnode
->futures
;
2031 cmp
= se_list_cmp ((void *)(*ec
)->effects
, (void *)prog_in_order
);
2036 if (!*ec
|| (cmp
< 0))
2038 struct rx_possible_future
* saved
= *ec
;
2039 *ec
= rx_possible_future (rx
, prog_in_order
);
2040 (*ec
)->next
= saved
;
2046 (*ec
)->destset
= nfa_set_enjoin (rx
, &rx
->set_list_memo
,
2047 node
, (*ec
)->destset
);
2048 if (!(*ec
)->destset
)
2058 if (!eclose_node (rx
, outnode
, e
->dest
, frame
))
2061 case ne_side_effect
:
2063 frame
->prog_backwards
= side_effect_cons (rx
,
2064 e
->params
.side_effect
,
2065 frame
->prog_backwards
);
2066 if (!frame
->prog_backwards
)
2068 if (!eclose_node (rx
, outnode
, e
->dest
, frame
))
2071 struct rx_se_list
* dying
= frame
->prog_backwards
;
2072 frame
->prog_backwards
= frame
->prog_backwards
->cdr
;
2073 free ((char *)dying
);
2089 rx_eclose_nfa (struct rx
*rx
)
2096 struct rx_nfa_state
*n
= rx
->nfa_states
;
2097 struct eclose_frame frame
;
2098 static int rx_id
= 0;
2100 frame
.prog_backwards
= 0;
2101 rx
->rx_id
= rx_id
++;
2102 bzero (&rx
->se_list_memo
, sizeof (rx
->se_list_memo
));
2103 bzero (&rx
->set_list_memo
, sizeof (rx
->set_list_memo
));
2107 if (n
->eclosure_needed
&& !eclose_node (rx
, n
, n
, &frame
))
2109 /* clear_marks (rx); */
2116 /* This deletes epsilon edges from an NFA. After running eclose_node,
2117 * we have no more need for these edges. They are removed to simplify
2118 * further operations on the NFA.
2123 rx_delete_epsilon_transitions (struct rx
*rx
)
2126 rx_delete_epsilon_transitions (rx
)
2130 struct rx_nfa_state
*n
= rx
->nfa_states
;
2131 struct rx_nfa_edge
**e
;
2138 struct rx_nfa_edge
*t
;
2142 case ne_side_effect
:
2145 rx_free_nfa_edge (t
);
2158 /* This page: storing the nfa in a contiguous region of memory for
2159 * subsequent conversion to a super-nfa.
2162 /* This is for qsort on an array of nfa_states. The order
2163 * is based on state ids and goes
2164 * [0...MAX][MIN..-1] where (MAX>=0) and (MIN<0)
2165 * This way, positive ids double as array indices.
2170 nfacmp (void * va
, void * vb
)
2178 struct rx_nfa_state
**a
= (struct rx_nfa_state
**)va
;
2179 struct rx_nfa_state
**b
= (struct rx_nfa_state
**)vb
;
2180 return (*a
== *b
/* &&&& 3.18 */
2182 : (((*a
)->id
< 0) == ((*b
)->id
< 0)
2183 ? (((*a
)->id
< (*b
)->id
) ? -1 : 1)
2190 count_hash_nodes (struct rx_hash
* st
)
2193 count_hash_nodes (st
)
2194 struct rx_hash
* st
;
2199 for (x
= 0; x
< 13; ++x
)
2200 count
+= ((st
->children
[x
])
2201 ? count_hash_nodes (st
->children
[x
])
2202 : st
->bucket_size
[x
]);
2210 se_memo_freer (struct rx_hash_item
* node
)
2213 se_memo_freer (node
)
2214 struct rx_hash_item
* node
;
2217 free ((char *)node
->data
);
2223 nfa_set_freer (struct rx_hash_item
* node
)
2226 nfa_set_freer (node
)
2227 struct rx_hash_item
* node
;
2230 free ((char *)node
->data
);
2234 /* This copies an entire NFA into a single malloced block of memory.
2235 * Mostly this is for compatability with regex.c, though it is convenient
2236 * to have the nfa nodes in an array.
2241 rx_compactify_nfa (struct rx
*rx
,
2242 void **mem
, unsigned long *size
)
2245 rx_compactify_nfa (rx
, mem
, size
)
2248 unsigned long *size
;
2252 struct rx_nfa_state
*n
;
2255 int se_list_consc
= count_hash_nodes (&rx
->se_list_memo
);
2256 int nfa_setc
= count_hash_nodes (&rx
->set_list_memo
);
2257 unsigned long total_size
;
2259 /* This takes place in two stages. First, the total size of the
2260 * nfa is computed, then structures are copied.
2266 struct rx_nfa_edge
*e
= n
->edges
;
2267 struct rx_possible_future
*ec
= n
->futures
;
2282 total_size
= (total_nodec
* sizeof (struct rx_nfa_state
)
2283 + edgec
* rx_sizeof_bitset (rx
->local_cset_size
)
2284 + edgec
* sizeof (struct rx_nfa_edge
)
2285 + nfa_setc
* sizeof (struct rx_nfa_state_set
)
2286 + eclosec
* sizeof (struct rx_possible_future
)
2287 + se_list_consc
* sizeof (struct rx_se_list
)
2290 if (total_size
> *size
)
2292 *mem
= remalloc (*mem
, total_size
);
2298 /* Now we've allocated the memory; this copies the NFA. */
2300 static struct rx_nfa_state
**scratch
= 0;
2301 static int scratch_alloc
= 0;
2302 struct rx_nfa_state
*state_base
= (struct rx_nfa_state
*) * mem
;
2303 struct rx_nfa_state
*new_state
= state_base
;
2304 struct rx_nfa_edge
*new_edge
=
2305 (struct rx_nfa_edge
*)
2306 ((char *) state_base
+ total_nodec
* sizeof (struct rx_nfa_state
));
2307 struct rx_se_list
* new_se_list
=
2308 (struct rx_se_list
*)
2309 ((char *)new_edge
+ edgec
* sizeof (struct rx_nfa_edge
));
2310 struct rx_possible_future
*new_close
=
2311 ((struct rx_possible_future
*)
2312 ((char *) new_se_list
2313 + se_list_consc
* sizeof (struct rx_se_list
)));
2314 struct rx_nfa_state_set
* new_nfa_set
=
2315 ((struct rx_nfa_state_set
*)
2316 ((char *)new_close
+ eclosec
* sizeof (struct rx_possible_future
)));
2318 ((char *) new_nfa_set
+ nfa_setc
* sizeof (struct rx_nfa_state_set
));
2320 struct rx_nfa_state
*n
;
2322 if (scratch_alloc
< total_nodec
)
2324 scratch
= ((struct rx_nfa_state
**)
2325 remalloc (scratch
, total_nodec
* sizeof (*scratch
)));
2327 scratch_alloc
= total_nodec
;
2335 for (x
= 0, n
= rx
->nfa_states
; n
; n
= n
->next
)
2338 qsort (scratch
, total_nodec
,
2339 sizeof (struct rx_nfa_state
*), (int (*)())nfacmp
);
2341 for (x
= 0; x
< total_nodec
; ++x
)
2343 struct rx_possible_future
*eclose
= scratch
[x
]->futures
;
2344 struct rx_nfa_edge
*edge
= scratch
[x
]->edges
;
2345 struct rx_nfa_state
*cn
= new_state
++;
2348 cn
->next
= (x
== total_nodec
- 1) ? 0 : (cn
+ 1);
2349 cn
->id
= scratch
[x
]->id
;
2350 cn
->is_final
= scratch
[x
]->is_final
;
2351 cn
->is_start
= scratch
[x
]->is_start
;
2355 int indx
= (edge
->dest
->id
< 0
2356 ? (total_nodec
+ edge
->dest
->id
)
2358 struct rx_nfa_edge
*e
= new_edge
++;
2359 rx_Bitset cset
= (rx_Bitset
) new_bitset
;
2360 new_bitset
+= rx_sizeof_bitset (rx
->local_cset_size
);
2361 rx_bitset_null (rx
->local_cset_size
, cset
);
2362 rx_bitset_union (rx
->local_cset_size
, cset
, edge
->params
.cset
);
2363 e
->next
= cn
->edges
;
2365 e
->type
= edge
->type
;
2366 e
->dest
= state_base
+ indx
;
2367 e
->params
.cset
= cset
;
2372 struct rx_possible_future
*ec
= new_close
++;
2373 struct rx_hash_item
* sp
;
2374 struct rx_se_list
** sepos
;
2375 struct rx_se_list
* sesrc
;
2376 struct rx_nfa_state_set
* destlst
;
2377 struct rx_nfa_state_set
** destpos
;
2378 ec
->next
= cn
->futures
;
2380 for (sepos
= &ec
->effects
, sesrc
= eclose
->effects
;
2382 sesrc
= sesrc
->cdr
, sepos
= &(*sepos
)->cdr
)
2384 sp
= rx_hash_find (&rx
->se_list_memo
,
2385 (long)sesrc
->car
^ (long)sesrc
->cdr
,
2386 sesrc
, &se_list_hash_rules
);
2389 sesrc
= (struct rx_se_list
*)sp
->binding
;
2392 *new_se_list
= *sesrc
;
2393 sp
->binding
= (void *)new_se_list
;
2394 *sepos
= new_se_list
;
2398 for (destpos
= &ec
->destset
, destlst
= eclose
->destset
;
2400 destpos
= &(*destpos
)->cdr
, destlst
= destlst
->cdr
)
2402 sp
= rx_hash_find (&rx
->set_list_memo
,
2403 ((((long)destlst
->car
) >> 8)
2404 ^ (long)destlst
->cdr
),
2405 destlst
, &nfa_set_hash_rules
);
2408 destlst
= (struct rx_nfa_state_set
*)sp
->binding
;
2411 *new_nfa_set
= *destlst
;
2412 new_nfa_set
->car
= state_base
+ destlst
->car
->id
;
2413 sp
->binding
= (void *)new_nfa_set
;
2414 *destpos
= new_nfa_set
;
2418 eclose
= eclose
->next
;
2422 rx_free_hash_table (&rx
->se_list_memo
, se_memo_freer
, &se_list_hash_rules
);
2423 bzero (&rx
->se_list_memo
, sizeof (rx
->se_list_memo
));
2424 rx_free_hash_table (&rx
->set_list_memo
, nfa_set_freer
, &nfa_set_hash_rules
);
2425 bzero (&rx
->set_list_memo
, sizeof (rx
->set_list_memo
));
2428 rx
->nfa_states
= (struct rx_nfa_state
*)*mem
;
2433 /* The functions in the next several pages define the lazy-NFA-conversion used
2434 * by matchers. The input to this construction is an NFA such as
2435 * is built by compactify_nfa (rx.c). The output is the superNFA.
2438 /* Match engines can use arbitrary values for opcodes. So, the parse tree
2439 * is built using instructions names (enum rx_opcode), but the superstate
2440 * nfa is populated with mystery opcodes (void *).
2442 * For convenience, here is an id table. The opcodes are == to their inxs
2444 * The lables in re_search_2 would make good values for instructions.
2447 void * rx_id_instruction_table
[rx_num_instructions
] =
2449 (void *) rx_backtrack_point
,
2450 (void *) rx_do_side_effects
,
2451 (void *) rx_cache_miss
,
2452 (void *) rx_next_char
,
2453 (void *) rx_backtrack
,
2454 (void *) rx_error_inx
2459 /* Memory mgt. for superstate graphs. */
2463 rx_cache_malloc (struct rx_cache
* cache
, int bytes
)
2466 rx_cache_malloc (cache
, bytes
)
2467 struct rx_cache
* cache
;
2471 while (cache
->bytes_left
< bytes
)
2473 if (cache
->memory_pos
)
2474 cache
->memory_pos
= cache
->memory_pos
->next
;
2475 if (!cache
->memory_pos
)
2477 cache
->morecore (cache
);
2478 if (!cache
->memory_pos
)
2481 cache
->bytes_left
= cache
->memory_pos
->bytes
;
2482 cache
->memory_addr
= ((char *)cache
->memory_pos
2483 + sizeof (struct rx_blocklist
));
2485 cache
->bytes_left
-= bytes
;
2487 char * addr
= cache
->memory_addr
;
2488 cache
->memory_addr
+= bytes
;
2495 rx_cache_free (struct rx_cache
* cache
,
2496 struct rx_freelist
** freelist
, char * mem
)
2499 rx_cache_free (cache
, freelist
, mem
)
2500 struct rx_cache
* cache
;
2501 struct rx_freelist
** freelist
;
2505 struct rx_freelist
* it
= (struct rx_freelist
*)mem
;
2506 it
->next
= *freelist
;
2511 /* The partially instantiated superstate graph has a transition
2512 * table at every node. There is one entry for every character.
2513 * This fills in the transition for a set.
2517 install_transition (struct rx_superstate
*super
,
2518 struct rx_inx
*answer
, rx_Bitset trcset
)
2521 install_transition (super
, answer
, trcset
)
2522 struct rx_superstate
*super
;
2523 struct rx_inx
*answer
;
2527 struct rx_inx
* transitions
= super
->transitions
;
2529 for (chr
= 0; chr
< 256; )
2537 RX_subset sub
= *trcset
;
2539 int bound
= chr
+ 32;
2543 transitions
[chr
] = *answer
;
2554 qlen (struct rx_superstate
* q
)
2558 struct rx_superstate
* q
;
2562 struct rx_superstate
* it
;
2565 for (it
= q
->next_recyclable
; it
!= q
; it
= it
->next_recyclable
)
2572 check_cache (struct rx_cache
* cache
)
2576 struct rx_cache
* cache
;
2579 struct rx_cache
* you_fucked_up
= 0;
2580 int total
= cache
->superstates
;
2581 int semi
= cache
->semifree_superstates
;
2582 if (semi
!= qlen (cache
->semifree_superstate
))
2583 check_cache (you_fucked_up
);
2584 if ((total
- semi
) != qlen (cache
->lru_superstate
))
2585 check_cache (you_fucked_up
);
2588 /* When a superstate is old and neglected, it can enter a
2589 * semi-free state. A semi-free state is slated to die.
2590 * Incoming transitions to a semi-free state are re-written
2591 * to cause an (interpreted) fault when they are taken.
2592 * The fault handler revives the semi-free state, patches
2593 * incoming transitions back to normal, and continues.
2595 * The idea is basicly to free in two stages, aborting
2596 * between the two if the state turns out to be useful again.
2597 * When a free is aborted, the rescued superstate is placed
2598 * in the most-favored slot to maximize the time until it
2599 * is next semi-freed.
2604 semifree_superstate (struct rx_cache
* cache
)
2607 semifree_superstate (cache
)
2608 struct rx_cache
* cache
;
2611 int disqualified
= cache
->semifree_superstates
;
2612 if (disqualified
== cache
->superstates
)
2614 while (cache
->lru_superstate
->locks
)
2616 cache
->lru_superstate
= cache
->lru_superstate
->next_recyclable
;
2618 if (disqualified
== cache
->superstates
)
2622 struct rx_superstate
* it
= cache
->lru_superstate
;
2623 it
->next_recyclable
->prev_recyclable
= it
->prev_recyclable
;
2624 it
->prev_recyclable
->next_recyclable
= it
->next_recyclable
;
2625 cache
->lru_superstate
= (it
== it
->next_recyclable
2627 : it
->next_recyclable
);
2628 if (!cache
->semifree_superstate
)
2630 cache
->semifree_superstate
= it
;
2631 it
->next_recyclable
= it
;
2632 it
->prev_recyclable
= it
;
2636 it
->prev_recyclable
= cache
->semifree_superstate
->prev_recyclable
;
2637 it
->next_recyclable
= cache
->semifree_superstate
;
2638 it
->prev_recyclable
->next_recyclable
= it
;
2639 it
->next_recyclable
->prev_recyclable
= it
;
2642 struct rx_distinct_future
*df
;
2643 it
->is_semifree
= 1;
2644 ++cache
->semifree_superstates
;
2645 df
= it
->transition_refs
;
2648 df
->prev_same_dest
->next_same_dest
= 0;
2649 for (df
= it
->transition_refs
; df
; df
= df
->next_same_dest
)
2651 df
->future_frame
.inx
= cache
->instruction_table
[rx_cache_miss
];
2652 df
->future_frame
.data
= 0;
2653 df
->future_frame
.data_2
= (void *) df
;
2654 /* If there are any NEXT-CHAR instruction frames that
2655 * refer to this state, we convert them to CACHE-MISS frames.
2658 && (df
->edge
->options
->next_same_super_edge
[0]
2659 == df
->edge
->options
))
2660 install_transition (df
->present
, &df
->future_frame
,
2663 df
= it
->transition_refs
;
2664 df
->prev_same_dest
->next_same_dest
= df
;
2673 refresh_semifree_superstate (struct rx_cache
* cache
,
2674 struct rx_superstate
* super
)
2677 refresh_semifree_superstate (cache
, super
)
2678 struct rx_cache
* cache
;
2679 struct rx_superstate
* super
;
2682 struct rx_distinct_future
*df
;
2684 if (super
->transition_refs
)
2686 super
->transition_refs
->prev_same_dest
->next_same_dest
= 0;
2687 for (df
= super
->transition_refs
; df
; df
= df
->next_same_dest
)
2689 df
->future_frame
.inx
= cache
->instruction_table
[rx_next_char
];
2690 df
->future_frame
.data
= (void *) super
->transitions
;
2691 /* CACHE-MISS instruction frames that refer to this state,
2692 * must be converted to NEXT-CHAR frames.
2695 && (df
->edge
->options
->next_same_super_edge
[0]
2696 == df
->edge
->options
))
2697 install_transition (df
->present
, &df
->future_frame
,
2700 super
->transition_refs
->prev_same_dest
->next_same_dest
2701 = super
->transition_refs
;
2703 if (cache
->semifree_superstate
== super
)
2704 cache
->semifree_superstate
= (super
->prev_recyclable
== super
2706 : super
->prev_recyclable
);
2707 super
->next_recyclable
->prev_recyclable
= super
->prev_recyclable
;
2708 super
->prev_recyclable
->next_recyclable
= super
->next_recyclable
;
2710 if (!cache
->lru_superstate
)
2711 (cache
->lru_superstate
2712 = super
->next_recyclable
2713 = super
->prev_recyclable
2717 super
->next_recyclable
= cache
->lru_superstate
;
2718 super
->prev_recyclable
= cache
->lru_superstate
->prev_recyclable
;
2719 super
->next_recyclable
->prev_recyclable
= super
;
2720 super
->prev_recyclable
->next_recyclable
= super
;
2722 super
->is_semifree
= 0;
2723 --cache
->semifree_superstates
;
2728 rx_refresh_this_superstate (struct rx_cache
* cache
, struct rx_superstate
* superstate
)
2731 rx_refresh_this_superstate (cache
, superstate
)
2732 struct rx_cache
* cache
;
2733 struct rx_superstate
* superstate
;
2736 if (superstate
->is_semifree
)
2737 refresh_semifree_superstate (cache
, superstate
);
2738 else if (cache
->lru_superstate
== superstate
)
2739 cache
->lru_superstate
= superstate
->next_recyclable
;
2740 else if (superstate
!= cache
->lru_superstate
->prev_recyclable
)
2742 superstate
->next_recyclable
->prev_recyclable
2743 = superstate
->prev_recyclable
;
2744 superstate
->prev_recyclable
->next_recyclable
2745 = superstate
->next_recyclable
;
2746 superstate
->next_recyclable
= cache
->lru_superstate
;
2747 superstate
->prev_recyclable
= cache
->lru_superstate
->prev_recyclable
;
2748 superstate
->next_recyclable
->prev_recyclable
= superstate
;
2749 superstate
->prev_recyclable
->next_recyclable
= superstate
;
2755 release_superset_low (struct rx_cache
* cache
,
2756 struct rx_superset
*set
)
2759 release_superset_low (cache
, set
)
2760 struct rx_cache
* cache
;
2761 struct rx_superset
*set
;
2767 release_superset_low (cache
, set
->cdr
);
2769 set
->starts_for
= 0;
2773 (&cache
->superset_table
,
2774 (unsigned long)set
->car
^ set
->id
^ (unsigned long)set
->cdr
,
2776 &cache
->superset_hash_rules
),
2777 &cache
->superset_hash_rules
);
2778 rx_cache_free (cache
, &cache
->free_supersets
, (char *)set
);
2784 rx_release_superset (struct rx
*rx
,
2785 struct rx_superset
*set
)
2788 rx_release_superset (rx
, set
)
2790 struct rx_superset
*set
;
2793 release_superset_low (rx
->cache
, set
);
2796 /* This tries to add a new superstate to the superstate freelist.
2797 * It might, as a result, free some edge pieces or hash tables.
2798 * If nothing can be freed because too many locks are being held, fail.
2803 rx_really_free_superstate (struct rx_cache
* cache
)
2806 rx_really_free_superstate (cache
)
2807 struct rx_cache
* cache
;
2810 int locked_superstates
= 0;
2811 struct rx_superstate
* it
;
2813 if (!cache
->superstates
)
2817 /* This is a total guess. The idea is that we should expect as
2818 * many misses as we've recently experienced. I.e., cache->misses
2819 * should be the same as cache->semifree_superstates.
2821 while ((cache
->hits
+ cache
->misses
) > cache
->superstates_allowed
)
2824 cache
->misses
>>= 1;
2826 if ( ((cache
->hits
+ cache
->misses
) * cache
->semifree_superstates
)
2827 < (cache
->superstates
* cache
->misses
))
2829 semifree_superstate (cache
);
2830 semifree_superstate (cache
);
2834 while (cache
->semifree_superstate
&& cache
->semifree_superstate
->locks
)
2836 refresh_semifree_superstate (cache
, cache
->semifree_superstate
);
2837 ++locked_superstates
;
2838 if (locked_superstates
== cache
->superstates
)
2842 if (cache
->semifree_superstate
)
2844 it
= cache
->semifree_superstate
;
2845 it
->next_recyclable
->prev_recyclable
= it
->prev_recyclable
;
2846 it
->prev_recyclable
->next_recyclable
= it
->next_recyclable
;
2847 cache
->semifree_superstate
= ((it
== it
->next_recyclable
)
2849 : it
->next_recyclable
);
2850 --cache
->semifree_superstates
;
2854 while (cache
->lru_superstate
->locks
)
2856 cache
->lru_superstate
= cache
->lru_superstate
->next_recyclable
;
2857 ++locked_superstates
;
2858 if (locked_superstates
== cache
->superstates
)
2861 it
= cache
->lru_superstate
;
2862 it
->next_recyclable
->prev_recyclable
= it
->prev_recyclable
;
2863 it
->prev_recyclable
->next_recyclable
= it
->next_recyclable
;
2864 cache
->lru_superstate
= ((it
== it
->next_recyclable
)
2866 : it
->next_recyclable
);
2869 if (it
->transition_refs
)
2871 struct rx_distinct_future
*df
;
2872 for (df
= it
->transition_refs
,
2873 df
->prev_same_dest
->next_same_dest
= 0;
2875 df
= df
->next_same_dest
)
2877 df
->future_frame
.inx
= cache
->instruction_table
[rx_cache_miss
];
2878 df
->future_frame
.data
= 0;
2879 df
->future_frame
.data_2
= (void *) df
;
2882 it
->transition_refs
->prev_same_dest
->next_same_dest
=
2883 it
->transition_refs
;
2886 struct rx_super_edge
*tc
= it
->edges
;
2889 struct rx_distinct_future
* df
;
2890 struct rx_super_edge
*tct
= tc
->next
;
2892 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = 0;
2895 struct rx_distinct_future
*dft
= df
;
2896 df
= df
->next_same_super_edge
[0];
2899 if (dft
->future
&& dft
->future
->transition_refs
== dft
)
2901 dft
->future
->transition_refs
= dft
->next_same_dest
;
2902 if (dft
->future
->transition_refs
== dft
)
2903 dft
->future
->transition_refs
= 0;
2905 dft
->next_same_dest
->prev_same_dest
= dft
->prev_same_dest
;
2906 dft
->prev_same_dest
->next_same_dest
= dft
->next_same_dest
;
2907 rx_cache_free (cache
, &cache
->free_discernable_futures
,
2910 rx_cache_free (cache
, &cache
->free_transition_classes
, (char *)tc
);
2915 if (it
->contents
->superstate
== it
)
2916 it
->contents
->superstate
= 0;
2917 release_superset_low (cache
, it
->contents
);
2918 rx_cache_free (cache
, &cache
->free_superstates
, (char *)it
);
2919 --cache
->superstates
;
2925 rx_cache_get (struct rx_cache
* cache
,
2926 struct rx_freelist
** freelist
)
2929 rx_cache_get (cache
, freelist
)
2930 struct rx_cache
* cache
;
2931 struct rx_freelist
** freelist
;
2934 while (!*freelist
&& rx_really_free_superstate (cache
))
2939 struct rx_freelist
* it
= *freelist
;
2940 *freelist
= it
->next
;
2947 rx_cache_malloc_or_get (struct rx_cache
* cache
,
2948 struct rx_freelist
** freelist
, int bytes
)
2951 rx_cache_malloc_or_get (cache
, freelist
, bytes
)
2952 struct rx_cache
* cache
;
2953 struct rx_freelist
** freelist
;
2959 char * answer
= rx_cache_malloc (cache
, bytes
);
2964 return rx_cache_get (cache
, freelist
);
2969 rx_cache_get_superstate (struct rx_cache
* cache
)
2972 rx_cache_get_superstate (cache
)
2973 struct rx_cache
* cache
;
2977 int bytes
= ( sizeof (struct rx_superstate
)
2978 + cache
->local_cset_size
* sizeof (struct rx_inx
));
2979 if (!cache
->free_superstates
2980 && (cache
->superstates
< cache
->superstates_allowed
))
2982 answer
= rx_cache_malloc (cache
, bytes
);
2985 ++cache
->superstates
;
2989 answer
= rx_cache_get (cache
, &cache
->free_superstates
);
2992 answer
= rx_cache_malloc (cache
, bytes
);
2994 ++cache
->superstates_allowed
;
2996 ++cache
->superstates
;
3004 supersetcmp (void * va
, void * vb
)
3007 supersetcmp (va
, vb
)
3012 struct rx_superset
* a
= (struct rx_superset
*)va
;
3013 struct rx_superset
* b
= (struct rx_superset
*)vb
;
3015 || (a
&& b
&& (a
->car
== b
->car
) && (a
->cdr
== b
->cdr
)));
3019 static struct rx_hash_item
*
3020 superset_allocator (struct rx_hash_rules
* rules
, void * val
)
3022 static struct rx_hash_item
*
3023 superset_allocator (rules
, val
)
3024 struct rx_hash_rules
* rules
;
3028 struct rx_cache
* cache
3029 = ((struct rx_cache
*)
3031 - (unsigned long)(&((struct rx_cache
*)0)->superset_hash_rules
)));
3032 struct rx_superset
* template = (struct rx_superset
*)val
;
3033 struct rx_superset
* newset
3034 = ((struct rx_superset
*)
3035 rx_cache_malloc_or_get (cache
,
3036 &cache
->free_supersets
,
3037 sizeof (*template)));
3041 newset
->car
= template->car
;
3042 newset
->id
= template->car
->id
;
3043 newset
->cdr
= template->cdr
;
3044 newset
->superstate
= 0;
3045 rx_protect_superset (rx
, template->cdr
);
3046 newset
->hash_item
.data
= (void *)newset
;
3047 newset
->hash_item
.binding
= 0;
3048 return &newset
->hash_item
;
3052 static struct rx_hash
*
3053 super_hash_allocator (struct rx_hash_rules
* rules
)
3055 static struct rx_hash
*
3056 super_hash_allocator (rules
)
3057 struct rx_hash_rules
* rules
;
3060 struct rx_cache
* cache
3061 = ((struct rx_cache
*)
3063 - (unsigned long)(&((struct rx_cache
*)0)->superset_hash_rules
)));
3064 return ((struct rx_hash
*)
3065 rx_cache_malloc_or_get (cache
,
3066 &cache
->free_hash
, sizeof (struct rx_hash
)));
3072 super_hash_liberator (struct rx_hash
* hash
, struct rx_hash_rules
* rules
)
3075 super_hash_liberator (hash
, rules
)
3076 struct rx_hash
* hash
;
3077 struct rx_hash_rules
* rules
;
3080 struct rx_cache
* cache
3081 = ((struct rx_cache
*)
3082 (char *)rules
- (long)(&((struct rx_cache
*)0)->superset_hash_rules
));
3083 rx_cache_free (cache
, &cache
->free_hash
, (char *)hash
);
3088 superset_hash_item_liberator (struct rx_hash_item
* it
,
3089 struct rx_hash_rules
* rules
)
3092 superset_hash_item_liberator (it
, rules
) /* Well, it does ya know. */
3093 struct rx_hash_item
* it
;
3094 struct rx_hash_rules
* rules
;
3099 int rx_cache_bound
= 128;
3100 static int rx_default_cache_got
= 0;
3104 bytes_for_cache_size (int supers
, int cset_size
)
3107 bytes_for_cache_size (supers
, cset_size
)
3112 /* What the hell is this? !!!*/
3115 ( (1.03 * (float) ( rx_sizeof_bitset (cset_size
)
3116 + sizeof (struct rx_super_edge
)))
3117 + (1.80 * (float) sizeof (struct rx_possible_future
))
3118 + (float) ( sizeof (struct rx_superstate
)
3119 + cset_size
* sizeof (struct rx_inx
))));
3124 rx_morecore (struct rx_cache
* cache
)
3128 struct rx_cache
* cache
;
3131 if (rx_default_cache_got
>= rx_cache_bound
)
3134 rx_default_cache_got
+= 16;
3135 cache
->superstates_allowed
= rx_cache_bound
;
3137 struct rx_blocklist
** pos
= &cache
->memory
;
3138 int size
= bytes_for_cache_size (16, cache
->local_cset_size
);
3140 pos
= &(*pos
)->next
;
3141 *pos
= ((struct rx_blocklist
*)
3142 malloc (size
+ sizeof (struct rx_blocklist
)));
3147 (*pos
)->bytes
= size
;
3148 cache
->memory_pos
= *pos
;
3149 cache
->memory_addr
= (char *)*pos
+ sizeof (**pos
);
3150 cache
->bytes_left
= size
;
3154 static struct rx_cache default_cache
=
3158 super_hash_allocator
,
3159 super_hash_liberator
,
3161 superset_hash_item_liberator
,
3187 rx_id_instruction_table
,
3198 /* This adds an element to a superstate set. These sets are lists, such
3199 * that lists with == elements are ==. The empty set is returned by
3200 * superset_cons (rx, 0, 0) and is NOT equivelent to
3201 * (struct rx_superset)0.
3205 RX_DECL
struct rx_superset
*
3206 rx_superset_cons (struct rx
* rx
,
3207 struct rx_nfa_state
*car
, struct rx_superset
*cdr
)
3209 RX_DECL
struct rx_superset
*
3210 rx_superset_cons (rx
, car
, cdr
)
3212 struct rx_nfa_state
*car
;
3213 struct rx_superset
*cdr
;
3216 struct rx_cache
* cache
= rx
->cache
;
3219 if (!cache
->empty_superset
)
3221 cache
->empty_superset
3222 = ((struct rx_superset
*)
3223 rx_cache_malloc_or_get (cache
, &cache
->free_supersets
,
3224 sizeof (struct rx_superset
)));
3225 if (!cache
->empty_superset
)
3227 bzero (cache
->empty_superset
, sizeof (struct rx_superset
));
3228 cache
->empty_superset
->refs
= 1000;
3230 return cache
->empty_superset
;
3233 struct rx_superset
template;
3234 struct rx_hash_item
* hit
;
3237 template.id
= car
->id
;
3238 hit
= rx_hash_store (&cache
->superset_table
,
3239 (unsigned long)car
^ car
->id
^ (unsigned long)cdr
,
3241 &cache
->superset_hash_rules
);
3243 ? (struct rx_superset
*)hit
->data
3248 /* This computes a union of two NFA state sets. The sets do not have the
3249 * same representation though. One is a RX_SUPERSET structure (part
3250 * of the superstate NFA) and the other is an NFA_STATE_SET (part of the NFA).
3254 RX_DECL
struct rx_superset
*
3255 rx_superstate_eclosure_union
3256 (struct rx
* rx
, struct rx_superset
*set
, struct rx_nfa_state_set
*ecl
)
3258 RX_DECL
struct rx_superset
*
3259 rx_superstate_eclosure_union (rx
, set
, ecl
)
3261 struct rx_superset
*set
;
3262 struct rx_nfa_state_set
*ecl
;
3269 return rx_superset_cons (rx
, ecl
->car
,
3270 rx_superstate_eclosure_union (rx
, set
, ecl
->cdr
));
3271 if (set
->car
== ecl
->car
)
3272 return rx_superstate_eclosure_union (rx
, set
, ecl
->cdr
);
3275 struct rx_superset
* tail
;
3276 struct rx_nfa_state
* first
;
3278 if (set
->car
> ecl
->car
)
3280 tail
= rx_superstate_eclosure_union (rx
, set
->cdr
, ecl
);
3285 tail
= rx_superstate_eclosure_union (rx
, set
, ecl
->cdr
);
3292 struct rx_superset
* answer
;
3293 answer
= rx_superset_cons (rx
, first
, tail
);
3296 rx_protect_superset (rx
, tail
);
3297 rx_release_superset (rx
, tail
);
3310 * This makes sure that a list of rx_distinct_futures contains
3311 * a future for each possible set of side effects in the eclosure
3312 * of a given state. This is some of the work of filling in a
3313 * superstate transition.
3317 static struct rx_distinct_future
*
3318 include_futures (struct rx
*rx
,
3319 struct rx_distinct_future
*df
, struct rx_nfa_state
3320 *state
, struct rx_superstate
*superstate
)
3322 static struct rx_distinct_future
*
3323 include_futures (rx
, df
, state
, superstate
)
3325 struct rx_distinct_future
*df
;
3326 struct rx_nfa_state
*state
;
3327 struct rx_superstate
*superstate
;
3330 struct rx_possible_future
*future
;
3331 struct rx_cache
* cache
= rx
->cache
;
3332 for (future
= state
->futures
; future
; future
= future
->next
)
3334 struct rx_distinct_future
*dfp
;
3335 struct rx_distinct_future
*insert_before
= 0;
3337 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = 0;
3338 for (dfp
= df
; dfp
; dfp
= dfp
->next_same_super_edge
[0])
3339 if (dfp
->effects
== future
->effects
)
3343 int order
= rx
->se_list_cmp (rx
, dfp
->effects
, future
->effects
);
3346 insert_before
= dfp
;
3352 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = df
;
3356 = ((struct rx_distinct_future
*)
3357 rx_cache_malloc_or_get (cache
, &cache
->free_discernable_futures
,
3358 sizeof (struct rx_distinct_future
)));
3363 df
= insert_before
= dfp
;
3364 df
->next_same_super_edge
[0] = df
->next_same_super_edge
[1] = df
;
3366 else if (!insert_before
)
3368 else if (insert_before
== df
)
3371 dfp
->next_same_super_edge
[0] = insert_before
;
3372 dfp
->next_same_super_edge
[1]
3373 = insert_before
->next_same_super_edge
[1];
3374 dfp
->next_same_super_edge
[1]->next_same_super_edge
[0] = dfp
;
3375 dfp
->next_same_super_edge
[0]->next_same_super_edge
[1] = dfp
;
3376 dfp
->next_same_dest
= dfp
->prev_same_dest
= dfp
;
3378 dfp
->present
= superstate
;
3379 dfp
->future_frame
.inx
= rx
->instruction_table
[rx_cache_miss
];
3380 dfp
->future_frame
.data
= 0;
3381 dfp
->future_frame
.data_2
= (void *) dfp
;
3382 dfp
->side_effects_frame
.inx
3383 = rx
->instruction_table
[rx_do_side_effects
];
3384 dfp
->side_effects_frame
.data
= 0;
3385 dfp
->side_effects_frame
.data_2
= (void *) dfp
;
3386 dfp
->effects
= future
->effects
;
3394 /* This constructs a new superstate from its state set. The only
3395 * complexity here is memory management.
3398 RX_DECL
struct rx_superstate
*
3399 rx_superstate (struct rx
*rx
,
3400 struct rx_superset
*set
)
3402 RX_DECL
struct rx_superstate
*
3403 rx_superstate (rx
, set
)
3405 struct rx_superset
*set
;
3408 struct rx_cache
* cache
= rx
->cache
;
3409 struct rx_superstate
* superstate
= 0;
3411 /* Does the superstate already exist in the cache? */
3412 if (set
->superstate
)
3414 if (set
->superstate
->rx_id
!= rx
->rx_id
)
3416 /* Aha. It is in the cache, but belongs to a superstate
3417 * that refers to an NFA that no longer exists.
3418 * (We know it no longer exists because it was evidently
3419 * stored in the same region of memory as the current nfa
3420 * yet it has a different id.)
3422 superstate
= set
->superstate
;
3423 if (!superstate
->is_semifree
)
3425 if (cache
->lru_superstate
== superstate
)
3427 cache
->lru_superstate
= superstate
->next_recyclable
;
3428 if (cache
->lru_superstate
== superstate
)
3429 cache
->lru_superstate
= 0;
3432 superstate
->next_recyclable
->prev_recyclable
3433 = superstate
->prev_recyclable
;
3434 superstate
->prev_recyclable
->next_recyclable
3435 = superstate
->next_recyclable
;
3436 if (!cache
->semifree_superstate
)
3438 (cache
->semifree_superstate
3439 = superstate
->next_recyclable
3440 = superstate
->prev_recyclable
3445 superstate
->next_recyclable
= cache
->semifree_superstate
;
3446 superstate
->prev_recyclable
3447 = cache
->semifree_superstate
->prev_recyclable
;
3448 superstate
->next_recyclable
->prev_recyclable
3450 superstate
->prev_recyclable
->next_recyclable
3452 cache
->semifree_superstate
= superstate
;
3454 ++cache
->semifree_superstates
;
3457 set
->superstate
= 0;
3458 goto handle_cache_miss
;
3461 superstate
= set
->superstate
;
3463 rx_refresh_this_superstate (cache
, superstate
);
3469 /* This point reached only for cache misses. */
3472 if (rx_debug_trace
> 1)
3474 struct rx_superset
* setp
= set
;
3475 fprintf (stderr
, "Building a superstet %d(%d): ", rx
->rx_id
, set
);
3478 fprintf (stderr
, "%d ", setp
->id
);
3481 fprintf (stderr
, "(%d)\n", set
);
3484 superstate
= (struct rx_superstate
*)rx_cache_get_superstate (cache
);
3488 if (!cache
->lru_superstate
)
3489 (cache
->lru_superstate
3490 = superstate
->next_recyclable
3491 = superstate
->prev_recyclable
3495 superstate
->next_recyclable
= cache
->lru_superstate
;
3496 superstate
->prev_recyclable
= cache
->lru_superstate
->prev_recyclable
;
3497 ( superstate
->prev_recyclable
->next_recyclable
3498 = superstate
->next_recyclable
->prev_recyclable
3501 superstate
->rx_id
= rx
->rx_id
;
3502 superstate
->transition_refs
= 0;
3503 superstate
->locks
= 0;
3504 superstate
->is_semifree
= 0;
3505 set
->superstate
= superstate
;
3506 superstate
->contents
= set
;
3507 rx_protect_superset (rx
, set
);
3508 superstate
->edges
= 0;
3511 /* None of the transitions from this superstate are known yet. */
3512 for (x
= 0; x
< rx
->local_cset_size
; ++x
) /* &&&&& 3.8 % */
3514 struct rx_inx
* ifr
= &superstate
->transitions
[x
];
3515 ifr
->inx
= rx
->instruction_table
[rx_cache_miss
];
3516 ifr
->data
= ifr
->data_2
= 0;
3523 /* This computes the destination set of one edge of the superstate NFA.
3524 * Note that a RX_DISTINCT_FUTURE is a superstate edge.
3525 * Returns 0 on an allocation failure.
3530 solve_destination (struct rx
*rx
, struct rx_distinct_future
*df
)
3533 solve_destination (rx
, df
)
3535 struct rx_distinct_future
*df
;
3538 struct rx_super_edge
*tc
= df
->edge
;
3539 struct rx_superset
*nfa_state
;
3540 struct rx_superset
*nil_set
= rx_superset_cons (rx
, 0, 0);
3541 struct rx_superset
*solution
= nil_set
;
3542 struct rx_superstate
*dest
;
3544 rx_protect_superset (rx
, solution
);
3545 /* Iterate over all NFA states in the state set of this superstate. */
3546 for (nfa_state
= df
->present
->contents
;
3548 nfa_state
= nfa_state
->cdr
)
3550 struct rx_nfa_edge
*e
;
3551 /* Iterate over all edges of each NFA state. */
3552 for (e
= nfa_state
->car
->edges
; e
; e
= e
->next
)
3553 /* If we find an edge that is labeled with
3554 * the characters we are solving for.....
3556 if (rx_bitset_is_subset (rx
->local_cset_size
,
3557 tc
->cset
, e
->params
.cset
))
3559 struct rx_nfa_state
*n
= e
->dest
;
3560 struct rx_possible_future
*pf
;
3561 /* ....search the partial epsilon closures of the destination
3562 * of that edge for a path that involves the same set of
3563 * side effects we are solving for.
3564 * If we find such a RX_POSSIBLE_FUTURE, we add members to the
3565 * stateset we are computing.
3567 for (pf
= n
->futures
; pf
; pf
= pf
->next
)
3568 if (pf
->effects
== df
->effects
)
3570 struct rx_superset
* old_sol
;
3572 solution
= rx_superstate_eclosure_union (rx
, solution
,
3576 rx_protect_superset (rx
, solution
);
3577 rx_release_superset (rx
, old_sol
);
3581 /* It is possible that the RX_DISTINCT_FUTURE we are working on has
3582 * the empty set of NFA states as its definition. In that case, this
3583 * is a failure point.
3585 if (solution
== nil_set
)
3587 df
->future_frame
.inx
= (void *) rx_backtrack
;
3588 df
->future_frame
.data
= 0;
3589 df
->future_frame
.data_2
= 0;
3592 dest
= rx_superstate (rx
, solution
);
3593 rx_release_superset (rx
, solution
);
3598 struct rx_distinct_future
*dft
;
3600 df
->prev_same_dest
->next_same_dest
= 0;
3604 dft
->future_frame
.inx
= rx
->instruction_table
[rx_next_char
];
3605 dft
->future_frame
.data
= (void *) dest
->transitions
;
3606 dft
= dft
->next_same_dest
;
3608 df
->prev_same_dest
->next_same_dest
= df
;
3610 if (!dest
->transition_refs
)
3611 dest
->transition_refs
= df
;
3614 struct rx_distinct_future
*dft
= dest
->transition_refs
->next_same_dest
;
3615 dest
->transition_refs
->next_same_dest
= df
->next_same_dest
;
3616 df
->next_same_dest
->prev_same_dest
= dest
->transition_refs
;
3617 df
->next_same_dest
= dft
;
3618 dft
->prev_same_dest
= df
;
3624 /* This takes a superstate and a character, and computes some edges
3625 * from the superstate NFA. In particular, this computes all edges
3626 * that lead from SUPERSTATE given CHR. This function also
3627 * computes the set of characters that share this edge set.
3628 * This returns 0 on allocation error.
3629 * The character set and list of edges are returned through
3630 * the paramters CSETOUT and DFOUT.
3635 compute_super_edge (struct rx
*rx
, struct rx_distinct_future
**dfout
,
3636 rx_Bitset csetout
, struct rx_superstate
*superstate
,
3640 compute_super_edge (rx
, dfout
, csetout
, superstate
, chr
)
3642 struct rx_distinct_future
**dfout
;
3644 struct rx_superstate
*superstate
;
3648 struct rx_superset
*stateset
= superstate
->contents
;
3650 /* To compute the set of characters that share edges with CHR,
3651 * we start with the full character set, and subtract.
3653 rx_bitset_universe (rx
->local_cset_size
, csetout
);
3656 /* Iterate over the NFA states in the superstate state-set. */
3657 while (stateset
->car
)
3659 struct rx_nfa_edge
*e
;
3660 for (e
= stateset
->car
->edges
; e
; e
= e
->next
)
3661 if (RX_bitset_member (e
->params
.cset
, chr
))
3663 /* If we find an NFA edge that applies, we make sure there
3664 * are corresponding edges in the superstate NFA.
3667 struct rx_distinct_future
* saved
;
3669 *dfout
= include_futures (rx
, *dfout
, e
->dest
, superstate
);
3672 struct rx_distinct_future
* df
;
3675 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = 0;
3678 struct rx_distinct_future
*dft
;
3680 df
= df
->next_same_super_edge
[0];
3682 if (dft
->future
&& dft
->future
->transition_refs
== dft
)
3684 dft
->future
->transition_refs
= dft
->next_same_dest
;
3685 if (dft
->future
->transition_refs
== dft
)
3686 dft
->future
->transition_refs
= 0;
3688 dft
->next_same_dest
->prev_same_dest
= dft
->prev_same_dest
;
3689 dft
->prev_same_dest
->next_same_dest
= dft
->next_same_dest
;
3690 rx_cache_free (rx
->cache
,
3691 &rx
->cache
->free_discernable_futures
,
3697 /* We also trim the character set a bit. */
3698 rx_bitset_intersection (rx
->local_cset_size
,
3699 csetout
, e
->params
.cset
);
3702 /* An edge that doesn't apply at least tells us some characters
3703 * that don't share the same edge set as CHR.
3705 rx_bitset_difference (rx
->local_cset_size
, csetout
, e
->params
.cset
);
3706 stateset
= stateset
->cdr
;
3712 /* This is a constructor for RX_SUPER_EDGE structures. These are
3713 * wrappers for lists of superstate NFA edges that share character sets labels.
3714 * If a transition class contains more than one rx_distinct_future (superstate
3715 * edge), then it represents a non-determinism in the superstate NFA.
3719 static struct rx_super_edge
*
3720 rx_super_edge (struct rx
*rx
,
3721 struct rx_superstate
*super
, rx_Bitset cset
,
3722 struct rx_distinct_future
*df
)
3724 static struct rx_super_edge
*
3725 rx_super_edge (rx
, super
, cset
, df
)
3727 struct rx_superstate
*super
;
3729 struct rx_distinct_future
*df
;
3732 struct rx_super_edge
*tc
=
3733 (struct rx_super_edge
*)rx_cache_malloc_or_get
3734 (rx
->cache
, &rx
->cache
->free_transition_classes
,
3735 sizeof (struct rx_super_edge
) + rx_sizeof_bitset (rx
->local_cset_size
));
3739 tc
->next
= super
->edges
;
3741 tc
->rx_backtrack_frame
.inx
= rx
->instruction_table
[rx_backtrack_point
];
3742 tc
->rx_backtrack_frame
.data
= 0;
3743 tc
->rx_backtrack_frame
.data_2
= (void *) tc
;
3745 tc
->cset
= (rx_Bitset
) ((char *) tc
+ sizeof (*tc
));
3746 rx_bitset_assign (rx
->local_cset_size
, tc
->cset
, cset
);
3749 struct rx_distinct_future
* dfp
= df
;
3750 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = 0;
3754 dfp
= dfp
->next_same_super_edge
[0];
3756 df
->next_same_super_edge
[1]->next_same_super_edge
[0] = df
;
3762 /* There are three kinds of cache miss. The first occurs when a
3763 * transition is taken that has never been computed during the
3764 * lifetime of the source superstate. That cache miss is handled by
3765 * calling COMPUTE_SUPER_EDGE. The second kind of cache miss
3766 * occurs when the destination superstate of a transition doesn't
3767 * exist. SOLVE_DESTINATION is used to construct the destination superstate.
3768 * Finally, the third kind of cache miss occurs when the destination
3769 * superstate of a transition is in a `semi-free state'. That case is
3770 * handled by UNFREE_SUPERSTATE.
3772 * The function of HANDLE_CACHE_MISS is to figure out which of these
3778 install_partial_transition (struct rx_superstate
*super
,
3779 struct rx_inx
*answer
,
3780 RX_subset set
, int offset
)
3783 install_partial_transition (super
, answer
, set
, offset
)
3784 struct rx_superstate
*super
;
3785 struct rx_inx
*answer
;
3791 int end
= start
+ 32;
3793 struct rx_inx
* transitions
= super
->transitions
;
3798 transitions
[start
] = *answer
;
3806 RX_DECL
struct rx_inx
*
3807 rx_handle_cache_miss
3808 (struct rx
*rx
, struct rx_superstate
*super
, unsigned char chr
, void *data
)
3810 RX_DECL
struct rx_inx
*
3811 rx_handle_cache_miss (rx
, super
, chr
, data
)
3813 struct rx_superstate
*super
;
3818 int offset
= chr
/ RX_subset_bits
;
3819 struct rx_distinct_future
*df
= data
;
3821 if (!df
) /* must be the shared_cache_miss_frame */
3823 /* Perhaps this is just a transition waiting to be filled. */
3824 struct rx_super_edge
*tc
;
3825 RX_subset mask
= rx_subset_singletons
[chr
% RX_subset_bits
];
3827 for (tc
= super
->edges
; tc
; tc
= tc
->next
)
3828 if (tc
->cset
[offset
] & mask
)
3830 struct rx_inx
* answer
;
3832 answer
= ((tc
->options
->next_same_super_edge
[0] != tc
->options
)
3833 ? &tc
->rx_backtrack_frame
3835 ? &df
->side_effects_frame
3836 : &df
->future_frame
));
3837 install_partial_transition (super
, answer
,
3838 tc
->cset
[offset
], offset
* 32);
3841 /* Otherwise, it's a flushed or newly encountered edge. */
3843 char cset_space
[1024]; /* this limit is far from unreasonable */
3845 struct rx_inx
*answer
;
3847 if (rx_sizeof_bitset (rx
->local_cset_size
) > sizeof (cset_space
))
3848 return 0; /* If the arbitrary limit is hit, always fail */
3850 trcset
= (rx_Bitset
)cset_space
;
3851 rx_lock_superstate (rx
, super
);
3852 if (!compute_super_edge (rx
, &df
, trcset
, super
, chr
))
3854 rx_unlock_superstate (rx
, super
);
3857 if (!df
) /* We just computed the fail transition. */
3859 static struct rx_inx
3860 shared_fail_frame
= { 0, 0, (void *)rx_backtrack
, 0 };
3861 answer
= &shared_fail_frame
;
3865 tc
= rx_super_edge (rx
, super
, trcset
, df
);
3868 rx_unlock_superstate (rx
, super
);
3871 answer
= ((tc
->options
->next_same_super_edge
[0] != tc
->options
)
3872 ? &tc
->rx_backtrack_frame
3874 ? &df
->side_effects_frame
3875 : &df
->future_frame
));
3877 install_partial_transition (super
, answer
,
3878 trcset
[offset
], offset
* 32);
3879 rx_unlock_superstate (rx
, super
);
3883 else if (df
->future
) /* A cache miss on an edge with a future? Must be
3884 * a semi-free destination. */
3886 if (df
->future
->is_semifree
)
3887 refresh_semifree_superstate (rx
->cache
, df
->future
);
3888 return &df
->future_frame
;
3891 /* no future superstate on an existing edge */
3893 rx_lock_superstate (rx
, super
);
3894 if (!solve_destination (rx
, df
))
3896 rx_unlock_superstate (rx
, super
);
3900 && (df
->edge
->options
->next_same_super_edge
[0] == df
->edge
->options
))
3901 install_partial_transition (super
, &df
->future_frame
,
3902 df
->edge
->cset
[offset
], offset
* 32);
3903 rx_unlock_superstate (rx
, super
);
3904 return &df
->future_frame
;
3911 /* The rest of the code provides a regex.c compatable interface. */
3914 __const__
char *re_error_msg
[] =
3917 "No match", /* REG_NOMATCH */
3918 "Invalid regular expression", /* REG_BADPAT */
3919 "Invalid collation character", /* REG_ECOLLATE */
3920 "Invalid character class name", /* REG_ECTYPE */
3921 "Trailing backslash", /* REG_EESCAPE */
3922 "Invalid back reference", /* REG_ESUBREG */
3923 "Unmatched [ or [^", /* REG_EBRACK */
3924 "Unmatched ( or \\(", /* REG_EPAREN */
3925 "Unmatched \\{", /* REG_EBRACE */
3926 "Invalid content of \\{\\}", /* REG_BADBR */
3927 "Invalid range end", /* REG_ERANGE */
3928 "Memory exhausted", /* REG_ESPACE */
3929 "Invalid preceding regular expression", /* REG_BADRPT */
3930 "Premature end of regular expression", /* REG_EEND */
3931 "Regular expression too big", /* REG_ESIZE */
3932 "Unmatched ) or \\)", /* REG_ERPAREN */
3938 * Macros used while compiling patterns.
3940 * By convention, PEND points just past the end of the uncompiled pattern,
3941 * P points to the read position in the pattern. `translate' is the name
3942 * of the translation table (`TRANSLATE' is the name of a macro that looks
3943 * things up in `translate').
3948 * Fetch the next character in the uncompiled pattern---translating it
3949 * if necessary. *Also cast from a signed character in the constant
3950 * string passed to us by the user to an unsigned char that we can use
3951 * as an array index (in, e.g., `translate').
3953 #define PATFETCH(c) \
3954 do {if (p == pend) return REG_EEND; \
3955 c = (unsigned char) *p++; \
3960 * Fetch the next character in the uncompiled pattern, with no
3963 #define PATFETCH_RAW(c) \
3964 do {if (p == pend) return REG_EEND; \
3965 c = (unsigned char) *p++; \
3968 /* Go backwards one character in the pattern. */
3969 #define PATUNFETCH p--
3972 #define TRANSLATE(d) translate[(unsigned char) (d)]
3974 typedef unsigned regnum_t
;
3976 /* Since offsets can go either forwards or backwards, this type needs to
3977 * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
3979 typedef int pattern_offset_t
;
3983 struct rexp_node
** top_expression
; /* was begalt */
3984 struct rexp_node
** last_expression
; /* was laststart */
3985 pattern_offset_t inner_group_offset
;
3987 } compile_stack_elt_t
;
3991 compile_stack_elt_t
*stack
;
3993 unsigned avail
; /* Offset of next open position. */
3994 } compile_stack_type
;
3997 #define INIT_COMPILE_STACK_SIZE 32
3999 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
4000 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
4002 /* The next available element. */
4003 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
4006 /* Set the bit for character C in a list. */
4007 #define SET_LIST_BIT(c) \
4008 (b[((unsigned char) (c)) / CHARBITS] \
4009 |= 1 << (((unsigned char) c) % CHARBITS))
4011 /* Get the next unsigned number in the uncompiled pattern. */
4012 #define GET_UNSIGNED_NUMBER(num) \
4016 while (isdigit (c)) \
4020 num = num * 10 + c - '0'; \
4028 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
4030 #define IS_CHAR_CLASS(string) \
4031 (!strcmp (string, "alpha") || !strcmp (string, "upper") \
4032 || !strcmp (string, "lower") || !strcmp (string, "digit") \
4033 || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
4034 || !strcmp (string, "space") || !strcmp (string, "print") \
4035 || !strcmp (string, "punct") || !strcmp (string, "graph") \
4036 || !strcmp (string, "cntrl") || !strcmp (string, "blank"))
4039 /* These predicates are used in regex_compile. */
4041 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4042 * after an alternative or a begin-subexpression. We assume there is at
4043 * least one character before the ^.
4048 at_begline_loc_p (__const__
char *pattern
, __const__
char * p
, reg_syntax_t syntax
)
4051 at_begline_loc_p (pattern
, p
, syntax
)
4052 __const__
char *pattern
;
4054 reg_syntax_t syntax
;
4057 __const__
char *prev
= p
- 2;
4058 boolean prev_prev_backslash
= ((prev
> pattern
) && (prev
[-1] == '\\'));
4062 (/* After a subexpression? */
4063 ((*prev
== '(') && ((syntax
& RE_NO_BK_PARENS
) || prev_prev_backslash
))
4065 /* After an alternative? */
4066 ((*prev
== '|') && ((syntax
& RE_NO_BK_VBAR
) || prev_prev_backslash
))
4070 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4071 * at least one character after the $, i.e., `P < PEND'.
4076 at_endline_loc_p (__const__
char *p
, __const__
char *pend
, int syntax
)
4079 at_endline_loc_p (p
, pend
, syntax
)
4081 __const__
char *pend
;
4085 __const__
char *next
= p
;
4086 boolean next_backslash
= (*next
== '\\');
4087 __const__
char *next_next
= (p
+ 1 < pend
) ? (p
+ 1) : 0;
4091 /* Before a subexpression? */
4092 ((syntax
& RE_NO_BK_PARENS
)
4094 : (next_backslash
&& next_next
&& (*next_next
== ')')))
4096 /* Before an alternative? */
4097 ((syntax
& RE_NO_BK_VBAR
)
4099 : (next_backslash
&& next_next
&& (*next_next
== '|')))
4104 unsigned char rx_id_translation
[256] =
4106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
4107 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
4108 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
4109 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
4110 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
4111 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
4112 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
4113 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
4114 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
4115 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
4117 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
4118 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
4119 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
4120 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
4121 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
4122 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
4123 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
4124 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
4125 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
4126 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
4128 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
4129 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
4130 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
4131 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
4132 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
4133 250, 251, 252, 253, 254, 255
4136 /* The compiler keeps an inverted translation table.
4137 * This looks up/inititalize elements.
4138 * VALID is an array of booleans that validate CACHE.
4143 inverse_translation (struct re_pattern_buffer
* rxb
,
4144 char * valid
, rx_Bitset cache
,
4145 unsigned char * translate
, int c
)
4148 inverse_translation (rxb
, valid
, cache
, translate
, c
)
4149 struct re_pattern_buffer
* rxb
;
4152 unsigned char * translate
;
4157 = cache
+ c
* rx_bitset_numb_subsets (rxb
->rx
.local_cset_size
);
4162 int c_tr
= TRANSLATE(c
);
4163 rx_bitset_null (rxb
->rx
.local_cset_size
, cs
);
4164 for (x
= 0; x
< 256; ++x
) /* &&&& 13.37 */
4165 if (TRANSLATE(x
) == c_tr
)
4166 RX_bitset_enjoin (cs
, x
);
4175 /* More subroutine declarations and macros for regex_compile. */
4177 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4178 false if it's not. */
4182 group_in_compile_stack (compile_stack_type compile_stack
, regnum_t regnum
)
4185 group_in_compile_stack (compile_stack
, regnum
)
4186 compile_stack_type compile_stack
;
4192 for (this_element
= compile_stack
.avail
- 1;
4195 if (compile_stack
.stack
[this_element
].regnum
== regnum
)
4203 * Read the ending character of a range (in a bracket expression) from the
4204 * uncompiled pattern *P_PTR (which ends at PEND). We assume the
4205 * starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4206 * Then we set the translation of all bits between the starting and
4207 * ending characters (inclusive) in the compiled pattern B.
4209 * Return an error code.
4211 * We use these short variable names so we can use the same macros as
4212 * `regex_compile' itself.
4216 static reg_errcode_t
4217 compile_range (struct re_pattern_buffer
* rxb
, rx_Bitset cs
,
4218 __const__
char ** p_ptr
, __const__
char * pend
,
4219 unsigned char * translate
, reg_syntax_t syntax
,
4220 rx_Bitset inv_tr
, char * valid_inv_tr
)
4222 static reg_errcode_t
4223 compile_range (rxb
, cs
, p_ptr
, pend
, translate
, syntax
, inv_tr
, valid_inv_tr
)
4224 struct re_pattern_buffer
* rxb
;
4226 __const__
char ** p_ptr
;
4227 __const__
char * pend
;
4228 unsigned char * translate
;
4229 reg_syntax_t syntax
;
4231 char * valid_inv_tr
;
4236 __const__
char *p
= *p_ptr
;
4238 unsigned char range_end
;
4239 unsigned char range_start
= TRANSLATE(p
[-2]);
4244 PATFETCH (range_end
);
4248 if (range_start
> range_end
)
4249 return syntax
& RE_NO_EMPTY_RANGES
? REG_ERANGE
: REG_NOERROR
;
4251 for (this_char
= range_start
; this_char
<= range_end
; this_char
++)
4254 inverse_translation (rxb
, valid_inv_tr
, inv_tr
, translate
, this_char
);
4255 rx_bitset_union (rxb
->rx
.local_cset_size
, cs
, it
);
4262 /* This searches a regexp for backreference side effects.
4263 * It fills in the array OUT with 1 at the index of every register pair
4264 * referenced by a backreference.
4266 * This is used to help optimize patterns for searching. The information is
4267 * useful because, if the caller doesn't want register values, backreferenced
4268 * registers are the only registers for which we need rx_backtrack.
4273 find_backrefs (char * out
, struct rexp_node
* rexp
,
4274 struct re_se_params
* params
)
4277 find_backrefs (out
, rexp
, params
)
4279 struct rexp_node
* rexp
;
4280 struct re_se_params
* params
;
4294 find_backrefs (out
, rexp
->params
.pair
.left
, params
);
4295 find_backrefs (out
, rexp
->params
.pair
.right
, params
);
4298 if ( ((long)rexp
->params
.side_effect
>= 0)
4299 && (params
[(long)rexp
->params
.side_effect
].se
== re_se_backref
))
4300 out
[ params
[(long)rexp
->params
.side_effect
].op1
] = 1;
4307 /* Returns 0 unless the pattern can match the empty string. */
4311 compute_fastset (struct re_pattern_buffer
* rxb
, struct rexp_node
* rexp
)
4314 compute_fastset (rxb
, rexp
)
4315 struct re_pattern_buffer
* rxb
;
4316 struct rexp_node
* rexp
;
4327 rx_bitset_union (rxb
->rx
.local_cset_size
,
4328 rxb
->fastset
, rexp
->params
.cset
);
4332 return (compute_fastset (rxb
, rexp
->params
.pair
.left
)
4333 && compute_fastset (rxb
, rexp
->params
.pair
.right
));
4335 compute_fastset (rxb
, rexp
->params
.pair
.left
);
4336 /* compute_fastset (rxb, rexp->params.pair.right); nope... */
4339 return !!(compute_fastset (rxb
, rexp
->params
.pair
.left
)
4340 + compute_fastset (rxb
, rexp
->params
.pair
.right
));
4343 compute_fastset (rxb
, rexp
->params
.pair
.left
);
4349 /* this should never happen */
4355 * 1 -- yes, definately anchored by the given side effect.
4356 * 2 -- maybe anchored, maybe the empty string.
4357 * 0 -- definately not anchored
4358 * There is simply no other possibility.
4363 is_anchored (struct rexp_node
* rexp
, rx_side_effect se
)
4366 is_anchored (rexp
, se
)
4367 struct rexp_node
* rexp
;
4381 int l
= is_anchored (rexp
->params
.pair
.left
, se
);
4382 return (l
== 2 ? is_anchored (rexp
->params
.pair
.right
, se
) : l
);
4386 int l
= is_anchored (rexp
->params
.pair
.left
, se
);
4387 int r
= l
? is_anchored (rexp
->params
.pair
.right
, se
) : 0;
4391 else if ((l
== 0) || (r
== 0))
4398 return is_anchored (rexp
->params
.pair
.left
, se
) ? 2 : 0;
4401 return ((rexp
->params
.side_effect
== se
)
4405 /* this should never happen */
4410 /* This removes register assignments that aren't required by backreferencing.
4411 * This can speed up explore_future, especially if it eliminates
4412 * non-determinism in the superstate NFA.
4414 * NEEDED is an array of characters, presumably filled in by FIND_BACKREFS.
4415 * The non-zero elements of the array indicate which register assignments
4416 * can NOT be removed from the expression.
4420 static struct rexp_node
*
4421 remove_unecessary_side_effects (struct rx
* rx
, char * needed
,
4422 struct rexp_node
* rexp
,
4423 struct re_se_params
* params
)
4425 static struct rexp_node
*
4426 remove_unecessary_side_effects (rx
, needed
, rexp
, params
)
4429 struct rexp_node
* rexp
;
4430 struct re_se_params
* params
;
4433 struct rexp_node
* l
;
4434 struct rexp_node
* r
;
4446 l
= remove_unecessary_side_effects (rx
, needed
,
4447 rexp
->params
.pair
.left
, params
);
4448 r
= remove_unecessary_side_effects (rx
, needed
,
4449 rexp
->params
.pair
.right
, params
);
4450 if ((l
&& r
) || (rexp
->type
!= r_concat
))
4452 rexp
->params
.pair
.left
= l
;
4453 rexp
->params
.pair
.right
= r
;
4458 rexp
->params
.pair
.left
= rexp
->params
.pair
.right
= 0;
4459 rx_free_rexp (rx
, rexp
);
4464 l
= remove_unecessary_side_effects (rx
, needed
,
4465 rexp
->params
.pair
.left
, params
);
4468 rexp
->params
.pair
.left
= l
;
4473 rexp
->params
.pair
.left
= 0;
4474 rx_free_rexp (rx
, rexp
);
4479 int se
= (long)rexp
->params
.side_effect
;
4481 && ( ((enum re_side_effects
)params
[se
].se
== re_se_lparen
)
4482 || ((enum re_side_effects
)params
[se
].se
== re_se_rparen
))
4483 && (params
[se
].op1
> 0)
4484 && (!needed
[params
[se
].op1
]))
4486 rx_free_rexp (rx
, rexp
);
4494 /* this should never happen */
4502 pointless_if_repeated (struct rexp_node
* node
, struct re_se_params
* params
)
4505 pointless_if_repeated (node
, params
)
4506 struct rexp_node
* node
;
4507 struct re_se_params
* params
;
4519 return (pointless_if_repeated (node
->params
.pair
.left
, params
)
4520 && pointless_if_repeated (node
->params
.pair
.right
, params
));
4523 return pointless_if_repeated (node
->params
.pair
.left
, params
);
4525 switch (((long)node
->params
.side_effect
< 0)
4526 ? (enum re_side_effects
)node
->params
.side_effect
4527 : (enum re_side_effects
)params
[(long)node
->params
.side_effect
].se
)
4534 case re_se_wordbound
:
4535 case re_se_notwordbound
:
4545 case re_se_end_iter
:
4547 case re_se_not_syntax
:
4561 registers_on_stack (struct re_pattern_buffer
* rxb
,
4562 struct rexp_node
* rexp
, int in_danger
,
4563 struct re_se_params
* params
)
4566 registers_on_stack (rxb
, rexp
, in_danger
, params
)
4567 struct re_pattern_buffer
* rxb
;
4568 struct rexp_node
* rexp
;
4570 struct re_se_params
* params
;
4583 return ( registers_on_stack (rxb
, rexp
->params
.pair
.left
,
4585 || (registers_on_stack
4586 (rxb
, rexp
->params
.pair
.right
,
4587 in_danger
, params
)));
4589 return registers_on_stack (rxb
, rexp
->params
.pair
.left
, 0, params
);
4591 return registers_on_stack (rxb
, rexp
->params
.pair
.left
, 1, params
);
4594 ( registers_on_stack (rxb
, rexp
->params
.pair
.left
, 1, params
)
4595 || registers_on_stack (rxb
, rexp
->params
.pair
.right
, 1, params
));
4598 int se
= (long)rexp
->params
.side_effect
;
4601 && (params
[se
].op1
> 0)
4602 && ( ((enum re_side_effects
)params
[se
].se
== re_se_lparen
)
4603 || ((enum re_side_effects
)params
[se
].se
== re_se_rparen
)))
4610 /* this should never happen */
4616 static char idempotent_complex_se
[] =
4618 #define RX_WANT_SE_DEFS 1
4620 #undef RX_DEF_CPLX_SE
4621 #define RX_DEF_SE(IDEM, NAME, VALUE)
4622 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) IDEM,
4625 #undef RX_DEF_CPLX_SE
4626 #undef RX_WANT_SE_DEFS
4630 static char idempotent_se
[] =
4633 #define RX_WANT_SE_DEFS 1
4635 #undef RX_DEF_CPLX_SE
4636 #define RX_DEF_SE(IDEM, NAME, VALUE) IDEM,
4637 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE)
4640 #undef RX_DEF_CPLX_SE
4641 #undef RX_WANT_SE_DEFS
4650 has_any_se (struct rx
* rx
,
4651 struct rexp_node
* rexp
)
4654 has_any_se (rx
, rexp
)
4656 struct rexp_node
* rexp
;
4675 ( has_any_se (rx
, rexp
->params
.pair
.left
)
4676 || has_any_se (rx
, rexp
->params
.pair
.right
));
4680 return has_any_se (rx
, rexp
->params
.pair
.left
);
4683 /* this should never happen */
4689 /* This must be called AFTER `convert_hard_loops' for a given REXP. */
4692 has_non_idempotent_epsilon_path (struct rx
* rx
,
4693 struct rexp_node
* rexp
,
4694 struct re_se_params
* params
)
4697 has_non_idempotent_epsilon_path (rx
, rexp
, params
)
4699 struct rexp_node
* rexp
;
4700 struct re_se_params
* params
;
4715 !((long)rexp
->params
.side_effect
> 0
4716 ? idempotent_complex_se
[ params
[(long)rexp
->params
.side_effect
].se
]
4717 : idempotent_se
[-(long)rexp
->params
.side_effect
]);
4721 ( has_non_idempotent_epsilon_path (rx
,
4722 rexp
->params
.pair
.left
, params
)
4723 || has_non_idempotent_epsilon_path (rx
,
4724 rexp
->params
.pair
.right
, params
));
4729 ( has_non_idempotent_epsilon_path (rx
,
4730 rexp
->params
.pair
.left
, params
)
4731 && has_non_idempotent_epsilon_path (rx
,
4732 rexp
->params
.pair
.right
, params
));
4735 return has_non_idempotent_epsilon_path (rx
,
4736 rexp
->params
.pair
.left
, params
);
4739 /* this should never happen */
4745 /* This computes rougly what it's name suggests. It can (and does) go wrong
4746 * in the direction of returning spurious 0 without causing disasters.
4750 begins_with_complex_se (struct rx
* rx
, struct rexp_node
* rexp
)
4753 begins_with_complex_se (rx
, rexp
)
4755 struct rexp_node
* rexp
;
4768 return ((long)rexp
->params
.side_effect
>= 0);
4772 ( begins_with_complex_se (rx
, rexp
->params
.pair
.left
)
4773 && begins_with_complex_se (rx
, rexp
->params
.pair
.right
));
4777 return has_any_se (rx
, rexp
->params
.pair
.left
);
4784 /* this should never happen */
4789 /* This destructively removes some of the re_se_tv side effects from
4790 * a rexp tree. In particular, during parsing re_se_tv was inserted on the
4791 * right half of every | to guarantee that posix path preference could be
4792 * honored. This function removes some which it can be determined aren't
4798 speed_up_alt (struct rx
* rx
,
4799 struct rexp_node
* rexp
,
4803 speed_up_alt (rx
, rexp
, unposix
)
4805 struct rexp_node
* rexp
;
4821 speed_up_alt (rx
, rexp
->params
.pair
.left
, unposix
);
4826 speed_up_alt (rx
, rexp
->params
.pair
.left
, unposix
);
4827 speed_up_alt (rx
, rexp
->params
.pair
.right
, unposix
);
4831 /* the right child is guaranteed to be (concat re_se_tv <subexp>) */
4833 speed_up_alt (rx
, rexp
->params
.pair
.left
, unposix
);
4834 speed_up_alt (rx
, rexp
->params
.pair
.right
->params
.pair
.right
, unposix
);
4837 || (begins_with_complex_se
4838 (rx
, rexp
->params
.pair
.right
->params
.pair
.right
))
4839 || !( has_any_se (rx
, rexp
->params
.pair
.right
->params
.pair
.right
)
4840 || has_any_se (rx
, rexp
->params
.pair
.left
)))
4842 struct rexp_node
* conc
= rexp
->params
.pair
.right
;
4843 rexp
->params
.pair
.right
= conc
->params
.pair
.right
;
4844 conc
->params
.pair
.right
= 0;
4845 rx_free_rexp (rx
, conc
);
4854 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
4855 Returns one of error codes defined in `regex.h', or zero for success.
4857 Assumes the `allocated' (and perhaps `buffer') and `translate'
4858 fields are set in BUFP on entry.
4860 If it succeeds, results are put in BUFP (if it returns an error, the
4861 contents of BUFP are undefined):
4862 `buffer' is the compiled pattern;
4863 `syntax' is set to SYNTAX;
4864 `used' is set to the length of the compiled pattern;
4865 `fastmap_accurate' is set to zero;
4866 `re_nsub' is set to the number of groups in PATTERN;
4867 `not_bol' and `not_eol' are set to zero.
4869 The `fastmap' and `newline_anchor' fields are neither
4870 examined nor set. */
4875 RX_DECL reg_errcode_t
4876 rx_compile (__const__
char *pattern
, int size
,
4877 reg_syntax_t syntax
,
4878 struct re_pattern_buffer
* rxb
)
4880 RX_DECL reg_errcode_t
4881 rx_compile (pattern
, size
, syntax
, rxb
)
4882 __const__
char *pattern
;
4884 reg_syntax_t syntax
;
4885 struct re_pattern_buffer
* rxb
;
4889 inverse_translate
[CHAR_SET_SIZE
* rx_bitset_numb_subsets(CHAR_SET_SIZE
)];
4891 validate_inv_tr
[CHAR_SET_SIZE
* rx_bitset_numb_subsets(CHAR_SET_SIZE
)];
4893 /* We fetch characters from PATTERN here. Even though PATTERN is
4894 `char *' (i.e., signed), we declare these variables as unsigned, so
4895 they can be reliably used as array indices. */
4896 register unsigned char c
, c1
;
4898 /* A random tempory spot in PATTERN. */
4901 /* Keeps track of unclosed groups. */
4902 compile_stack_type compile_stack
;
4904 /* Points to the current (ending) position in the pattern. */
4905 __const__
char *p
= pattern
;
4906 __const__
char *pend
= pattern
+ size
;
4908 /* How to translate the characters in the pattern. */
4909 unsigned char *translate
= (rxb
->translate
4911 : rx_id_translation
);
4913 /* When parsing is done, this will hold the expression tree. */
4914 struct rexp_node
* rexp
= 0;
4916 /* In the midst of compilation, this holds onto the regexp
4917 * first parst while rexp goes on to aquire additional constructs.
4919 struct rexp_node
* orig_rexp
= 0;
4920 struct rexp_node
* fewer_side_effects
= 0;
4922 /* This and top_expression are saved on the compile stack. */
4923 struct rexp_node
** top_expression
= &rexp
;
4924 struct rexp_node
** last_expression
= top_expression
;
4926 /* Parameter to `goto append_node' */
4927 struct rexp_node
* append
;
4929 /* Counts open-groups as they are encountered. This is the index of the
4930 * innermost group being compiled.
4932 regnum_t regnum
= 0;
4934 /* Place in the uncompiled pattern (i.e., the {) to
4935 * which to go back if the interval is invalid.
4937 __const__
char *beg_interval
;
4939 struct re_se_params
* params
= 0;
4940 int paramc
= 0; /* How many complex side effects so far? */
4942 rx_side_effect side
; /* param to `goto add_side_effect' */
4944 bzero (validate_inv_tr
, sizeof (validate_inv_tr
));
4946 rxb
->rx
.instruction_table
= rx_id_instruction_table
;
4949 /* Initialize the compile stack. */
4950 compile_stack
.stack
= (( compile_stack_elt_t
*) malloc ((INIT_COMPILE_STACK_SIZE
) * sizeof ( compile_stack_elt_t
)));
4951 if (compile_stack
.stack
== 0)
4954 compile_stack
.size
= INIT_COMPILE_STACK_SIZE
;
4955 compile_stack
.avail
= 0;
4957 /* Initialize the pattern buffer. */
4958 rxb
->rx
.cache
= &default_cache
;
4959 rxb
->syntax
= syntax
;
4960 rxb
->fastmap_accurate
= 0;
4961 rxb
->not_bol
= rxb
->not_eol
= 0;
4962 rxb
->least_subs
= 0;
4964 /* Always count groups, whether or not rxb->no_sub is set.
4965 * The whole pattern is implicitly group 0, so counting begins
4970 #if !defined (emacs) && !defined (SYNTAX)
4971 /* Initialize the syntax table. */
4972 init_syntax_once ();
4975 /* Loop through the uncompiled pattern until we're at the end. */
4984 if ( /* If at start of pattern, it's an operator. */
4986 /* If context independent, it's an operator. */
4987 || syntax
& RE_CONTEXT_INDEP_ANCHORS
4988 /* Otherwise, depends on what's come before. */
4989 || at_begline_loc_p (pattern
, p
, syntax
))
4991 struct rexp_node
* n
4992 = rx_mk_r_side_effect (&rxb
->rx
, (rx_side_effect
)re_se_hat
);
5006 if ( /* If at end of pattern, it's an operator. */
5008 /* If context independent, it's an operator. */
5009 || syntax
& RE_CONTEXT_INDEP_ANCHORS
5010 /* Otherwise, depends on what's next. */
5011 || at_endline_loc_p (p
, pend
, syntax
))
5013 struct rexp_node
* n
5014 = rx_mk_r_side_effect (&rxb
->rx
, (rx_side_effect
)re_se_dollar
);
5028 if ((syntax
& RE_BK_PLUS_QM
)
5029 || (syntax
& RE_LIMITED_OPS
))
5034 /* If there is no previous pattern... */
5035 if (pointless_if_repeated (*last_expression
, params
))
5037 if (syntax
& RE_CONTEXT_INVALID_OPS
)
5039 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
5044 /* 1 means zero (many) matches is allowed. */
5045 char zero_times_ok
= 0, many_times_ok
= 0;
5047 /* If there is a sequence of repetition chars, collapse it
5048 down to just one (the right one). We can't combine
5049 interval operators with these because of, e.g., `a{2}*',
5050 which should only match an even number of `a's. */
5054 zero_times_ok
|= c
!= '+';
5055 many_times_ok
|= c
!= '?';
5063 || (!(syntax
& RE_BK_PLUS_QM
) && (c
== '+' || c
== '?')))
5066 else if (syntax
& RE_BK_PLUS_QM
&& c
== '\\')
5068 if (p
== pend
) return REG_EESCAPE
;
5071 if (!(c1
== '+' || c1
== '?'))
5086 /* If we get here, we found another repeat character. */
5089 /* Star, etc. applied to an empty pattern is equivalent
5090 to an empty pattern. */
5091 if (!last_expression
)
5094 /* Now we know whether or not zero matches is allowed
5095 * and also whether or not two or more matches is allowed.
5099 struct rexp_node
* inner_exp
= *last_expression
;
5103 && has_non_idempotent_epsilon_path (&rxb
->rx
,
5106 struct rexp_node
* pusher
5107 = rx_mk_r_side_effect (&rxb
->rx
,
5108 (rx_side_effect
)re_se_pushpos
);
5109 struct rexp_node
* checker
5110 = rx_mk_r_side_effect (&rxb
->rx
,
5111 (rx_side_effect
)re_se_chkpos
);
5112 struct rexp_node
* pushback
5113 = rx_mk_r_side_effect (&rxb
->rx
,
5114 (rx_side_effect
)re_se_pushback
);
5115 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5116 struct rexp_node
* lit_t
= rx_mk_r_cset (&rxb
->rx
, cs
);
5117 struct rexp_node
* fake_state
5118 = rx_mk_r_concat (&rxb
->rx
, pushback
, lit_t
);
5119 struct rexp_node
* phase2
5120 = rx_mk_r_concat (&rxb
->rx
, checker
, fake_state
);
5121 struct rexp_node
* popper
5122 = rx_mk_r_side_effect (&rxb
->rx
,
5123 (rx_side_effect
)re_se_poppos
);
5124 struct rexp_node
* star
5125 = rx_mk_r_2phase_star (&rxb
->rx
, inner_exp
, phase2
);
5126 struct rexp_node
* a
5127 = rx_mk_r_concat (&rxb
->rx
, pusher
, star
);
5128 struct rexp_node
* whole_thing
5129 = rx_mk_r_concat (&rxb
->rx
, a
, popper
);
5130 if (!(pusher
&& star
&& pushback
&& lit_t
&& fake_state
5131 && lit_t
&& phase2
&& checker
&& popper
5132 && a
&& whole_thing
))
5134 RX_bitset_enjoin (cs
, 't');
5135 *last_expression
= whole_thing
;
5139 struct rexp_node
* star
=
5140 (many_times_ok
? rx_mk_r_star
: rx_mk_r_opt
)
5141 (&rxb
->rx
, *last_expression
);
5144 *last_expression
= star
;
5145 need_sync
= has_any_se (&rxb
->rx
, *last_expression
);
5149 struct rexp_node
* concat
5150 = rx_mk_r_concat (&rxb
->rx
, inner_exp
,
5151 rx_copy_rexp (&rxb
->rx
,
5155 *last_expression
= concat
;
5159 int sync_se
= paramc
;
5161 ? ((struct re_se_params
*)
5163 sizeof (*params
) * (1 + paramc
)))
5164 : ((struct re_se_params
*)
5165 malloc (sizeof (*params
))));
5169 params
[sync_se
].se
= re_se_tv
;
5170 side
= (rx_side_effect
)sync_se
;
5171 goto add_side_effect
;
5174 /* The old regex.c used to optimize `.*\n'.
5175 * Maybe rx should too?
5183 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5184 struct rexp_node
* n
= rx_mk_r_cset (&rxb
->rx
, cs
);
5188 rx_bitset_universe (rxb
->rx
.local_cset_size
, cs
);
5189 if (!(rxb
->syntax
& RE_DOT_NEWLINE
))
5190 RX_bitset_remove (cs
, '\n');
5191 if (!(rxb
->syntax
& RE_DOT_NOT_NULL
))
5192 RX_bitset_remove (cs
, 0);
5201 if (p
== pend
) return REG_EBRACK
;
5203 boolean had_char_class
= false;
5204 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5205 struct rexp_node
* node
= rx_mk_r_cset (&rxb
->rx
, cs
);
5206 int is_inverted
= *p
== '^';
5211 /* This branch of the switch is normally exited with
5219 /* Remember the first position in the bracket expression. */
5222 /* Read in characters and ranges, setting map bits. */
5225 if (p
== pend
) return REG_EBRACK
;
5229 /* \ might escape characters inside [...] and [^...]. */
5230 if ((syntax
& RE_BACKSLASH_ESCAPE_IN_LISTS
) && c
== '\\')
5232 if (p
== pend
) return REG_EESCAPE
;
5236 rx_Bitset it
= inverse_translation (rxb
,
5241 rx_bitset_union (rxb
->rx
.local_cset_size
, cs
, it
);
5246 /* Could be the end of the bracket expression. If it's
5247 not (i.e., when the bracket expression is `[]' so
5248 far), the ']' character bit gets set way below. */
5249 if (c
== ']' && p
!= p1
+ 1)
5250 goto finalize_class_and_append
;
5252 /* Look ahead to see if it's a range when the last thing
5253 was a character class. */
5254 if (had_char_class
&& c
== '-' && *p
!= ']')
5257 /* Look ahead to see if it's a range when the last thing
5258 was a character: if this is a hyphen not at the
5259 beginning or the end of a list, then it's the range
5262 && !(p
- 2 >= pattern
&& p
[-2] == '[')
5263 && !(p
- 3 >= pattern
&& p
[-3] == '[' && p
[-2] == '^')
5267 = compile_range (rxb
, cs
, &p
, pend
, translate
, syntax
,
5268 inverse_translate
, validate_inv_tr
);
5269 if (ret
!= REG_NOERROR
) return ret
;
5272 else if (p
[0] == '-' && p
[1] != ']')
5273 { /* This handles ranges made up of characters only. */
5276 /* Move past the `-'. */
5279 ret
= compile_range (rxb
, cs
, &p
, pend
, translate
, syntax
,
5280 inverse_translate
, validate_inv_tr
);
5281 if (ret
!= REG_NOERROR
) return ret
;
5284 /* See if we're at the beginning of a possible character
5287 else if ((syntax
& RE_CHAR_CLASSES
)
5288 && (c
== '[') && (*p
== ':'))
5290 char str
[CHAR_CLASS_MAX_LENGTH
+ 1];
5295 /* If pattern is `[[:'. */
5296 if (p
== pend
) return REG_EBRACK
;
5301 if (c
== ':' || c
== ']' || p
== pend
5302 || c1
== CHAR_CLASS_MAX_LENGTH
)
5308 /* If isn't a word bracketed by `[:' and:`]':
5309 undo the ending character, the letters, and leave
5310 the leading `:' and `[' (but set bits for them). */
5311 if (c
== ':' && *p
== ']')
5314 boolean is_alnum
= !strcmp (str
, "alnum");
5315 boolean is_alpha
= !strcmp (str
, "alpha");
5316 boolean is_blank
= !strcmp (str
, "blank");
5317 boolean is_cntrl
= !strcmp (str
, "cntrl");
5318 boolean is_digit
= !strcmp (str
, "digit");
5319 boolean is_graph
= !strcmp (str
, "graph");
5320 boolean is_lower
= !strcmp (str
, "lower");
5321 boolean is_print
= !strcmp (str
, "print");
5322 boolean is_punct
= !strcmp (str
, "punct");
5323 boolean is_space
= !strcmp (str
, "space");
5324 boolean is_upper
= !strcmp (str
, "upper");
5325 boolean is_xdigit
= !strcmp (str
, "xdigit");
5327 if (!IS_CHAR_CLASS (str
)) return REG_ECTYPE
;
5329 /* Throw away the ] at the end of the character
5333 if (p
== pend
) return REG_EBRACK
;
5335 for (ch
= 0; ch
< 1 << CHARBITS
; ch
++)
5337 if ( (is_alnum
&& isalnum (ch
))
5338 || (is_alpha
&& isalpha (ch
))
5339 || (is_blank
&& isblank (ch
))
5340 || (is_cntrl
&& iscntrl (ch
))
5341 || (is_digit
&& isdigit (ch
))
5342 || (is_graph
&& isgraph (ch
))
5343 || (is_lower
&& islower (ch
))
5344 || (is_print
&& isprint (ch
))
5345 || (is_punct
&& ispunct (ch
))
5346 || (is_space
&& isspace (ch
))
5347 || (is_upper
&& isupper (ch
))
5348 || (is_xdigit
&& isxdigit (ch
)))
5351 inverse_translation (rxb
,
5356 rx_bitset_union (rxb
->rx
.local_cset_size
,
5360 had_char_class
= true;
5369 inverse_translation (rxb
,
5374 rx_bitset_union (rxb
->rx
.local_cset_size
,
5379 inverse_translation (rxb
,
5384 rx_bitset_union (rxb
->rx
.local_cset_size
,
5387 had_char_class
= false;
5392 had_char_class
= false;
5394 rx_Bitset it
= inverse_translation (rxb
,
5399 rx_bitset_union (rxb
->rx
.local_cset_size
, cs
, it
);
5404 finalize_class_and_append
:
5407 rx_bitset_complement (rxb
->rx
.local_cset_size
, cs
);
5408 if (syntax
& RE_HAT_LISTS_NOT_NEWLINE
)
5409 RX_bitset_remove (cs
, '\n');
5417 if (syntax
& RE_NO_BK_PARENS
)
5424 if (syntax
& RE_NO_BK_PARENS
)
5431 if (syntax
& RE_NEWLINE_ALT
)
5438 if (syntax
& RE_NO_BK_VBAR
)
5445 if ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
5446 goto handle_interval
;
5452 if (p
== pend
) return REG_EESCAPE
;
5454 /* Do not translate the character after the \, so that we can
5455 distinguish, e.g., \B from \b, even if we normally would
5456 translate, e.g., B to b. */
5462 if (syntax
& RE_NO_BK_PARENS
)
5463 goto normal_backslash
;
5468 if (COMPILE_STACK_FULL
)
5470 ((compile_stack
.stack
) =
5471 (compile_stack_elt_t
*) realloc (compile_stack
.stack
, ( compile_stack
.size
<< 1) * sizeof (
5472 compile_stack_elt_t
)));
5473 if (compile_stack
.stack
== 0) return REG_ESPACE
;
5475 compile_stack
.size
<<= 1;
5478 if (*last_expression
)
5480 struct rexp_node
* concat
5481 = rx_mk_r_concat (&rxb
->rx
, *last_expression
, 0);
5484 *last_expression
= concat
;
5485 last_expression
= &concat
->params
.pair
.right
;
5489 * These are the values to restore when we hit end of this
5492 COMPILE_STACK_TOP
.top_expression
= top_expression
;
5493 COMPILE_STACK_TOP
.last_expression
= last_expression
;
5494 COMPILE_STACK_TOP
.regnum
= regnum
;
5496 compile_stack
.avail
++;
5498 top_expression
= last_expression
;
5503 if (syntax
& RE_NO_BK_PARENS
) goto normal_backslash
;
5506 /* See similar code for backslashed left paren above. */
5507 if (COMPILE_STACK_EMPTY
)
5508 if (syntax
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
5513 /* Since we just checked for an empty stack above, this
5514 ``can't happen''. */
5517 /* We don't just want to restore into `regnum', because
5518 later groups should continue to be numbered higher,
5519 as in `(ab)c(de)' -- the second group is #2. */
5520 regnum_t this_group_regnum
;
5521 struct rexp_node
** inner
= top_expression
;
5523 compile_stack
.avail
--;
5524 top_expression
= COMPILE_STACK_TOP
.top_expression
;
5525 last_expression
= COMPILE_STACK_TOP
.last_expression
;
5526 this_group_regnum
= COMPILE_STACK_TOP
.regnum
;
5528 int left_se
= paramc
;
5529 int right_se
= paramc
+ 1;
5532 ? ((struct re_se_params
*)
5534 (paramc
+ 2) * sizeof (params
[0])))
5535 : ((struct re_se_params
*)
5536 malloc (2 * sizeof (params
[0]))));
5541 params
[left_se
].se
= re_se_lparen
;
5542 params
[left_se
].op1
= this_group_regnum
;
5543 params
[right_se
].se
= re_se_rparen
;
5544 params
[right_se
].op1
= this_group_regnum
;
5546 struct rexp_node
* left
5547 = rx_mk_r_side_effect (&rxb
->rx
,
5548 (rx_side_effect
)left_se
);
5549 struct rexp_node
* right
5550 = rx_mk_r_side_effect (&rxb
->rx
,
5551 (rx_side_effect
)right_se
);
5552 struct rexp_node
* c1
5554 ? rx_mk_r_concat (&rxb
->rx
, left
, *inner
) : left
);
5555 struct rexp_node
* c2
5556 = rx_mk_r_concat (&rxb
->rx
, c1
, right
);
5557 if (!(left
&& right
&& c1
&& c2
))
5565 case '|': /* `\|'. */
5566 if ((syntax
& RE_LIMITED_OPS
) || (syntax
& RE_NO_BK_VBAR
))
5567 goto normal_backslash
;
5569 if (syntax
& RE_LIMITED_OPS
)
5573 struct rexp_node
* alt
5574 = rx_mk_r_alternate (&rxb
->rx
, *top_expression
, 0);
5577 *top_expression
= alt
;
5578 last_expression
= &alt
->params
.pair
.right
;
5580 int sync_se
= paramc
;
5583 ? ((struct re_se_params
*)
5585 (paramc
+ 1) * sizeof (params
[0])))
5586 : ((struct re_se_params
*)
5587 malloc (sizeof (params
[0]))));
5592 params
[sync_se
].se
= re_se_tv
;
5594 struct rexp_node
* sync
5595 = rx_mk_r_side_effect (&rxb
->rx
,
5596 (rx_side_effect
)sync_se
);
5597 struct rexp_node
* conc
5598 = rx_mk_r_concat (&rxb
->rx
, sync
, 0);
5603 *last_expression
= conc
;
5604 last_expression
= &conc
->params
.pair
.right
;
5612 /* If \{ is a literal. */
5613 if (!(syntax
& RE_INTERVALS
)
5614 /* If we're at `\{' and it's not the open-interval
5616 || ((syntax
& RE_INTERVALS
) && (syntax
& RE_NO_BK_BRACES
))
5617 || (p
- 2 == pattern
&& p
== pend
))
5618 goto normal_backslash
;
5622 /* If got here, then the syntax allows intervals. */
5624 /* At least (most) this many matches must be made. */
5625 int lower_bound
= -1, upper_bound
= -1;
5627 beg_interval
= p
- 1;
5631 if (syntax
& RE_NO_BK_BRACES
)
5632 goto unfetch_interval
;
5637 GET_UNSIGNED_NUMBER (lower_bound
);
5641 GET_UNSIGNED_NUMBER (upper_bound
);
5642 if (upper_bound
< 0) upper_bound
= RE_DUP_MAX
;
5645 /* Interval such as `{1}' => match exactly once. */
5646 upper_bound
= lower_bound
;
5648 if (lower_bound
< 0 || upper_bound
> RE_DUP_MAX
5649 || lower_bound
> upper_bound
)
5651 if (syntax
& RE_NO_BK_BRACES
)
5652 goto unfetch_interval
;
5657 if (!(syntax
& RE_NO_BK_BRACES
))
5659 if (c
!= '\\') return REG_EBRACE
;
5665 if (syntax
& RE_NO_BK_BRACES
)
5666 goto unfetch_interval
;
5671 /* We just parsed a valid interval. */
5673 /* If it's invalid to have no preceding re. */
5674 if (pointless_if_repeated (*last_expression
, params
))
5676 if (syntax
& RE_CONTEXT_INVALID_OPS
)
5678 else if (!(syntax
& RE_CONTEXT_INDEP_OPS
))
5679 goto unfetch_interval
;
5680 /* was: else laststart = b; */
5683 /* If the upper bound is zero, don't want to iterate
5686 if (upper_bound
== 0)
5688 if (*last_expression
)
5690 rx_free_rexp (&rxb
->rx
, *last_expression
);
5691 *last_expression
= 0;
5695 /* Otherwise, we have a nontrivial interval. */
5697 int iter_se
= paramc
;
5698 int end_se
= paramc
+ 1;
5700 ? ((struct re_se_params
*)
5702 sizeof (*params
) * (2 + paramc
)))
5703 : ((struct re_se_params
*)
5704 malloc (2 * sizeof (*params
))));
5708 params
[iter_se
].se
= re_se_iter
;
5709 params
[iter_se
].op1
= lower_bound
;
5710 params
[iter_se
].op2
= upper_bound
;
5712 params
[end_se
].se
= re_se_end_iter
;
5713 params
[end_se
].op1
= lower_bound
;
5714 params
[end_se
].op2
= upper_bound
;
5716 struct rexp_node
* push0
5717 = rx_mk_r_side_effect (&rxb
->rx
,
5718 (rx_side_effect
)re_se_push0
);
5719 struct rexp_node
* start_one_iter
5720 = rx_mk_r_side_effect (&rxb
->rx
,
5721 (rx_side_effect
)iter_se
);
5722 struct rexp_node
* phase1
5723 = rx_mk_r_concat (&rxb
->rx
, start_one_iter
,
5725 struct rexp_node
* pushback
5726 = rx_mk_r_side_effect (&rxb
->rx
,
5727 (rx_side_effect
)re_se_pushback
);
5728 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5729 struct rexp_node
* lit_t
5730 = rx_mk_r_cset (&rxb
->rx
, cs
);
5731 struct rexp_node
* phase2
5732 = rx_mk_r_concat (&rxb
->rx
, pushback
, lit_t
);
5733 struct rexp_node
* loop
5734 = rx_mk_r_2phase_star (&rxb
->rx
, phase1
, phase2
);
5735 struct rexp_node
* push_n_loop
5736 = rx_mk_r_concat (&rxb
->rx
, push0
, loop
);
5737 struct rexp_node
* final_test
5738 = rx_mk_r_side_effect (&rxb
->rx
,
5739 (rx_side_effect
)end_se
);
5740 struct rexp_node
* full_exp
5741 = rx_mk_r_concat (&rxb
->rx
, push_n_loop
, final_test
);
5743 if (!(push0
&& start_one_iter
&& phase1
5744 && pushback
&& lit_t
&& phase2
5745 && loop
&& push_n_loop
&& final_test
&& full_exp
))
5748 RX_bitset_enjoin(cs
, 't');
5750 *last_expression
= full_exp
;
5758 /* If an invalid interval, match the characters as literals. */
5762 /* normal_char and normal_backslash need `c'. */
5765 if (!(syntax
& RE_NO_BK_BRACES
))
5767 if (p
> pattern
&& p
[-1] == '\\')
5768 goto normal_backslash
;
5773 /* There is no way to specify the before_dot and after_dot
5774 operators. rms says this is ok. --karl */
5776 side
= (rx_side_effect
)rx_se_at_dot
;
5777 goto add_side_effect
;
5783 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5784 struct rexp_node
* set
= rx_mk_r_cset (&rxb
->rx
, cs
);
5788 rx_bitset_universe (rxb
->rx
.local_cset_size
, cs
);
5793 enum syntaxcode code
= syntax_spec_code
[c
];
5794 for (x
= 0; x
< 256; ++x
)
5797 if (SYNTAX (x
) == code
)
5800 inverse_translation (rxb
, validate_inv_tr
,
5803 rx_bitset_xor (rxb
->rx
.local_cset_size
, cs
, it
);
5817 rx_Bitset cs
= rx_cset (&rxb
->rx
);
5818 struct rexp_node
* n
= (cs
? rx_mk_r_cset (&rxb
->rx
, cs
) : 0);
5822 rx_bitset_universe (rxb
->rx
.local_cset_size
,cs
);
5825 for (x
= rxb
->rx
.local_cset_size
- 1; x
> 0; --x
)
5826 if (SYNTAX(x
) & Sword
)
5827 RX_bitset_toggle (cs
, x
);
5834 /* With a little extra work, some of these side effects could be optimized
5835 * away (basicly by looking at what we already know about the surrounding
5839 side
= (rx_side_effect
)re_se_wordbeg
;
5840 goto add_side_effect
;
5844 side
= (rx_side_effect
)re_se_wordend
;
5845 goto add_side_effect
;
5849 side
= (rx_side_effect
)re_se_wordbound
;
5850 goto add_side_effect
;
5854 side
= (rx_side_effect
)re_se_notwordbound
;
5855 goto add_side_effect
;
5859 side
= (rx_side_effect
)re_se_begbuf
;
5860 goto add_side_effect
;
5864 side
= (rx_side_effect
)re_se_endbuf
;
5865 goto add_side_effect
;
5870 struct rexp_node
* se
5871 = rx_mk_r_side_effect (&rxb
->rx
, side
);
5879 case '1': case '2': case '3': case '4': case '5':
5880 case '6': case '7': case '8': case '9':
5881 if (syntax
& RE_NO_BK_REFS
)
5889 /* Can't back reference to a subexpression if inside of it. */
5890 if (group_in_compile_stack (compile_stack
, c1
))
5894 int backref_se
= paramc
;
5896 ? ((struct re_se_params
*)
5898 sizeof (*params
) * (1 + paramc
)))
5899 : ((struct re_se_params
*)
5900 malloc (sizeof (*params
))));
5904 params
[backref_se
].se
= re_se_backref
;
5905 params
[backref_se
].op1
= c1
;
5906 side
= (rx_side_effect
)backref_se
;
5907 goto add_side_effect
;
5913 if (syntax
& RE_BK_PLUS_QM
)
5916 goto normal_backslash
;
5920 /* You might think it would be useful for \ to mean
5921 not to translate; but if we don't translate it
5922 it will never match anything. */
5930 /* Expects the character in `c'. */
5933 rx_Bitset cs
= rx_cset(&rxb
->rx
);
5934 struct rexp_node
* match
= rx_mk_r_cset (&rxb
->rx
, cs
);
5938 it
= inverse_translation (rxb
, validate_inv_tr
,
5939 inverse_translate
, translate
, c
);
5940 rx_bitset_union (CHAR_SET_SIZE
, cs
, it
);
5944 /* This genericly appends the rexp APPEND to *LAST_EXPRESSION
5945 * and then parses the next character normally.
5947 if (*last_expression
)
5949 struct rexp_node
* concat
5950 = rx_mk_r_concat (&rxb
->rx
, *last_expression
, append
);
5953 *last_expression
= concat
;
5954 last_expression
= &concat
->params
.pair
.right
;
5957 *last_expression
= append
;
5960 } /* while p != pend */
5964 int win_se
= paramc
;
5966 ? ((struct re_se_params
*)
5968 sizeof (*params
) * (1 + paramc
)))
5969 : ((struct re_se_params
*)
5970 malloc (sizeof (*params
))));
5974 params
[win_se
].se
= re_se_win
;
5976 struct rexp_node
* se
5977 = rx_mk_r_side_effect (&rxb
->rx
, (rx_side_effect
)win_se
);
5978 struct rexp_node
* concat
5979 = rx_mk_r_concat (&rxb
->rx
, rexp
, se
);
5980 if (!(se
&& concat
))
5987 /* Through the pattern now. */
5989 if (!COMPILE_STACK_EMPTY
)
5992 free (compile_stack
.stack
);
5996 if (rx_debug_compile
)
5999 fputs ("\n\nCompiling ", stdout
);
6000 fwrite (pattern
, 1, size
, stdout
);
6001 fputs (":\n", stdout
);
6002 rxb
->se_params
= params
;
6003 print_rexp (&rxb
->rx
, orig_rexp
, 2, re_seprint
, stdout
);
6007 rx_Bitset cs
= rx_cset(&rxb
->rx
);
6008 rx_Bitset cs2
= rx_cset(&rxb
->rx
);
6009 char * se_map
= (char *) alloca (paramc
);
6010 struct rexp_node
* new_rexp
= 0;
6013 bzero (se_map
, paramc
);
6014 find_backrefs (se_map
, rexp
, params
);
6015 fewer_side_effects
=
6016 remove_unecessary_side_effects (&rxb
->rx
, se_map
,
6017 rx_copy_rexp (&rxb
->rx
, rexp
), params
);
6019 speed_up_alt (&rxb
->rx
, rexp
, 0);
6020 speed_up_alt (&rxb
->rx
, fewer_side_effects
, 1);
6023 char * syntax_parens
= rxb
->syntax_parens
;
6024 if (syntax_parens
== (char *)0x1)
6025 rexp
= remove_unecessary_side_effects
6026 (&rxb
->rx
, se_map
, rexp
, params
);
6027 else if (syntax_parens
)
6030 for (x
= 0; x
< paramc
; ++x
)
6031 if (( (params
[x
].se
== re_se_lparen
)
6032 || (params
[x
].se
== re_se_rparen
))
6033 && (!syntax_parens
[params
[x
].op1
]))
6035 rexp
= remove_unecessary_side_effects
6036 (&rxb
->rx
, se_map
, rexp
, params
);
6040 /* At least one more optimization would be nice to have here but i ran out
6041 * of time. The idea would be to delay side effects.
6042 * For examle, `(abc)' is the same thing as `abc()' except that the
6043 * left paren is offset by 3 (which we know at compile time).
6044 * (In this comment, write that second pattern `abc(:3:)'
6045 * where `(:3:' is a syntactic unit.)
6047 * Trickier: `(abc|defg)' is the same as `(abc(:3:|defg(:4:))'
6048 * (The paren nesting may be hard to follow -- that's an alternation
6049 * of `abc(:3:' and `defg(:4:' inside (purely syntactic) parens
6050 * followed by the closing paren from the original expression.)
6052 * Neither the expression tree representation nor the the nfa make
6053 * this very easy to write. :(
6056 /* What we compile is different than what the parser returns.
6057 * Suppose the parser returns expression R.
6058 * Let R' be R with unnecessary register assignments removed
6059 * (see REMOVE_UNECESSARY_SIDE_EFFECTS, above).
6061 * What we will compile is the expression:
6063 * m{try}R{win}\|s{try}R'{win}
6065 * {try} and {win} denote side effect epsilons (see EXPLORE_FUTURE).
6067 * When trying a match, we insert an `m' at the beginning of the
6068 * string if the user wants registers to be filled, `s' if not.
6073 rx_mk_r_concat (&rxb
->rx
, rx_mk_r_cset (&rxb
->rx
, cs2
), rexp
),
6074 rx_mk_r_concat (&rxb
->rx
,
6075 rx_mk_r_cset (&rxb
->rx
, cs
), fewer_side_effects
));
6077 if (!(new_rexp
&& cs
&& cs2
))
6079 RX_bitset_enjoin (cs2
, '\0'); /* prefixed to the rexp used for matching. */
6080 RX_bitset_enjoin (cs
, '\1'); /* prefixed to the rexp used for searching. */
6085 if (rx_debug_compile
)
6087 fputs ("\n...which is compiled as:\n", stdout
);
6088 print_rexp (&rxb
->rx
, rexp
, 2, re_seprint
, stdout
);
6092 struct rx_nfa_state
*start
= 0;
6093 struct rx_nfa_state
*end
= 0;
6095 if (!rx_build_nfa (&rxb
->rx
, rexp
, &start
, &end
))
6096 return REG_ESPACE
; /* */
6099 void * mem
= (void *)rxb
->buffer
;
6100 unsigned long size
= rxb
->allocated
;
6103 int iterator_size
= paramc
* sizeof (params
[0]);
6106 start
->is_start
= 1;
6107 rx_name_nfa_states (&rxb
->rx
);
6108 start_id
= start
->id
;
6110 if (rx_debug_compile
)
6112 fputs ("...giving the NFA: \n", stdout
);
6114 print_nfa (&rxb
->rx
, rxb
->rx
.nfa_states
, re_seprint
, stdout
);
6117 if (!rx_eclose_nfa (&rxb
->rx
))
6121 rx_delete_epsilon_transitions (&rxb
->rx
);
6123 /* For compatability reasons, we need to shove the
6124 * compiled nfa into one chunk of malloced memory.
6126 rxb
->rx
.reserved
= ( sizeof (params
[0]) * paramc
6127 + rx_sizeof_bitset (rxb
->rx
.local_cset_size
));
6129 if (rx_debug_compile
)
6132 fputs ("...which cooks down (uncompactified) to: \n", stdout
);
6133 print_nfa (&rxb
->rx
, rxb
->rx
.nfa_states
, re_seprint
, stdout
);
6136 if (!rx_compactify_nfa (&rxb
->rx
, &mem
, &size
))
6139 rxb
->allocated
= size
;
6140 rxb
->rx
.buffer
= mem
;
6141 rxb
->rx
.allocated
= size
;
6142 perm_mem
= ((char *)rxb
->rx
.buffer
6143 + rxb
->rx
.allocated
- rxb
->rx
.reserved
);
6144 rxb
->se_params
= ((struct re_se_params
*)perm_mem
);
6145 bcopy (params
, rxb
->se_params
, iterator_size
);
6146 perm_mem
+= iterator_size
;
6147 rxb
->fastset
= (rx_Bitset
) perm_mem
;
6148 rxb
->start
= rx_id_to_nfa_state (&rxb
->rx
, start_id
);
6150 rx_bitset_null (rxb
->rx
.local_cset_size
, rxb
->fastset
);
6151 rxb
->can_match_empty
= compute_fastset (rxb
, orig_rexp
);
6152 rxb
->match_regs_on_stack
=
6153 registers_on_stack (rxb
, orig_rexp
, 0, params
);
6154 rxb
->search_regs_on_stack
=
6155 registers_on_stack (rxb
, fewer_side_effects
, 0, params
);
6156 if (rxb
->can_match_empty
)
6157 rx_bitset_universe (rxb
->rx
.local_cset_size
, rxb
->fastset
);
6158 rxb
->is_anchored
= is_anchored (orig_rexp
, (rx_side_effect
) re_se_hat
);
6159 rxb
->begbuf_only
= is_anchored (orig_rexp
,
6160 (rx_side_effect
) re_se_begbuf
);
6162 rx_free_rexp (&rxb
->rx
, rexp
);
6166 if (rx_debug_compile
)
6169 fputs ("...which cooks down to: \n", stdout
);
6170 print_nfa (&rxb
->rx
, rxb
->rx
.nfa_states
, re_seprint
, stdout
);
6179 /* This table gives an error message for each of the error codes listed
6180 in regex.h. Obviously the order here has to be same as there. */
6182 __const__
char * rx_error_msg
[] =
6183 { 0, /* REG_NOERROR */
6184 "No match", /* REG_NOMATCH */
6185 "Invalid regular expression", /* REG_BADPAT */
6186 "Invalid collation character", /* REG_ECOLLATE */
6187 "Invalid character class name", /* REG_ECTYPE */
6188 "Trailing backslash", /* REG_EESCAPE */
6189 "Invalid back reference", /* REG_ESUBREG */
6190 "Unmatched [ or [^", /* REG_EBRACK */
6191 "Unmatched ( or \\(", /* REG_EPAREN */
6192 "Unmatched \\{", /* REG_EBRACE */
6193 "Invalid content of \\{\\}", /* REG_BADBR */
6194 "Invalid range end", /* REG_ERANGE */
6195 "Memory exhausted", /* REG_ESPACE */
6196 "Invalid preceding regular expression", /* REG_BADRPT */
6197 "Premature end of regular expression", /* REG_EEND */
6198 "Regular expression too big", /* REG_ESIZE */
6199 "Unmatched ) or \\)", /* REG_ERPAREN */
6205 char rx_slowmap
[256] =
6207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6219 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6220 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6221 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6222 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6227 rx_blow_up_fastmap (struct re_pattern_buffer
* rxb
)
6230 rx_blow_up_fastmap (rxb
)
6231 struct re_pattern_buffer
* rxb
;
6235 for (x
= 0; x
< 256; ++x
) /* &&&& 3.6 % */
6236 rxb
->fastmap
[x
] = !!RX_bitset_member (rxb
->fastset
, x
);
6237 rxb
->fastmap_accurate
= 1;
6243 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6244 #define RE_SEARCH_2_FN inner_re_search_2
6245 #define RE_S2_QUAL static
6247 #define RE_SEARCH_2_FN re_search_2
6251 struct re_search_2_closure
6253 __const__
unsigned char * string1
;
6255 __const__
unsigned char * string2
;
6260 static __inline__
enum rx_get_burst_return
6261 re_search_2_get_burst (pos
, vclosure
, stop
)
6262 struct rx_string_position
* pos
;
6266 struct re_search_2_closure
* closure
;
6267 closure
= (struct re_search_2_closure
*)vclosure
;
6268 if (!closure
->string2
)
6272 inset
= pos
->pos
- pos
->string
;
6273 if ((inset
< -1) || (inset
> closure
->size1
))
6274 return rx_get_burst_no_more
;
6277 pos
->pos
= (__const__
unsigned char *) closure
->string1
+ inset
;
6278 pos
->string
= (__const__
unsigned char *) closure
->string1
;
6279 pos
->size
= closure
->size1
;
6280 pos
->end
= ((__const__
unsigned char *)
6281 MIN(closure
->string1
+ closure
->size1
,
6282 closure
->string1
+ stop
));
6284 return ((pos
->pos
< pos
->end
)
6286 : rx_get_burst_no_more
);
6289 else if (!closure
->string1
)
6293 inset
= pos
->pos
- pos
->string
;
6294 pos
->pos
= (__const__
unsigned char *) closure
->string2
+ inset
;
6295 pos
->string
= (__const__
unsigned char *) closure
->string2
;
6296 pos
->size
= closure
->size2
;
6297 pos
->end
= ((__const__
unsigned char *)
6298 MIN(closure
->string2
+ closure
->size2
,
6299 closure
->string2
+ stop
));
6301 return ((pos
->pos
< pos
->end
)
6303 : rx_get_burst_no_more
);
6309 inset
= pos
->pos
- pos
->string
+ pos
->offset
;
6310 if (inset
< closure
->size1
)
6312 pos
->pos
= (__const__
unsigned char *) closure
->string1
+ inset
;
6313 pos
->string
= (__const__
unsigned char *) closure
->string1
;
6314 pos
->size
= closure
->size1
;
6315 pos
->end
= ((__const__
unsigned char *)
6316 MIN(closure
->string1
+ closure
->size1
,
6317 closure
->string1
+ stop
));
6319 return rx_get_burst_ok
;
6323 pos
->pos
= ((__const__
unsigned char *)
6324 closure
->string2
+ inset
- closure
->size1
);
6325 pos
->string
= (__const__
unsigned char *) closure
->string2
;
6326 pos
->size
= closure
->size2
;
6327 pos
->end
= ((__const__
unsigned char *)
6328 MIN(closure
->string2
+ closure
->size2
,
6329 closure
->string2
+ stop
- closure
->size1
));
6330 pos
->offset
= closure
->size1
;
6331 return ((pos
->pos
< pos
->end
)
6333 : rx_get_burst_no_more
);
6339 static __inline__
enum rx_back_check_return
6340 re_search_2_back_check (pos
, lparen
, rparen
, translate
, vclosure
, stop
)
6341 struct rx_string_position
* pos
;
6344 unsigned char * translate
;
6348 struct rx_string_position there
;
6349 struct rx_string_position past
;
6352 there
.pos
= there
.string
+ lparen
- there
.offset
;
6353 re_search_2_get_burst (&there
, vclosure
, stop
);
6356 past
.pos
= past
.string
+ rparen
- there
.offset
;
6357 re_search_2_get_burst (&past
, vclosure
, stop
);
6360 re_search_2_get_burst (pos
, vclosure
, stop
);
6362 while ( (there
.pos
!= past
.pos
)
6363 && (pos
->pos
!= pos
->end
))
6364 if (TRANSLATE(*there
.pos
) != TRANSLATE(*pos
->pos
))
6365 return rx_back_check_fail
;
6370 if (there
.pos
== there
.end
)
6371 re_search_2_get_burst (&there
, vclosure
, stop
);
6372 if (pos
->pos
== pos
->end
)
6373 re_search_2_get_burst (pos
, vclosure
, stop
);
6376 if (there
.pos
!= past
.pos
)
6377 return rx_back_check_fail
;
6379 re_search_2_get_burst (pos
, vclosure
, stop
);
6380 return rx_back_check_pass
;
6383 static __inline__
int
6384 re_search_2_fetch_char (pos
, offset
, app_closure
, stop
)
6385 struct rx_string_position
* pos
;
6390 struct re_search_2_closure
* closure
;
6391 closure
= (struct re_search_2_closure
*)app_closure
;
6394 if (pos
->pos
>= pos
->string
)
6398 if ( (pos
->string
== closure
->string2
)
6399 && (closure
->string1
)
6400 && (closure
->size1
))
6401 return closure
->string1
[closure
->size1
- 1];
6403 return 0; /* sure, why not. */
6406 if (pos
->pos
== pos
->end
)
6407 return *closure
->string2
;
6415 RE_SEARCH_2_FN (struct re_pattern_buffer
*rxb
,
6416 __const__
char * string1
, int size1
,
6417 __const__
char * string2
, int size2
,
6418 int startpos
, int range
,
6419 struct re_registers
*regs
,
6423 RE_SEARCH_2_FN (rxb
,
6424 string1
, size1
, string2
, size2
, startpos
, range
, regs
, stop
)
6425 struct re_pattern_buffer
*rxb
;
6426 __const__
char * string1
;
6428 __const__
char * string2
;
6432 struct re_registers
*regs
;
6437 struct re_search_2_closure closure
;
6438 closure
.string1
= (__const__
unsigned char *) string1
;
6439 closure
.size1
= size1
;
6440 closure
.string2
= (__const__
unsigned char *) string2
;
6441 closure
.size2
= size2
;
6442 answer
= rx_search (rxb
, startpos
, range
, stop
, size1
+ size2
,
6443 re_search_2_get_burst
,
6444 re_search_2_back_check
,
6445 re_search_2_fetch_char
,
6452 case rx_search_continuation
:
6454 case rx_search_error
:
6456 case rx_search_soft_fail
:
6457 case rx_search_fail
:
6464 /* Export rx_search to callers outside this file. */
6467 re_rx_search (rxb
, startpos
, range
, stop
, total_size
,
6468 get_burst
, back_check
, fetch_char
,
6469 app_closure
, regs
, resume_state
, save_state
)
6470 struct re_pattern_buffer
* rxb
;
6475 rx_get_burst_fn get_burst
;
6476 rx_back_check_fn back_check
;
6477 rx_fetch_char_fn fetch_char
;
6479 struct re_registers
* regs
;
6480 struct rx_search_state
* resume_state
;
6481 struct rx_search_state
* save_state
;
6483 return rx_search (rxb
, startpos
, range
, stop
, total_size
,
6484 get_burst
, back_check
, fetch_char
, app_closure
,
6485 regs
, resume_state
, save_state
);
6488 #if !defined(REGEX_MALLOC) && !defined(__GNUC__)
6491 re_search_2 (struct re_pattern_buffer
*rxb
,
6492 __const__
char * string1
, int size1
,
6493 __const__
char * string2
, int size2
,
6494 int startpos
, int range
,
6495 struct re_registers
*regs
,
6499 re_search_2 (rxb
, string1
, size1
, string2
, size2
, startpos
, range
, regs
, stop
)
6500 struct re_pattern_buffer
*rxb
;
6501 __const__
char * string1
;
6503 __const__
char * string2
;
6507 struct re_registers
*regs
;
6512 ret
= inner_re_search_2 (rxb
, string1
, size1
, string2
, size2
, startpos
,
6520 /* Like re_search_2, above, but only one string is specified, and
6521 * doesn't let you say where to stop matching.
6526 re_search (struct re_pattern_buffer
* rxb
, __const__
char *string
,
6527 int size
, int startpos
, int range
,
6528 struct re_registers
*regs
)
6531 re_search (rxb
, string
, size
, startpos
, range
, regs
)
6532 struct re_pattern_buffer
* rxb
;
6533 __const__
char * string
;
6537 struct re_registers
*regs
;
6540 return re_search_2 (rxb
, 0, 0, string
, size
, startpos
, range
, regs
, size
);
6545 re_match_2 (struct re_pattern_buffer
* rxb
,
6546 __const__
char * string1
, int size1
,
6547 __const__
char * string2
, int size2
,
6548 int pos
, struct re_registers
*regs
, int stop
)
6551 re_match_2 (rxb
, string1
, size1
, string2
, size2
, pos
, regs
, stop
)
6552 struct re_pattern_buffer
* rxb
;
6553 __const__
char * string1
;
6555 __const__
char * string2
;
6558 struct re_registers
*regs
;
6562 struct re_registers some_regs
;
6566 int save
= rxb
->regs_allocated
;
6567 struct re_registers
* regs_to_pass
= regs
;
6571 some_regs
.start
= &start
;
6572 some_regs
.end
= &end
;
6573 some_regs
.num_regs
= 1;
6574 regs_to_pass
= &some_regs
;
6575 rxb
->regs_allocated
= REGS_FIXED
;
6578 srch
= re_search_2 (rxb
, string1
, size1
, string2
, size2
,
6579 pos
, 1, regs_to_pass
, stop
);
6580 if (regs_to_pass
!= regs
)
6581 rxb
->regs_allocated
= save
;
6584 return regs_to_pass
->end
[0] - regs_to_pass
->start
[0];
6587 /* re_match is like re_match_2 except it takes only a single string. */
6591 re_match (struct re_pattern_buffer
* rxb
,
6592 __const__
char * string
,
6594 struct re_registers
*regs
)
6597 re_match (rxb
, string
, size
, pos
, regs
)
6598 struct re_pattern_buffer
* rxb
;
6599 __const__
char *string
;
6602 struct re_registers
*regs
;
6605 return re_match_2 (rxb
, string
, size
, 0, 0, pos
, regs
, size
);
6610 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
6611 also be assigned to arbitrarily: each pattern buffer stores its own
6612 syntax, so it can be changed between regex compilations. */
6613 reg_syntax_t re_syntax_options
= RE_SYNTAX_EMACS
;
6616 /* Specify the precise syntax of regexps for compilation. This provides
6617 for compatibility for various utilities which historically have
6618 different, incompatible syntaxes.
6620 The argument SYNTAX is a bit mask comprised of the various bits
6621 defined in regex.h. We return the old syntax. */
6625 re_set_syntax (reg_syntax_t syntax
)
6628 re_set_syntax (syntax
)
6629 reg_syntax_t syntax
;
6632 reg_syntax_t ret
= re_syntax_options
;
6634 re_syntax_options
= syntax
;
6639 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
6640 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
6641 this memory for recording register information. STARTS and ENDS
6642 must be allocated using the malloc library routine, and must each
6643 be at least NUM_REGS * sizeof (regoff_t) bytes long.
6645 If NUM_REGS == 0, then subsequent matches should allocate their own
6648 Unless this function is called, the first search or match using
6649 PATTERN_BUFFER will allocate its own register data, without
6650 freeing the old data. */
6654 re_set_registers (struct re_pattern_buffer
*bufp
,
6655 struct re_registers
*regs
,
6657 regoff_t
* starts
, regoff_t
* ends
)
6660 re_set_registers (bufp
, regs
, num_regs
, starts
, ends
)
6661 struct re_pattern_buffer
*bufp
;
6662 struct re_registers
*regs
;
6670 bufp
->regs_allocated
= REGS_REALLOCATE
;
6671 regs
->num_regs
= num_regs
;
6672 regs
->start
= starts
;
6677 bufp
->regs_allocated
= REGS_UNALLOCATED
;
6679 regs
->start
= regs
->end
= (regoff_t
) 0;
6688 cplx_se_sublist_len (struct rx_se_list
* list
)
6691 cplx_se_sublist_len (list
)
6692 struct rx_se_list
* list
;
6698 if ((long)list
->car
>= 0)
6706 /* For rx->se_list_cmp */
6710 posix_se_list_order (struct rx
* rx
,
6711 struct rx_se_list
* a
, struct rx_se_list
* b
)
6714 posix_se_list_order (rx
, a
, b
)
6716 struct rx_se_list
* a
;
6717 struct rx_se_list
* b
;
6720 int al
= cplx_se_sublist_len (a
);
6721 int bl
= cplx_se_sublist_len (b
);
6726 : ((a
< b
) ? -1 : 1));
6736 rx_side_effect
* av
= ((rx_side_effect
*)
6737 alloca (sizeof (rx_side_effect
) * (al
+ 1)));
6738 rx_side_effect
* bv
= ((rx_side_effect
*)
6739 alloca (sizeof (rx_side_effect
) * (bl
+ 1)));
6740 struct rx_se_list
* ap
= a
;
6741 struct rx_se_list
* bp
= b
;
6744 for (ai
= al
- 1; ai
>= 0; --ai
)
6746 while ((long)ap
->car
< 0)
6751 av
[al
] = (rx_side_effect
)-2;
6752 for (bi
= bl
- 1; bi
>= 0; --bi
)
6754 while ((long)bp
->car
< 0)
6759 bv
[bl
] = (rx_side_effect
)-1;
6764 while (av
[x
] == bv
[x
])
6766 ret
= (((unsigned *)(av
[x
]) < (unsigned *)(bv
[x
])) ? -1 : 1);
6775 /* re_compile_pattern is the GNU regular expression compiler: it
6776 compiles PATTERN (of length SIZE) and puts the result in RXB.
6777 Returns 0 if the pattern was valid, otherwise an error string.
6779 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6780 are set in RXB on entry.
6782 We call rx_compile to do the actual compilation. */
6786 re_compile_pattern (__const__
char *pattern
,
6788 struct re_pattern_buffer
* rxb
)
6791 re_compile_pattern (pattern
, length
, rxb
)
6792 __const__
char *pattern
;
6794 struct re_pattern_buffer
* rxb
;
6799 /* GNU code is written to assume at least RE_NREGS registers will be set
6800 (and at least one extra will be -1). */
6801 rxb
->regs_allocated
= REGS_UNALLOCATED
;
6803 /* And GNU code determines whether or not to get register information
6804 by passing null for the REGS argument to re_match, etc., not by
6808 rxb
->rx
.local_cset_size
= 256;
6810 /* Match anchors at newline. */
6811 rxb
->newline_anchor
= 1;
6817 rxb
->rx
.epsnodec
= 0;
6818 rxb
->rx
.instruction_table
= 0;
6819 rxb
->rx
.nfa_states
= 0;
6820 rxb
->rx
.se_list_cmp
= posix_se_list_order
;
6821 rxb
->rx
.start_set
= 0;
6823 ret
= rx_compile (pattern
, length
, re_syntax_options
, rxb
);
6825 return rx_error_msg
[(int) ret
];
6832 re_compile_fastmap (struct re_pattern_buffer
* rxb
)
6835 re_compile_fastmap (rxb
)
6836 struct re_pattern_buffer
* rxb
;
6839 rx_blow_up_fastmap (rxb
);
6846 /* Entry points compatible with 4.2 BSD regex library. We don't define
6847 them if this is an Emacs or POSIX compilation. */
6849 #if (!defined (emacs) && !defined (_POSIX_SOURCE)) || defined(USE_BSD_REGEX)
6851 /* BSD has one and only one pattern buffer. */
6852 static struct re_pattern_buffer rx_comp_buf
;
6856 re_comp (__const__
char *s
)
6865 if (!s
|| (*s
== '\0'))
6867 if (!rx_comp_buf
.buffer
)
6868 return "No previous regular expression";
6872 if (!rx_comp_buf
.fastmap
)
6874 rx_comp_buf
.fastmap
= (char *) malloc (1 << CHARBITS
);
6875 if (!rx_comp_buf
.fastmap
)
6876 return "Memory exhausted";
6879 /* Since `rx_exec' always passes NULL for the `regs' argument, we
6880 don't need to initialize the pattern buffer fields which affect it. */
6882 /* Match anchors at newlines. */
6883 rx_comp_buf
.newline_anchor
= 1;
6885 rx_comp_buf
.fastmap_accurate
= 0;
6886 rx_comp_buf
.re_nsub
= 0;
6887 rx_comp_buf
.start
= 0;
6888 rx_comp_buf
.se_params
= 0;
6889 rx_comp_buf
.rx
.nodec
= 0;
6890 rx_comp_buf
.rx
.epsnodec
= 0;
6891 rx_comp_buf
.rx
.instruction_table
= 0;
6892 rx_comp_buf
.rx
.nfa_states
= 0;
6893 rx_comp_buf
.rx
.start
= 0;
6894 rx_comp_buf
.rx
.se_list_cmp
= posix_se_list_order
;
6895 rx_comp_buf
.rx
.start_set
= 0;
6896 rx_comp_buf
.rx
.local_cset_size
= 256;
6898 ret
= rx_compile (s
, strlen (s
), re_syntax_options
, &rx_comp_buf
);
6901 /* Yes, we're discarding `__const__' here. */
6902 return (char *) rx_error_msg
[(int) ret
];
6908 re_exec (__const__
char *s
)
6915 __const__
int len
= strlen (s
);
6917 0 <= re_search (&rx_comp_buf
, s
, len
, 0, len
, (struct re_registers
*) 0);
6919 #endif /* not emacs and not _POSIX_SOURCE */
6923 /* POSIX.2 functions. Don't define these for Emacs. */
6927 /* regcomp takes a regular expression as a string and compiles it.
6929 PREG is a regex_t *. We do not expect any fields to be initialized,
6930 since POSIX says we shouldn't. Thus, we set
6932 `buffer' to the compiled pattern;
6933 `used' to the length of the compiled pattern;
6934 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6935 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6936 RE_SYNTAX_POSIX_BASIC;
6937 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6938 `fastmap' and `fastmap_accurate' to zero;
6939 `re_nsub' to the number of subexpressions in PATTERN.
6941 PATTERN is the address of the pattern string.
6943 CFLAGS is a series of bits which affect compilation.
6945 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6946 use POSIX basic syntax.
6948 If REG_NEWLINE is set, then . and [^...] don't match newline.
6949 Also, regexec will try a match beginning after every newline.
6951 If REG_ICASE is set, then we considers upper- and lowercase
6952 versions of letters to be equivalent when matching.
6954 If REG_NOSUB is set, then when PREG is passed to regexec, that
6955 routine will report only success or failure, and nothing about the
6958 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6959 the return codes and their meanings.) */
6964 regcomp (regex_t
* preg
, __const__
char * pattern
, int cflags
)
6967 regcomp (preg
, pattern
, cflags
)
6969 __const__
char * pattern
;
6975 = cflags
& REG_EXTENDED
? RE_SYNTAX_POSIX_EXTENDED
: RE_SYNTAX_POSIX_BASIC
;
6977 /* regex_compile will allocate the space for the compiled pattern. */
6979 preg
->allocated
= 0;
6980 preg
->fastmap
= malloc (256);
6983 preg
->fastmap_accurate
= 0;
6985 if (cflags
& REG_ICASE
)
6989 preg
->translate
= (unsigned char *) malloc (256);
6990 if (!preg
->translate
)
6991 return (int) REG_ESPACE
;
6993 /* Map uppercase characters to corresponding lowercase ones. */
6994 for (i
= 0; i
< CHAR_SET_SIZE
; i
++)
6995 preg
->translate
[i
] = isupper (i
) ? tolower (i
) : i
;
6998 preg
->translate
= 0;
7000 /* If REG_NEWLINE is set, newlines are treated differently. */
7001 if (cflags
& REG_NEWLINE
)
7002 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7003 syntax
&= ~RE_DOT_NEWLINE
;
7004 syntax
|= RE_HAT_LISTS_NOT_NEWLINE
;
7005 /* It also changes the matching behavior. */
7006 preg
->newline_anchor
= 1;
7009 preg
->newline_anchor
= 0;
7011 preg
->no_sub
= !!(cflags
& REG_NOSUB
);
7013 /* POSIX says a null character in the pattern terminates it, so we
7014 can use strlen here in compiling the pattern. */
7017 preg
->se_params
= 0;
7018 preg
->syntax_parens
= 0;
7020 preg
->rx
.epsnodec
= 0;
7021 preg
->rx
.instruction_table
= 0;
7022 preg
->rx
.nfa_states
= 0;
7023 preg
->rx
.local_cset_size
= 256;
7025 preg
->rx
.se_list_cmp
= posix_se_list_order
;
7026 preg
->rx
.start_set
= 0;
7027 ret
= rx_compile (pattern
, strlen (pattern
), syntax
, preg
);
7030 /* POSIX doesn't distinguish between an unmatched open-group and an
7031 unmatched close-group: both are REG_EPAREN. */
7032 if (ret
== REG_ERPAREN
) ret
= REG_EPAREN
;
7038 /* regexec searches for a given pattern, specified by PREG, in the
7041 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7042 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7043 least NMATCH elements, and we set them to the offsets of the
7044 corresponding matched substrings.
7046 EFLAGS specifies `execution flags' which affect matching: if
7047 REG_NOTBOL is set, then ^ does not match at the beginning of the
7048 string; if REG_NOTEOL is set, then $ does not match at the end.
7050 We return 0 if we find a match and REG_NOMATCH if not. */
7054 regexec (__const__ regex_t
*preg
, __const__
char *string
,
7055 size_t nmatch
, regmatch_t pmatch
[],
7059 regexec (preg
, string
, nmatch
, pmatch
, eflags
)
7060 __const__ regex_t
*preg
;
7061 __const__
char *string
;
7063 regmatch_t pmatch
[];
7068 struct re_registers regs
;
7069 regex_t private_preg
;
7070 int len
= strlen (string
);
7071 boolean want_reg_info
= !preg
->no_sub
&& nmatch
> 0;
7073 private_preg
= *preg
;
7075 private_preg
.not_bol
= !!(eflags
& REG_NOTBOL
);
7076 private_preg
.not_eol
= !!(eflags
& REG_NOTEOL
);
7078 /* The user has told us exactly how many registers to return
7079 * information about, via `nmatch'. We have to pass that on to the
7080 * matching routines.
7082 private_preg
.regs_allocated
= REGS_FIXED
;
7086 regs
.num_regs
= nmatch
;
7087 regs
.start
= (( regoff_t
*) malloc ((nmatch
) * sizeof ( regoff_t
)));
7088 regs
.end
= (( regoff_t
*) malloc ((nmatch
) * sizeof ( regoff_t
)));
7089 if (regs
.start
== 0 || regs
.end
== 0)
7090 return (int) REG_NOMATCH
;
7093 /* Perform the searching operation. */
7094 ret
= re_search (&private_preg
,
7098 want_reg_info
? ®s
: (struct re_registers
*) 0);
7100 /* Copy the register information to the POSIX structure. */
7107 for (r
= 0; r
< nmatch
; r
++)
7109 pmatch
[r
].rm_so
= regs
.start
[r
];
7110 pmatch
[r
].rm_eo
= regs
.end
[r
];
7114 /* If we needed the temporary register info, free the space now. */
7119 /* We want zero return to mean success, unlike `re_search'. */
7120 return ret
>= 0 ? (int) REG_NOERROR
: (int) REG_NOMATCH
;
7124 /* Returns a message corresponding to an error code, ERRCODE, returned
7125 from either regcomp or regexec. */
7129 regerror (int errcode
, __const__ regex_t
*preg
,
7130 char *errbuf
, size_t errbuf_size
)
7133 regerror (errcode
, preg
, errbuf
, errbuf_size
)
7135 __const__ regex_t
*preg
;
7141 = rx_error_msg
[errcode
] == 0 ? "Success" : rx_error_msg
[errcode
];
7142 size_t msg_size
= strlen (msg
) + 1; /* Includes the 0. */
7144 if (errbuf_size
!= 0)
7146 if (msg_size
> errbuf_size
)
7148 strncpy (errbuf
, msg
, errbuf_size
- 1);
7149 errbuf
[errbuf_size
- 1] = 0;
7152 strcpy (errbuf
, msg
);
7159 /* Free dynamically allocated space used by PREG. */
7163 regfree (regex_t
*preg
)
7170 if (preg
->buffer
!= 0)
7171 free (preg
->buffer
);
7173 preg
->allocated
= 0;
7175 if (preg
->fastmap
!= 0)
7176 free (preg
->fastmap
);
7178 preg
->fastmap_accurate
= 0;
7180 if (preg
->translate
!= 0)
7181 free (preg
->translate
);
7182 preg
->translate
= 0;
7185 #endif /* not emacs */