Use c99types.h from dxcommon.
[liblbx.git] / src / glob.c
blob798ee08ec1e8c2e8f8a1331860bef712d354c7d9
1 /*
2 * Simple shell-like filename glob matcher.
3 * Copyright © 2024 Nick Bowler
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
19 #include <config.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <limits.h>
24 #include <stddef.h>
26 #if CHAR_BIT != 8
27 # error this implementation assumes 8-bit bytes
28 #endif
30 #include "tools.h"
32 #define STATE_FLAG_LOOP 1u
35 * Define a nondeterministic transition for a particular state.
37 * The transitions are:
39 * - on a matching input, transition to the next state.
40 * - if the loop flag is set, on any input, stay in the same state.
41 * - on any input, transition to the dead state.
43 struct glob_transition {
44 struct bitmap256 map;
45 unsigned short flags;
49 * Reasonably compact representation of a nondeterministic finite automaton
50 * representing a shell-like pattern matching sequences of 8-bit bytes.
52 * Shell patterns are fairly restrictive. It is sufficient to have a
53 * single accepting state and the transitions above are sufficient for
54 * every possible state.
56 * States are numbered starting from 0 (the initial state), with the
57 * corresponding element of delta describing the transitions. The
58 * accepting state is always the highest numbered state. There is
59 * a dead state (with no outward transitions) which is not encoded.
61 struct glob_nfa {
62 unsigned char accepting;
64 struct glob_transition delta[FLEXIBLE_ARRAY_MEMBER];
67 static void bitmap256_set_bit(struct bitmap256 *map, unsigned char n)
69 map->map[BITMAP_IDX(n)] |= BITMAP_BITMASK(n);
72 static void bitmap256_invert(struct bitmap256 *map)
74 unsigned i;
76 for (i = 0; i < BITMAP256_COUNT; i++) {
77 map->map[i] = ~map->map[i];
82 * Sets the given bit n in the bitmap and clears all lower bits.
84 static void bitmap256_set_minimum(struct bitmap256 *map, unsigned char n)
86 unsigned i, lo_idx = BITMAP_IDX(n);
88 for (i = 0; i < lo_idx; i++) {
89 map->map[i] = 0;
92 map->map[i] |= BITMAP_BITMASK(n);
93 map->map[i] &= ~(BITMAP_BITMASK(n)-1);
97 * Returns a non-zero value if any bit is set in map.
99 static bitmap_slice bitmap256_any(struct bitmap256 *map)
101 bitmap_slice ret = 0;
102 unsigned i;
104 for (i = 0; i < BITMAP256_COUNT; i++) {
105 ret |= map->map[i];
108 return ret;
112 * Allocate glob NFA with the given number of states (minimum 1). The
113 * automaton initially accepts only the empty string.
115 static struct glob_nfa *glob_alloc(unsigned num_states)
117 return calloc(1, offsetof(struct glob_nfa, delta)
118 + num_states * sizeof (struct glob_transition));
122 * Given a glob NFA which decides some set of words L, transform it to one
123 * which decides the new set representing words prefixed by something in L:
125 * L' = { lc | l in L , c in S* }
127 * (where S* is the set of all words).
129 static void glob_append_star(struct glob_nfa *fsm)
131 fsm->delta[fsm->accepting].flags |= STATE_FLAG_LOOP;
135 * Given a glob NFA which decides some set of words L, and a character class C
136 * which describes a subset of the input alphabet, transform the NFA into one
137 * which decides the new set representing some word in L concatenated with
138 * some symbol in C:
140 * L' = { lc | l in L , c in C }
142 static void glob_append_class(struct glob_nfa *fsm, const struct bitmap256 *class)
144 struct glob_transition *t = &fsm->delta[fsm->accepting++];
146 t->map = *class;
150 * Parse the "meat" of a shell character class (that is, everything after
151 * the opening bracket) such as [abc0-9].
153 * The class is converted to a bitmap with the bits corresponding to matching
154 * characters set, and other bits clear. Returns the number of characters
155 * parsed.
157 * Notes on shell syntax:
159 * - If the very first character is a bang ('!'), then the class is inverted.
161 * - If the input begins with a hyphen ('-', possibly preceded by bang) then
162 * this is interpreted as a literal hyphen as opposed to its usual syntactic
163 * meaning as part of a character range.
165 * - If the input begins with a closing bracket (']', possibly preceded by bang
166 * and/or hyphen), then this is interpreted as a literal bracket as opposed
167 * to its usual syntactic meaning as the end of the character class.
169 * - Within a character range, if the final character is a hyphen or a bracket
170 * then this is interpreted literally as part of the character range.
172 static int parse_class(const char *in, struct bitmap256 *out)
174 const char *s = in;
175 unsigned char c;
176 int invert;
178 memset(out, 0, sizeof *out);
180 if ((invert = (*s == '!')))
181 s++;
182 if (*s == '-')
183 bitmap256_set_bit(out, *s++);
184 if (*s == ']')
185 bitmap256_set_bit(out, *s++);
187 for (; (c = *s) && c != ']'; s++) {
188 if (c == '-' && (c = *++s))
189 bitmap256_set_range(out, s[-2], c);
190 else
191 bitmap256_set_bit(out, c);
194 if (invert)
195 bitmap256_invert(out);
197 return s - in + !!c;
201 * Return an estimate of the number of states required to parse s.
203 * This estimate may be more than the true number of states but
204 * generally should not be less (if it is, then the pattern will
205 * fail to compile). The result is clamped to 256 as this is the
206 * maximum number of states supported.
208 static unsigned estimate_states(const char *s)
210 unsigned total = 1;
212 while (*s) {
213 size_t c;
215 if ((c = strcspn(s, "[")) >= 256 - total)
216 return 256;
218 total += c;
219 if (*(s += c)) {
220 if (total++ >= 256)
221 return 256;
223 s++;
224 if (*s == '!')
225 s++;
226 if (*s == '-')
227 s++;
228 if (*s == ']')
229 s++;
230 if (*(s += strcspn(s, "]")))
231 s++;
235 return total;
238 struct glob_nfa *glob_compile(const char *s)
240 unsigned estimate = estimate_states(s);
241 struct glob_nfa *fsm;
242 unsigned char c;
244 if (!(fsm = glob_alloc(estimate)))
245 return NULL;
247 for (; (c = *s); s++) {
248 struct bitmap256 class;
250 switch (c) {
251 case '*':
252 glob_append_star(fsm);
253 continue;
254 case '?':
255 memset(&class, -1, sizeof class);
256 break;
257 case '[':
258 s += parse_class(s+1, &class);
259 break;
260 default:
261 memset(&class, 0, sizeof class);
262 bitmap256_set_bit(&class, c);
265 if (--estimate == 0) {
266 /* too many states */
267 free(fsm);
268 return NULL;
271 glob_append_class(fsm, &class);
274 return fsm;
278 * We can use a perfect hashing technique to compute the base-2 logarithm of
279 * an exact power of two, as described in Leierson et al. "Using de Bruijn
280 * Sequences to Index a 1 in a Computer Word".
282 * The constants are defined such that each BITMAP_SHIFT-bit subsequence occurs
283 * exactly once. Thus, multiplication by an exact power of two shifts one of
284 * these unique subsequences into the upper bit positions (of the lower half
285 * of the product). Then we use that to index a precomputed lookup table.
287 #if BITMAP_SHIFT == 6
288 #define DEBRUIJN_IDX(x) ((((x)*0x218a392cd3d5dbf) & 0xffffffffffffffff) >> 58)
289 #define DEBRUIJN_LUT \
290 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40, \
291 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57, \
292 63, 6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56, \
293 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
294 #elif BITMAP_SHIFT == 5
295 #define DEBRUIJN_IDX(x) ((((x)*0x4653adf & 0xffffffff)) >> 27)
296 #define DEBRUIJN_LUT \
297 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26, \
298 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27
299 #endif
302 * Iterate over each set bit in cur_state, then evaluate all associated
303 * transition functions with the given character c to determine next_state.
305 * The original value of cur_state is clobbered.
307 * It's wild but gcc-12 can actually identify the below as a "count trailing
308 * zeroes" operation and optimize it as such.
310 static int step_nfa(const struct glob_nfa *fsm, struct bitmap256 *cur_state,
311 struct bitmap256 *next_state, unsigned char c)
313 int loop_state = -1;
314 int i, j;
316 for (i = 0;;) {
317 static const unsigned char bitpos[] = { DEBRUIJN_LUT };
318 const struct glob_transition *t;
319 bitmap_slice v;
321 for (; i < BITMAP256_COUNT; i++) {
322 if ((v = cur_state->map[i]))
323 break;
326 if (!v) break;
328 cur_state->map[i] ^= (v &= -v);
329 j = (i << BITMAP_SHIFT) + bitpos[DEBRUIJN_IDX(v)];
330 t = &fsm->delta[j];
332 if (bitmap256_test_bit(&t->map, c)) {
333 bitmap256_set_bit(next_state, ++j);
335 if (j > loop_state && t[1].flags & STATE_FLAG_LOOP)
336 loop_state = j;
340 return loop_state;
343 int glob_match(const struct glob_nfa *fsm, const char *s)
345 int loop_state = -!(fsm->delta->flags & STATE_FLAG_LOOP);
346 struct bitmap256 cur_state = {1};
347 unsigned char c;
349 for (; (c = *s); s++) {
350 struct bitmap256 next_state = {0};
351 int rc;
354 * Short circuit if the loop state is an accepting one, as
355 * all remaining text will match.
357 if (fsm->accepting == loop_state)
358 return 1;
361 * Short circuit if we are in the dead state, as no remaining
362 * text can match.
364 if (!bitmap256_any(&cur_state))
365 return 0;
367 rc = step_nfa(fsm, &cur_state, &next_state, c);
368 if (rc > loop_state)
369 loop_state = rc;
372 * Short circuit by exiting any prior loop states, as they
373 * will not meaningfully affect the result.
375 if (loop_state >= 0) {
376 bitmap256_set_minimum(&next_state, loop_state);
379 cur_state = next_state;
382 return bitmap256_test_bit(&cur_state, fsm->accepting);