2 * Simple shell-like filename glob matcher.
3 * Copyright © 2024 Nick Bowler
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
27 # error this implementation assumes 8-bit bytes
32 #define STATE_FLAG_LOOP 1u
35 * Define a nondeterministic transition for a particular state.
37 * The transitions are:
39 * - on a matching input, transition to the next state.
40 * - if the loop flag is set, on any input, stay in the same state.
41 * - on any input, transition to the dead state.
43 struct glob_transition
{
49 * Reasonably compact representation of a nondeterministic finite automaton
50 * representing a shell-like pattern matching sequences of 8-bit bytes.
52 * Shell patterns are fairly restrictive. It is sufficient to have a
53 * single accepting state and the transitions above are sufficient for
54 * every possible state.
56 * States are numbered starting from 0 (the initial state), with the
57 * corresponding element of delta describing the transitions. The
58 * accepting state is always the highest numbered state. There is
59 * a dead state (with no outward transitions) which is not encoded.
62 unsigned char accepting
;
64 struct glob_transition delta
[FLEXIBLE_ARRAY_MEMBER
];
67 static void bitmap256_set_bit(struct bitmap256
*map
, unsigned char n
)
69 map
->map
[BITMAP_IDX(n
)] |= BITMAP_BITMASK(n
);
72 static void bitmap256_invert(struct bitmap256
*map
)
76 for (i
= 0; i
< BITMAP256_COUNT
; i
++) {
77 map
->map
[i
] = ~map
->map
[i
];
82 * Sets the given bit n in the bitmap and clears all lower bits.
84 static void bitmap256_set_minimum(struct bitmap256
*map
, unsigned char n
)
86 unsigned i
, lo_idx
= BITMAP_IDX(n
);
88 for (i
= 0; i
< lo_idx
; i
++) {
92 map
->map
[i
] |= BITMAP_BITMASK(n
);
93 map
->map
[i
] &= ~(BITMAP_BITMASK(n
)-1);
97 * Returns a non-zero value if any bit is set in map.
99 static bitmap_slice
bitmap256_any(struct bitmap256
*map
)
101 bitmap_slice ret
= 0;
104 for (i
= 0; i
< BITMAP256_COUNT
; i
++) {
112 * Allocate glob NFA with the given number of states (minimum 1). The
113 * automaton initially accepts only the empty string.
115 static struct glob_nfa
*glob_alloc(unsigned num_states
)
117 return calloc(1, offsetof(struct glob_nfa
, delta
)
118 + num_states
* sizeof (struct glob_transition
));
122 * Given a glob NFA which decides some set of words L, transform it to one
123 * which decides the new set representing words prefixed by something in L:
125 * L' = { lc | l in L , c in S* }
127 * (where S* is the set of all words).
129 static void glob_append_star(struct glob_nfa
*fsm
)
131 fsm
->delta
[fsm
->accepting
].flags
|= STATE_FLAG_LOOP
;
135 * Given a glob NFA which decides some set of words L, and a character class C
136 * which describes a subset of the input alphabet, transform the NFA into one
137 * which decides the new set representing some word in L concatenated with
140 * L' = { lc | l in L , c in C }
142 static void glob_append_class(struct glob_nfa
*fsm
, const struct bitmap256
*class)
144 struct glob_transition
*t
= &fsm
->delta
[fsm
->accepting
++];
150 * Parse the "meat" of a shell character class (that is, everything after
151 * the opening bracket) such as [abc0-9].
153 * The class is converted to a bitmap with the bits corresponding to matching
154 * characters set, and other bits clear. Returns the number of characters
157 * Notes on shell syntax:
159 * - If the very first character is a bang ('!'), then the class is inverted.
161 * - If the input begins with a hyphen ('-', possibly preceded by bang) then
162 * this is interpreted as a literal hyphen as opposed to its usual syntactic
163 * meaning as part of a character range.
165 * - If the input begins with a closing bracket (']', possibly preceded by bang
166 * and/or hyphen), then this is interpreted as a literal bracket as opposed
167 * to its usual syntactic meaning as the end of the character class.
169 * - Within a character range, if the final character is a hyphen or a bracket
170 * then this is interpreted literally as part of the character range.
172 static int parse_class(const char *in
, struct bitmap256
*out
)
178 memset(out
, 0, sizeof *out
);
180 if ((invert
= (*s
== '!')))
183 bitmap256_set_bit(out
, *s
++);
185 bitmap256_set_bit(out
, *s
++);
187 for (; (c
= *s
) && c
!= ']'; s
++) {
188 if (c
== '-' && (c
= *++s
))
189 bitmap256_set_range(out
, s
[-2], c
);
191 bitmap256_set_bit(out
, c
);
195 bitmap256_invert(out
);
201 * Return an estimate of the number of states required to parse s.
203 * This estimate may be more than the true number of states but
204 * generally should not be less (if it is, then the pattern will
205 * fail to compile). The result is clamped to 256 as this is the
206 * maximum number of states supported.
208 static unsigned estimate_states(const char *s
)
215 if ((c
= strcspn(s
, "[")) >= 256 - total
)
230 if (*(s
+= strcspn(s
, "]")))
238 struct glob_nfa
*glob_compile(const char *s
)
240 unsigned estimate
= estimate_states(s
);
241 struct glob_nfa
*fsm
;
244 if (!(fsm
= glob_alloc(estimate
)))
247 for (; (c
= *s
); s
++) {
248 struct bitmap256
class;
252 glob_append_star(fsm
);
255 memset(&class, -1, sizeof class);
258 s
+= parse_class(s
+1, &class);
261 memset(&class, 0, sizeof class);
262 bitmap256_set_bit(&class, c
);
265 if (--estimate
== 0) {
266 /* too many states */
271 glob_append_class(fsm
, &class);
278 * We can use a perfect hashing technique to compute the base-2 logarithm of
279 * an exact power of two, as described in Leierson et al. "Using de Bruijn
280 * Sequences to Index a 1 in a Computer Word".
282 * The constants are defined such that each BITMAP_SHIFT-bit subsequence occurs
283 * exactly once. Thus, multiplication by an exact power of two shifts one of
284 * these unique subsequences into the upper bit positions (of the lower half
285 * of the product). Then we use that to index a precomputed lookup table.
287 #if BITMAP_SHIFT == 6
288 #define DEBRUIJN_IDX(x) ((((x)*0x218a392cd3d5dbf) & 0xffffffffffffffff) >> 58)
289 #define DEBRUIJN_LUT \
290 0, 1, 2, 7, 3, 13, 8, 19, 4, 25, 14, 28, 9, 34, 20, 40, \
291 5, 17, 26, 38, 15, 46, 29, 48, 10, 31, 35, 54, 21, 50, 41, 57, \
292 63, 6, 12, 18, 24, 27, 33, 39, 16, 37, 45, 47, 30, 53, 49, 56, \
293 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58
294 #elif BITMAP_SHIFT == 5
295 #define DEBRUIJN_IDX(x) ((((x)*0x4653adf & 0xffffffff)) >> 27)
296 #define DEBRUIJN_LUT \
297 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26, \
298 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27
302 * Iterate over each set bit in cur_state, then evaluate all associated
303 * transition functions with the given character c to determine next_state.
305 * The original value of cur_state is clobbered.
307 * It's wild but gcc-12 can actually identify the below as a "count trailing
308 * zeroes" operation and optimize it as such.
310 static int step_nfa(const struct glob_nfa
*fsm
, struct bitmap256
*cur_state
,
311 struct bitmap256
*next_state
, unsigned char c
)
317 static const unsigned char bitpos
[] = { DEBRUIJN_LUT
};
318 const struct glob_transition
*t
;
321 for (; i
< BITMAP256_COUNT
; i
++) {
322 if ((v
= cur_state
->map
[i
]))
328 cur_state
->map
[i
] ^= (v
&= -v
);
329 j
= (i
<< BITMAP_SHIFT
) + bitpos
[DEBRUIJN_IDX(v
)];
332 if (bitmap256_test_bit(&t
->map
, c
)) {
333 bitmap256_set_bit(next_state
, ++j
);
335 if (j
> loop_state
&& t
[1].flags
& STATE_FLAG_LOOP
)
343 int glob_match(const struct glob_nfa
*fsm
, const char *s
)
345 int loop_state
= -!(fsm
->delta
->flags
& STATE_FLAG_LOOP
);
346 struct bitmap256 cur_state
= {1};
349 for (; (c
= *s
); s
++) {
350 struct bitmap256 next_state
= {0};
354 * Short circuit if the loop state is an accepting one, as
355 * all remaining text will match.
357 if (fsm
->accepting
== loop_state
)
361 * Short circuit if we are in the dead state, as no remaining
364 if (!bitmap256_any(&cur_state
))
367 rc
= step_nfa(fsm
, &cur_state
, &next_state
, c
);
372 * Short circuit by exiting any prior loop states, as they
373 * will not meaningfully affect the result.
375 if (loop_state
>= 0) {
376 bitmap256_set_minimum(&next_state
, loop_state
);
379 cur_state
= next_state
;
382 return bitmap256_test_bit(&cur_state
, fsm
->accepting
);