API: added some flags to compiler and executor; added non-utf8 mode
[libre9.git] / src / libre9 / re9.h
blob8466ccc79e7ea5a6a10b0507929fea1fd0733e3a
1 /*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
15 #ifndef _REGEXP9_H_
16 #define _REGEXP9_H_
18 #ifdef __cplusplus
19 extern "C" {
20 #endif
22 #include <stdint.h>
26 * A regular expression specifies a set of strings of characters. A
27 * member of this set of strings is said to be matched by the regular
28 * expression. In the following specification for regular expressions the
29 * word `character' means any character (rune) but newline.
31 * The syntax for a regular expression e0 is
32 * e3: literal | charclass | '.' | '^' | '$' | '(?:' e0 ')' | '(' e0 ')'
34 * e2: e3
35 * | e2 REP
37 * REP: '*' | '+' | '?'
39 * e1: e2
40 * | e1 e2
42 * e0: e1
43 * | e0 '|' e1
46 * A literal is any non-metacharacter, or a metacharacter (one of .*+?[]()|\^$).
48 * A charclass is a nonempty string s bracketed [s] (or [^s]); it matches any
49 * character in (or not in) s. A negated character class never matches newline.
50 * A substring a-b, with a and b in ascending order, stands for the inclusive
51 * range of characters between a and b. In s, the metacharacters '-', ']', an
52 * initial '^' must be preceded by a '\'; other metacharacters have no special
53 * meaning and may appear unescaped.
55 * A '.' matches any character.
57 * A '^' matches the beginning of a line; '$' matches the end of the line.
59 * The REP operators match zero or more (*), one or more (+), zero or one (?),
60 * instances respectively of the preceding regular expression e2.
62 * An alternative regular expression, e0|e1, matches either a match to e0 or a match to e1.
64 * A match to any part of a regular expression extends as far as possible without preventing
65 * a match to the remainder of the regular expression.
67 * BUG: There is no way to specify or match a NUL character; NULs terminate patterns and strings.
71 typedef uint32_t re9_rune;
73 typedef struct re9_sub_s re9_sub_t;
74 typedef struct re9_class_s re9_class_t;
75 typedef struct re9_inst_s re9_inst_t;
76 typedef struct re9_prog_s re9_prog_t;
79 enum { RE9_SUBEXP_MAX = 16 };
82 /* subexpression matches */
83 struct re9_sub_s {
84 union {
85 const char *sp;
86 const re9_rune *rsp;
87 } s;
88 union {
89 const char *ep;
90 const re9_rune *rep;
91 } e;
95 /* character class, each pair of rune's defines a range */
96 struct re9_class_s {
97 re9_rune *end;
98 re9_rune spans[64];
102 /* machine instructions */
103 struct re9_inst_s {
104 int type;
105 union {
106 re9_class_t *cp; /* class pointer */
107 re9_rune r; /* character */
108 int subid; /* sub-expression id for RBRA and LBRA */
109 re9_inst_t *right; /* right child of OR */
110 } u1;
111 union { /* regexp relies on these two being in the same union */
112 re9_inst_t *left; /* left child of OR */
113 re9_inst_t *next; /* next instruction for CAT & LBRA */
114 } u2;
118 enum {
119 RE9_PROG_CLASS_MAX = 16,
120 RE9_PROG_FINST_MAX = 5
124 /* program definition */
125 struct re9_prog_s {
126 re9_inst_t *startinst; /* start pc */
127 re9_class_t class[RE9_PROG_CLASS_MAX]; /* .data */
128 re9_inst_t firstinst[RE9_PROG_FINST_MAX]; /* .text */
133 * re9_compile() compiles a regular expression and returns a pointer to the generated description.
134 * re9_compile() returns 0 for an illegal expression or other failure.
135 * Compiler is thread-safe.
136 * errmsg can be NULL; if *errmsg is not NULL, it should be free()d.
139 enum {
140 RE9_FLAG_NONE = 0,
141 RE9_FLAG_NONUTF8 = 0x01, /* for both compile and execute */
142 RE9_FLAG_ANYDOT = 0x02, /* '.' matches newline too; for both compile and execute */
143 RE9_FLAG_LITERAL = 0x10 /* only for compile */
146 extern re9_prog_t *re9_compile (const char *s, int flags, char **errmsg);
149 * free compiled regular expression
151 extern void re9_free (re9_prog_t *p);
153 /* re9_execute() matches a null-terminated string against the compiled regular expression in prog.
154 * If it matches, regexec returns 1 and fills in the array match with character pointers to the
155 * substrings of string that correspond to the parenthesized subexpressions of exp: match[i].sp
156 * points to the beginning and match[i].ep points just beyond the end of the ith substring.
157 * (Subexpression i begins at the ith left parenthesis, counting from 1.) Pointers in match[0]
158 * pick out the substring that corresponds to the whole regular expression.
159 * If match[0].sp is nonzero on entry, regexec starts matching at that point within string.
160 * If match[0].ep is nonzero on entry, the last character matched is the one preceding that point.
161 * Unused elements of match are filled with zeros. Matches involving and are extended as far as
162 * possible. The number of array elements in match is given by msize.
163 * mp can be NULL and ms should be 0 in this case.
164 * re9_execute() returns 0 if string is not matched.
165 * Executor is thread-safe, one program can be used in multiple threads simultaneously.
168 /* progp: program to run
169 * bol: string to run machine on
170 * mp: subexpression elements (can be NULL)
171 * ms: number of elements at mp (should be 0 if mp is NULL)
173 extern int re9_execute (const re9_prog_t *progp, int flags, const char *bol, re9_sub_t *mp, int ms);
175 /* re9_sub() places in dp a substitution instance of sp in the context of the last regexec
176 * performed using match. Each instance of '\n', where n is a digit, is replaced by the string
177 * delimited by match[n].sp and match[n].ep. Each instance of is replaced by the string
178 * delimited by match[0].sp and match[0].ep. The substitution will always be null terminated
179 * and trimmed to fit into dlen bytes.
182 /* sp: source string
183 * dp: destination string
184 * dlen: destination string size (FIXME: size_t)
185 * mp: subexpression elements
186 * ms: number of elements at mp
188 extern void re9_sub (const char *sp, char *dp, int dlen, const re9_sub_t *mp, int ms);
191 #ifdef __cplusplus
193 #endif
194 #endif