1 /* GRegex -- regular expression API wrapper around PCRE.
3 * Copyright (C) 1999, 2000 Scott Wimer
4 * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5 * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #if defined(G_DISABLE_SINGLE_INCLUDES) && !defined (__GLIB_H_INSIDE__) && !defined (GLIB_COMPILATION)
23 #error "Only <glib.h> can be included directly."
29 #include <glib/gerror.h>
30 #include <glib/gstring.h>
36 * @G_REGEX_ERROR_COMPILE: Compilation of the regular expression failed.
37 * @G_REGEX_ERROR_OPTIMIZE: Optimization of the regular expression failed.
38 * @G_REGEX_ERROR_REPLACE: Replacement failed due to an ill-formed replacement
40 * @G_REGEX_ERROR_MATCH: The match process failed.
41 * @G_REGEX_ERROR_INTERNAL: Internal error of the regular expression engine.
43 * @G_REGEX_ERROR_STRAY_BACKSLASH: "\\" at end of pattern. Since 2.16
44 * @G_REGEX_ERROR_MISSING_CONTROL_CHAR: "\\c" at end of pattern. Since 2.16
45 * @G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: Unrecognized character follows "\\".
47 * @G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: Numbers out of order in "{}"
48 * quantifier. Since 2.16
49 * @G_REGEX_ERROR_QUANTIFIER_TOO_BIG: Number too big in "{}" quantifier.
51 * @G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: Missing terminating "]" for
52 * character class. Since 2.16
53 * @G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: Invalid escape sequence
54 * in character class. Since 2.16
55 * @G_REGEX_ERROR_RANGE_OUT_OF_ORDER: Range out of order in character class.
57 * @G_REGEX_ERROR_NOTHING_TO_REPEAT: Nothing to repeat. Since 2.16
58 * @G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: Unrecognized character after "(?",
59 * "(?<" or "(?P". Since 2.16
60 * @G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: POSIX named classes are
61 * supported only within a class. Since 2.16
62 * @G_REGEX_ERROR_UNMATCHED_PARENTHESIS: Missing terminating ")" or ")"
63 * without opening "(". Since 2.16
64 * @G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: Reference to non-existent
65 * subpattern. Since 2.16
66 * @G_REGEX_ERROR_UNTERMINATED_COMMENT: Missing terminating ")" after comment.
68 * @G_REGEX_ERROR_EXPRESSION_TOO_LARGE: Regular expression too large.
70 * @G_REGEX_ERROR_MEMORY_ERROR: Failed to get memory. Since 2.16
71 * @G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: Lookbehind assertion is not
72 * fixed length. Since 2.16
73 * @G_REGEX_ERROR_MALFORMED_CONDITION: Malformed number or name after "(?(".
75 * @G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: Conditional group contains
76 * more than two branches. Since 2.16
77 * @G_REGEX_ERROR_ASSERTION_EXPECTED: Assertion expected after "(?(".
79 * @G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: Unknown POSIX class name.
81 * @G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: POSIX collating
82 * elements are not supported. Since 2.16
83 * @G_REGEX_ERROR_HEX_CODE_TOO_LARGE: Character value in "\\x{...}" sequence
84 * is too large. Since 2.16
85 * @G_REGEX_ERROR_INVALID_CONDITION: Invalid condition "(?(0)". Since 2.16
86 * @G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: \\C not allowed in
87 * lookbehind assertion. Since 2.16
88 * @G_REGEX_ERROR_INFINITE_LOOP: Recursive call could loop indefinitely.
90 * @G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: Missing terminator
91 * in subpattern name. Since 2.16
92 * @G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: Two named subpatterns have
93 * the same name. Since 2.16
94 * @G_REGEX_ERROR_MALFORMED_PROPERTY: Malformed "\\P" or "\\p" sequence.
96 * @G_REGEX_ERROR_UNKNOWN_PROPERTY: Unknown property name after "\\P" or
98 * @G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: Subpattern name is too long
99 * (maximum 32 characters). Since 2.16
100 * @G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: Too many named subpatterns (maximum
101 * 10,000). Since 2.16
102 * @G_REGEX_ERROR_INVALID_OCTAL_VALUE: Octal value is greater than "\\377".
104 * @G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: "DEFINE" group contains more
105 * than one branch. Since 2.16
106 * @G_REGEX_ERROR_DEFINE_REPETION: Repeating a "DEFINE" group is not allowed.
108 * @G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: Inconsistent newline options.
110 * @G_REGEX_ERROR_MISSING_BACK_REFERENCE: "\\g" is not followed by a braced
111 * name or an optionally braced non-zero number. Since 2.16
113 * Error codes returned by regular expressions functions.
119 G_REGEX_ERROR_COMPILE
,
120 G_REGEX_ERROR_OPTIMIZE
,
121 G_REGEX_ERROR_REPLACE
,
123 G_REGEX_ERROR_INTERNAL
,
125 /* These are the error codes from PCRE + 100 */
126 G_REGEX_ERROR_STRAY_BACKSLASH
= 101,
127 G_REGEX_ERROR_MISSING_CONTROL_CHAR
= 102,
128 G_REGEX_ERROR_UNRECOGNIZED_ESCAPE
= 103,
129 G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER
= 104,
130 G_REGEX_ERROR_QUANTIFIER_TOO_BIG
= 105,
131 G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS
= 106,
132 G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS
= 107,
133 G_REGEX_ERROR_RANGE_OUT_OF_ORDER
= 108,
134 G_REGEX_ERROR_NOTHING_TO_REPEAT
= 109,
135 G_REGEX_ERROR_UNRECOGNIZED_CHARACTER
= 112,
136 G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS
= 113,
137 G_REGEX_ERROR_UNMATCHED_PARENTHESIS
= 114,
138 G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE
= 115,
139 G_REGEX_ERROR_UNTERMINATED_COMMENT
= 118,
140 G_REGEX_ERROR_EXPRESSION_TOO_LARGE
= 120,
141 G_REGEX_ERROR_MEMORY_ERROR
= 121,
142 G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND
= 125,
143 G_REGEX_ERROR_MALFORMED_CONDITION
= 126,
144 G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES
= 127,
145 G_REGEX_ERROR_ASSERTION_EXPECTED
= 128,
146 G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME
= 130,
147 G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED
= 131,
148 G_REGEX_ERROR_HEX_CODE_TOO_LARGE
= 134,
149 G_REGEX_ERROR_INVALID_CONDITION
= 135,
150 G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND
= 136,
151 G_REGEX_ERROR_INFINITE_LOOP
= 140,
152 G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR
= 142,
153 G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME
= 143,
154 G_REGEX_ERROR_MALFORMED_PROPERTY
= 146,
155 G_REGEX_ERROR_UNKNOWN_PROPERTY
= 147,
156 G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG
= 148,
157 G_REGEX_ERROR_TOO_MANY_SUBPATTERNS
= 149,
158 G_REGEX_ERROR_INVALID_OCTAL_VALUE
= 151,
159 G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE
= 154,
160 G_REGEX_ERROR_DEFINE_REPETION
= 155,
161 G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS
= 156,
162 G_REGEX_ERROR_MISSING_BACK_REFERENCE
= 157
168 * Error domain for regular expressions. Errors in this domain will be
169 * from the #GRegexError enumeration. See #GError for information on
174 #define G_REGEX_ERROR g_regex_error_quark ()
176 GQuark
g_regex_error_quark (void);
179 * GRegexCompileFlags:
180 * @G_REGEX_CASELESS: Letters in the pattern match both upper- and
181 * lowercase letters. This option can be changed within a pattern
182 * by a "(?i)" option setting.
183 * @G_REGEX_MULTILINE: By default, GRegex treats the strings as consisting
184 * of a single line of characters (even if it actually contains
185 * newlines). The "start of line" metacharacter ("^") matches only
186 * at the start of the string, while the "end of line" metacharacter
187 * ("$") matches only at the end of the string, or before a terminating
188 * newline (unless #G_REGEX_DOLLAR_ENDONLY is set). When
189 * #G_REGEX_MULTILINE is set, the "start of line" and "end of line"
190 * constructs match immediately following or immediately before any
191 * newline in the string, respectively, as well as at the very start
192 * and end. This can be changed within a pattern by a "(?m)" option
194 * @G_REGEX_DOTALL: A dot metacharater (".") in the pattern matches all
195 * characters, including newlines. Without it, newlines are excluded.
196 * This option can be changed within a pattern by a ("?s") option setting.
197 * @G_REGEX_EXTENDED: Whitespace data characters in the pattern are
198 * totally ignored except when escaped or inside a character class.
199 * Whitespace does not include the VT character (code 11). In addition,
200 * characters between an unescaped "#" outside a character class and
201 * the next newline character, inclusive, are also ignored. This can
202 * be changed within a pattern by a "(?x)" option setting.
203 * @G_REGEX_ANCHORED: The pattern is forced to be "anchored", that is,
204 * it is constrained to match only at the first matching point in the
205 * string that is being searched. This effect can also be achieved by
206 * appropriate constructs in the pattern itself such as the "^"
208 * @G_REGEX_DOLLAR_ENDONLY: A dollar metacharacter ("$") in the pattern
209 * matches only at the end of the string. Without this option, a
210 * dollar also matches immediately before the final character if
211 * it is a newline (but not before any other newlines). This option
212 * is ignored if #G_REGEX_MULTILINE is set.
213 * @G_REGEX_UNGREEDY: Inverts the "greediness" of the quantifiers so that
214 * they are not greedy by default, but become greedy if followed by "?".
215 * It can also be set by a "(?U)" option setting within the pattern.
216 * @G_REGEX_RAW: Usually strings must be valid UTF-8 strings, using this
217 * flag they are considered as a raw sequence of bytes.
218 * @G_REGEX_NO_AUTO_CAPTURE: Disables the use of numbered capturing
219 * parentheses in the pattern. Any opening parenthesis that is not
220 * followed by "?" behaves as if it were followed by "?:" but named
221 * parentheses can still be used for capturing (and they acquire numbers
223 * @G_REGEX_OPTIMIZE: Optimize the regular expression. If the pattern will
224 * be used many times, then it may be worth the effort to optimize it
225 * to improve the speed of matches.
226 * @G_REGEX_DUPNAMES: Names used to identify capturing subpatterns need not
227 * be unique. This can be helpful for certain types of pattern when it
228 * is known that only one instance of the named subpattern can ever be
230 * @G_REGEX_NEWLINE_CR: Usually any newline character is recognized, if this
231 * option is set, the only recognized newline character is '\r'.
232 * @G_REGEX_NEWLINE_LF: Usually any newline character is recognized, if this
233 * option is set, the only recognized newline character is '\n'.
234 * @G_REGEX_NEWLINE_CRLF: Usually any newline character is recognized, if this
235 * option is set, the only recognized newline character sequence is '\r\n'.
237 * Flags specifying compile-time options.
241 /* Remember to update G_REGEX_COMPILE_MASK in gregex.c after
242 * adding a new flag. */
245 G_REGEX_CASELESS
= 1 << 0,
246 G_REGEX_MULTILINE
= 1 << 1,
247 G_REGEX_DOTALL
= 1 << 2,
248 G_REGEX_EXTENDED
= 1 << 3,
249 G_REGEX_ANCHORED
= 1 << 4,
250 G_REGEX_DOLLAR_ENDONLY
= 1 << 5,
251 G_REGEX_UNGREEDY
= 1 << 9,
252 G_REGEX_RAW
= 1 << 11,
253 G_REGEX_NO_AUTO_CAPTURE
= 1 << 12,
254 G_REGEX_OPTIMIZE
= 1 << 13,
255 G_REGEX_DUPNAMES
= 1 << 19,
256 G_REGEX_NEWLINE_CR
= 1 << 20,
257 G_REGEX_NEWLINE_LF
= 1 << 21,
258 G_REGEX_NEWLINE_CRLF
= G_REGEX_NEWLINE_CR
| G_REGEX_NEWLINE_LF
259 } GRegexCompileFlags
;
263 * @G_REGEX_MATCH_ANCHORED: The pattern is forced to be "anchored", that is,
264 * it is constrained to match only at the first matching point in the
265 * string that is being searched. This effect can also be achieved by
266 * appropriate constructs in the pattern itself such as the "^"
268 * @G_REGEX_MATCH_NOTBOL: Specifies that first character of the string is
269 * not the beginning of a line, so the circumflex metacharacter should
270 * not match before it. Setting this without #G_REGEX_MULTILINE (at
271 * compile time) causes circumflex never to match. This option affects
272 * only the behaviour of the circumflex metacharacter, it does not
274 * @G_REGEX_MATCH_NOTEOL: Specifies that the end of the subject string is
275 * not the end of a line, so the dollar metacharacter should not match
276 * it nor (except in multiline mode) a newline immediately before it.
277 * Setting this without #G_REGEX_MULTILINE (at compile time) causes
278 * dollar never to match. This option affects only the behaviour of
279 * the dollar metacharacter, it does not affect "\Z" or "\z".
280 * @G_REGEX_MATCH_NOTEMPTY: An empty string is not considered to be a valid
281 * match if this option is set. If there are alternatives in the pattern,
282 * they are tried. If all the alternatives match the empty string, the
283 * entire match fails. For example, if the pattern "a?b?" is applied to
284 * a string not beginning with "a" or "b", it matches the empty string
285 * at the start of the string. With this flag set, this match is not
286 * valid, so GRegex searches further into the string for occurrences
288 * @G_REGEX_MATCH_PARTIAL: Turns on the partial matching feature, for more
289 * documentation on partial matching see g_match_info_is_partial_match().
290 * @G_REGEX_MATCH_NEWLINE_CR: Overrides the newline definition set when
291 * creating a new #GRegex, setting the '\r' character as line terminator.
292 * @G_REGEX_MATCH_NEWLINE_LF: Overrides the newline definition set when
293 * creating a new #GRegex, setting the '\n' character as line terminator.
294 * @G_REGEX_MATCH_NEWLINE_CRLF: Overrides the newline definition set when
295 * creating a new #GRegex, setting the '\r\n' characters as line terminator.
296 * @G_REGEX_MATCH_NEWLINE_ANY: Overrides the newline definition set when
297 * creating a new #GRegex, any newline character or character sequence
300 * Flags specifying match-time options.
304 /* Remember to update G_REGEX_MATCH_MASK in gregex.c after
305 * adding a new flag. */
308 G_REGEX_MATCH_ANCHORED
= 1 << 4,
309 G_REGEX_MATCH_NOTBOL
= 1 << 7,
310 G_REGEX_MATCH_NOTEOL
= 1 << 8,
311 G_REGEX_MATCH_NOTEMPTY
= 1 << 10,
312 G_REGEX_MATCH_PARTIAL
= 1 << 15,
313 G_REGEX_MATCH_NEWLINE_CR
= 1 << 20,
314 G_REGEX_MATCH_NEWLINE_LF
= 1 << 21,
315 G_REGEX_MATCH_NEWLINE_CRLF
= G_REGEX_MATCH_NEWLINE_CR
| G_REGEX_MATCH_NEWLINE_LF
,
316 G_REGEX_MATCH_NEWLINE_ANY
= 1 << 22
322 * A GRegex is the "compiled" form of a regular expression pattern. This
323 * structure is opaque and its fields cannot be accessed directly.
327 typedef struct _GRegex GRegex
;
330 typedef struct _GMatchInfo GMatchInfo
;
333 * GRegexEvalCallback:
334 * @match_info: the #GMatchInfo generated by the match.
335 * Use g_match_info_get_regex() and g_match_info_get_string() if you
336 * need the #GRegex or the matched string.
337 * @result: a #GString containing the new string
338 * @user_data: user data passed to g_regex_replace_eval()
340 * Specifies the type of the function passed to g_regex_replace_eval().
341 * It is called for each occurance of the pattern in the string passed
342 * to g_regex_replace_eval(), and it should append the replacement to
345 * Returns: %FALSE to continue the replacement process, %TRUE to stop it
349 typedef gboolean (*GRegexEvalCallback
) (const GMatchInfo
*match_info
,
354 GRegex
*g_regex_new (const gchar
*pattern
,
355 GRegexCompileFlags compile_options
,
356 GRegexMatchFlags match_options
,
358 GRegex
*g_regex_ref (GRegex
*regex
);
359 void g_regex_unref (GRegex
*regex
);
360 const gchar
*g_regex_get_pattern (const GRegex
*regex
);
361 gint
g_regex_get_max_backref (const GRegex
*regex
);
362 gint
g_regex_get_capture_count (const GRegex
*regex
);
363 gint
g_regex_get_string_number (const GRegex
*regex
,
365 gchar
*g_regex_escape_string (const gchar
*string
,
368 GRegexCompileFlags
g_regex_get_compile_flags (const GRegex
*regex
);
369 GRegexMatchFlags
g_regex_get_match_flags (const GRegex
*regex
);
372 gboolean
g_regex_match_simple (const gchar
*pattern
,
374 GRegexCompileFlags compile_options
,
375 GRegexMatchFlags match_options
);
376 gboolean
g_regex_match (const GRegex
*regex
,
378 GRegexMatchFlags match_options
,
379 GMatchInfo
**match_info
);
380 gboolean
g_regex_match_full (const GRegex
*regex
,
384 GRegexMatchFlags match_options
,
385 GMatchInfo
**match_info
,
387 gboolean
g_regex_match_all (const GRegex
*regex
,
389 GRegexMatchFlags match_options
,
390 GMatchInfo
**match_info
);
391 gboolean
g_regex_match_all_full (const GRegex
*regex
,
395 GRegexMatchFlags match_options
,
396 GMatchInfo
**match_info
,
399 /* String splitting. */
400 gchar
**g_regex_split_simple (const gchar
*pattern
,
402 GRegexCompileFlags compile_options
,
403 GRegexMatchFlags match_options
);
404 gchar
**g_regex_split (const GRegex
*regex
,
406 GRegexMatchFlags match_options
);
407 gchar
**g_regex_split_full (const GRegex
*regex
,
411 GRegexMatchFlags match_options
,
415 /* String replacement. */
416 gchar
*g_regex_replace (const GRegex
*regex
,
420 const gchar
*replacement
,
421 GRegexMatchFlags match_options
,
423 gchar
*g_regex_replace_literal (const GRegex
*regex
,
427 const gchar
*replacement
,
428 GRegexMatchFlags match_options
,
430 gchar
*g_regex_replace_eval (const GRegex
*regex
,
434 GRegexMatchFlags match_options
,
435 GRegexEvalCallback eval
,
438 gboolean
g_regex_check_replacement (const gchar
*replacement
,
439 gboolean
*has_references
,
443 GRegex
*g_match_info_get_regex (const GMatchInfo
*match_info
);
444 const gchar
*g_match_info_get_string (const GMatchInfo
*match_info
);
446 void g_match_info_free (GMatchInfo
*match_info
);
447 gboolean
g_match_info_next (GMatchInfo
*match_info
,
449 gboolean
g_match_info_matches (const GMatchInfo
*match_info
);
450 gint
g_match_info_get_match_count (const GMatchInfo
*match_info
);
451 gboolean
g_match_info_is_partial_match (const GMatchInfo
*match_info
);
452 gchar
*g_match_info_expand_references(const GMatchInfo
*match_info
,
453 const gchar
*string_to_expand
,
455 gchar
*g_match_info_fetch (const GMatchInfo
*match_info
,
457 gboolean
g_match_info_fetch_pos (const GMatchInfo
*match_info
,
461 gchar
*g_match_info_fetch_named (const GMatchInfo
*match_info
,
463 gboolean
g_match_info_fetch_named_pos (const GMatchInfo
*match_info
,
467 gchar
**g_match_info_fetch_all (const GMatchInfo
*match_info
);
471 #endif /* __G_REGEX_H__ */