2 **********************************************************************
3 * Copyright (C) 2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
10 * created on: 2004mar09
11 * created by: Andy Heninger
13 * ICU Regular Expressions, API for C
18 * \brief C API: Regular Expressions
20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #ifndef U_HIDE_DRAFT_API
34 struct URegularExpression
;
36 * Structure represeting a compiled regular rexpression, plus the results
37 * of a match operation.
40 typedef struct URegularExpression URegularExpression
;
42 #endif /* U_HIDE_DRAFT_API */
46 * Constants for Regular Expression Match Modes.
49 typedef enum URegexpFlag
{
50 /** Forces normalization of pattern and strings. @draft ICU 2.4 */
51 UREGEX_CANON_EQ
= 128,
53 /** Enable case insensitive matching. @stable ICU 2.4 */
54 UREGEX_CASE_INSENSITIVE
= 2,
56 /** Allow white space and comments within patterns @stable ICU 2.4 */
59 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
63 /** Control behavior of "$" and "^"
64 * If set, recognize line terminators within string,
65 * otherwise, match only at start and end of input string.
69 /** Unicode word boundaries.
70 * If set, \b uses the Unicode TR 29 definition of word boundaries.
71 * Warning: Unicode word boundaries are quite different from
72 * traditional regular expression word boundaries. See
73 * http://unicode.org/reports/tr29/#Word_Boundaries
80 * Open (compile) an ICU regular expression. Compiles the regular expression in
81 * string form into an internal representation using the specified match mode flags.
82 * The resulting regular expression handle can then be used to perform various
83 * matching operations.
85 * @param pattern The Regular Expression pattern to be compiled.
86 * @param patternLength The length of the pattern, or -1 if the pattern is
88 * @param flags Flags that alter the default matching behavior for
89 * the regular expression, UREGEX_CASE_INSENSITIVE, for
90 * example. For default behavior, set this parameter to zero.
91 * See <code>enum URegexpFlag</code>. All desired flags
92 * are bitwise-ORed together.
93 * @param pe Receives the position (line and column nubers) of any syntax
94 * error within the source regular expression string. If this
95 * information is not wanted, pass NULL for this parameter.
96 * @param status Receives error detected by this function.
100 U_DRAFT URegularExpression
* U_EXPORT2
101 uregex_open( const UChar
*pattern
,
102 int32_t patternLength
,
108 * Open (compile) an ICU regular expression. The resulting regular expression
109 * handle can then be used to perform various matching operations.
111 * This function is the same as uregex_open, except that the pattern
112 * is supplied as an 8 bit char * string in the default code page.
114 * @param pattern The Regular Expression pattern to be compiled,
116 * @param flags Flags that alter the default matching behavior for
117 * the regular expression, UREGEX_CASE_INSENSITIVE, for
118 * example. For default behavior, set this parameter to zero.
119 * See <code>enum URegexpFlag</code>. All desired flags
120 * are bitwise-ORed together.
121 * @param pe Receives the position (line and column nubers) of any syntax
122 * error within the source regular expression string. If this
123 * information is not wanted, pass NULL for this parameter.
124 * @param status Receives errors detected by this function.
125 * @return The URegularExpression object representing the compiled
130 U_DRAFT URegularExpression
* U_EXPORT2
131 uregex_openC( const char *pattern
,
140 * Close the regular expression, recovering all resources (memory) it
143 * @param regexp The regular expression to be closed.
146 U_DRAFT
void U_EXPORT2
147 uregex_close(URegularExpression
*regexp
);
150 * Make a copy of a compiled regular expression. Cloning a regular
151 * expression is faster than opening a second instance from the source
152 * form of the expression, and requires less memory.
154 * Note that the current input string and the position of any matched text
155 * within it are not cloned; only the pattern itself and and the
156 * match mode flags are copied.
158 * Cloning can be particularly useful to threaded applications that perform
159 * multiple match operations in parallel. Each concurrent RE
160 * operation requires its own instance of a URegularExpression.
162 * @param regexp The compiled regular expression to be cloned.
163 * @param status Receives indication of any errors encountered
164 * @return the cloned copy of the compiled regular expression.
167 U_DRAFT URegularExpression
* U_EXPORT2
168 uregex_clone(const URegularExpression
*regexp
, UErrorCode
*status
);
171 * Return a pointer to the source form of the pattern for this regular expression.
173 * @param regexp The compiled regular expression.
174 * @param patLength This output parameter will be set to the length of the
175 * pattern string. A NULL pointer may be used here if the
176 * pattern length is not needed, as would be the case if
177 * the pattern is known in advance to be a NUL terminated
179 * @param status Receives errors detected by this function.
180 * @return a pointer to the pattern string. The storage for the string is
181 * owned by the regular expression object, and must not be
182 * altered or deleted by the application. The returned string
183 * will remain valid until the regular expression is closed.
186 U_DRAFT
const UChar
* U_EXPORT2
187 uregex_pattern(const URegularExpression
*regexp
,
193 * Get the match mode flags that were specified when compiling this regular expression.
194 * @param status Receives errors detected by this function.
195 * @param regexp The compiled regular expression.
196 * @return The match mode flags
200 U_DRAFT
int32_t U_EXPORT2
201 uregex_flags(const URegularExpression
*regexp
,
206 * Set the subject text string upon which the regular expression will look for matches.
207 * This function may be called any number of times, allowing the regular
208 * expression pattern to be applied to different strings.
210 * Regular expression matching operations work directly on the application's
211 * string data. No copy is made. The subject string data must not be
212 * altered after calling this function until after all regular expression
213 * operations involving this string data are completed.
215 * Zero length strings are permitted. In this case, no subsequent match
216 * operation will dereference the text string pointer.
218 * @param regexp The compiled regular expression.
219 * @param text The subject text string.
220 * @param textLength The length of the subject text, or -1 if the string
222 * @param status Receives errors detected by this function.
225 U_DRAFT
void U_EXPORT2
226 uregex_setText(URegularExpression
*regexp
,
232 * Get the subject text that is currently associated with this
233 * regular expression object. This simply returns whatever string
234 * pointer was previously supplied via uregex_setText().
236 * @param regexp The compiled regular expression.
237 * @param textLength The length of the string is returned in this output parameter.
238 * A NULL pointer may be used here if the
239 * text length is not needed, as would be the case if
240 * the text is known in advance to be a NUL terminated
242 * @param status Receives errors detected by this function.
243 * @return Poiner to the subject text string currently associated with
244 * this regular expression.
247 U_DRAFT
const UChar
* U_EXPORT2
248 uregex_getText(URegularExpression
*regexp
,
253 * Attempts to match the input string, beginning at startIndex, against the pattern.
254 * To succeed, the match must extend to the end of the input string.
256 * @param regexp The compiled regular expression.
257 * @param startIndex The input string index at which to begin matching.
258 * @param status Receives errors detected by this function.
259 * @return TRUE if there is a match
262 U_DRAFT UBool U_EXPORT2
263 uregex_matches(URegularExpression
*regexp
,
268 * Attempts to match the input string, starting from the specified index, against the pattern.
269 * The match may be of any length, and is not required to extend to the end
270 * of the input string. Contrast with uregex_matches().
272 * <p>If the match succeeds then more information can be obtained via the
273 * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
274 * and <code>uregexp_group()</code> functions.</p>
276 * @param regexp The compiled regular expression.
277 * @param startIndex The input string index at which to begin matching.
278 * @param status A reference to a UErrorCode to receive any errors.
279 * @return TRUE if there is a match.
282 U_DRAFT UBool U_EXPORT2
283 uregex_lookingAt(URegularExpression
*regexp
,
288 * Find the first matching substring of the input string that matches the pattern.
289 * The search for a match begins at the specified index.
290 * If a match is found, <code>uregex_start(), uregex_end()</code>, and
291 * <code>uregex_group()</code> will provide more information regarding the match.
293 * @param regexp The compiled regular expression.
294 * @param startIndex The position in the input string to begin the search
295 * @param status A reference to a UErrorCode to receive any errors.
296 * @return TRUE if a match is found.
299 U_DRAFT UBool U_EXPORT2
300 uregex_find(URegularExpression
*regexp
,
305 * Find the next pattern match in the input string.
306 * Begin searching the input at the location following the end of
307 * the previous match, or at the start of the string if there is no previous match.
308 * If a match is found, <code>uregex_start(), uregex_end()</code>, and
309 * <code>uregex_group()</code> will provide more information regarding the match.
311 * @param regexp The compiled regular expression.
312 * @param status A reference to a UErrorCode to receive any errors.
313 * @return TRUE if a match is found.
317 U_DRAFT UBool U_EXPORT2
318 uregex_findNext(URegularExpression
*regexp
,
322 * Get the number of capturing groups in this regular expression's pattern.
323 * @param regexp The compiled regular expression.
324 * @param status A reference to a UErrorCode to receive any errors.
325 * @return the number of capture groups
328 U_DRAFT
int32_t U_EXPORT2
329 uregex_groupCount(URegularExpression
*regexp
,
332 /** Extract the string for the specified matching expression or subexpression.
333 * Group #0 is the complete string of matched text.
334 * Group #1 is the text matched by the first set of capturing parentheses.
336 * @param regexp The compiled regular expression.
337 * @param groupNum The capture group to extract. Group 0 is the complete
338 * match. The value of this parameter must be
339 * less than or equal to the number of capture groups in
341 * @param dest Buffer to receive the matching string data
342 * @param destCapacity Capacity of the dest buffer.
343 * @param status A reference to a UErrorCode to receive any errors.
344 * @return Length of matching data,
345 * or -1 if no applicable match.
348 U_DRAFT
int32_t U_EXPORT2
349 uregex_group(URegularExpression
*regexp
,
352 int32_t destCapacity
,
357 * Returns the index in the input string of the start of the text matched by the
358 * specified capture group during the previous match operation. Return -1 if
359 * the capture group was not part of the last match.
360 * Group #0 refers to the complete range of matched text.
361 * Group #1 refers to the text matched by the first set of capturing parentheses.
363 * @param regexp The compiled regular expression.
364 * @param groupNum The capture group number
365 * @param status A reference to a UErrorCode to receive any errors.
366 * @return the starting position in the input of the text matched
367 * by the specified group.
370 U_DRAFT
int32_t U_EXPORT2
371 uregex_start(URegularExpression
*regexp
,
376 * Returns the index in the input string of the position following the end
377 * of the text matched by the specified capture group.
378 * Return -1 if the capture group was not part of the last match.
379 * Group #0 refers to the complete range of matched text.
380 * Group #1 refers to the text matched by the first set of capturing parentheses.
382 * @param regexp The compiled regular expression.
383 * @param groupNum The capture group number
384 * @param status A reference to a UErrorCode to receive any errors.
385 * @return the index of the position following the last matched character.
388 U_DRAFT
int32_t U_EXPORT2
389 uregex_end(URegularExpression
*regexp
,
394 * Reset any saved state from the previous match. Has the effect of
395 * causing uregex_findNext to begin at the specified index, and causing
396 * uregex_start(), uregex_end() and uregex_group() to return an error
397 * indicating that there is no match information available.
399 * @param regexp The compiled regular expression.
400 * @param index The position in the text at which a
401 * uregex_findNext() should begin searching.
402 * @param status A reference to a UErrorCode to receive any errors.
405 U_DRAFT
void U_EXPORT2
406 uregex_reset(URegularExpression
*regexp
,
411 * Replaces every substring of the input that matches the pattern
412 * with the given replacement string. This is a convenience function that
413 * provides a complete find-and-replace-all operation.
415 * This method scans the input string looking for matches of the pattern.
416 * Input that is not part of any match is copied unchanged to the
417 * destination buffer. Matched regions are replaced in the output
418 * buffer by the replacement string. The replacement string may contain
419 * references to capture groups; these take the form of $1, $2, etc.
421 * @param regexp The compiled regular expression.
422 * @param replacementText A string containing the replacement text.
423 * @param replacementLength The length of the replacement string, or
424 * -1 if it is NUL terminated.
425 * @param destBuf A (UChar *) buffer that will receive the result.
426 * @param destCapacity The capacity of the desitnation buffer.
427 * @param status A reference to a UErrorCode to receive any errors.
428 * @return The length of the string resulting from the find
429 * and replace operation. In the event that the
430 * destination capacity is inadequate, the return value
431 * is still the full length of the untruncated string.
434 U_DRAFT
int32_t U_EXPORT2
435 uregex_replaceAll(URegularExpression
*regexp
,
436 UChar
*replacementText
,
437 int32_t replacementLength
,
439 int32_t destCapacity
,
444 * Replaces the first substring of the input that matches the pattern
445 * with the given replacement string. This is a convenience function that
446 * provides a complete find-and-replace operation.
448 * This method scans the input string looking for a match of the pattern.
449 * All input that is not part of the match is copied unchanged to the
450 * destination buffer. The matched region is replaced in the output
451 * buffer by the replacement string. The replacement string may contain
452 * references to capture groups; these take the form of $1, $2, etc.
454 * @param regexp The compiled regular expression.
455 * @param replacementText A string containing the replacement text.
456 * @param replacementLength The length of the replacement string, or
457 * -1 if it is NUL terminated.
458 * @param destBuf A (UChar *) buffer that will receive the result.
459 * @param destCapacity The capacity of the desitnation buffer.
460 * @param status a reference to a UErrorCode to receive any errors.
461 * @return The length of the string resulting from the find
462 * and replace operation. In the event that the
463 * destination capacity is inadequate, the return value
464 * is still the full length of the untruncated string.
467 U_DRAFT
int32_t U_EXPORT2
468 uregex_replaceFirst(URegularExpression
*regexp
,
469 UChar
*replacementText
,
470 int32_t replacementLength
,
472 int32_t destCapacity
,
477 * Implements a replace operation intended to be used as part of an
478 * incremental find-and-replace.
480 * <p>The input string, starting from the end of the previous match and ending at
481 * the start of the current match, is appended to the destination string. Then the
482 * replacement string is appended to the output string,
483 * including handling any substitutions of captured text.</p>
485 * <p>A note on preflight computation of buffersize and error handling:
486 * Calls to uregex_appendReplacement() and uregex_appendTail() are
487 * designed to be chained, one after another, with the destination
488 * buffer pointer and buffer capacity updated after each in preparation
489 * to for the next. If the destination buffer is exhausted partway through such a
490 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
491 * ICU conventions are for a function to perform no action if it is
492 * called with an error status, but for this one case, uregex_appendRepacement()
493 * will operate normally so that buffer size computations will complete
496 * <p>For simple, prepackaged, non-incremental find-and-replace
497 * operations, see replaceFirst() or replaceAll().</p>
499 * @param regexp The regular expression object.
500 * @param replacementText The string that will replace the matched portion of the
501 * input string as it is copied to the destination buffer.
502 * The replacement text may contain references ($1, for
503 * example) to capture groups from the match.
504 * @param replacementLength The length of the replacement text string,
505 * or -1 if the string is NUL terminated.
506 * @param destBuf The buffer into which the results of the
507 * find-and-replace are placed. On return, this pointer
508 * will be updated to refer to the beginning of the
509 * unused portion of buffer, leaving it in position for
510 * a subsequent call to this function.
511 * @param destCapacity The size of the output buffer, On return, this
512 * parameter will be updated to reflect the space remaining
513 * unused in the output buffer.
514 * @param status A reference to a UErrorCode to receive any errors.
515 * @return The length of the result string. In the event that
516 * destCapacity is inadequate, the full length of the
517 * untruncated output string is returned.
522 U_DRAFT
int32_t U_EXPORT2
523 uregex_appendReplacement(URegularExpression
*regexp
,
524 UChar
*replacementText
,
525 int32_t replacementLength
,
527 int32_t *destCapacity
,
532 * As the final step in a find-and-replace operation, append the remainder
533 * of the input string, starting at the position following the last match,
534 * to the destination string. <code>uregex_appendTail()</code> is intended
535 * to be invoked after one or more invocations of the
536 * <code>uregex_appendReplacement()</code> function.
538 * @param regexp The regular expression object. This is needed to
539 * obtain the input string and with the position
540 * of the last match within it.
541 * @param destBuf The buffer in which the results of the
542 * find-and-replace are placed. On return, the pointer
543 * will be updated to refer to the beginning of the
544 * unused portion of buffer.
545 * @param destCapacity The size of the output buffer, On return, this
546 * value will be updated to reflect the space remaining
547 * unused in the output buffer.
548 * @param status A reference to a UErrorCode to receive any errors.
549 * @return The length of the result string. In the event that
550 * destCapacity is inadequate, the full length of the
551 * untruncated output string is returned.
555 U_DRAFT
int32_t U_EXPORT2
556 uregex_appendTail(URegularExpression
*regexp
,
558 int32_t *destCapacity
,
565 * Split a string into fields. Somewhat like split() from Perl.
566 * The pattern matches identify delimiters that separate the input
567 * into fields. The input data between the matches becomes the
570 * Each of the fields is copied from the input string to the destination
571 * buffer, and the NUL terminated. The position of each field within
572 * the destination buffer is returned in the destFields array.
574 * @param regexp The compiled regular expression.
575 * @param destBuf A (UChar *) buffer to receive the fields that
576 * are extracted from the input string. These
577 * field pointers will refer to positions within the
578 * destination buffer supplied by the caller. Any
579 * extra positions within the destFields array will be
581 * @param destCapacity The capacity of the destBuf.
582 * @param requiredCapacity The actual capacity required of the destBuf.
583 * If destCapacity is too small, requiredCapacity will return
584 * the total capacity required to hold all of the output, and
585 * a U_BUFFER_OVERFLOW_ERROR will be returned.
586 * @param destFields An array to be filled with the position of each
587 * of the extracted fields within destBuf.
588 * @param destFieldsCapacity The number of elements in the destFields array.
589 * If the number of fields found is less than destFieldsCapacity,
590 * the extra destFields elements are set to zero.
591 * If destFieldsCapacity is too small, the trailing part of the
592 * input, including any field delimiters, is treated as if it
593 * were the last field - it is copied to the destBuf, and
594 * its position is in the destBuf is stored in the last element
595 * of destFields. This behavior mimics that of Perl. It is not
596 * an error condition, and no error status is returned when all destField
597 * positions are used.
598 * @param status A reference to a UErrorCode to receive any errors.
599 * @return The number of fields into which the input string was split.
602 * Note: another choice for the design of this function would be to not
603 * copy the resulting fields at all, but to return indexes and
604 * lengths within the source text.
605 * Advantages would be
606 * o Faster. No Copying.
607 * o Nothing extra needed when field data may contain embedded NUL chars.
608 * o Less memory needed if working on large data.
610 * o Less consistent with C++ split, which copies into an
611 * array of UnicodeStrings.
612 * o No NUL termination, extracted fields would be less convenient
613 * to use in most cases.
614 * o Possible problems in the future, when support Unicode Normalization
615 * could cause the fields to not correspond exactly to
616 * a range of the source text.
618 U_DRAFT
int32_t U_EXPORT2
619 uregex_split( URegularExpression
*regexp
,
621 int32_t destCapacity
,
622 int32_t *requiredCapacity
,
624 int32_t destFieldsCapacity
,
629 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
630 #endif /* UREGEX_H */