class library: PriorityQueue - implement removeValue, hide array
[supercollider.git] / external_libraries / icu / unicode / uregex.h
blobbee0e5bc692523bcc639e83cd610e2d81b7756cb
1 /*
2 **********************************************************************
3 * Copyright (C) 2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
10 * created on: 2004mar09
11 * created by: Andy Heninger
13 * ICU Regular Expressions, API for C
16 /**
17 * \file
18 * \brief C API: Regular Expressions
20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
23 #ifndef UREGEX_H
24 #define UREGEX_H
26 #include "utypes.h"
28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
30 #include "parseerr.h"
32 #ifndef U_HIDE_DRAFT_API
34 struct URegularExpression;
35 /**
36 * Structure represeting a compiled regular rexpression, plus the results
37 * of a match operation.
38 * @draft ICU 3.0
40 typedef struct URegularExpression URegularExpression;
42 #endif /* U_HIDE_DRAFT_API */
45 /**
46 * Constants for Regular Expression Match Modes.
47 * @stable ICU 2.4
49 typedef enum URegexpFlag{
50 /** Forces normalization of pattern and strings. @draft ICU 2.4 */
51 UREGEX_CANON_EQ = 128,
53 /** Enable case insensitive matching. @stable ICU 2.4 */
54 UREGEX_CASE_INSENSITIVE = 2,
56 /** Allow white space and comments within patterns @stable ICU 2.4 */
57 UREGEX_COMMENTS = 4,
59 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
60 * @stable ICU 2.4 */
61 UREGEX_DOTALL = 32,
63 /** Control behavior of "$" and "^"
64 * If set, recognize line terminators within string,
65 * otherwise, match only at start and end of input string.
66 * @stable ICU 2.4 */
67 UREGEX_MULTILINE = 8,
69 /** Unicode word boundaries.
70 * If set, \b uses the Unicode TR 29 definition of word boundaries.
71 * Warning: Unicode word boundaries are quite different from
72 * traditional regular expression word boundaries. See
73 * http://unicode.org/reports/tr29/#Word_Boundaries
74 * @draft ICU 2.8
76 UREGEX_UWORD = 256
77 } URegexpFlag;
79 /**
80 * Open (compile) an ICU regular expression. Compiles the regular expression in
81 * string form into an internal representation using the specified match mode flags.
82 * The resulting regular expression handle can then be used to perform various
83 * matching operations.
85 * @param pattern The Regular Expression pattern to be compiled.
86 * @param patternLength The length of the pattern, or -1 if the pattern is
87 * NUL termintated.
88 * @param flags Flags that alter the default matching behavior for
89 * the regular expression, UREGEX_CASE_INSENSITIVE, for
90 * example. For default behavior, set this parameter to zero.
91 * See <code>enum URegexpFlag</code>. All desired flags
92 * are bitwise-ORed together.
93 * @param pe Receives the position (line and column nubers) of any syntax
94 * error within the source regular expression string. If this
95 * information is not wanted, pass NULL for this parameter.
96 * @param status Receives error detected by this function.
97 * @draft ICU 3.0
100 U_DRAFT URegularExpression * U_EXPORT2
101 uregex_open( const UChar *pattern,
102 int32_t patternLength,
103 uint32_t flags,
104 UParseError *pe,
105 UErrorCode *status);
108 * Open (compile) an ICU regular expression. The resulting regular expression
109 * handle can then be used to perform various matching operations.
110 * <p>
111 * This function is the same as uregex_open, except that the pattern
112 * is supplied as an 8 bit char * string in the default code page.
114 * @param pattern The Regular Expression pattern to be compiled,
115 * NUL termintated.
116 * @param flags Flags that alter the default matching behavior for
117 * the regular expression, UREGEX_CASE_INSENSITIVE, for
118 * example. For default behavior, set this parameter to zero.
119 * See <code>enum URegexpFlag</code>. All desired flags
120 * are bitwise-ORed together.
121 * @param pe Receives the position (line and column nubers) of any syntax
122 * error within the source regular expression string. If this
123 * information is not wanted, pass NULL for this parameter.
124 * @param status Receives errors detected by this function.
125 * @return The URegularExpression object representing the compiled
126 * pattern.
128 * @draft ICU 3.0
130 U_DRAFT URegularExpression * U_EXPORT2
131 uregex_openC( const char *pattern,
132 uint32_t flags,
133 UParseError *pe,
134 UErrorCode *status);
140 * Close the regular expression, recovering all resources (memory) it
141 * was holding.
143 * @param regexp The regular expression to be closed.
144 * @draft ICU 3.0
146 U_DRAFT void U_EXPORT2
147 uregex_close(URegularExpression *regexp);
150 * Make a copy of a compiled regular expression. Cloning a regular
151 * expression is faster than opening a second instance from the source
152 * form of the expression, and requires less memory.
153 * <p>
154 * Note that the current input string and the position of any matched text
155 * within it are not cloned; only the pattern itself and and the
156 * match mode flags are copied.
157 * <p>
158 * Cloning can be particularly useful to threaded applications that perform
159 * multiple match operations in parallel. Each concurrent RE
160 * operation requires its own instance of a URegularExpression.
162 * @param regexp The compiled regular expression to be cloned.
163 * @param status Receives indication of any errors encountered
164 * @return the cloned copy of the compiled regular expression.
165 * @draft ICU 3.0
167 U_DRAFT URegularExpression * U_EXPORT2
168 uregex_clone(const URegularExpression *regexp, UErrorCode *status);
171 * Return a pointer to the source form of the pattern for this regular expression.
173 * @param regexp The compiled regular expression.
174 * @param patLength This output parameter will be set to the length of the
175 * pattern string. A NULL pointer may be used here if the
176 * pattern length is not needed, as would be the case if
177 * the pattern is known in advance to be a NUL terminated
178 * string.
179 * @param status Receives errors detected by this function.
180 * @return a pointer to the pattern string. The storage for the string is
181 * owned by the regular expression object, and must not be
182 * altered or deleted by the application. The returned string
183 * will remain valid until the regular expression is closed.
184 * @draft ICU 3.0
186 U_DRAFT const UChar * U_EXPORT2
187 uregex_pattern(const URegularExpression *regexp,
188 int32_t *patLength,
189 UErrorCode *status);
193 * Get the match mode flags that were specified when compiling this regular expression.
194 * @param status Receives errors detected by this function.
195 * @param regexp The compiled regular expression.
196 * @return The match mode flags
197 * @see URegexpFlag
198 * @draft ICU 3.0
200 U_DRAFT int32_t U_EXPORT2
201 uregex_flags(const URegularExpression *regexp,
202 UErrorCode *status);
206 * Set the subject text string upon which the regular expression will look for matches.
207 * This function may be called any number of times, allowing the regular
208 * expression pattern to be applied to different strings.
209 * <p>
210 * Regular expression matching operations work directly on the application's
211 * string data. No copy is made. The subject string data must not be
212 * altered after calling this function until after all regular expression
213 * operations involving this string data are completed.
214 * <p>
215 * Zero length strings are permitted. In this case, no subsequent match
216 * operation will dereference the text string pointer.
218 * @param regexp The compiled regular expression.
219 * @param text The subject text string.
220 * @param textLength The length of the subject text, or -1 if the string
221 * is NUL terminated.
222 * @param status Receives errors detected by this function.
223 * @draft ICU 3.0
225 U_DRAFT void U_EXPORT2
226 uregex_setText(URegularExpression *regexp,
227 const UChar *text,
228 int32_t textLength,
229 UErrorCode *status);
232 * Get the subject text that is currently associated with this
233 * regular expression object. This simply returns whatever string
234 * pointer was previously supplied via uregex_setText().
236 * @param regexp The compiled regular expression.
237 * @param textLength The length of the string is returned in this output parameter.
238 * A NULL pointer may be used here if the
239 * text length is not needed, as would be the case if
240 * the text is known in advance to be a NUL terminated
241 * string.
242 * @param status Receives errors detected by this function.
243 * @return Poiner to the subject text string currently associated with
244 * this regular expression.
245 * @draft ICU 3.0
247 U_DRAFT const UChar * U_EXPORT2
248 uregex_getText(URegularExpression *regexp,
249 int32_t *textLength,
250 UErrorCode *status);
253 * Attempts to match the input string, beginning at startIndex, against the pattern.
254 * To succeed, the match must extend to the end of the input string.
256 * @param regexp The compiled regular expression.
257 * @param startIndex The input string index at which to begin matching.
258 * @param status Receives errors detected by this function.
259 * @return TRUE if there is a match
260 * @draft ICU 3.0
262 U_DRAFT UBool U_EXPORT2
263 uregex_matches(URegularExpression *regexp,
264 int32_t startIndex,
265 UErrorCode *status);
268 * Attempts to match the input string, starting from the specified index, against the pattern.
269 * The match may be of any length, and is not required to extend to the end
270 * of the input string. Contrast with uregex_matches().
272 * <p>If the match succeeds then more information can be obtained via the
273 * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
274 * and <code>uregexp_group()</code> functions.</p>
276 * @param regexp The compiled regular expression.
277 * @param startIndex The input string index at which to begin matching.
278 * @param status A reference to a UErrorCode to receive any errors.
279 * @return TRUE if there is a match.
280 * @draft ICU 3.0
282 U_DRAFT UBool U_EXPORT2
283 uregex_lookingAt(URegularExpression *regexp,
284 int32_t startIndex,
285 UErrorCode *status);
288 * Find the first matching substring of the input string that matches the pattern.
289 * The search for a match begins at the specified index.
290 * If a match is found, <code>uregex_start(), uregex_end()</code>, and
291 * <code>uregex_group()</code> will provide more information regarding the match.
293 * @param regexp The compiled regular expression.
294 * @param startIndex The position in the input string to begin the search
295 * @param status A reference to a UErrorCode to receive any errors.
296 * @return TRUE if a match is found.
297 * @draft ICU 3.0
299 U_DRAFT UBool U_EXPORT2
300 uregex_find(URegularExpression *regexp,
301 int32_t startIndex,
302 UErrorCode *status);
305 * Find the next pattern match in the input string.
306 * Begin searching the input at the location following the end of
307 * the previous match, or at the start of the string if there is no previous match.
308 * If a match is found, <code>uregex_start(), uregex_end()</code>, and
309 * <code>uregex_group()</code> will provide more information regarding the match.
311 * @param regexp The compiled regular expression.
312 * @param status A reference to a UErrorCode to receive any errors.
313 * @return TRUE if a match is found.
314 * @see uregex_reset
315 * @draft ICU 3.0
317 U_DRAFT UBool U_EXPORT2
318 uregex_findNext(URegularExpression *regexp,
319 UErrorCode *status);
322 * Get the number of capturing groups in this regular expression's pattern.
323 * @param regexp The compiled regular expression.
324 * @param status A reference to a UErrorCode to receive any errors.
325 * @return the number of capture groups
326 * @draft ICU 3.0
328 U_DRAFT int32_t U_EXPORT2
329 uregex_groupCount(URegularExpression *regexp,
330 UErrorCode *status);
332 /** Extract the string for the specified matching expression or subexpression.
333 * Group #0 is the complete string of matched text.
334 * Group #1 is the text matched by the first set of capturing parentheses.
336 * @param regexp The compiled regular expression.
337 * @param groupNum The capture group to extract. Group 0 is the complete
338 * match. The value of this parameter must be
339 * less than or equal to the number of capture groups in
340 * the pattern.
341 * @param dest Buffer to receive the matching string data
342 * @param destCapacity Capacity of the dest buffer.
343 * @param status A reference to a UErrorCode to receive any errors.
344 * @return Length of matching data,
345 * or -1 if no applicable match.
346 * @draft ICU 3.0
348 U_DRAFT int32_t U_EXPORT2
349 uregex_group(URegularExpression *regexp,
350 int32_t groupNum,
351 UChar *dest,
352 int32_t destCapacity,
353 UErrorCode *status);
357 * Returns the index in the input string of the start of the text matched by the
358 * specified capture group during the previous match operation. Return -1 if
359 * the capture group was not part of the last match.
360 * Group #0 refers to the complete range of matched text.
361 * Group #1 refers to the text matched by the first set of capturing parentheses.
363 * @param regexp The compiled regular expression.
364 * @param groupNum The capture group number
365 * @param status A reference to a UErrorCode to receive any errors.
366 * @return the starting position in the input of the text matched
367 * by the specified group.
368 * @draft ICU 3.0
370 U_DRAFT int32_t U_EXPORT2
371 uregex_start(URegularExpression *regexp,
372 int32_t groupNum,
373 UErrorCode *status);
376 * Returns the index in the input string of the position following the end
377 * of the text matched by the specified capture group.
378 * Return -1 if the capture group was not part of the last match.
379 * Group #0 refers to the complete range of matched text.
380 * Group #1 refers to the text matched by the first set of capturing parentheses.
382 * @param regexp The compiled regular expression.
383 * @param groupNum The capture group number
384 * @param status A reference to a UErrorCode to receive any errors.
385 * @return the index of the position following the last matched character.
386 * @draft ICU 3.0
388 U_DRAFT int32_t U_EXPORT2
389 uregex_end(URegularExpression *regexp,
390 int32_t groupNum,
391 UErrorCode *status);
394 * Reset any saved state from the previous match. Has the effect of
395 * causing uregex_findNext to begin at the specified index, and causing
396 * uregex_start(), uregex_end() and uregex_group() to return an error
397 * indicating that there is no match information available.
399 * @param regexp The compiled regular expression.
400 * @param index The position in the text at which a
401 * uregex_findNext() should begin searching.
402 * @param status A reference to a UErrorCode to receive any errors.
403 * @draft ICU 3.0
405 U_DRAFT void U_EXPORT2
406 uregex_reset(URegularExpression *regexp,
407 int32_t index,
408 UErrorCode *status);
411 * Replaces every substring of the input that matches the pattern
412 * with the given replacement string. This is a convenience function that
413 * provides a complete find-and-replace-all operation.
415 * This method scans the input string looking for matches of the pattern.
416 * Input that is not part of any match is copied unchanged to the
417 * destination buffer. Matched regions are replaced in the output
418 * buffer by the replacement string. The replacement string may contain
419 * references to capture groups; these take the form of $1, $2, etc.
421 * @param regexp The compiled regular expression.
422 * @param replacementText A string containing the replacement text.
423 * @param replacementLength The length of the replacement string, or
424 * -1 if it is NUL terminated.
425 * @param destBuf A (UChar *) buffer that will receive the result.
426 * @param destCapacity The capacity of the desitnation buffer.
427 * @param status A reference to a UErrorCode to receive any errors.
428 * @return The length of the string resulting from the find
429 * and replace operation. In the event that the
430 * destination capacity is inadequate, the return value
431 * is still the full length of the untruncated string.
432 * @draft ICU 3.0
434 U_DRAFT int32_t U_EXPORT2
435 uregex_replaceAll(URegularExpression *regexp,
436 UChar *replacementText,
437 int32_t replacementLength,
438 UChar *destBuf,
439 int32_t destCapacity,
440 UErrorCode *status);
444 * Replaces the first substring of the input that matches the pattern
445 * with the given replacement string. This is a convenience function that
446 * provides a complete find-and-replace operation.
448 * This method scans the input string looking for a match of the pattern.
449 * All input that is not part of the match is copied unchanged to the
450 * destination buffer. The matched region is replaced in the output
451 * buffer by the replacement string. The replacement string may contain
452 * references to capture groups; these take the form of $1, $2, etc.
454 * @param regexp The compiled regular expression.
455 * @param replacementText A string containing the replacement text.
456 * @param replacementLength The length of the replacement string, or
457 * -1 if it is NUL terminated.
458 * @param destBuf A (UChar *) buffer that will receive the result.
459 * @param destCapacity The capacity of the desitnation buffer.
460 * @param status a reference to a UErrorCode to receive any errors.
461 * @return The length of the string resulting from the find
462 * and replace operation. In the event that the
463 * destination capacity is inadequate, the return value
464 * is still the full length of the untruncated string.
465 * @draft ICU 3.0
467 U_DRAFT int32_t U_EXPORT2
468 uregex_replaceFirst(URegularExpression *regexp,
469 UChar *replacementText,
470 int32_t replacementLength,
471 UChar *destBuf,
472 int32_t destCapacity,
473 UErrorCode *status);
477 * Implements a replace operation intended to be used as part of an
478 * incremental find-and-replace.
480 * <p>The input string, starting from the end of the previous match and ending at
481 * the start of the current match, is appended to the destination string. Then the
482 * replacement string is appended to the output string,
483 * including handling any substitutions of captured text.</p>
485 * <p>A note on preflight computation of buffersize and error handling:
486 * Calls to uregex_appendReplacement() and uregex_appendTail() are
487 * designed to be chained, one after another, with the destination
488 * buffer pointer and buffer capacity updated after each in preparation
489 * to for the next. If the destination buffer is exhausted partway through such a
490 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
491 * ICU conventions are for a function to perform no action if it is
492 * called with an error status, but for this one case, uregex_appendRepacement()
493 * will operate normally so that buffer size computations will complete
494 * correctly.
496 * <p>For simple, prepackaged, non-incremental find-and-replace
497 * operations, see replaceFirst() or replaceAll().</p>
499 * @param regexp The regular expression object.
500 * @param replacementText The string that will replace the matched portion of the
501 * input string as it is copied to the destination buffer.
502 * The replacement text may contain references ($1, for
503 * example) to capture groups from the match.
504 * @param replacementLength The length of the replacement text string,
505 * or -1 if the string is NUL terminated.
506 * @param destBuf The buffer into which the results of the
507 * find-and-replace are placed. On return, this pointer
508 * will be updated to refer to the beginning of the
509 * unused portion of buffer, leaving it in position for
510 * a subsequent call to this function.
511 * @param destCapacity The size of the output buffer, On return, this
512 * parameter will be updated to reflect the space remaining
513 * unused in the output buffer.
514 * @param status A reference to a UErrorCode to receive any errors.
515 * @return The length of the result string. In the event that
516 * destCapacity is inadequate, the full length of the
517 * untruncated output string is returned.
519 * @draft ICU 3.0
522 U_DRAFT int32_t U_EXPORT2
523 uregex_appendReplacement(URegularExpression *regexp,
524 UChar *replacementText,
525 int32_t replacementLength,
526 UChar **destBuf,
527 int32_t *destCapacity,
528 UErrorCode *status);
532 * As the final step in a find-and-replace operation, append the remainder
533 * of the input string, starting at the position following the last match,
534 * to the destination string. <code>uregex_appendTail()</code> is intended
535 * to be invoked after one or more invocations of the
536 * <code>uregex_appendReplacement()</code> function.
538 * @param regexp The regular expression object. This is needed to
539 * obtain the input string and with the position
540 * of the last match within it.
541 * @param destBuf The buffer in which the results of the
542 * find-and-replace are placed. On return, the pointer
543 * will be updated to refer to the beginning of the
544 * unused portion of buffer.
545 * @param destCapacity The size of the output buffer, On return, this
546 * value will be updated to reflect the space remaining
547 * unused in the output buffer.
548 * @param status A reference to a UErrorCode to receive any errors.
549 * @return The length of the result string. In the event that
550 * destCapacity is inadequate, the full length of the
551 * untruncated output string is returned.
553 * @draft ICU 3.0
555 U_DRAFT int32_t U_EXPORT2
556 uregex_appendTail(URegularExpression *regexp,
557 UChar **destBuf,
558 int32_t *destCapacity,
559 UErrorCode *status);
565 * Split a string into fields. Somewhat like split() from Perl.
566 * The pattern matches identify delimiters that separate the input
567 * into fields. The input data between the matches becomes the
568 * fields themselves.
569 * <p>
570 * Each of the fields is copied from the input string to the destination
571 * buffer, and the NUL terminated. The position of each field within
572 * the destination buffer is returned in the destFields array.
574 * @param regexp The compiled regular expression.
575 * @param destBuf A (UChar *) buffer to receive the fields that
576 * are extracted from the input string. These
577 * field pointers will refer to positions within the
578 * destination buffer supplied by the caller. Any
579 * extra positions within the destFields array will be
580 * set to NULL.
581 * @param destCapacity The capacity of the destBuf.
582 * @param requiredCapacity The actual capacity required of the destBuf.
583 * If destCapacity is too small, requiredCapacity will return
584 * the total capacity required to hold all of the output, and
585 * a U_BUFFER_OVERFLOW_ERROR will be returned.
586 * @param destFields An array to be filled with the position of each
587 * of the extracted fields within destBuf.
588 * @param destFieldsCapacity The number of elements in the destFields array.
589 * If the number of fields found is less than destFieldsCapacity,
590 * the extra destFields elements are set to zero.
591 * If destFieldsCapacity is too small, the trailing part of the
592 * input, including any field delimiters, is treated as if it
593 * were the last field - it is copied to the destBuf, and
594 * its position is in the destBuf is stored in the last element
595 * of destFields. This behavior mimics that of Perl. It is not
596 * an error condition, and no error status is returned when all destField
597 * positions are used.
598 * @param status A reference to a UErrorCode to receive any errors.
599 * @return The number of fields into which the input string was split.
600 * @draft ICU 3.0
602 * Note: another choice for the design of this function would be to not
603 * copy the resulting fields at all, but to return indexes and
604 * lengths within the source text.
605 * Advantages would be
606 * o Faster. No Copying.
607 * o Nothing extra needed when field data may contain embedded NUL chars.
608 * o Less memory needed if working on large data.
609 * Disadvantages
610 * o Less consistent with C++ split, which copies into an
611 * array of UnicodeStrings.
612 * o No NUL termination, extracted fields would be less convenient
613 * to use in most cases.
614 * o Possible problems in the future, when support Unicode Normalization
615 * could cause the fields to not correspond exactly to
616 * a range of the source text.
618 U_DRAFT int32_t U_EXPORT2
619 uregex_split( URegularExpression *regexp,
620 UChar *destBuf,
621 int32_t destCapacity,
622 int32_t *requiredCapacity,
623 UChar *destFields[],
624 int32_t destFieldsCapacity,
625 UErrorCode *status);
629 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
630 #endif /* UREGEX_H */