2 # $Id: README,v 1.1.1.1 2000/01/25 21:41:10 lukem Exp $
4 # Copyright 1997, 1998, 1999 Computing Research Labs,
5 # New Mexico State University
7 # Permission is hereby granted, free of charge, to any person obtaining a
8 # copy of this software and associated documentation files (the "Software"),
9 # to deal in the Software without restriction, including without limitation
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 # and/or sell copies of the Software, and to permit persons to whom the
12 # Software is furnished to do so, subject to the following conditions:
14 # The above copyright notice and this permission notice shall be included in
15 # all copies or substantial portions of the Software.
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 # THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
21 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
22 # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
23 # THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 Unicode and Regular Expressions
30 This is a simple regular expression package for matching against Unicode text
31 in UCS2 form. The implementation of this URE package is a variation on the
32 RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
33 Hopkins' algorithm had the virtue of being very simple, so it was used as a
36 ---------------------------------------------------------------------------
40 o Regular expression and text already normalized.
42 o Conversion to lower case assumes a 1-1 mapping.
46 Separator - any one of U+2028, U+2029, '\n', '\r'.
49 . - match any character.
50 * - match zero or more of the last subexpression.
51 + - match one or more of the last subexpression.
52 ? - match zero or one of the last subexpression.
53 () - subexpression grouping.
57 o The "." operator normally does not match separators, but a flag is
58 available for the ure_exec() function that will allow this operator to
61 Literals and Constants:
63 c - literal UCS2 character.
64 \x.... - hexadecimal number of up to 4 digits.
65 \X.... - hexadecimal number of up to 4 digits.
66 \u.... - hexadecimal number of up to 4 digits.
67 \U.... - hexadecimal number of up to 4 digits.
71 [...] - Character class.
72 [^...] - Negated character class.
73 \pN1,N2,...,Nn - Character properties class.
74 \PN1,N2,...,Nn - Negated character properties class.
76 POSIX character classes recognized:
92 o Character property classes are \p or \P followed by a comma separated
93 list of integers between 1 and 32. These integers are references to
94 the following character properties:
97 --------------------------
131 o Character classes can contain literals, constants, and character
132 property classes. Example:
136 ---------------------------------------------------------------------------
140 Before URE is used, two functions need to be created. One to check if a
141 character matches a set of URE character properties, and one to convert a
142 character to lower case.
144 Stubs for these function are located in the urestubs.c file.
149 Sample pseudo-code fragment.
154 unsigned long relen, textlen;
155 unsigned long match_start, match_end;
158 * Allocate the dynamic storage needed to compile regular expressions.
160 rebuf = ure_buffer_create();
162 for each regular expression in a list {
163 re = next regular expression;
167 * Compile the regular expression with the case insensitive flag
170 dfa = ure_compile(re, relen, 1, rebuf);
173 * Look for the first match in some text. The matching will be done
174 * in a case insensitive manner because the expression was compiled
175 * with the case insensitive flag on.
177 if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
178 printf("MATCH: %ld %ld\n", match_start, match_end);
181 * Look for the first match in some text, ignoring non-spacing
184 if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
185 &match_start, &match_end))
186 printf("MATCH: %ld %ld\n", match_start, match_end);
195 * Free the dynamic storage used for compiling the expressions.
197 ure_free_buffer(rebuf);
199 ---------------------------------------------------------------------------
201 Mark Leisher <mleisher@crl.nmsu.edu>
204 ===========================================================================
210 Date : 21 September 1999
211 ==========================
212 1. Added copyright stuff and put in CVS.