update dev300-m58
[ooovba.git] / regexp / source / reclass.hxx
blob510429cb26dbadd5a7868d114d752a9e8f37306b
1 /* Definitions for data structures and routines for the regular
2 expression library, version 0.12.
3 Copyright (C) 1985,89,90,91,92,93,95,96,97,98 Free Software Foundation, Inc.
5 This file is part of the GNU C Library. Its master source is NOT part of
6 the C library, however. The master source lives in /gd/gnu/lib.
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public License as
10 published by the Free Software Foundation; either version 2 of the
11 License, or (at your option) any later version.
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public
19 License along with the GNU C Library; see the file COPYING.LIB. If not,
20 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
24 Modified for OpenOffice.org to use sal_Unicode and Transliteration service.
27 #ifndef INCLUDED_REGEXP_RECLASS_HXX
28 #define INCLUDED_REGEXP_RECLASS_HXX
30 #ifndef INCLUDED_I18NUTIL_UNICODE_HXX
31 #include <i18nutil/unicode.hxx>
32 #endif
33 #ifndef _COM_SUN_STAR_UTIL_SEARCHFLAGS_HPP_
34 #include <com/sun/star/util/SearchFlags.hpp>
35 #endif
36 #ifndef _COM_SUN_STAR_UTIL_SEARCHOPTIONS_HPP_
37 #include <com/sun/star/util/SearchOptions.hpp>
38 #endif
39 #ifndef _SAL_TYPES_H_
40 #include <sal/types.h>
41 #endif
42 #ifndef _COM_SUN_STAR_I18N_XEXTENDEDTRANSLITERATION_HPP_
43 #include <com/sun/star/i18n/XExtendedTransliteration.hpp>
44 #endif
46 /* If any error codes are removed, changed, or added, update the
47 `re_error_msg' table in regex.c. */
48 typedef enum
50 #ifdef _XOPEN_SOURCE
51 REG_ENOSYS = -1, /* This will never happen for this implementation. */
52 #endif
54 REG_NOERROR = 0, /* Success. */
55 REG_NOMATCH, /* Didn't find a match (for regexec). */
57 /* POSIX regcomp return error codes. (In the order listed in the
58 standard.) */
59 REG_BADPAT, /* Invalid pattern. */
60 REG_ECOLLATE, /* Not implemented. */
61 REG_ECTYPE, /* Invalid character class name. */
62 REG_EESCAPE, /* Trailing backslash. */
63 REG_ESUBREG, /* Invalid back reference. */
64 REG_EBRACK, /* Unmatched left bracket. */
65 REG_EPAREN, /* Parenthesis imbalance. */
66 REG_EBRACE, /* Unmatched \{. */
67 REG_BADBR, /* Invalid contents of \{\}. */
68 REG_ERANGE, /* Invalid range end. */
69 REG_ESPACE, /* Ran out of memory. */
70 REG_BADRPT, /* No preceding re for repetition op. */
72 /* Error codes we've added. */
73 REG_EEND, /* Premature end. */
74 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
75 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
76 } reg_errcode_t;
79 /* This data structure represents a compiled pattern. Before calling
80 the pattern compiler, the fields `buffer', `allocated', `fastmap',
81 can be set. After the pattern has been
82 compiled, the `re_nsub' field is available. All other fields are
83 private to the regex routines. */
85 struct re_pattern_buffer
87 /* [[[begin pattern_buffer]]] */
88 /* Space that holds the compiled pattern. It is declared as
89 `unsigned char *' because its elements are
90 sometimes used as array indexes. */
91 sal_Unicode *buffer;
93 /* Number of bytes to which `buffer' points. */
94 sal_uInt32 allocated;
96 /* Number of bytes actually used in `buffer'. */
97 sal_uInt32 used;
99 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
100 the fastmap, if there is one, to skip over impossible
101 starting points for matches. */
102 sal_Unicode *fastmap;
105 /* Number of subexpressions found by the compiler. */
106 size_t re_nsub;
108 /* Zero if this pattern cannot match the empty string, one else.
109 Well, in truth it's used only in `re_search2', to see
110 whether or not we should use the fastmap, so we don't set
111 this absolutely perfectly; see `re_compile_fastmap' (the
112 `duplicate' case). */
113 unsigned can_be_null : 1;
115 /* Set to zero when `regex_compile' compiles a pattern; set to one
116 by `re_compile_fastmap' if it updates the fastmap. */
117 unsigned fastmap_accurate : 1;
119 /* If set, a beginning-of-line anchor doesn't match at the
120 beginning of the string. */
121 unsigned not_bol : 1;
123 /* Similarly for an end-of-line anchor. */
124 unsigned not_eol : 1;
126 /* If true, an anchor at a newline matches. */
127 unsigned newline_anchor : 1;
129 /* [[[end pattern_buffer]]] */
132 /* These are the command codes that appear in compiled regular
133 expressions. Some opcodes are followed by argument bytes. A
134 command code can specify any interpretation whatsoever for its
135 arguments. Zero bytes may appear in the compiled regular expression. */
137 typedef enum
139 no_op = 0,
141 /* Succeed right away--no more backtracking. */
142 succeed,
144 /* Followed by one byte giving n, then by n literal bytes. */
145 exactn,
147 /* Matches any (more or less) character. */
148 anychar,
150 /* Matches any one char belonging to specified set. First
151 following byte is number of bitmap bytes. Then come bytes
152 for a bitmap saying which chars are in. Bits in each byte
153 are ordered low-bit-first. A character is in the set if its
154 bit is 1. A character too large to have a bit in the map is
155 automatically not in the set. */
156 charset,
158 /* Same parameters as charset, but match any character that is
159 not one of those specified. */
160 charset_not,
162 /* Start remembering the text that is matched, for storing in a
163 register. Followed by one byte with the register number, in
164 the range 0 to one less than the pattern buffer's re_nsub
165 field. Then followed by one byte with the number of groups
166 inner to this one. (This last has to be part of the
167 start_memory only because we need it in the on_failure_jump
168 of re_match2.) */
169 start_memory,
170 /* Stop remembering the text that is matched and store it in a
171 memory register. Followed by one byte with the register
172 number, in the range 0 to one less than `re_nsub' in the
173 pattern buffer, and one byte with the number of inner groups,
174 just like `start_memory'. (We need the number of inner
175 groups here because we don't have any easy way of finding the
176 corresponding start_memory when we're at a stop_memory.) */
177 stop_memory,
179 /* Match a duplicate of something remembered. Followed by one
180 byte containing the register number. */
181 duplicate,
183 /* Fail unless at beginning of line. */
184 begline,
186 /* Fail unless at end of line. */
187 endline,
189 /* Succeeds if at beginning of buffer (if emacs) or at beginning
190 of string to be matched (if not). */
191 begbuf,
193 /* Analogously, for end of buffer/string. */
194 endbuf,
196 /* Followed by two byte relative address to which to jump. */
197 jump,
199 /* Same as jump, but marks the end of an alternative. */
200 jump_past_alt,
202 /* Followed by two-byte relative address of place to resume at
203 in case of failure. */
204 on_failure_jump,
206 /* Like on_failure_jump, but pushes a placeholder instead of the
207 current string position when executed. */
208 on_failure_keep_string_jump,
210 /* Throw away latest failure point and then jump to following
211 two-byte relative address. */
212 pop_failure_jump,
214 /* Change to pop_failure_jump if know won't have to backtrack to
215 match; otherwise change to jump. This is used to jump
216 back to the beginning of a repeat. If what follows this jump
217 clearly won't match what the repeat does, such that we can be
218 sure that there is no use backtracking out of repetitions
219 already matched, then we change it to a pop_failure_jump.
220 Followed by two-byte address. */
221 maybe_pop_jump,
223 /* Jump to following two-byte address, and push a dummy failure
224 point. This failure point will be thrown away if an attempt
225 is made to use it for a failure. A `+' construct makes this
226 before the first repeat. Also used as an intermediary kind
227 of jump when compiling an alternative. */
228 dummy_failure_jump,
230 /* Push a dummy failure point and continue. Used at the end of
231 alternatives. */
232 push_dummy_failure,
234 /* Followed by two-byte relative address and two-byte number n.
235 After matching N times, jump to the address upon failure. */
236 succeed_n,
238 /* Followed by two-byte relative address, and two-byte number n.
239 Jump to the address N times, then fail. */
240 jump_n,
242 /* Set the following two-byte relative address to the
243 subsequent two-byte number. The address *includes* the two
244 bytes of number. */
245 set_number_at,
247 wordbeg, /* Succeeds if at word beginning. */
248 wordend /* Succeeds if at word end. */
250 } re_opcode_t;
252 typedef struct re_pattern_buffer regex_t;
254 /* Type for byte offsets within the string. POSIX mandates this. */
255 typedef sal_Int32 regoff_t;
257 /* This is the structure we store register match data in. See
258 regex.texinfo for a full description of what registers match. */
259 struct re_registers
261 sal_uInt32 num_regs;
262 sal_Int32 *start;
263 sal_Int32 *end;
264 sal_Int32 num_of_match;
267 typedef struct {
268 sal_Int32 begalt_offset;
269 sal_Int32 fixup_alt_jump;
270 sal_Int32 inner_group_offset;
271 sal_Int32 laststart_offset;
272 sal_uInt32 regnum;
273 } compile_stack_elt_t;
275 typedef struct {
276 compile_stack_elt_t *stack;
277 sal_uInt32 size;
278 sal_uInt32 avail;
279 } compile_stack_type;
281 union fail_stack_elt
283 sal_Unicode *pointer;
284 sal_Int32 integer;
287 typedef union fail_stack_elt fail_stack_elt_t;
289 typedef struct
291 fail_stack_elt_t *stack;
292 sal_uInt32 size;
293 sal_uInt32 avail; /* Offset of next open position. */
294 } fail_stack_type;
296 typedef union
298 fail_stack_elt_t word;
299 struct
301 /* This field is one if this group can match the empty string,
302 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
303 #define MATCH_NULL_UNSET_VALUE 3
304 unsigned match_null_string_p : 2;
305 unsigned is_active : 1;
306 unsigned matched_something : 1;
307 unsigned ever_matched_something : 1;
308 } bits;
309 } register_info_type;
312 class Regexpr
314 ::com::sun::star::uno::Reference<
315 ::com::sun::star::i18n::XExtendedTransliteration > translit;
317 const sal_Unicode *line; // line to search in
318 sal_Int32 linelen; // length of search string
320 sal_Unicode *pattern; // RE pattern to match
321 sal_Int32 patsize; // Length of pattern
323 struct re_pattern_buffer *bufp;
325 sal_Bool isIgnoreCase;
327 /* Either a translate table to apply to all characters before
328 comparing them, or zero for no translation. The translation
329 is applied to a pattern when it is compiled and to a string
330 when it is matched. */
331 int translate;
333 sal_uInt32 failure_id;
334 sal_uInt32 nfailure_points_pushed;
335 sal_uInt32 nfailure_points_popped;
336 /* Counts the total number of registers pushed. */
337 sal_uInt32 num_regs_pushed;
339 sal_uInt32 re_max_failures;
341 /* Registers are set to a sentinel when they haven't yet matched. */
342 sal_Unicode reg_unset_dummy;
344 // private instance functions
345 inline void store_number( sal_Unicode * destination, sal_Int32 number );
346 inline void store_number_and_incr( sal_Unicode *& destination, sal_Int32 number );
347 inline void extract_number(sal_Int32 & dest, sal_Unicode *source);
348 inline void extract_number_and_incr(sal_Int32 & destination, sal_Unicode *& source);
350 sal_Bool group_match_null_string_p(sal_Unicode **p, sal_Unicode *end,
351 register_info_type *reg_info);
352 sal_Bool alt_match_null_string_p(sal_Unicode *p, sal_Unicode *end,
353 register_info_type *reg_info);
355 sal_Bool common_op_match_null_string_p(sal_Unicode **p, sal_Unicode *end,
356 register_info_type *reg_info);
357 sal_Int32 bcmp_translate(const sal_Unicode *s1,
358 const sal_Unicode *s2, sal_Int32 len);
360 sal_Int32 regcomp(void);
361 sal_Int32 regex_compile(void);
362 inline void store_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg);
363 inline void store_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2);
364 void insert_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg,
365 sal_Unicode *end);
366 void insert_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1,
367 sal_Int32 arg2, sal_Unicode *end);
368 sal_Bool at_begline_loc_p(const sal_Unicode *local_pattern,
369 const sal_Unicode *p);
370 sal_Bool at_endline_loc_p(const sal_Unicode *p, const sal_Unicode *pend);
371 reg_errcode_t compile_range(sal_Unicode range_begin, sal_Unicode range_end, sal_Unicode *b);
372 sal_Bool group_in_compile_stack(compile_stack_type compile_stack,
373 sal_uInt32 regnum);
374 sal_Int32 re_match2(struct re_registers *regs, sal_Int32 pos, sal_Int32 range);
376 sal_Bool iswordbegin(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize);
377 sal_Bool iswordend(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize);
378 void set_list_bit(sal_Unicode c, sal_Unicode *b);
380 public:
381 // constructors
382 Regexpr( const ::com::sun::star::util::SearchOptions & rOptions,
383 ::com::sun::star::uno::Reference<
384 ::com::sun::star::i18n::XExtendedTransliteration > XTrans );
386 // destructor
387 ~Regexpr();
389 void set_line( const sal_Unicode *line, sal_Int32 len );
391 // function returning pointers to occurrences in regs
392 sal_Int32 re_search(struct re_registers *regs, sal_Int32 pOffset); // find pattern in line
395 #endif