1 /* Definitions for data structures and routines for the regular
2 expression library, version 0.12.
3 Copyright (C) 1985,89,90,91,92,93,95,96,97,98 Free Software Foundation, Inc.
5 This file is part of the GNU C Library. Its master source is NOT part of
6 the C library, however. The master source lives in /gd/gnu/lib.
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Library General Public License as
10 published by the Free Software Foundation; either version 2 of the
11 License, or (at your option) any later version.
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Library General Public License for more details.
18 You should have received a copy of the GNU Library General Public
19 License along with the GNU C Library; see the file COPYING.LIB. If not,
20 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
24 Modified for OpenOffice.org to use sal_Unicode and Transliteration service.
27 #ifndef INCLUDED_REGEXP_RECLASS_HXX
28 #define INCLUDED_REGEXP_RECLASS_HXX
30 #ifndef INCLUDED_I18NUTIL_UNICODE_HXX
31 #include <i18nutil/unicode.hxx>
33 #ifndef _COM_SUN_STAR_UTIL_SEARCHFLAGS_HPP_
34 #include <com/sun/star/util/SearchFlags.hpp>
36 #ifndef _COM_SUN_STAR_UTIL_SEARCHOPTIONS_HPP_
37 #include <com/sun/star/util/SearchOptions.hpp>
40 #include <sal/types.h>
42 #ifndef _COM_SUN_STAR_I18N_XEXTENDEDTRANSLITERATION_HPP_
43 #include <com/sun/star/i18n/XExtendedTransliteration.hpp>
46 /* If any error codes are removed, changed, or added, update the
47 `re_error_msg' table in regex.c. */
51 REG_ENOSYS
= -1, /* This will never happen for this implementation. */
54 REG_NOERROR
= 0, /* Success. */
55 REG_NOMATCH
, /* Didn't find a match (for regexec). */
57 /* POSIX regcomp return error codes. (In the order listed in the
59 REG_BADPAT
, /* Invalid pattern. */
60 REG_ECOLLATE
, /* Not implemented. */
61 REG_ECTYPE
, /* Invalid character class name. */
62 REG_EESCAPE
, /* Trailing backslash. */
63 REG_ESUBREG
, /* Invalid back reference. */
64 REG_EBRACK
, /* Unmatched left bracket. */
65 REG_EPAREN
, /* Parenthesis imbalance. */
66 REG_EBRACE
, /* Unmatched \{. */
67 REG_BADBR
, /* Invalid contents of \{\}. */
68 REG_ERANGE
, /* Invalid range end. */
69 REG_ESPACE
, /* Ran out of memory. */
70 REG_BADRPT
, /* No preceding re for repetition op. */
72 /* Error codes we've added. */
73 REG_EEND
, /* Premature end. */
74 REG_ESIZE
, /* Compiled pattern bigger than 2^16 bytes. */
75 REG_ERPAREN
/* Unmatched ) or \); not returned from regcomp. */
79 /* This data structure represents a compiled pattern. Before calling
80 the pattern compiler, the fields `buffer', `allocated', `fastmap',
81 can be set. After the pattern has been
82 compiled, the `re_nsub' field is available. All other fields are
83 private to the regex routines. */
85 struct re_pattern_buffer
87 /* [[[begin pattern_buffer]]] */
88 /* Space that holds the compiled pattern. It is declared as
89 `unsigned char *' because its elements are
90 sometimes used as array indexes. */
93 /* Number of bytes to which `buffer' points. */
96 /* Number of bytes actually used in `buffer'. */
99 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
100 the fastmap, if there is one, to skip over impossible
101 starting points for matches. */
102 sal_Unicode
*fastmap
;
105 /* Number of subexpressions found by the compiler. */
108 /* Zero if this pattern cannot match the empty string, one else.
109 Well, in truth it's used only in `re_search2', to see
110 whether or not we should use the fastmap, so we don't set
111 this absolutely perfectly; see `re_compile_fastmap' (the
112 `duplicate' case). */
113 unsigned can_be_null
: 1;
115 /* Set to zero when `regex_compile' compiles a pattern; set to one
116 by `re_compile_fastmap' if it updates the fastmap. */
117 unsigned fastmap_accurate
: 1;
119 /* If set, a beginning-of-line anchor doesn't match at the
120 beginning of the string. */
121 unsigned not_bol
: 1;
123 /* Similarly for an end-of-line anchor. */
124 unsigned not_eol
: 1;
126 /* If true, an anchor at a newline matches. */
127 unsigned newline_anchor
: 1;
129 /* [[[end pattern_buffer]]] */
132 /* These are the command codes that appear in compiled regular
133 expressions. Some opcodes are followed by argument bytes. A
134 command code can specify any interpretation whatsoever for its
135 arguments. Zero bytes may appear in the compiled regular expression. */
141 /* Succeed right away--no more backtracking. */
144 /* Followed by one byte giving n, then by n literal bytes. */
147 /* Matches any (more or less) character. */
150 /* Matches any one char belonging to specified set. First
151 following byte is number of bitmap bytes. Then come bytes
152 for a bitmap saying which chars are in. Bits in each byte
153 are ordered low-bit-first. A character is in the set if its
154 bit is 1. A character too large to have a bit in the map is
155 automatically not in the set. */
158 /* Same parameters as charset, but match any character that is
159 not one of those specified. */
162 /* Start remembering the text that is matched, for storing in a
163 register. Followed by one byte with the register number, in
164 the range 0 to one less than the pattern buffer's re_nsub
165 field. Then followed by one byte with the number of groups
166 inner to this one. (This last has to be part of the
167 start_memory only because we need it in the on_failure_jump
170 /* Stop remembering the text that is matched and store it in a
171 memory register. Followed by one byte with the register
172 number, in the range 0 to one less than `re_nsub' in the
173 pattern buffer, and one byte with the number of inner groups,
174 just like `start_memory'. (We need the number of inner
175 groups here because we don't have any easy way of finding the
176 corresponding start_memory when we're at a stop_memory.) */
179 /* Match a duplicate of something remembered. Followed by one
180 byte containing the register number. */
183 /* Fail unless at beginning of line. */
186 /* Fail unless at end of line. */
189 /* Succeeds if at beginning of buffer (if emacs) or at beginning
190 of string to be matched (if not). */
193 /* Analogously, for end of buffer/string. */
196 /* Followed by two byte relative address to which to jump. */
199 /* Same as jump, but marks the end of an alternative. */
202 /* Followed by two-byte relative address of place to resume at
203 in case of failure. */
206 /* Like on_failure_jump, but pushes a placeholder instead of the
207 current string position when executed. */
208 on_failure_keep_string_jump
,
210 /* Throw away latest failure point and then jump to following
211 two-byte relative address. */
214 /* Change to pop_failure_jump if know won't have to backtrack to
215 match; otherwise change to jump. This is used to jump
216 back to the beginning of a repeat. If what follows this jump
217 clearly won't match what the repeat does, such that we can be
218 sure that there is no use backtracking out of repetitions
219 already matched, then we change it to a pop_failure_jump.
220 Followed by two-byte address. */
223 /* Jump to following two-byte address, and push a dummy failure
224 point. This failure point will be thrown away if an attempt
225 is made to use it for a failure. A `+' construct makes this
226 before the first repeat. Also used as an intermediary kind
227 of jump when compiling an alternative. */
230 /* Push a dummy failure point and continue. Used at the end of
234 /* Followed by two-byte relative address and two-byte number n.
235 After matching N times, jump to the address upon failure. */
238 /* Followed by two-byte relative address, and two-byte number n.
239 Jump to the address N times, then fail. */
242 /* Set the following two-byte relative address to the
243 subsequent two-byte number. The address *includes* the two
247 wordbeg
, /* Succeeds if at word beginning. */
248 wordend
/* Succeeds if at word end. */
252 typedef struct re_pattern_buffer regex_t
;
254 /* Type for byte offsets within the string. POSIX mandates this. */
255 typedef sal_Int32 regoff_t
;
257 /* This is the structure we store register match data in. See
258 regex.texinfo for a full description of what registers match. */
264 sal_Int32 num_of_match
;
268 sal_Int32 begalt_offset
;
269 sal_Int32 fixup_alt_jump
;
270 sal_Int32 inner_group_offset
;
271 sal_Int32 laststart_offset
;
273 } compile_stack_elt_t
;
276 compile_stack_elt_t
*stack
;
279 } compile_stack_type
;
283 sal_Unicode
*pointer
;
287 typedef union fail_stack_elt fail_stack_elt_t
;
291 fail_stack_elt_t
*stack
;
293 sal_uInt32 avail
; /* Offset of next open position. */
298 fail_stack_elt_t word
;
301 /* This field is one if this group can match the empty string,
302 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
303 #define MATCH_NULL_UNSET_VALUE 3
304 unsigned match_null_string_p
: 2;
305 unsigned is_active
: 1;
306 unsigned matched_something
: 1;
307 unsigned ever_matched_something
: 1;
309 } register_info_type
;
314 ::com::sun::star::uno::Reference
<
315 ::com::sun::star::i18n::XExtendedTransliteration
> translit
;
317 const sal_Unicode
*line
; // line to search in
318 sal_Int32 linelen
; // length of search string
320 sal_Unicode
*pattern
; // RE pattern to match
321 sal_Int32 patsize
; // Length of pattern
323 struct re_pattern_buffer
*bufp
;
325 sal_Bool isIgnoreCase
;
327 /* Either a translate table to apply to all characters before
328 comparing them, or zero for no translation. The translation
329 is applied to a pattern when it is compiled and to a string
330 when it is matched. */
333 sal_uInt32 failure_id
;
334 sal_uInt32 nfailure_points_pushed
;
335 sal_uInt32 nfailure_points_popped
;
336 /* Counts the total number of registers pushed. */
337 sal_uInt32 num_regs_pushed
;
339 sal_uInt32 re_max_failures
;
341 /* Registers are set to a sentinel when they haven't yet matched. */
342 sal_Unicode reg_unset_dummy
;
344 // private instance functions
345 inline void store_number( sal_Unicode
* destination
, sal_Int32 number
);
346 inline void store_number_and_incr( sal_Unicode
*& destination
, sal_Int32 number
);
347 inline void extract_number(sal_Int32
& dest
, sal_Unicode
*source
);
348 inline void extract_number_and_incr(sal_Int32
& destination
, sal_Unicode
*& source
);
350 sal_Bool
group_match_null_string_p(sal_Unicode
**p
, sal_Unicode
*end
,
351 register_info_type
*reg_info
);
352 sal_Bool
alt_match_null_string_p(sal_Unicode
*p
, sal_Unicode
*end
,
353 register_info_type
*reg_info
);
355 sal_Bool
common_op_match_null_string_p(sal_Unicode
**p
, sal_Unicode
*end
,
356 register_info_type
*reg_info
);
357 sal_Int32
bcmp_translate(const sal_Unicode
*s1
,
358 const sal_Unicode
*s2
, sal_Int32 len
);
360 sal_Int32
regcomp(void);
361 sal_Int32
regex_compile(void);
362 inline void store_op1(re_opcode_t op
, sal_Unicode
*loc
, sal_Int32 arg
);
363 inline void store_op2(re_opcode_t op
, sal_Unicode
*loc
, sal_Int32 arg1
, sal_Int32 arg2
);
364 void insert_op1(re_opcode_t op
, sal_Unicode
*loc
, sal_Int32 arg
,
366 void insert_op2(re_opcode_t op
, sal_Unicode
*loc
, sal_Int32 arg1
,
367 sal_Int32 arg2
, sal_Unicode
*end
);
368 sal_Bool
at_begline_loc_p(const sal_Unicode
*local_pattern
,
369 const sal_Unicode
*p
);
370 sal_Bool
at_endline_loc_p(const sal_Unicode
*p
, const sal_Unicode
*pend
);
371 reg_errcode_t
compile_range(sal_Unicode range_begin
, sal_Unicode range_end
, sal_Unicode
*b
);
372 sal_Bool
group_in_compile_stack(compile_stack_type compile_stack
,
374 sal_Int32
re_match2(struct re_registers
*regs
, sal_Int32 pos
, sal_Int32 range
);
376 sal_Bool
iswordbegin(const sal_Unicode
*d
, sal_Unicode
*string
, sal_Int32 ssize
);
377 sal_Bool
iswordend(const sal_Unicode
*d
, sal_Unicode
*string
, sal_Int32 ssize
);
378 void set_list_bit(sal_Unicode c
, sal_Unicode
*b
);
382 Regexpr( const ::com::sun::star::util::SearchOptions
& rOptions
,
383 ::com::sun::star::uno::Reference
<
384 ::com::sun::star::i18n::XExtendedTransliteration
> XTrans
);
389 void set_line( const sal_Unicode
*line
, sal_Int32 len
);
391 // function returning pointers to occurrences in regs
392 sal_Int32
re_search(struct re_registers
*regs
, sal_Int32 pOffset
); // find pattern in line