3 #include "shotgun/lib/shotgun.h"
4 #include "shotgun/lib/tuple.h"
5 #include "shotgun/lib/string.h"
6 #include "shotgun/lib/symbol.h"
7 #include "shotgun/lib/hash.h"
9 #define OPTION_IGNORECASE ONIG_OPTION_IGNORECASE
10 #define OPTION_EXTENDED ONIG_OPTION_EXTEND
11 #define OPTION_MULTILINE ONIG_OPTION_MULTILINE
12 #define OPTION_MASK (OPTION_IGNORECASE|OPTION_EXTENDED|OPTION_MULTILINE)
19 #define KCODE_MASK (KCODE_EUC|KCODE_SJIS|KCODE_UTF8)
21 #define REG(k) (*DATA_STRUCT(k, regex_t**))
23 OBJECT
get_match_data(STATE
, OnigRegion
*region
, OBJECT string
, OBJECT regex
, int max
);
25 void regexp_cleanup(STATE
, OBJECT data
) {
29 void regexp_init(STATE
) {
31 state_add_cleanup(state
, BASIC_CLASS(regexpdata
), regexp_cleanup
);
34 char *regexp_version(STATE
) {
35 return (char*)onig_version();
43 static int _gather_names(const UChar
*name
, const UChar
*name_end
,
44 int ngroup_num
, int *group_nums
, regex_t
*reg
, struct _gather_data
*gd
) {
54 hash_set(state
, tup
, symbol_from_cstr(state
, (char*)name
), I2N(gn
- 1));
58 OnigEncoding
get_enc_from_kcode(int kcode
)
62 r
= ONIG_ENCODING_ASCII
;
65 r
= ONIG_ENCODING_ASCII
;
68 r
= ONIG_ENCODING_EUC_JP
;
71 r
= ONIG_ENCODING_SJIS
;
74 r
= ONIG_ENCODING_UTF8
;
81 int get_kcode_from_enc(OnigEncoding enc
)
86 if (enc
== ONIG_ENCODING_ASCII
) r
= KCODE_NONE
;
87 if (enc
== ONIG_ENCODING_EUC_JP
) r
= KCODE_EUC
;
88 if (enc
== ONIG_ENCODING_SJIS
) r
= KCODE_SJIS
;
89 if (enc
== ONIG_ENCODING_UTF8
) r
= KCODE_UTF8
;
93 OBJECT
regexp_new(STATE
, OBJECT pattern
, OBJECT options
, char *err_buf
) {
97 OBJECT o_regdata
, o_reg
, o_names
;
98 OnigErrorInfo err_info
;
101 int err
, num_names
, kcode
;
103 pat
= (UChar
*)rbx_string_as_cstr(state
, pattern
);
104 end
= pat
+ N2I(string_get_bytes(pattern
));
106 /* Ug. What I hate about the onig API is that there is no way
107 to define how to allocate the reg, onig_new does it for you.
108 regex_t is a typedef for a pointer of the internal type.
109 So for the time being a regexp object will just store the
110 pointer to the real regex structure. */
112 NEW_STRUCT(o_regdata
, reg
, BASIC_CLASS(regexpdata
), regex_t
*);
115 kcode
= opts
& KCODE_MASK
;
116 enc
= get_enc_from_kcode(kcode
);
119 err
= onig_new(reg
, pat
, end
,
120 opts
, enc
, ONIG_SYNTAX_RUBY
, &err_info
);
122 if(err
!= ONIG_NORMAL
) {
123 UChar onig_err_buf
[ONIG_MAX_ERROR_MESSAGE_LEN
];
124 onig_error_code_to_str(onig_err_buf
, err
, &err_info
);
125 snprintf(err_buf
, 1024, "%s: %s", onig_err_buf
, pat
);
129 o_reg
= regexp_allocate(state
);
130 regexp_set_source(o_reg
, pattern
);
131 regexp_set_data(o_reg
, o_regdata
);
132 num_names
= onig_number_of_names(*reg
);
134 regexp_set_names(o_reg
, Qnil
);
136 struct _gather_data gd
;
138 o_names
= hash_new(state
);
140 onig_foreach_name(*reg
, (int (*)(const OnigUChar
*, const OnigUChar
*,int,int*,OnigRegex
,void*))_gather_names
, (void*)&gd
);
141 regexp_set_names(o_reg
, o_names
);
144 o_reg
->RequiresCleanup
= TRUE
;
149 OBJECT
regexp_options(STATE
, OBJECT regexp
)
152 OnigOptionType option
;
155 reg
= REG(regexp_get_data(regexp
));
156 option
= onig_get_options(reg
);
157 enc
= onig_get_encoding(reg
);
159 return I2N((int)(option
& OPTION_MASK
) | get_kcode_from_enc(enc
));
162 OBJECT
_md_region_to_tuple(STATE
, OnigRegion
*region
, int max
) {
165 tup
= tuple_new(state
, region
->num_regs
- 1);
166 for(i
= 1; i
< region
->num_regs
; i
++) {
167 sub
= tuple_new2(state
, 2, I2N(region
->beg
[i
]), I2N(region
->end
[i
]));
168 tuple_put(state
, tup
, i
- 1, sub
);
173 OBJECT
get_match_data(STATE
, OnigRegion
*region
, OBJECT string
, OBJECT regexp
, int max
) {
174 OBJECT md
= matchdata_allocate(state
);
175 matchdata_set_source(md
, string_dup(state
, string
));
176 matchdata_set_regexp(md
, regexp
);
177 matchdata_set_full(md
, tuple_new2(state
, 2, I2N(region
->beg
[0]), I2N(region
->end
[0])));
178 matchdata_set_region(md
, _md_region_to_tuple(state
, region
, max
));
182 OBJECT
regexp_match_start(STATE
, OBJECT regexp
, OBJECT string
, OBJECT start
) {
188 region
= onig_region_new();
190 max
= N2I(string_get_bytes(string
));
191 str
= (UChar
*)rbx_string_as_cstr(state
, string
);
193 beg
= onig_match(REG(regexp_get_data(regexp
)), str
, str
+ max
, str
+ N2I(start
), region
, ONIG_OPTION_NONE
);
195 if(beg
!= ONIG_MISMATCH
) {
196 md
= get_match_data(state
, region
, string
, regexp
, max
);
199 onig_region_free(region
, 1);
203 OBJECT
regexp_search_region(STATE
, OBJECT regexp
, OBJECT string
, OBJECT start
, OBJECT end
, OBJECT forward
) {
209 region
= onig_region_new();
211 max
= N2I(string_get_bytes(string
));
212 str
= (UChar
*)rbx_string_as_cstr(state
, string
);
214 if (RTEST(forward
)) {
215 beg
= onig_search(REG(regexp_get_data(regexp
)), str
, str
+ max
, str
+ N2I(start
), str
+ N2I(end
), region
, ONIG_OPTION_NONE
);
217 beg
= onig_search(REG(regexp_get_data(regexp
)), str
, str
+ max
, str
+ N2I(end
), str
+ N2I(start
), region
, ONIG_OPTION_NONE
);
220 if (beg
!= ONIG_MISMATCH
) {
221 md
= get_match_data(state
, region
, string
, regexp
, max
);
224 onig_region_free(region
, 1);
228 OBJECT
regexp_match(STATE
, OBJECT regexp
, OBJECT string
) {
230 const UChar
*str
, *end
, *start
, *range
;
235 region
= onig_region_new();
237 max
= N2I(string_get_bytes(string
));
238 str
= (UChar
*)rbx_string_as_cstr(state
, string
);
243 reg
= REG(regexp_get_data(regexp
));
245 err
= onig_search(reg
, str
, end
, start
, range
, region
, ONIG_OPTION_NONE
);
247 if(err
== ONIG_MISMATCH
) {
248 onig_region_free(region
, 1);
252 md
= matchdata_allocate(state
);
253 matchdata_set_source(md
, string
);
254 matchdata_set_regexp(md
, regexp
);
255 matchdata_set_full(md
, tuple_new2(state
, 2, I2N(region
->beg
[0]), I2N(region
->end
[0])));
256 matchdata_set_region(md
, _md_region_to_tuple(state
, region
, max
));
257 onig_region_free(region
, 1);