Added spec:commit task to commit changes to spec/ruby sources.
[rbx.git] / shotgun / lib / regexp.c
blobe83cd476c0b11deb77c125d7d6d5885c73f960f4
1 #include "oniguruma.h"
3 #include "shotgun/lib/shotgun.h"
4 #include "shotgun/lib/tuple.h"
5 #include "shotgun/lib/string.h"
6 #include "shotgun/lib/symbol.h"
7 #include "shotgun/lib/hash.h"
9 #define OPTION_IGNORECASE ONIG_OPTION_IGNORECASE
10 #define OPTION_EXTENDED ONIG_OPTION_EXTEND
11 #define OPTION_MULTILINE ONIG_OPTION_MULTILINE
12 #define OPTION_MASK (OPTION_IGNORECASE|OPTION_EXTENDED|OPTION_MULTILINE)
14 #define KCODE_ASCII 0
15 #define KCODE_NONE 16
16 #define KCODE_EUC 32
17 #define KCODE_SJIS 48
18 #define KCODE_UTF8 64
19 #define KCODE_MASK (KCODE_EUC|KCODE_SJIS|KCODE_UTF8)
21 #define REG(k) (*DATA_STRUCT(k, regex_t**))
23 OBJECT get_match_data(STATE, OnigRegion *region, OBJECT string, OBJECT regex, int max);
25 void regexp_cleanup(STATE, OBJECT data) {
26 onig_free(REG(data));
29 void regexp_init(STATE) {
30 onig_init();
31 state_add_cleanup(state, BASIC_CLASS(regexpdata), regexp_cleanup);
34 char *regexp_version(STATE) {
35 return (char*)onig_version();
38 struct _gather_data {
39 STATE;
40 OBJECT tup;
43 static int _gather_names(const UChar *name, const UChar *name_end,
44 int ngroup_num, int *group_nums, regex_t *reg, struct _gather_data *gd) {
46 int gn;
47 STATE;
48 OBJECT tup;
50 state = gd->state;
51 tup = gd->tup;
53 gn = group_nums[0];
54 hash_set(state, tup, symbol_from_cstr(state, (char*)name), I2N(gn - 1));
55 return 0;
58 OnigEncoding get_enc_from_kcode(int kcode)
60 OnigEncoding r;
62 r = ONIG_ENCODING_ASCII;
63 switch (kcode) {
64 case KCODE_NONE:
65 r = ONIG_ENCODING_ASCII;
66 break;
67 case KCODE_EUC:
68 r = ONIG_ENCODING_EUC_JP;
69 break;
70 case KCODE_SJIS:
71 r = ONIG_ENCODING_SJIS;
72 break;
73 case KCODE_UTF8:
74 r = ONIG_ENCODING_UTF8;
75 break;
77 return r;
81 int get_kcode_from_enc(OnigEncoding enc)
83 int r;
85 r = KCODE_ASCII;
86 if (enc == ONIG_ENCODING_ASCII) r = KCODE_NONE;
87 if (enc == ONIG_ENCODING_EUC_JP) r = KCODE_EUC;
88 if (enc == ONIG_ENCODING_SJIS) r = KCODE_SJIS;
89 if (enc == ONIG_ENCODING_UTF8) r = KCODE_UTF8;
90 return r;
93 OBJECT regexp_new(STATE, OBJECT pattern, OBJECT options, char *err_buf) {
94 regex_t **reg;
95 const UChar *pat;
96 const UChar *end;
97 OBJECT o_regdata, o_reg, o_names;
98 OnigErrorInfo err_info;
99 OnigOptionType opts;
100 OnigEncoding enc;
101 int err, num_names, kcode;
103 pat = (UChar*)rbx_string_as_cstr(state, pattern);
104 end = pat + N2I(string_get_bytes(pattern));
106 /* Ug. What I hate about the onig API is that there is no way
107 to define how to allocate the reg, onig_new does it for you.
108 regex_t is a typedef for a pointer of the internal type.
109 So for the time being a regexp object will just store the
110 pointer to the real regex structure. */
112 NEW_STRUCT(o_regdata, reg, BASIC_CLASS(regexpdata), regex_t*);
114 opts = N2I(options);
115 kcode = opts & KCODE_MASK;
116 enc = get_enc_from_kcode(kcode);
117 opts &= OPTION_MASK;
119 err = onig_new(reg, pat, end,
120 opts, enc, ONIG_SYNTAX_RUBY, &err_info);
122 if(err != ONIG_NORMAL) {
123 UChar onig_err_buf[ONIG_MAX_ERROR_MESSAGE_LEN];
124 onig_error_code_to_str(onig_err_buf, err, &err_info);
125 snprintf(err_buf, 1024, "%s: %s", onig_err_buf, pat);
126 return Qnil;
129 o_reg = regexp_allocate(state);
130 regexp_set_source(o_reg, pattern);
131 regexp_set_data(o_reg, o_regdata);
132 num_names = onig_number_of_names(*reg);
133 if(num_names == 0) {
134 regexp_set_names(o_reg, Qnil);
135 } else {
136 struct _gather_data gd;
137 gd.state = state;
138 o_names = hash_new(state);
139 gd.tup = o_names;
140 onig_foreach_name(*reg, (int (*)(const OnigUChar*, const OnigUChar*,int,int*,OnigRegex,void*))_gather_names, (void*)&gd);
141 regexp_set_names(o_reg, o_names);
144 o_reg->RequiresCleanup = TRUE;
146 return o_reg;
149 OBJECT regexp_options(STATE, OBJECT regexp)
151 OnigEncoding enc;
152 OnigOptionType option;
153 regex_t* reg;
155 reg = REG(regexp_get_data(regexp));
156 option = onig_get_options(reg);
157 enc = onig_get_encoding(reg);
159 return I2N((int)(option & OPTION_MASK) | get_kcode_from_enc(enc));
162 OBJECT _md_region_to_tuple(STATE, OnigRegion *region, int max) {
163 int i;
164 OBJECT tup, sub;
165 tup = tuple_new(state, region->num_regs - 1);
166 for(i = 1; i < region->num_regs; i++) {
167 sub = tuple_new2(state, 2, I2N(region->beg[i]), I2N(region->end[i]));
168 tuple_put(state, tup, i - 1, sub);
170 return tup;
173 OBJECT get_match_data(STATE, OnigRegion *region, OBJECT string, OBJECT regexp, int max) {
174 OBJECT md = matchdata_allocate(state);
175 matchdata_set_source(md, string_dup(state, string));
176 matchdata_set_regexp(md, regexp);
177 matchdata_set_full(md, tuple_new2(state, 2, I2N(region->beg[0]), I2N(region->end[0])));
178 matchdata_set_region(md, _md_region_to_tuple(state, region, max));
179 return md;
182 OBJECT regexp_match_start(STATE, OBJECT regexp, OBJECT string, OBJECT start) {
183 int beg, max;
184 const UChar *str;
185 OnigRegion *region;
186 OBJECT md = Qnil;
188 region = onig_region_new();
190 max = N2I(string_get_bytes(string));
191 str = (UChar*)rbx_string_as_cstr(state, string);
193 beg = onig_match(REG(regexp_get_data(regexp)), str, str + max, str + N2I(start), region, ONIG_OPTION_NONE);
195 if(beg != ONIG_MISMATCH) {
196 md = get_match_data(state, region, string, regexp, max);
199 onig_region_free(region, 1);
200 return md;
203 OBJECT regexp_search_region(STATE, OBJECT regexp, OBJECT string, OBJECT start, OBJECT end, OBJECT forward) {
204 int beg, max;
205 const UChar *str;
206 OnigRegion *region;
207 OBJECT md = Qnil;
209 region = onig_region_new();
211 max = N2I(string_get_bytes(string));
212 str = (UChar*)rbx_string_as_cstr(state, string);
214 if (RTEST(forward)) {
215 beg = onig_search(REG(regexp_get_data(regexp)), str, str + max, str + N2I(start), str + N2I(end), region, ONIG_OPTION_NONE);
216 } else {
217 beg = onig_search(REG(regexp_get_data(regexp)), str, str + max, str + N2I(end), str + N2I(start), region, ONIG_OPTION_NONE);
220 if (beg != ONIG_MISMATCH) {
221 md = get_match_data(state, region, string, regexp, max);
224 onig_region_free(region, 1);
225 return md;
228 OBJECT regexp_match(STATE, OBJECT regexp, OBJECT string) {
229 int err, max;
230 const UChar *str, *end, *start, *range;
231 OnigRegion *region;
232 regex_t *reg;
233 OBJECT md;
235 region = onig_region_new();
237 max = N2I(string_get_bytes(string));
238 str = (UChar*)rbx_string_as_cstr(state, string);
239 end = str + max;
240 start = str;
241 range = end;
243 reg = REG(regexp_get_data(regexp));
245 err = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
247 if(err == ONIG_MISMATCH) {
248 onig_region_free(region, 1);
249 return Qnil;
252 md = matchdata_allocate(state);
253 matchdata_set_source(md, string);
254 matchdata_set_regexp(md, regexp);
255 matchdata_set_full(md, tuple_new2(state, 2, I2N(region->beg[0]), I2N(region->end[0])));
256 matchdata_set_region(md, _md_region_to_tuple(state, region, max));
257 onig_region_free(region, 1);
258 return md;