2 * @brief Implement OmegaScript $transform function.
4 /* Copyright (C) 2003,2009,2015,2022 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "transform.h"
25 #define PCRE2_CODE_UNIT_WIDTH 8
35 static map
<pair
<string
, uint32_t>, pcre2_code
*> re_cache
;
36 static pcre2_match_data
* md
= NULL
;
39 get_re(const string
& pattern
, uint32_t options
)
41 pair
<string
, uint32_t> re_key
= make_pair(pattern
, options
);
42 auto re_it
= re_cache
.find(re_key
);
43 if (re_it
!= re_cache
.end()) {
48 // Create lazily - here is a good point as it's a single place we
49 // have to pass through before executing a regex.
50 md
= pcre2_match_data_create(10, NULL
);
55 auto re
= pcre2_compile(PCRE2_SPTR8(pattern
.data()), pattern
.size(),
56 options
, &error_code
, &erroffset
, NULL
);
58 string m
= "$transform failed to compile its regular expression: ";
59 // pcre2api(3) says that "a buffer size of 120 code units is ample".
60 unsigned char buf
[120];
61 pcre2_get_error_message(error_code
, buf
, sizeof(buf
));
62 m
+= reinterpret_cast<char*>(buf
);
65 re_cache
.insert(make_pair(re_key
, re
));
70 omegascript_match(string
& value
, const vector
<string
> & args
)
72 uint32_t options
= PCRE2_UTF
;
73 if (args
.size() > 2) {
74 const string
&opts
= args
[2];
75 for (char ch
: opts
) {
78 options
|= PCRE2_CASELESS
;
81 options
|= PCRE2_MULTILINE
;
84 options
|= PCRE2_DOTALL
;
87 options
|= PCRE2_EXTENDED
;
90 string m
= "Unknown $match option character: ";
97 pcre2_code
* re
= get_re(args
[0], options
);
98 int matches
= pcre2_match(re
, PCRE2_SPTR8(args
[1].data()), args
[1].size(),
106 omegascript_transform(string
& value
, const vector
<string
> & args
)
108 bool replace_all
= false;
109 uint32_t options
= PCRE2_UTF
;
110 if (args
.size() > 3) {
111 const string
& opts
= args
[3];
112 for (char ch
: opts
) {
118 options
|= PCRE2_CASELESS
;
121 options
|= PCRE2_MULTILINE
;
124 options
|= PCRE2_DOTALL
;
127 options
|= PCRE2_EXTENDED
;
130 string m
= "Unknown $transform option character: ";
138 pcre2_code
* re
= get_re(args
[0], options
);
139 PCRE2_SIZE start
= 0;
141 int matches
= pcre2_match(re
,
142 PCRE2_SPTR8(args
[2].data()), args
[2].size(),
145 // (matches == PCRE_ERROR_NOMATCH) is OK, otherwise this is an
146 // error. FIXME: should we report this rather than ignoring it?
150 // Substitute \1 ... \9, and \\.
151 PCRE2_SIZE
* offsets
= pcre2_get_ovector_pointer(md
);
152 value
.append(args
[2], start
, offsets
[0] - start
);
153 for (auto i
= args
[1].begin(); i
!= args
[1].end(); ++i
) {
160 if (rare(++i
== args
[1].end())) {
161 // Trailing single '\'.
167 if (c
>= '1' && c
<= '9') {
169 // If there aren't that many groupings, expand to nothing.
170 if (c
>= matches
) continue;
173 if (c
!= '\\') value
+= char(c
);
177 int off_c
= offsets
[c
* 2];
178 value
.append(args
[2], off_c
, offsets
[c
* 2 + 1] - off_c
);
181 } while (replace_all
);
182 value
.append(args
[2], start
, string::npos
);