Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / transform.cc
blob47ac6cc650c1e0f37efc5309813aaefe9819e7ef
1 /** @file
2 * @brief Implement OmegaScript $transform function.
3 */
4 /* Copyright (C) 2003,2009,2015,2022 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #include "transform.h"
25 #define PCRE2_CODE_UNIT_WIDTH 8
26 #include <pcre2.h>
28 #include <cstdint>
29 #include <map>
30 #include <string>
31 #include <vector>
33 using namespace std;
35 static map<pair<string, uint32_t>, pcre2_code*> re_cache;
36 static pcre2_match_data* md = NULL;
38 static pcre2_code*
39 get_re(const string& pattern, uint32_t options)
41 pair<string, uint32_t> re_key = make_pair(pattern, options);
42 auto re_it = re_cache.find(re_key);
43 if (re_it != re_cache.end()) {
44 return re_it->second;
47 if (!md) {
48 // Create lazily - here is a good point as it's a single place we
49 // have to pass through before executing a regex.
50 md = pcre2_match_data_create(10, NULL);
53 int error_code;
54 PCRE2_SIZE erroffset;
55 auto re = pcre2_compile(PCRE2_SPTR8(pattern.data()), pattern.size(),
56 options, &error_code, &erroffset, NULL);
57 if (!re) {
58 string m = "$transform failed to compile its regular expression: ";
59 // pcre2api(3) says that "a buffer size of 120 code units is ample".
60 unsigned char buf[120];
61 pcre2_get_error_message(error_code, buf, sizeof(buf));
62 m += reinterpret_cast<char*>(buf);
63 throw m;
65 re_cache.insert(make_pair(re_key, re));
66 return re;
69 void
70 omegascript_match(string & value, const vector<string> & args)
72 uint32_t options = PCRE2_UTF;
73 if (args.size() > 2) {
74 const string &opts = args[2];
75 for (char ch : opts) {
76 switch (ch) {
77 case 'i':
78 options |= PCRE2_CASELESS;
79 break;
80 case 'm':
81 options |= PCRE2_MULTILINE;
82 break;
83 case 's':
84 options |= PCRE2_DOTALL;
85 break;
86 case 'x':
87 options |= PCRE2_EXTENDED;
88 break;
89 default: {
90 string m = "Unknown $match option character: ";
91 m += ch;
92 throw m;
97 pcre2_code* re = get_re(args[0], options);
98 int matches = pcre2_match(re, PCRE2_SPTR8(args[1].data()), args[1].size(),
99 0, 0, md, NULL);
100 if (matches > 0) {
101 value += "true";
105 void
106 omegascript_transform(string & value, const vector<string> & args)
108 bool replace_all = false;
109 uint32_t options = PCRE2_UTF;
110 if (args.size() > 3) {
111 const string & opts = args[3];
112 for (char ch : opts) {
113 switch (ch) {
114 case 'g':
115 replace_all = true;
116 break;
117 case 'i':
118 options |= PCRE2_CASELESS;
119 break;
120 case 'm':
121 options |= PCRE2_MULTILINE;
122 break;
123 case 's':
124 options |= PCRE2_DOTALL;
125 break;
126 case 'x':
127 options |= PCRE2_EXTENDED;
128 break;
129 default: {
130 string m = "Unknown $transform option character: ";
131 m += ch;
132 throw m;
138 pcre2_code* re = get_re(args[0], options);
139 PCRE2_SIZE start = 0;
140 do {
141 int matches = pcre2_match(re,
142 PCRE2_SPTR8(args[2].data()), args[2].size(),
143 start, 0, md, NULL);
144 if (matches <= 0) {
145 // (matches == PCRE_ERROR_NOMATCH) is OK, otherwise this is an
146 // error. FIXME: should we report this rather than ignoring it?
147 break;
150 // Substitute \1 ... \9, and \\.
151 PCRE2_SIZE* offsets = pcre2_get_ovector_pointer(md);
152 value.append(args[2], start, offsets[0] - start);
153 for (auto i = args[1].begin(); i != args[1].end(); ++i) {
154 char ch = *i;
155 if (ch != '\\') {
156 value += ch;
157 continue;
160 if (rare(++i == args[1].end())) {
161 // Trailing single '\'.
162 value += ch;
163 break;
166 int c = *i;
167 if (c >= '1' && c <= '9') {
168 c -= '0';
169 // If there aren't that many groupings, expand to nothing.
170 if (c >= matches) continue;
171 } else {
172 value += ch;
173 if (c != '\\') value += char(c);
174 continue;
177 int off_c = offsets[c * 2];
178 value.append(args[2], off_c, offsets[c * 2 + 1] - off_c);
180 start = offsets[1];
181 } while (replace_all);
182 value.append(args[2], start, string::npos);