Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / sort.cc
blob000d63bc5ac5ed7881f34035d19ae72d47620aae
1 /** @file
2 * @brief OmegaScript $sort function
3 */
4 /* Copyright (C) 2001,2004,2012,2016,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "sort.h"
26 // For option["decimal"].
27 #include "omega.h"
28 #include "stringutils.h"
30 #include <algorithm>
31 #include <string>
32 #include <vector>
34 using namespace std;
36 // Return -1, 0 or 1 depending on sign of floating point number starting at
37 // index i of string s. Non-numbers give 0.
38 static int
39 string_sign(const string& s, const string& decimal, size_t i = 0)
41 int r = 1;
42 if (s[i] == '-') {
43 r = -1;
44 ++i;
46 while (s[i] == '0') {
47 ++i;
49 if (s.compare(i, decimal.size(), decimal) == 0) {
50 i += decimal.size();
51 while (s[i] == '0') {
52 ++i;
55 return C_isdigit(s[i]) ? r : 0;
58 // Compare strings of two non-zero floating point numbers, without loss of
59 // precision, and ignoring their signs.
61 // The format handled is (as a Perl regex):
63 // ^[ \t]*(-?[0-9]*($decimal[0-9]*)?)
65 // where $decimal is option["decimal"].
67 // Values which are equal by the numeric test are compared as strings.
68 static int
69 ncmp(const string& a, const string& b, const string& decimal)
71 // Ignore leading blanks, an optional minus sign, and leading zeros.
72 size_t ai = a.find_first_not_of(" \t");
73 if (a[ai] == '-') ++ai;
74 while (a[ai] == '0') ++ai;
76 size_t bi = b.find_first_not_of(" \t");
77 if (b[bi] == '-') ++bi;
78 while (b[bi] == '0') ++bi;
80 while (a[ai] == b[bi] && C_isdigit(a[ai])) {
81 // Matching digits.
82 ++ai;
83 ++bi;
86 if (C_isdigit(a[ai])) {
87 if (!C_isdigit(b[bi])) {
88 // a > b
89 return 1;
92 int r = a[ai] - b[bi];
93 bool a_digit, b_digit;
94 do {
95 ++ai;
96 ++bi;
97 a_digit = C_isdigit(a[ai]);
98 b_digit = C_isdigit(b[bi]);
99 } while (a_digit && b_digit);
101 if (!a_digit && !b_digit) {
102 // Same number of digits, but not the same digits.
103 return r;
105 return a_digit ? 1 : -1;
108 if (C_isdigit(b[bi])) {
109 // a < b
110 return -1;
113 // Non-fractional parts are the same - compare any fractional parts.
114 bool a_frac = (a.compare(ai, decimal.size(), decimal) == 0);
115 if (a_frac) ai += decimal.size();
116 bool b_frac = (b.compare(bi, decimal.size(), decimal) == 0);
117 if (b_frac) bi += decimal.size();
118 if (!a_frac && !b_frac) return 0;
119 if (!b_frac) {
120 // Check if a's fractional part is zero.
121 while (a[ai] == '0') ++ai;
122 return C_isdigit(a[ai]) ? 1 : 0;
124 if (!a_frac) {
125 // Check if b's fractional part is zero.
126 while (b[bi] == '0') ++bi;
127 return C_isdigit(b[bi]) ? -1 : 0;
130 // Both have fractional parts, so compare.
131 while (a[ai] == b[bi] && C_isdigit(a[ai])) {
132 ++ai;
133 ++bi;
135 if (C_isdigit(a[ai])) return 1;
136 if (C_isdigit(b[bi])) return -1;
137 return 0;
140 static int
141 natcmp(const string& a, const string& b)
143 size_t shorter = min(a.size(), b.size());
144 size_t i = 0;
145 while (i != shorter) {
146 int cha = static_cast<unsigned char>(a[i]);
147 int chb = static_cast<unsigned char>(b[i]);
148 if (!C_isdigit(cha)) {
149 if (cha == chb) {
150 // Matching non-digits.
151 ++i;
152 continue;
155 if (C_isdigit(chb)) return 1;
156 return cha - chb;
159 // Sort embedded digit spans by numeric value and before non-digits.
160 if (!C_isdigit(chb)) return -1;
162 // Skip any leading zeros on each.
163 size_t sa = i;
164 while (a[sa] == '0') ++sa;
165 size_t sb = i;
166 while (b[sb] == '0') ++sb;
168 size_t ea = sa;
169 size_t eb = sb;
170 int res = 0;
171 while (true) {
172 if (!C_isdigit(a[ea])) {
173 if (C_isdigit(b[eb])) {
174 // Number in b is longer and so larger.
175 return -1;
177 // Digit sequences are the same length.
178 break;
180 if (a[ea] != b[eb]) {
181 if (!C_isdigit(b[eb])) {
182 // Number in a is longer and so larger.
183 return 1;
185 // Record first difference between digits.
186 if (res == 0) res = a[ea] - b[eb];
188 ++ea;
189 ++eb;
192 if (res == 0) {
193 // More leading zeros sorts first.
194 res = int(sb) - int(sa);
196 if (res) return res;
198 // Digit sequences were identical, so keep going.
199 i = ea;
201 return int(a.size()) - int(b.size());
204 void
205 omegascript_sort(const vector<string>& args,
206 string& value)
208 const string &list = args[0];
209 if (list.empty()) return;
211 // 0 => string sort, '#' => natural, 'n' => numeric.
212 char mode = 0;
213 bool uniq = false;
214 bool rev = false;
215 if (args.size() > 1) {
216 for (auto opt_ch : args[1]) {
217 switch (opt_ch) {
218 case '#':
219 case 'n':
220 if (mode != 0) {
221 string m = "Invalid $sort option combination: ";
222 m += args[1];
223 throw m;
225 mode = opt_ch;
226 break;
227 case 'r':
228 rev = true;
229 break;
230 case 'u':
231 uniq = true;
232 break;
233 default:
234 throw string("Unknown $sort option: ") + opt_ch;
238 vector<string> items;
239 string::size_type split = 0, split2;
240 do {
241 split2 = list.find('\t', split);
242 items.emplace_back(list, split, split2 - split);
243 split = UNSIGNED_OVERFLOW_OK(split2 + 1);
244 } while (split2 != string::npos);
246 if (mode != 'n') {
247 if (mode == 0) {
248 // String sort.
249 if (!rev) {
250 sort(items.begin(), items.end());
251 } else {
252 sort(items.begin(), items.end(),
253 [](const string& a, const string& b) {
254 return a > b;
257 } else {
258 // "Natural" sort - embedded natural numbers are handled specially.
259 if (!rev) {
260 sort(items.begin(), items.end(),
261 [](const string& a, const string& b) {
262 return natcmp(a, b) < 0;
264 } else {
265 sort(items.begin(), items.end(),
266 [](const string& a, const string& b) {
267 return natcmp(a, b) > 0;
272 value.reserve(list.size());
273 bool tab = false;
274 const string* prev = nullptr;
275 for (auto&& item : items) {
276 // Skip duplicates if "u" flag specified.
277 if (prev && *prev == item) {
278 continue;
280 if (uniq) {
281 prev = &item;
284 if (tab) {
285 value += '\t';
286 } else {
287 tab = true;
289 value += item;
291 return;
294 // Numeric sort.
296 // We first partition items such that all the negative values come first,
297 // then all the zero values, then all the positive values (for a forward
298 // search that is - for a reverse search the order of the partitions is
299 // reversed).
301 // Then we sort within each partition.
302 const string& decimal = option["decimal"];
303 size_t part_z = 0;
304 size_t part_f = items.size();
306 int first = rev ? 1 : -1;
307 if (!uniq) {
308 // Scan linearly through items, swapping to create the desired
309 // partitioning.
310 size_t i = 0;
311 while (i < part_f) {
312 int sign = string_sign(items[i], decimal);
313 if (sign == first) {
314 // Swap value to end of the first partition.
315 if (part_z != i)
316 swap(items[part_z], items[i]);
317 ++part_z;
318 ++i;
319 } else if (sign == 0) {
320 // Extend middle partition to include zero value.
321 ++i;
322 } else {
323 // Swap value to start of final partition.
324 swap(items[i], items[--part_f]);
327 } else {
328 // Need to preserve order within each partition.
329 auto z = stable_partition(items.begin(), items.end(),
330 [&decimal, first](const string& a) {
331 return string_sign(a, decimal) == first;
333 part_z = z - items.begin();
334 auto f = stable_partition(z, items.end(),
335 [&decimal](const string& a) {
336 return string_sign(a, decimal) == 0;
338 part_f = f - items.begin();
341 value.reserve(list.size());
342 bool tab = false;
344 const string* prev = nullptr;
345 if (part_z > 0) {
346 // Sort the first partition.
347 if (!uniq) {
348 if (!rev) {
349 sort(items.begin(), items.begin() + part_z,
350 [&decimal](const string& a, const string& b) {
351 int r = ncmp(a, b, decimal);
352 return r ? r > 0 : a < b;
354 } else {
355 sort(items.begin(), items.begin() + part_z,
356 [&decimal](const string& a, const string& b) {
357 int r = ncmp(a, b, decimal);
358 return r ? r > 0 : a > b;
361 } else {
362 stable_sort(items.begin(), items.begin() + part_z,
363 [&decimal](const string& a, const string& b) {
364 return ncmp(a, b, decimal) > 0;
367 for (size_t i = 0; i != part_z; ++i) {
368 const string& item = items[i];
369 // Skip duplicates.
370 if (prev && ncmp(*prev, item, decimal) == 0) {
371 continue;
373 prev = &item;
375 if (tab) {
376 value += '\t';
377 } else {
378 tab = true;
380 value += item;
385 if (part_z != part_f) {
386 // Handle all the zero values.
387 if (!uniq) {
388 if (!rev) {
389 sort(items.begin() + part_z, items.begin() + part_f,
390 [](const string& a, const string& b) {
391 return a < b;
393 } else {
394 sort(items.begin() + part_z, items.begin() + part_f,
395 [](const string& a, const string& b) {
396 return a > b;
399 } else {
400 if (tab) {
401 value += '\t';
402 } else {
403 tab = true;
405 // No need to actually sort this partition when uniq is true -
406 // we just take the first element.
407 value += items[part_z];
411 if (part_f != items.size()) {
412 // Sort the final partition.
413 if (!uniq) {
414 if (!rev) {
415 sort(items.begin() + part_f, items.end(),
416 [&decimal](const string& a, const string& b) {
417 int r = ncmp(a, b, decimal);
418 return r ? r < 0 : a < b;
420 } else {
421 sort(items.begin() + part_f, items.end(),
422 [&decimal](const string& a, const string& b) {
423 int r = ncmp(a, b, decimal);
424 return r ? r < 0 : a > b;
427 } else {
428 stable_sort(items.begin() + part_f, items.end(),
429 [&decimal](const string& a, const string& b) {
430 return ncmp(a, b, decimal) < 0;
433 for (size_t i = part_f; i != items.size(); ++i) {
434 const string& item = items[i];
435 // Skip duplicates.
436 if (prev && ncmp(*prev, item, decimal) == 0) {
437 continue;
439 prev = &item;
441 if (tab) {
442 value += '\t';
443 } else {
444 tab = true;
446 value += item;
451 // If uniq is true we already assembled the output in value as we processed
452 // each partition.
453 if (!uniq) {
454 for (auto&& item : items) {
455 if (tab) {
456 value += '\t';
457 } else {
458 tab = true;
460 value += item;