scriptindex: Fix weird error cases
[xapian.git] / xapian-applications / omega / scriptindex.cc
blob528f6e630d34bc3312580cd89491a8b2f99fbf84
1 /** @file
2 * @brief index arbitrary data as described by an index script
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2022 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #include <xapian.h>
29 #include <algorithm>
30 #include <fstream>
31 #include <iostream>
32 #include <list>
33 #include <map>
34 #include <memory>
35 #include <string>
36 #include <unordered_set>
37 #include <vector>
38 #include <cstring>
40 #include <cerrno>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <ctime>
45 #include "commonhelp.h"
46 #include "hashterm.h"
47 #include "loadfile.h"
48 #include "myhtmlparse.h"
49 #include "parseint.h"
50 #include "setenv.h"
51 #include "str.h"
52 #include "stringutils.h"
53 #include "timegm.h"
54 #include "utf8truncate.h"
55 #include "utils.h"
56 #include "values.h"
58 #ifndef HAVE_STRPTIME
59 #include "portability/strptime.h"
60 #endif
62 #include "gnu_getopt.h"
64 using namespace std;
66 #define PROG_NAME "scriptindex"
67 #define PROG_DESC "index arbitrary data as described by an index script"
69 static bool verbose;
70 static int addcount;
71 static int repcount;
72 static int delcount;
74 static inline bool
75 prefix_needs_colon(const string & prefix, unsigned ch)
77 if (!C_isupper(ch) && ch != ':') return false;
78 string::size_type len = prefix.length();
79 return (len > 1 && prefix[len - 1] != ':');
82 const char * action_names[] = {
83 // Actions used internally:
84 "bad",
85 "new",
86 // Actual actions:
87 "boolean",
88 "date",
89 "field",
90 "gap",
91 "hash",
92 "hextobin",
93 "index",
94 "indexnopos",
95 "load",
96 "lower",
97 "ltrim",
98 "parsedate",
99 "rtrim",
100 "spell",
101 "split",
102 "squash",
103 "trim",
104 "truncate",
105 "unhtml",
106 "unique",
107 "value",
108 "valuenumeric",
109 "valuepacked",
110 "weight"
113 // For debugging:
114 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
116 class Action {
117 public:
118 typedef enum {
119 // Actions used internally:
120 BAD,
121 NEW,
122 // Actual actions:
123 BOOLEAN,
124 DATE,
125 FIELD,
126 GAP,
127 HASH,
128 HEXTOBIN,
129 INDEX,
130 INDEXNOPOS,
131 LOAD,
132 LOWER,
133 LTRIM,
134 PARSEDATE,
135 RTRIM,
136 SPELL,
137 SPLIT,
138 SQUASH,
139 TRIM,
140 TRUNCATE,
141 UNHTML,
142 UNIQUE,
143 VALUE,
144 VALUENUMERIC,
145 VALUEPACKED,
146 WEIGHT
147 } type;
148 enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
149 private:
150 type action;
151 int num_arg;
152 string string_arg;
153 // Offset into indexscript line.
154 size_t pos;
155 public:
156 Action(type action_, size_t pos_)
157 : action(action_), num_arg(0), pos(pos_) { }
158 Action(type action_, size_t pos_, const string & arg)
159 : action(action_), string_arg(arg), pos(pos_) {
160 num_arg = atoi(string_arg.c_str());
162 Action(type action_, size_t pos_, const string & arg, int num)
163 : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
164 type get_action() const { return action; }
165 int get_num_arg() const { return num_arg; }
166 void set_num_arg(int num) { num_arg = num; }
167 const string & get_string_arg() const { return string_arg; }
168 size_t get_pos() const { return pos; }
171 // These allow searching for an Action with a particular Action::type using
172 // std::find().
174 inline bool
175 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
177 inline bool
178 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
180 inline bool
181 operator!=(const Action& a, Action::type t) { return !(a == t); }
183 inline bool
184 operator!=(Action::type t, const Action& a) { return !(t == a); }
186 static void
187 ltrim(string& s, const string& chars)
189 auto i = s.find_first_not_of(chars);
190 if (i) s.erase(0, i);
193 static void
194 rtrim(string& s, const string& chars)
196 s.resize(s.find_last_not_of(chars) + 1);
199 static void
200 squash(string& s, const string& chars)
202 string output;
203 output.reserve(s.size());
204 string::size_type i = 0;
205 while ((i = s.find_first_not_of(chars, i)) != string::npos) {
206 auto j = s.find_first_of(chars, i);
207 if (!output.empty()) output += ' ';
208 output.append(s, i, j - i);
209 i = j;
211 s = std::move(output);
214 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
216 static unsigned error_count = 0;
218 static void
219 report_location(enum diag_type type,
220 const string& filename,
221 size_t line = 0,
222 size_t pos = string::npos)
224 cerr << filename;
225 if (line != 0) {
226 cerr << ':' << line;
227 if (pos != string::npos) {
228 // The first column is numbered 1.
229 cerr << ':' << pos + 1;
232 switch (type) {
233 case DIAG_ERROR:
234 cerr << ": error: ";
235 ++error_count;
236 break;
237 case DIAG_WARN:
238 cerr << ": warning: ";
239 break;
240 case DIAG_NOTE:
241 cerr << ": note: ";
242 break;
246 static void
247 report_useless_action(const string &file, size_t line, size_t pos,
248 const string &action)
250 report_location(DIAG_WARN, file, line, pos);
251 cerr << "Index action '" << action << "' has no effect" << endl;
253 static bool given_left_to_right_warning = false;
254 if (!given_left_to_right_warning) {
255 given_left_to_right_warning = true;
256 report_location(DIAG_NOTE, file, line, pos);
257 cerr << "Actions are executed from left to right" << endl;
261 static map<string, vector<Action>> index_spec;
263 static void
264 parse_index_script(const string &filename)
266 ifstream script(filename.c_str());
267 if (!script.is_open()) {
268 report_location(DIAG_ERROR, filename);
269 cerr << strerror(errno) << endl;
270 exit(1);
272 string line;
273 size_t line_no = 0;
274 // Line number where we saw a `unique` action, or 0 if we haven't.
275 int unique_line_no = 0;
276 // Offset into line unique_line_no where the `unique` action was.
277 size_t unique_pos = 0;
278 while (getline(script, line)) {
279 ++line_no;
280 vector<string> fields;
281 vector<Action> actions;
282 string::const_iterator i, j;
283 const string &s = line;
284 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
285 if (i == s.end() || *i == '#') {
286 // Blank line or comment.
287 continue;
289 while (true) {
290 if (!C_isalnum(*i)) {
291 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
292 cerr << "field name must start with alphanumeric" << endl;
294 j = find_if(i + 1, s.end(),
295 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
296 fields.push_back(string(i, j));
297 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
298 if (i == s.end()) break;
299 if (*i == ':') {
300 ++i;
301 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
302 break;
304 if (i == j) {
305 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
306 cerr << "bad character '" << *i << "' in field name" << endl;
307 ++i;
308 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
309 if (i == s.end()) break;
312 Xapian::termcount weight = 1;
313 size_t useless_weight_pos = string::npos;
314 map<string, Action::type> boolmap;
315 j = i;
316 while (j != s.end()) {
317 size_t action_pos = j - s.begin();
318 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
319 string action(s, j - s.begin(), i - j);
320 Action::type code = Action::BAD;
321 unsigned min_args = 0, max_args = 0;
322 bool takes_integer_argument = false;
323 if (!action.empty()) {
324 switch (action[0]) {
325 case 'b':
326 if (action == "boolean") {
327 code = Action::BOOLEAN;
328 max_args = 1;
330 break;
331 case 'd':
332 if (action == "date") {
333 code = Action::DATE;
334 min_args = max_args = 1;
336 break;
337 case 'f':
338 if (action == "field") {
339 code = Action::FIELD;
340 max_args = 1;
342 break;
343 case 'g':
344 if (action == "gap") {
345 code = Action::GAP;
346 max_args = 1;
347 takes_integer_argument = true;
349 break;
350 case 'h':
351 if (action == "hash") {
352 code = Action::HASH;
353 max_args = 1;
354 takes_integer_argument = true;
355 } else if (action == "hextobin") {
356 code = Action::HEXTOBIN;
358 break;
359 case 'i':
360 if (action == "index") {
361 code = Action::INDEX;
362 max_args = 1;
363 } else if (action == "indexnopos") {
364 code = Action::INDEXNOPOS;
365 max_args = 1;
367 break;
368 case 'l':
369 if (action == "lower") {
370 code = Action::LOWER;
371 } else if (action == "load") {
372 code = Action::LOAD;
373 } else if (action == "ltrim") {
374 code = Action::LTRIM;
375 max_args = 1;
377 break;
378 case 'p':
379 if (action == "parsedate") {
380 code = Action::PARSEDATE;
381 min_args = max_args = 1;
383 break;
384 case 'r':
385 if (action == "rtrim") {
386 code = Action::RTRIM;
387 max_args = 1;
389 break;
390 case 's':
391 if (action == "spell") {
392 code = Action::SPELL;
393 } else if (action == "split") {
394 code = Action::SPLIT;
395 min_args = 1;
396 max_args = 2;
397 } else if (action == "squash") {
398 code = Action::SQUASH;
399 max_args = 1;
401 break;
402 case 't':
403 if (action == "truncate") {
404 code = Action::TRUNCATE;
405 min_args = max_args = 1;
406 takes_integer_argument = true;
407 } else if (action == "trim") {
408 code = Action::TRIM;
409 max_args = 1;
411 break;
412 case 'u':
413 if (action == "unhtml") {
414 code = Action::UNHTML;
415 } else if (action == "unique") {
416 code = Action::UNIQUE;
417 min_args = max_args = 1;
419 break;
420 case 'v':
421 if (action == "value") {
422 code = Action::VALUE;
423 min_args = max_args = 1;
424 takes_integer_argument = true;
425 } else if (action == "valuenumeric") {
426 code = Action::VALUENUMERIC;
427 min_args = max_args = 1;
428 takes_integer_argument = true;
429 } else if (action == "valuepacked") {
430 code = Action::VALUEPACKED;
431 min_args = max_args = 1;
432 takes_integer_argument = true;
434 break;
435 case 'w':
436 if (action == "weight") {
437 code = Action::WEIGHT;
438 min_args = max_args = 1;
439 // Don't set takes_integer_argument since we parse
440 // it with parse_unsigned() and issue an error there
441 // - setting takes_integer_argument would give a
442 // double error for arguments with a decimal point.
444 break;
447 if (code == Action::BAD) {
448 report_location(DIAG_ERROR, filename, line_no, action_pos);
449 cerr << "Unknown index action '" << action << "'" << endl;
451 auto i_after_action = i;
452 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
454 if (i != s.end() && *i == '=') {
455 if (i != i_after_action) {
456 report_location(DIAG_WARN, filename, line_no,
457 i_after_action - s.begin());
458 cerr << "putting spaces between the action and '=' is "
459 "deprecated" << endl;
462 if (max_args == 0) {
463 report_location(DIAG_ERROR, filename, line_no,
464 i - s.begin());
465 cerr << "Index action '" << action
466 << "' doesn't take an argument" << endl;
469 ++i;
470 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
471 if (i != j) {
472 report_location(DIAG_WARN, filename, line_no,
473 i - s.begin());
474 cerr << "putting spaces between '=' and the argument is "
475 "deprecated" << endl;
478 vector<string> vals;
479 while (true) {
480 if (j != s.end() && *j == '"') {
481 // Quoted argument.
482 ++j;
483 string arg;
484 while (true) {
485 i = find_if(j, s.end(),
486 [](char ch) {
487 return ch == '"' || ch == '\\';
489 if (i == s.end()) {
490 report_location(DIAG_ERROR, filename, line_no,
491 s.size());
492 cerr << "No closing quote" << endl;
493 break;
495 arg.append(j, i);
496 if (*i++ == '"')
497 break;
499 // Escape sequence.
500 if (i == s.end()) {
501 bad_escaping:
502 report_location(DIAG_ERROR, filename, line_no,
503 i - s.begin());
504 cerr << "Bad escaping in quoted action argument"
505 << endl;
506 break;
509 char ch = *i;
510 switch (ch) {
511 case '\\':
512 case '"':
513 break;
514 case '0':
515 ch = '\0';
516 break;
517 case 'n':
518 ch = '\n';
519 break;
520 case 'r':
521 ch = '\r';
522 break;
523 case 't':
524 ch = '\t';
525 break;
526 case 'x': {
527 if (++i == s.end())
528 goto bad_escaping;
529 char ch1 = *i;
530 if (!C_isxdigit(ch1)) {
531 bad_hex_digit:
532 report_location(DIAG_ERROR, filename,
533 line_no, i - s.begin());
534 cerr << "Bad hex digit in escaping\n";
535 --i;
536 break;
538 if (++i == s.end())
539 goto bad_escaping;
540 char ch2 = *i;
541 if (!C_isxdigit(ch2)) {
542 goto bad_hex_digit;
544 ch = hex_digit(ch1) << 4 |
545 hex_digit(ch2);
546 break;
548 default:
549 report_location(DIAG_ERROR, filename,
550 line_no, i - s.begin());
551 cerr << "Bad escape sequence '\\" << ch
552 << "'\n";
553 break;
555 arg += ch;
556 j = i + 1;
558 vals.emplace_back(std::move(arg));
559 if (i == s.end() || C_isspace(*i)) break;
560 if (*i == ',') {
561 ++i;
562 } else {
563 report_location(DIAG_ERROR, filename, line_no,
564 i - s.begin());
565 cerr << "Unexpected character '" << *i
566 << "' after closing quote" << endl;
567 do {
568 ++i;
569 } while (i != s.end() && *i != ',' && !C_isspace(*i));
570 if (*i != ',') break;
571 ++i;
573 } else if (max_args > 1) {
574 // Unquoted argument, split on comma.
575 i = find_if(j, s.end(),
576 [](char ch) {
577 return C_isspace(ch) || ch == ',';
579 vals.emplace_back(j, i);
580 if (*i != ',') break;
581 ++i;
582 } else {
583 // Unquoted argument, including any commas.
584 i = find_if(j, s.end(),
585 [](char ch) { return C_isspace(ch); });
586 vals.emplace_back(j, i);
587 break;
589 j = i;
591 if (vals.size() == max_args) {
592 report_location(DIAG_ERROR, filename, line_no,
593 i - s.begin());
594 cerr << "Index action '" << action
595 << "' takes at most " << max_args << " arguments"
596 << endl;
600 if (vals.size() < min_args) {
601 report_location(DIAG_ERROR, filename, line_no,
602 i - s.begin());
603 if (min_args == max_args) {
604 cerr << "Index action '" << action
605 << "' requires " << min_args << " arguments"
606 << endl;
607 } else {
608 cerr << "Index action '" << action
609 << "' requires at least " << min_args << " arguments"
610 << endl;
612 // Allow action handling code to assume there are min_args
613 // arguments.
614 vals.resize(min_args);
617 string val;
618 if (!vals.empty()) {
619 val = vals.front();
622 if (takes_integer_argument) {
623 auto dot = val.find('.');
624 if (dot != string::npos) {
625 report_location(DIAG_ERROR, filename, line_no,
626 j - s.begin() + dot);
627 cerr << "Index action '" << action
628 << "' takes an integer argument" << endl;
631 switch (code) {
632 case Action::DATE:
633 if (val != "unix" &&
634 val != "unixutc" &&
635 val != "yyyymmdd") {
636 report_location(DIAG_ERROR, filename, line_no,
637 j - s.begin());
638 cerr << "Invalid parameter '" << val << "' for "
639 "action 'date'" << endl;
641 actions.emplace_back(code, action_pos, val);
642 break;
643 case Action::INDEX:
644 case Action::INDEXNOPOS:
645 actions.emplace_back(code, action_pos, val, weight);
646 useless_weight_pos = string::npos;
647 break;
648 case Action::WEIGHT:
649 // We don't push an Action for WEIGHT - instead we
650 // store it ready to use in the INDEX and INDEXNOPOS
651 // Actions.
652 if (!parse_unsigned(val.c_str(), weight)) {
653 report_location(DIAG_ERROR, filename, line_no,
654 j - s.begin());
655 cerr << "Index action 'weight' takes a "
656 "non-negative integer argument" << endl;
657 weight = 0;
659 if (useless_weight_pos != string::npos) {
660 report_useless_action(filename, line_no,
661 useless_weight_pos, action);
663 useless_weight_pos = action_pos;
664 break;
665 case Action::PARSEDATE: {
666 auto bad_code = val.find("%Z");
667 if (bad_code != val.npos) {
668 report_location(DIAG_ERROR, filename, line_no,
669 j - s.begin() + bad_code);
670 cerr << "Parsing timezone names with %Z is not supported" << endl;
672 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
673 bad_code = val.find("%z");
674 if (bad_code != val.npos) {
675 report_location(DIAG_ERROR, filename, line_no,
676 j - s.begin() + bad_code);
677 cerr << "Parsing timezone offsets with %z is not supported on "
678 "this platform" << endl;
680 #endif
681 actions.emplace_back(code, action_pos, val);
682 break;
684 case Action::SPLIT: {
685 if (val.empty()) {
686 report_location(DIAG_ERROR, filename, line_no,
687 j - s.begin());
688 cerr << "Split delimiter can't be empty" << endl;
690 int operation = Action::SPLIT_NONE;
691 if (vals.size() >= 2) {
692 if (vals[1] == "dedup") {
693 operation = Action::SPLIT_DEDUP;
694 } else if (vals[1] == "sort") {
695 operation = Action::SPLIT_SORT;
696 } else if (vals[1] == "none") {
697 operation = Action::SPLIT_NONE;
698 } else if (vals[1] == "prefixes") {
699 operation = Action::SPLIT_PREFIXES;
700 } else {
701 // FIXME: Column should be for where the `op`
702 // parameter starts, which this isn't if the
703 // value is quoted, contains escape sequences,
704 // etc.
705 report_location(DIAG_ERROR, filename, line_no,
706 i - s.begin() - vals[1].size());
707 cerr << "Bad split operation '" << vals[1]
708 << "'" << endl;
711 actions.emplace_back(code, action_pos, val, operation);
712 break;
714 case Action::TRUNCATE:
715 if (!actions.empty() &&
716 actions.back().get_action() == Action::LOAD) {
717 /* Turn "load truncate=n" into "load" with
718 * num_arg n, so that we don't needlessly
719 * allocate memory and read data we're just
720 * going to ignore.
722 actions.pop_back();
723 code = Action::LOAD;
725 actions.emplace_back(code, action_pos, val);
726 break;
727 case Action::UNIQUE:
728 if (unique_line_no) {
729 report_location(DIAG_ERROR, filename, line_no,
730 action_pos);
731 cerr << "Index action 'unique' used more than once"
732 << endl;
733 report_location(DIAG_NOTE, filename,
734 unique_line_no, unique_pos);
735 cerr << "Previously used here" << endl;
737 unique_line_no = line_no;
738 unique_pos = action_pos;
739 if (boolmap.find(val) == boolmap.end())
740 boolmap[val] = Action::UNIQUE;
741 actions.emplace_back(code, action_pos, val);
742 break;
743 case Action::GAP: {
744 actions.emplace_back(code, action_pos, val);
745 auto& obj = actions.back();
746 auto gap_size = obj.get_num_arg();
747 if (gap_size <= 0) {
748 report_location(DIAG_ERROR, filename, line_no,
749 obj.get_pos() + 3 + 1);
750 cerr << "Index action 'gap' takes a strictly "
751 "positive integer argument" << endl;
753 break;
755 case Action::HASH: {
756 actions.emplace_back(code, action_pos, val);
757 auto& obj = actions.back();
758 auto max_length = obj.get_num_arg();
759 if (max_length < 6) {
760 report_location(DIAG_ERROR, filename, line_no,
761 obj.get_pos() + 4 + 1);
762 cerr << "Index action 'hash' takes an integer "
763 "argument which must be at least 6" << endl;
765 break;
767 case Action::LTRIM:
768 case Action::RTRIM:
769 case Action::SQUASH:
770 case Action::TRIM:
771 for (unsigned char ch : val) {
772 if (ch >= 0x80) {
773 auto column = actions.back().get_pos() +
774 strlen(action_names[code]) + 1;
775 report_location(DIAG_ERROR, filename, line_no,
776 column);
777 cerr << "Index action '" << action_names[code]
778 << "' only support ASCII characters "
779 "currently\n";
782 actions.emplace_back(code, action_pos, val);
783 break;
784 case Action::BOOLEAN:
785 boolmap[val] = Action::BOOLEAN;
786 /* FALLTHRU */
787 default:
788 actions.emplace_back(code, action_pos, val);
790 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
791 } else {
792 if (min_args > 0) {
793 report_location(DIAG_ERROR, filename, line_no,
794 i_after_action - s.begin());
795 if (min_args == max_args) {
796 cerr << "Index action '" << action << "' requires "
797 << min_args << " arguments" << endl;
798 } else {
799 cerr << "Index action '" << action << "' requires at least "
800 << min_args << " arguments" << endl;
803 switch (code) {
804 case Action::INDEX:
805 case Action::INDEXNOPOS:
806 useless_weight_pos = string::npos;
807 actions.emplace_back(code, action_pos, "", weight);
808 break;
809 case Action::GAP:
810 actions.emplace_back(code, action_pos, "", 100);
811 break;
812 case Action::HASH:
813 actions.emplace_back(code, action_pos, "",
814 MAX_SAFE_TERM_LENGTH - 1);
815 break;
816 case Action::LTRIM:
817 case Action::RTRIM:
818 case Action::SQUASH:
819 case Action::TRIM:
820 actions.emplace_back(code, action_pos, " \t\f\v\r\n");
821 break;
822 default:
823 actions.emplace_back(code, action_pos);
824 break;
827 j = i;
830 if (useless_weight_pos != string::npos) {
831 report_useless_action(filename, line_no, useless_weight_pos,
832 "weight");
835 while (!actions.empty()) {
836 bool done = true;
837 Action::type action = actions.back().get_action();
838 switch (action) {
839 case Action::HASH:
840 case Action::HEXTOBIN:
841 case Action::LOWER:
842 case Action::LTRIM:
843 case Action::PARSEDATE:
844 case Action::RTRIM:
845 case Action::SPELL:
846 case Action::SQUASH:
847 case Action::TRIM:
848 case Action::TRUNCATE:
849 case Action::UNHTML:
850 done = false;
851 report_useless_action(filename, line_no,
852 actions.back().get_pos(),
853 action_names[action]);
854 actions.pop_back();
855 break;
856 default:
857 break;
859 if (done) break;
862 map<string, Action::type>::const_iterator boolpfx;
863 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
864 if (boolpfx->second == Action::UNIQUE) {
865 report_location(DIAG_WARN, filename, unique_line_no,
866 unique_pos);
867 cerr << "Index action 'unique=" << boolpfx->first
868 << "' without 'boolean=" << boolpfx->first << "'" << endl;
869 static bool given_doesnt_imply_boolean_warning = false;
870 if (!given_doesnt_imply_boolean_warning) {
871 given_doesnt_imply_boolean_warning = true;
872 report_location(DIAG_NOTE, filename, unique_line_no,
873 unique_pos);
874 cerr << "'unique' doesn't implicitly add a boolean term"
875 << endl;
880 vector<string>::const_iterator field;
881 for (field = fields.begin(); field != fields.end(); ++field) {
882 vector<Action> &v = index_spec[*field];
883 if (v.empty()) {
884 if (fields.size() == 1) {
885 // Optimise common case where there's only one fieldname
886 // for a list of actions.
887 v = std::move(actions);
888 } else {
889 v = actions;
891 } else {
892 v.emplace_back(Action::NEW, string::npos);
893 v.insert(v.end(), actions.begin(), actions.end());
898 if (index_spec.empty()) {
899 report_location(DIAG_ERROR, filename, line_no);
900 cerr << "No rules found in index script" << endl;
903 if (error_count) {
904 exit(1);
908 static bool
909 run_actions(vector<Action>::const_iterator action_it,
910 vector<Action>::const_iterator action_end,
911 Xapian::WritableDatabase& database,
912 Xapian::TermGenerator& indexer,
913 const string& old_value,
914 bool& this_field_is_content, Xapian::Document& doc,
915 map<string, list<string>>& fields,
916 string& field, const char* fname,
917 size_t line_no, Xapian::docid& docid)
919 string value = old_value;
920 while (action_it != action_end) {
921 auto& action = *action_it++;
922 switch (action.get_action()) {
923 case Action::BAD:
924 abort();
925 case Action::NEW:
926 value = old_value;
927 // We're processing the same field again - give it a reprieve.
928 this_field_is_content = true;
929 break;
930 case Action::FIELD:
931 if (!value.empty()) {
932 string f = action.get_string_arg();
933 if (f.empty()) f = field;
934 // replace newlines with spaces
935 string s = value;
936 string::size_type j = 0;
937 while ((j = s.find('\n', j)) != string::npos)
938 s[j] = ' ';
939 fields[f].push_back(s);
941 break;
942 case Action::INDEX:
943 indexer.index_text(value,
944 action.get_num_arg(),
945 action.get_string_arg());
946 break;
947 case Action::INDEXNOPOS:
948 // No positional information so phrase searching won't work.
949 // However, the database will use much less diskspace.
950 indexer.index_text_without_positions(value,
951 action.get_num_arg(),
952 action.get_string_arg());
953 break;
954 case Action::BOOLEAN: {
955 // Do nothing if there's no text.
956 if (value.empty()) break;
958 string term = action.get_string_arg();
959 if (prefix_needs_colon(term, value[0])) term += ':';
960 term += value;
962 doc.add_boolean_term(term);
963 break;
965 case Action::GAP:
966 indexer.increase_termpos(action.get_num_arg());
967 break;
968 case Action::HASH: {
969 unsigned int max_length = action.get_num_arg();
970 if (value.length() > max_length)
971 value = hash_long_term(value, max_length);
972 break;
974 case Action::HEXTOBIN: {
975 size_t len = value.length();
976 if (len & 1) {
977 report_location(DIAG_ERROR, fname, line_no);
978 cerr << "hextobin: input must have even length"
979 << endl;
980 exit(1);
983 string output;
984 output.reserve(len / 2);
985 for (size_t j = 0; j < len; j += 2) {
986 char a = value[j];
987 char b = value[j + 1];
988 if (!C_isxdigit(a) || !C_isxdigit(b)) {
989 report_location(DIAG_ERROR, fname, line_no);
990 cerr << "hextobin: input must be all hex digits\n";
991 exit(1);
993 char r = (hex_digit(a) << 4) | hex_digit(b);
994 output.push_back(r);
996 value = std::move(output);
997 break;
999 case Action::LOWER:
1000 value = Xapian::Unicode::tolower(value);
1001 break;
1002 case Action::LTRIM:
1003 ltrim(value, action.get_string_arg());
1004 break;
1005 case Action::RTRIM:
1006 rtrim(value, action.get_string_arg());
1007 break;
1008 case Action::TRIM:
1009 rtrim(value, action.get_string_arg());
1010 ltrim(value, action.get_string_arg());
1011 break;
1012 case Action::SQUASH:
1013 squash(value, action.get_string_arg());
1014 break;
1015 case Action::LOAD: {
1016 // If there's no input, just issue a warning.
1017 if (value.empty()) {
1018 report_location(DIAG_WARN, fname, line_no);
1019 cerr << "Empty filename in LOAD action" << endl;
1020 break;
1022 bool truncated = false;
1023 string filename = std::move(value);
1024 // FIXME: Use NOATIME if we own the file or are root.
1025 if (!load_file(filename, action.get_num_arg(), NOCACHE,
1026 value, truncated)) {
1027 report_location(DIAG_ERROR, fname, line_no);
1028 cerr << "Couldn't load file '" << filename << "': "
1029 << strerror(errno) << endl;
1030 exit(1);
1032 if (!truncated) break;
1034 /* FALLTHRU */
1035 case Action::TRUNCATE:
1036 utf8_truncate(value, action.get_num_arg());
1037 break;
1038 case Action::SPELL:
1039 indexer.set_flags(indexer.FLAG_SPELLING);
1040 break;
1041 case Action::SPLIT: {
1042 // Find the end of the actions which split should execute.
1043 auto split_end = find(action_it, action_end, Action::NEW);
1045 int split_type = action.get_num_arg();
1046 if (value.empty()) {
1047 // Nothing to do.
1048 } else if (split_type != Action::SPLIT_SORT) {
1049 // Generate split as we consume it.
1050 const string& delimiter = action.get_string_arg();
1052 unique_ptr<unordered_set<string>> seen;
1053 if (split_type == Action::SPLIT_DEDUP) {
1054 seen.reset(new unordered_set<string>);
1057 if (delimiter.size() == 1) {
1058 // Special case for common single character delimiter.
1059 char ch = delimiter[0];
1060 string::size_type i = 0;
1061 while (true) {
1062 string::size_type j = value.find(ch, i);
1063 if (split_type == Action::SPLIT_PREFIXES) {
1064 if (j > 0) {
1065 string val(value, 0, j);
1066 run_actions(action_it, split_end,
1067 database, indexer,
1068 val,
1069 this_field_is_content, doc,
1070 fields,
1071 field, fname, line_no,
1072 docid);
1074 } else if (i != j) {
1075 string val(value, i, j - i);
1076 if (!seen.get() || seen->insert(val).second) {
1077 run_actions(action_it, split_end,
1078 database, indexer,
1079 val,
1080 this_field_is_content, doc,
1081 fields,
1082 field, fname, line_no,
1083 docid);
1086 if (j == string::npos) break;
1087 i = j + 1;
1089 } else {
1090 string::size_type i = 0;
1091 while (true) {
1092 string::size_type j = value.find(delimiter, i);
1093 if (split_type == Action::SPLIT_PREFIXES) {
1094 if (j > 0) {
1095 string val(value, 0, j);
1096 run_actions(action_it, split_end,
1097 database, indexer,
1098 val,
1099 this_field_is_content, doc,
1100 fields,
1101 field, fname, line_no,
1102 docid);
1104 } else if (i != j) {
1105 string val(value, i, j - i);
1106 if (!seen.get() || seen->insert(val).second) {
1107 run_actions(action_it, split_end,
1108 database, indexer,
1109 val,
1110 this_field_is_content, doc,
1111 fields,
1112 field, fname, line_no,
1113 docid);
1116 if (j == string::npos) break;
1117 i = j + delimiter.size();
1120 } else {
1121 vector<string> split_values;
1122 const string& delimiter = action.get_string_arg();
1123 if (delimiter.size() == 1) {
1124 // Special case for common single character delimiter.
1125 char ch = delimiter[0];
1126 string::size_type i = 0;
1127 while (true) {
1128 string::size_type j = value.find(ch, i);
1129 if (i != j) {
1130 split_values.emplace_back(value, i, j - i);
1132 if (j == string::npos) break;
1133 i = j + 1;
1135 } else {
1136 string::size_type i = 0;
1137 while (true) {
1138 string::size_type j = value.find(delimiter, i);
1139 if (i != j) {
1140 split_values.emplace_back(value, i, j - i);
1142 if (j == string::npos) break;
1143 i = j + delimiter.size();
1147 sort(split_values.begin(), split_values.end());
1149 for (auto&& val : split_values) {
1150 run_actions(action_it, split_end,
1151 database, indexer, val,
1152 this_field_is_content, doc, fields,
1153 field, fname, line_no,
1154 docid);
1158 action_it = split_end;
1159 break;
1161 case Action::UNHTML: {
1162 MyHtmlParser p;
1163 try {
1164 // Default HTML character set is latin 1, though
1165 // not specifying one is deprecated these days.
1166 p.parse_html(value, "iso-8859-1", false);
1167 } catch (const string & newcharset) {
1168 p.reset();
1169 p.parse_html(value, newcharset, true);
1171 if (p.indexing_allowed)
1172 value = p.dump;
1173 else
1174 value = "";
1175 break;
1177 case Action::UNIQUE: {
1178 // If there's no text, just issue a warning.
1179 if (value.empty()) {
1180 report_location(DIAG_WARN, fname, line_no);
1181 cerr << "Ignoring UNIQUE action on empty text"
1182 << endl;
1183 break;
1186 // Ensure that the value of this field is unique.
1187 // If a record already exists with the same value,
1188 // it will be replaced with the new record.
1190 // Unique fields aren't considered content - if
1191 // there are no other fields in the document, the
1192 // document is to be deleted.
1193 this_field_is_content = false;
1195 // Argument is the prefix to add to the field value
1196 // to get the unique term.
1197 string t = action.get_string_arg();
1198 if (prefix_needs_colon(t, value[0])) t += ':';
1199 t += value;
1200 Xapian::PostingIterator p = database.postlist_begin(t);
1201 if (p != database.postlist_end(t)) {
1202 docid = *p;
1204 break;
1206 case Action::VALUE:
1207 if (!value.empty())
1208 doc.add_value(action.get_num_arg(), value);
1209 break;
1210 case Action::VALUENUMERIC: {
1211 if (value.empty()) break;
1212 char * end;
1213 double dbl = strtod(value.c_str(), &end);
1214 if (*end) {
1215 report_location(DIAG_WARN, fname, line_no);
1216 cerr << "Trailing characters in VALUENUMERIC: '"
1217 << value << "'" << endl;
1219 doc.add_value(action.get_num_arg(),
1220 Xapian::sortable_serialise(dbl));
1221 break;
1223 case Action::VALUEPACKED: {
1224 uint32_t word = 0;
1225 if (value.empty() || !C_isdigit(value[0])) {
1226 // strtoul() accepts leading whitespace and negated
1227 // values, neither of which we want to allow.
1228 errno = EINVAL;
1229 } else {
1230 errno = 0;
1231 char* q;
1232 word = strtoul(value.c_str(), &q, 10);
1233 if (!errno && *q != '\0') {
1234 // Trailing characters after converted value.
1235 errno = EINVAL;
1238 if (errno) {
1239 report_location(DIAG_WARN, fname, line_no);
1240 cerr << "valuepacked \"" << value << "\" ";
1241 if (errno == ERANGE) {
1242 cerr << "out of range";
1243 } else {
1244 cerr << "not an unsigned integer";
1246 cerr << endl;
1248 int valueslot = action.get_num_arg();
1249 doc.add_value(valueslot, int_to_binary_string(word));
1250 break;
1252 case Action::DATE: {
1253 // Do nothing for empty input.
1254 if (value.empty()) break;
1256 const string & type = action.get_string_arg();
1257 string yyyymmdd;
1258 if (type == "unix") {
1259 time_t t;
1260 if (!parse_signed(value.c_str(), t)) {
1261 report_location(DIAG_WARN, fname, line_no);
1262 cerr << "Date value (in secs) for action DATE "
1263 "must be an integer - ignoring" << endl;
1264 break;
1266 struct tm *tm = localtime(&t);
1267 int y = tm->tm_year + 1900;
1268 int m = tm->tm_mon + 1;
1269 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1270 } else if (type == "unixutc") {
1271 time_t t;
1272 if (!parse_signed(value.c_str(), t)) {
1273 report_location(DIAG_WARN, fname, line_no);
1274 cerr << "Date value (in secs) for action DATE "
1275 "must be an integer - ignoring" << endl;
1276 break;
1278 struct tm *tm = gmtime(&t);
1279 int y = tm->tm_year + 1900;
1280 int m = tm->tm_mon + 1;
1281 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1282 } else if (type == "yyyymmdd") {
1283 if (value.length() != 8) {
1284 report_location(DIAG_WARN, fname, line_no);
1285 cerr << "date=yyyymmdd expects an 8 character value "
1286 "- ignoring" << endl;
1287 break;
1289 yyyymmdd = value;
1292 // Date (YYYYMMDD)
1293 doc.add_boolean_term("D" + yyyymmdd);
1294 yyyymmdd.resize(6);
1295 // Month (YYYYMM)
1296 doc.add_boolean_term("M" + yyyymmdd);
1297 yyyymmdd.resize(4);
1298 // Year (YYYY)
1299 doc.add_boolean_term("Y" + yyyymmdd);
1300 break;
1302 case Action::PARSEDATE: {
1303 string dateformat = action.get_string_arg();
1304 struct tm tm;
1305 memset(&tm, 0, sizeof(tm));
1306 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1307 if (ret == NULL) {
1308 report_location(DIAG_WARN, fname, line_no);
1309 cerr << "\"" << value << "\" doesn't match format "
1310 "\"" << dateformat << '\"' << endl;
1311 break;
1314 if (*ret != '\0') {
1315 report_location(DIAG_WARN, fname, line_no);
1316 cerr << "\"" << value << "\" not fully matched by "
1317 "format \"" << dateformat << "\" "
1318 "(\"" << ret << "\" left over) but "
1319 "indexing anyway" << endl;
1321 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1322 auto gmtoff = tm.tm_gmtoff;
1323 #endif
1324 auto secs_since_epoch = timegm(&tm);
1325 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1326 secs_since_epoch -= gmtoff;
1327 #endif
1328 value = str(secs_since_epoch);
1329 break;
1331 default:
1332 /* Empty default case to avoid "unhandled enum value"
1333 * warnings. */
1334 break;
1337 return true;
1340 static void
1341 index_file(const char *fname, istream &stream,
1342 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1344 string line;
1345 size_t line_no = 0;
1346 while (!stream.eof() && getline(stream, line)) {
1347 ++line_no;
1348 Xapian::Document doc;
1349 indexer.set_document(doc);
1350 Xapian::docid docid = 0;
1351 map<string, list<string>> fields;
1352 bool seen_content = false;
1353 while (!line.empty()) {
1354 // Cope with files from MS Windows (\r\n end of lines).
1355 // Trim multiple \r characters, since that seems the best way
1356 // to handle that case.
1357 string::size_type last = line.find_last_not_of('\r');
1358 if (last == string::npos) break;
1359 line.resize(last + 1);
1361 string::size_type eq = line.find('=');
1362 if (eq == string::npos && !line.empty()) {
1363 report_location(DIAG_ERROR, fname, line_no, line.size());
1364 cerr << "expected = somewhere in this line" << endl;
1365 exit(1);
1367 string field(line, 0, eq);
1368 string value(line, eq + 1, string::npos);
1369 line.clear();
1370 while (getline(stream, line)) {
1371 ++line_no;
1372 if (line.empty() || line[0] != '=') break;
1373 // Cope with files from MS Windows (\r\n end of lines).
1374 // Trim multiple \r characters, since that seems the best way
1375 // to handle that case.
1376 last = line.find_last_not_of('\r');
1377 // line[0] == '=', so last != string::npos.
1378 // Replace the '=' with a '\n' so we don't have to use substr.
1379 line[0] = '\n';
1380 line.resize(last + 1);
1381 value += line;
1384 // Default to not indexing spellings.
1385 indexer.set_flags(Xapian::TermGenerator::flags(0));
1387 bool this_field_is_content = true;
1388 const vector<Action>& v = index_spec[field];
1389 run_actions(v.begin(), v.end(),
1390 database, indexer, value,
1391 this_field_is_content, doc, fields,
1392 field, fname, line_no,
1393 docid);
1394 if (this_field_is_content) seen_content = true;
1397 // If we haven't seen any fields (other than unique identifiers)
1398 // the document is to be deleted.
1399 if (!seen_content) {
1400 if (docid) {
1401 database.delete_document(docid);
1402 if (verbose) cout << "Del: " << docid << endl;
1403 ++delcount;
1405 } else {
1406 string data;
1407 for (auto&& i : fields) {
1408 for (auto&& field_val : i.second) {
1409 data += i.first;
1410 data += '=';
1411 data += field_val;
1412 data += '\n';
1416 // Put the data in the document
1417 doc.set_data(data);
1419 // Add the document to the database
1420 if (docid) {
1421 database.replace_document(docid, doc);
1422 if (verbose) cout << "Replace: " << docid << endl;
1423 ++repcount;
1424 } else {
1425 docid = database.add_document(doc);
1426 if (verbose) cout << "Add: " << docid << endl;
1427 ++addcount;
1432 // Commit after each file to make sure all changes from that file make it
1433 // in.
1434 if (verbose) cout << "Committing: " << endl;
1435 database.commit();
1438 static void
1439 show_help(int exit_code)
1441 cout << PROG_NAME " - " PROG_DESC "\n"
1442 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1443 "\n"
1444 "Creates or updates a Xapian database with the data from the input files listed\n"
1445 "on the command line. If no files are specified, data is read from stdin.\n"
1446 "\n"
1447 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1448 "format for INDEXER_SCRIPT.\n"
1449 "\n"
1450 "Options:\n"
1451 " -v, --verbose display additional messages to aid debugging\n"
1452 " --overwrite create the database anew (the default is to update if\n"
1453 " the database already exists)\n";
1454 print_stemmer_help("");
1455 print_help_and_version_help("");
1456 exit(exit_code);
1460 main(int argc, char **argv)
1461 try {
1462 // If the database already exists, default to updating not overwriting.
1463 int database_mode = Xapian::DB_CREATE_OR_OPEN;
1464 verbose = false;
1465 Xapian::Stem stemmer("english");
1467 // Without this, strptime() seems to treat formats without a timezone as
1468 // being local time, including %s.
1469 setenv("TZ", "UTC", 1);
1471 constexpr auto NO_ARG = no_argument;
1472 constexpr auto REQ_ARG = required_argument;
1473 static const struct option longopts[] = {
1474 { "help", NO_ARG, NULL, 'h' },
1475 { "version", NO_ARG, NULL, 'V' },
1476 { "stemmer", REQ_ARG, NULL, 's' },
1477 { "overwrite", NO_ARG, NULL, 'o' },
1478 { "verbose", NO_ARG, NULL, 'v' },
1479 { 0, 0, NULL, 0 }
1482 int getopt_ret;
1483 while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1484 longopts, NULL)) != -1) {
1485 switch (getopt_ret) {
1486 default:
1487 show_help(1);
1488 break;
1489 case 'h': // --help
1490 show_help(0);
1491 break;
1492 case 'V': // --version
1493 print_package_info(PROG_NAME);
1494 return 0;
1495 case 'o': // --overwrite
1496 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1497 break;
1498 case 'v':
1499 verbose = true;
1500 break;
1501 case 's':
1502 try {
1503 stemmer = Xapian::Stem(optarg);
1504 } catch (const Xapian::InvalidArgumentError &) {
1505 cerr << "Unknown stemming language '" << optarg << "'.\n";
1506 cerr << "Available language names are: "
1507 << Xapian::Stem::get_available_languages() << endl;
1508 return 1;
1510 break;
1514 argv += optind;
1515 argc -= optind;
1516 if (argc < 2) {
1517 show_help(1);
1520 parse_index_script(argv[1]);
1522 // Open the database. If another process is currently updating the
1523 // database, wait for the lock to become available.
1524 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1525 Xapian::WritableDatabase database(argv[0], flags);
1527 Xapian::TermGenerator indexer;
1528 indexer.set_stemmer(stemmer);
1529 // Set the database for spellings to be added to by the "spell" action.
1530 indexer.set_database(database);
1532 addcount = 0;
1533 repcount = 0;
1534 delcount = 0;
1536 if (argc == 2) {
1537 // Read from stdin.
1538 index_file("<stdin>", cin, database, indexer);
1539 } else {
1540 // Read file(s) listed on the command line.
1541 for (int i = 2; i < argc; ++i) {
1542 ifstream stream(argv[i]);
1543 if (stream) {
1544 index_file(argv[i], stream, database, indexer);
1545 } else {
1546 cerr << "Can't open file " << argv[i] << endl;
1551 cout << "records (added, replaced, deleted) = (" << addcount << ", "
1552 << repcount << ", " << delcount << ")" << endl;
1553 } catch (const Xapian::Error &error) {
1554 cerr << "Exception: " << error.get_description() << endl;
1555 exit(1);
1556 } catch (const std::bad_alloc &) {
1557 cerr << "Exception: std::bad_alloc" << endl;
1558 exit(1);
1559 } catch (...) {
1560 cerr << "Unknown Exception" << endl;
1561 exit(1);