2 * @brief index arbitrary data as described by an index script
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2022 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
36 #include <unordered_set>
45 #include "commonhelp.h"
48 #include "myhtmlparse.h"
52 #include "stringutils.h"
54 #include "utf8truncate.h"
59 #include "portability/strptime.h"
62 #include "gnu_getopt.h"
66 #define PROG_NAME "scriptindex"
67 #define PROG_DESC "index arbitrary data as described by an index script"
75 prefix_needs_colon(const string
& prefix
, unsigned ch
)
77 if (!C_isupper(ch
) && ch
!= ':') return false;
78 string::size_type len
= prefix
.length();
79 return (len
> 1 && prefix
[len
- 1] != ':');
82 const char * action_names
[] = {
83 // Actions used internally:
114 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
119 // Actions used internally:
148 enum { SPLIT_NONE
, SPLIT_DEDUP
, SPLIT_SORT
, SPLIT_PREFIXES
};
153 // Offset into indexscript line.
156 Action(type action_
, size_t pos_
)
157 : action(action_
), num_arg(0), pos(pos_
) { }
158 Action(type action_
, size_t pos_
, const string
& arg
)
159 : action(action_
), string_arg(arg
), pos(pos_
) {
160 num_arg
= atoi(string_arg
.c_str());
162 Action(type action_
, size_t pos_
, const string
& arg
, int num
)
163 : action(action_
), num_arg(num
), string_arg(arg
), pos(pos_
) { }
164 type
get_action() const { return action
; }
165 int get_num_arg() const { return num_arg
; }
166 void set_num_arg(int num
) { num_arg
= num
; }
167 const string
& get_string_arg() const { return string_arg
; }
168 size_t get_pos() const { return pos
; }
171 // These allow searching for an Action with a particular Action::type using
175 operator==(const Action
& a
, Action::type t
) { return a
.get_action() == t
; }
178 operator==(Action::type t
, const Action
& a
) { return a
.get_action() == t
; }
181 operator!=(const Action
& a
, Action::type t
) { return !(a
== t
); }
184 operator!=(Action::type t
, const Action
& a
) { return !(t
== a
); }
187 ltrim(string
& s
, const string
& chars
)
189 auto i
= s
.find_first_not_of(chars
);
190 if (i
) s
.erase(0, i
);
194 rtrim(string
& s
, const string
& chars
)
196 s
.resize(s
.find_last_not_of(chars
) + 1);
200 squash(string
& s
, const string
& chars
)
203 output
.reserve(s
.size());
204 string::size_type i
= 0;
205 while ((i
= s
.find_first_not_of(chars
, i
)) != string::npos
) {
206 auto j
= s
.find_first_of(chars
, i
);
207 if (!output
.empty()) output
+= ' ';
208 output
.append(s
, i
, j
- i
);
211 s
= std::move(output
);
214 enum diag_type
{ DIAG_ERROR
, DIAG_WARN
, DIAG_NOTE
};
216 static unsigned error_count
= 0;
219 report_location(enum diag_type type
,
220 const string
& filename
,
222 size_t pos
= string::npos
)
227 if (pos
!= string::npos
) {
228 // The first column is numbered 1.
229 cerr
<< ':' << pos
+ 1;
238 cerr
<< ": warning: ";
247 report_useless_action(const string
&file
, size_t line
, size_t pos
,
248 const string
&action
)
250 report_location(DIAG_WARN
, file
, line
, pos
);
251 cerr
<< "Index action '" << action
<< "' has no effect" << endl
;
253 static bool given_left_to_right_warning
= false;
254 if (!given_left_to_right_warning
) {
255 given_left_to_right_warning
= true;
256 report_location(DIAG_NOTE
, file
, line
, pos
);
257 cerr
<< "Actions are executed from left to right" << endl
;
261 static map
<string
, vector
<Action
>> index_spec
;
264 parse_index_script(const string
&filename
)
266 ifstream
script(filename
.c_str());
267 if (!script
.is_open()) {
268 report_location(DIAG_ERROR
, filename
);
269 cerr
<< strerror(errno
) << endl
;
274 // Line number where we saw a `unique` action, or 0 if we haven't.
275 int unique_line_no
= 0;
276 // Offset into line unique_line_no where the `unique` action was.
277 size_t unique_pos
= 0;
278 while (getline(script
, line
)) {
280 vector
<string
> fields
;
281 vector
<Action
> actions
;
282 string::const_iterator i
, j
;
283 const string
&s
= line
;
284 i
= find_if(s
.begin(), s
.end(), [](char ch
) { return !C_isspace(ch
); });
285 if (i
== s
.end() || *i
== '#') {
286 // Blank line or comment.
290 if (!C_isalnum(*i
)) {
291 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
292 cerr
<< "field name must start with alphanumeric" << endl
;
294 j
= find_if(i
+ 1, s
.end(),
295 [](char ch
) { return !C_isalnum(ch
) && ch
!= '_'; });
296 fields
.push_back(string(i
, j
));
297 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
298 if (i
== s
.end()) break;
301 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
305 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
306 cerr
<< "bad character '" << *i
<< "' in field name" << endl
;
308 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
309 if (i
== s
.end()) break;
312 Xapian::termcount weight
= 1;
313 size_t useless_weight_pos
= string::npos
;
314 map
<string
, Action::type
> boolmap
;
316 while (j
!= s
.end()) {
317 size_t action_pos
= j
- s
.begin();
318 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isalnum(ch
); });
319 string
action(s
, j
- s
.begin(), i
- j
);
320 Action::type code
= Action::BAD
;
321 unsigned min_args
= 0, max_args
= 0;
322 bool takes_integer_argument
= false;
323 if (!action
.empty()) {
326 if (action
== "boolean") {
327 code
= Action::BOOLEAN
;
332 if (action
== "date") {
334 min_args
= max_args
= 1;
338 if (action
== "field") {
339 code
= Action::FIELD
;
344 if (action
== "gap") {
347 takes_integer_argument
= true;
351 if (action
== "hash") {
354 takes_integer_argument
= true;
355 } else if (action
== "hextobin") {
356 code
= Action::HEXTOBIN
;
360 if (action
== "index") {
361 code
= Action::INDEX
;
363 } else if (action
== "indexnopos") {
364 code
= Action::INDEXNOPOS
;
369 if (action
== "lower") {
370 code
= Action::LOWER
;
371 } else if (action
== "load") {
373 } else if (action
== "ltrim") {
374 code
= Action::LTRIM
;
379 if (action
== "parsedate") {
380 code
= Action::PARSEDATE
;
381 min_args
= max_args
= 1;
385 if (action
== "rtrim") {
386 code
= Action::RTRIM
;
391 if (action
== "spell") {
392 code
= Action::SPELL
;
393 } else if (action
== "split") {
394 code
= Action::SPLIT
;
397 } else if (action
== "squash") {
398 code
= Action::SQUASH
;
403 if (action
== "truncate") {
404 code
= Action::TRUNCATE
;
405 min_args
= max_args
= 1;
406 takes_integer_argument
= true;
407 } else if (action
== "trim") {
413 if (action
== "unhtml") {
414 code
= Action::UNHTML
;
415 } else if (action
== "unique") {
416 code
= Action::UNIQUE
;
417 min_args
= max_args
= 1;
421 if (action
== "value") {
422 code
= Action::VALUE
;
423 min_args
= max_args
= 1;
424 takes_integer_argument
= true;
425 } else if (action
== "valuenumeric") {
426 code
= Action::VALUENUMERIC
;
427 min_args
= max_args
= 1;
428 takes_integer_argument
= true;
429 } else if (action
== "valuepacked") {
430 code
= Action::VALUEPACKED
;
431 min_args
= max_args
= 1;
432 takes_integer_argument
= true;
436 if (action
== "weight") {
437 code
= Action::WEIGHT
;
438 min_args
= max_args
= 1;
439 // Don't set takes_integer_argument since we parse
440 // it with parse_unsigned() and issue an error there
441 // - setting takes_integer_argument would give a
442 // double error for arguments with a decimal point.
447 if (code
== Action::BAD
) {
448 report_location(DIAG_ERROR
, filename
, line_no
, action_pos
);
449 cerr
<< "Unknown index action '" << action
<< "'" << endl
;
451 auto i_after_action
= i
;
452 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
454 if (i
!= s
.end() && *i
== '=') {
455 if (i
!= i_after_action
) {
456 report_location(DIAG_WARN
, filename
, line_no
,
457 i_after_action
- s
.begin());
458 cerr
<< "putting spaces between the action and '=' is "
459 "deprecated" << endl
;
463 report_location(DIAG_ERROR
, filename
, line_no
,
465 cerr
<< "Index action '" << action
466 << "' doesn't take an argument" << endl
;
470 j
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
472 report_location(DIAG_WARN
, filename
, line_no
,
474 cerr
<< "putting spaces between '=' and the argument is "
475 "deprecated" << endl
;
480 if (j
!= s
.end() && *j
== '"') {
485 i
= find_if(j
, s
.end(),
487 return ch
== '"' || ch
== '\\';
490 report_location(DIAG_ERROR
, filename
, line_no
,
492 cerr
<< "No closing quote" << endl
;
502 report_location(DIAG_ERROR
, filename
, line_no
,
504 cerr
<< "Bad escaping in quoted action argument"
530 if (!C_isxdigit(ch1
)) {
532 report_location(DIAG_ERROR
, filename
,
533 line_no
, i
- s
.begin());
534 cerr
<< "Bad hex digit in escaping\n";
541 if (!C_isxdigit(ch2
)) {
544 ch
= hex_digit(ch1
) << 4 |
549 report_location(DIAG_ERROR
, filename
,
550 line_no
, i
- s
.begin());
551 cerr
<< "Bad escape sequence '\\" << ch
558 vals
.emplace_back(std::move(arg
));
559 if (i
== s
.end() || C_isspace(*i
)) break;
563 report_location(DIAG_ERROR
, filename
, line_no
,
565 cerr
<< "Unexpected character '" << *i
566 << "' after closing quote" << endl
;
569 } while (i
!= s
.end() && *i
!= ',' && !C_isspace(*i
));
570 if (*i
!= ',') break;
573 } else if (max_args
> 1) {
574 // Unquoted argument, split on comma.
575 i
= find_if(j
, s
.end(),
577 return C_isspace(ch
) || ch
== ',';
579 vals
.emplace_back(j
, i
);
580 if (*i
!= ',') break;
583 // Unquoted argument, including any commas.
584 i
= find_if(j
, s
.end(),
585 [](char ch
) { return C_isspace(ch
); });
586 vals
.emplace_back(j
, i
);
591 if (vals
.size() == max_args
) {
592 report_location(DIAG_ERROR
, filename
, line_no
,
594 cerr
<< "Index action '" << action
595 << "' takes at most " << max_args
<< " arguments"
600 if (vals
.size() < min_args
) {
601 report_location(DIAG_ERROR
, filename
, line_no
,
603 if (min_args
== max_args
) {
604 cerr
<< "Index action '" << action
605 << "' requires " << min_args
<< " arguments"
608 cerr
<< "Index action '" << action
609 << "' requires at least " << min_args
<< " arguments"
612 // Allow action handling code to assume there are min_args
614 vals
.resize(min_args
);
622 if (takes_integer_argument
) {
623 auto dot
= val
.find('.');
624 if (dot
!= string::npos
) {
625 report_location(DIAG_ERROR
, filename
, line_no
,
626 j
- s
.begin() + dot
);
627 cerr
<< "Index action '" << action
628 << "' takes an integer argument" << endl
;
636 report_location(DIAG_ERROR
, filename
, line_no
,
638 cerr
<< "Invalid parameter '" << val
<< "' for "
639 "action 'date'" << endl
;
641 actions
.emplace_back(code
, action_pos
, val
);
644 case Action::INDEXNOPOS
:
645 actions
.emplace_back(code
, action_pos
, val
, weight
);
646 useless_weight_pos
= string::npos
;
649 // We don't push an Action for WEIGHT - instead we
650 // store it ready to use in the INDEX and INDEXNOPOS
652 if (!parse_unsigned(val
.c_str(), weight
)) {
653 report_location(DIAG_ERROR
, filename
, line_no
,
655 cerr
<< "Index action 'weight' takes a "
656 "non-negative integer argument" << endl
;
659 if (useless_weight_pos
!= string::npos
) {
660 report_useless_action(filename
, line_no
,
661 useless_weight_pos
, action
);
663 useless_weight_pos
= action_pos
;
665 case Action::PARSEDATE
: {
666 auto bad_code
= val
.find("%Z");
667 if (bad_code
!= val
.npos
) {
668 report_location(DIAG_ERROR
, filename
, line_no
,
669 j
- s
.begin() + bad_code
);
670 cerr
<< "Parsing timezone names with %Z is not supported" << endl
;
672 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
673 bad_code
= val
.find("%z");
674 if (bad_code
!= val
.npos
) {
675 report_location(DIAG_ERROR
, filename
, line_no
,
676 j
- s
.begin() + bad_code
);
677 cerr
<< "Parsing timezone offsets with %z is not supported on "
678 "this platform" << endl
;
681 actions
.emplace_back(code
, action_pos
, val
);
684 case Action::SPLIT
: {
686 report_location(DIAG_ERROR
, filename
, line_no
,
688 cerr
<< "Split delimiter can't be empty" << endl
;
690 int operation
= Action::SPLIT_NONE
;
691 if (vals
.size() >= 2) {
692 if (vals
[1] == "dedup") {
693 operation
= Action::SPLIT_DEDUP
;
694 } else if (vals
[1] == "sort") {
695 operation
= Action::SPLIT_SORT
;
696 } else if (vals
[1] == "none") {
697 operation
= Action::SPLIT_NONE
;
698 } else if (vals
[1] == "prefixes") {
699 operation
= Action::SPLIT_PREFIXES
;
701 // FIXME: Column should be for where the `op`
702 // parameter starts, which this isn't if the
703 // value is quoted, contains escape sequences,
705 report_location(DIAG_ERROR
, filename
, line_no
,
706 i
- s
.begin() - vals
[1].size());
707 cerr
<< "Bad split operation '" << vals
[1]
711 actions
.emplace_back(code
, action_pos
, val
, operation
);
714 case Action::TRUNCATE
:
715 if (!actions
.empty() &&
716 actions
.back().get_action() == Action::LOAD
) {
717 /* Turn "load truncate=n" into "load" with
718 * num_arg n, so that we don't needlessly
719 * allocate memory and read data we're just
725 actions
.emplace_back(code
, action_pos
, val
);
728 if (unique_line_no
) {
729 report_location(DIAG_ERROR
, filename
, line_no
,
731 cerr
<< "Index action 'unique' used more than once"
733 report_location(DIAG_NOTE
, filename
,
734 unique_line_no
, unique_pos
);
735 cerr
<< "Previously used here" << endl
;
737 unique_line_no
= line_no
;
738 unique_pos
= action_pos
;
739 if (boolmap
.find(val
) == boolmap
.end())
740 boolmap
[val
] = Action::UNIQUE
;
741 actions
.emplace_back(code
, action_pos
, val
);
744 actions
.emplace_back(code
, action_pos
, val
);
745 auto& obj
= actions
.back();
746 auto gap_size
= obj
.get_num_arg();
748 report_location(DIAG_ERROR
, filename
, line_no
,
749 obj
.get_pos() + 3 + 1);
750 cerr
<< "Index action 'gap' takes a strictly "
751 "positive integer argument" << endl
;
756 actions
.emplace_back(code
, action_pos
, val
);
757 auto& obj
= actions
.back();
758 auto max_length
= obj
.get_num_arg();
759 if (max_length
< 6) {
760 report_location(DIAG_ERROR
, filename
, line_no
,
761 obj
.get_pos() + 4 + 1);
762 cerr
<< "Index action 'hash' takes an integer "
763 "argument which must be at least 6" << endl
;
771 for (unsigned char ch
: val
) {
773 auto column
= actions
.back().get_pos() +
774 strlen(action_names
[code
]) + 1;
775 report_location(DIAG_ERROR
, filename
, line_no
,
777 cerr
<< "Index action '" << action_names
[code
]
778 << "' only support ASCII characters "
782 actions
.emplace_back(code
, action_pos
, val
);
784 case Action::BOOLEAN
:
785 boolmap
[val
] = Action::BOOLEAN
;
788 actions
.emplace_back(code
, action_pos
, val
);
790 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
793 report_location(DIAG_ERROR
, filename
, line_no
,
794 i_after_action
- s
.begin());
795 if (min_args
== max_args
) {
796 cerr
<< "Index action '" << action
<< "' requires "
797 << min_args
<< " arguments" << endl
;
799 cerr
<< "Index action '" << action
<< "' requires at least "
800 << min_args
<< " arguments" << endl
;
805 case Action::INDEXNOPOS
:
806 useless_weight_pos
= string::npos
;
807 actions
.emplace_back(code
, action_pos
, "", weight
);
810 actions
.emplace_back(code
, action_pos
, "", 100);
813 actions
.emplace_back(code
, action_pos
, "",
814 MAX_SAFE_TERM_LENGTH
- 1);
820 actions
.emplace_back(code
, action_pos
, " \t\f\v\r\n");
823 actions
.emplace_back(code
, action_pos
);
830 if (useless_weight_pos
!= string::npos
) {
831 report_useless_action(filename
, line_no
, useless_weight_pos
,
835 while (!actions
.empty()) {
837 Action::type action
= actions
.back().get_action();
840 case Action::HEXTOBIN
:
843 case Action::PARSEDATE
:
848 case Action::TRUNCATE
:
851 report_useless_action(filename
, line_no
,
852 actions
.back().get_pos(),
853 action_names
[action
]);
862 map
<string
, Action::type
>::const_iterator boolpfx
;
863 for (boolpfx
= boolmap
.begin(); boolpfx
!= boolmap
.end(); ++boolpfx
) {
864 if (boolpfx
->second
== Action::UNIQUE
) {
865 report_location(DIAG_WARN
, filename
, unique_line_no
,
867 cerr
<< "Index action 'unique=" << boolpfx
->first
868 << "' without 'boolean=" << boolpfx
->first
<< "'" << endl
;
869 static bool given_doesnt_imply_boolean_warning
= false;
870 if (!given_doesnt_imply_boolean_warning
) {
871 given_doesnt_imply_boolean_warning
= true;
872 report_location(DIAG_NOTE
, filename
, unique_line_no
,
874 cerr
<< "'unique' doesn't implicitly add a boolean term"
880 vector
<string
>::const_iterator field
;
881 for (field
= fields
.begin(); field
!= fields
.end(); ++field
) {
882 vector
<Action
> &v
= index_spec
[*field
];
884 if (fields
.size() == 1) {
885 // Optimise common case where there's only one fieldname
886 // for a list of actions.
887 v
= std::move(actions
);
892 v
.emplace_back(Action::NEW
, string::npos
);
893 v
.insert(v
.end(), actions
.begin(), actions
.end());
898 if (index_spec
.empty()) {
899 report_location(DIAG_ERROR
, filename
, line_no
);
900 cerr
<< "No rules found in index script" << endl
;
909 run_actions(vector
<Action
>::const_iterator action_it
,
910 vector
<Action
>::const_iterator action_end
,
911 Xapian::WritableDatabase
& database
,
912 Xapian::TermGenerator
& indexer
,
913 const string
& old_value
,
914 bool& this_field_is_content
, Xapian::Document
& doc
,
915 map
<string
, list
<string
>>& fields
,
916 string
& field
, const char* fname
,
917 size_t line_no
, Xapian::docid
& docid
)
919 string value
= old_value
;
920 while (action_it
!= action_end
) {
921 auto& action
= *action_it
++;
922 switch (action
.get_action()) {
927 // We're processing the same field again - give it a reprieve.
928 this_field_is_content
= true;
931 if (!value
.empty()) {
932 string f
= action
.get_string_arg();
933 if (f
.empty()) f
= field
;
934 // replace newlines with spaces
936 string::size_type j
= 0;
937 while ((j
= s
.find('\n', j
)) != string::npos
)
939 fields
[f
].push_back(s
);
943 indexer
.index_text(value
,
944 action
.get_num_arg(),
945 action
.get_string_arg());
947 case Action::INDEXNOPOS
:
948 // No positional information so phrase searching won't work.
949 // However, the database will use much less diskspace.
950 indexer
.index_text_without_positions(value
,
951 action
.get_num_arg(),
952 action
.get_string_arg());
954 case Action::BOOLEAN
: {
955 // Do nothing if there's no text.
956 if (value
.empty()) break;
958 string term
= action
.get_string_arg();
959 if (prefix_needs_colon(term
, value
[0])) term
+= ':';
962 doc
.add_boolean_term(term
);
966 indexer
.increase_termpos(action
.get_num_arg());
969 unsigned int max_length
= action
.get_num_arg();
970 if (value
.length() > max_length
)
971 value
= hash_long_term(value
, max_length
);
974 case Action::HEXTOBIN
: {
975 size_t len
= value
.length();
977 report_location(DIAG_ERROR
, fname
, line_no
);
978 cerr
<< "hextobin: input must have even length"
984 output
.reserve(len
/ 2);
985 for (size_t j
= 0; j
< len
; j
+= 2) {
987 char b
= value
[j
+ 1];
988 if (!C_isxdigit(a
) || !C_isxdigit(b
)) {
989 report_location(DIAG_ERROR
, fname
, line_no
);
990 cerr
<< "hextobin: input must be all hex digits\n";
993 char r
= (hex_digit(a
) << 4) | hex_digit(b
);
996 value
= std::move(output
);
1000 value
= Xapian::Unicode::tolower(value
);
1003 ltrim(value
, action
.get_string_arg());
1006 rtrim(value
, action
.get_string_arg());
1009 rtrim(value
, action
.get_string_arg());
1010 ltrim(value
, action
.get_string_arg());
1012 case Action::SQUASH
:
1013 squash(value
, action
.get_string_arg());
1015 case Action::LOAD
: {
1016 // If there's no input, just issue a warning.
1017 if (value
.empty()) {
1018 report_location(DIAG_WARN
, fname
, line_no
);
1019 cerr
<< "Empty filename in LOAD action" << endl
;
1022 bool truncated
= false;
1023 string filename
= std::move(value
);
1024 // FIXME: Use NOATIME if we own the file or are root.
1025 if (!load_file(filename
, action
.get_num_arg(), NOCACHE
,
1026 value
, truncated
)) {
1027 report_location(DIAG_ERROR
, fname
, line_no
);
1028 cerr
<< "Couldn't load file '" << filename
<< "': "
1029 << strerror(errno
) << endl
;
1032 if (!truncated
) break;
1035 case Action::TRUNCATE
:
1036 utf8_truncate(value
, action
.get_num_arg());
1039 indexer
.set_flags(indexer
.FLAG_SPELLING
);
1041 case Action::SPLIT
: {
1042 // Find the end of the actions which split should execute.
1043 auto split_end
= find(action_it
, action_end
, Action::NEW
);
1045 int split_type
= action
.get_num_arg();
1046 if (value
.empty()) {
1048 } else if (split_type
!= Action::SPLIT_SORT
) {
1049 // Generate split as we consume it.
1050 const string
& delimiter
= action
.get_string_arg();
1052 unique_ptr
<unordered_set
<string
>> seen
;
1053 if (split_type
== Action::SPLIT_DEDUP
) {
1054 seen
.reset(new unordered_set
<string
>);
1057 if (delimiter
.size() == 1) {
1058 // Special case for common single character delimiter.
1059 char ch
= delimiter
[0];
1060 string::size_type i
= 0;
1062 string::size_type j
= value
.find(ch
, i
);
1063 if (split_type
== Action::SPLIT_PREFIXES
) {
1065 string
val(value
, 0, j
);
1066 run_actions(action_it
, split_end
,
1069 this_field_is_content
, doc
,
1071 field
, fname
, line_no
,
1074 } else if (i
!= j
) {
1075 string
val(value
, i
, j
- i
);
1076 if (!seen
.get() || seen
->insert(val
).second
) {
1077 run_actions(action_it
, split_end
,
1080 this_field_is_content
, doc
,
1082 field
, fname
, line_no
,
1086 if (j
== string::npos
) break;
1090 string::size_type i
= 0;
1092 string::size_type j
= value
.find(delimiter
, i
);
1093 if (split_type
== Action::SPLIT_PREFIXES
) {
1095 string
val(value
, 0, j
);
1096 run_actions(action_it
, split_end
,
1099 this_field_is_content
, doc
,
1101 field
, fname
, line_no
,
1104 } else if (i
!= j
) {
1105 string
val(value
, i
, j
- i
);
1106 if (!seen
.get() || seen
->insert(val
).second
) {
1107 run_actions(action_it
, split_end
,
1110 this_field_is_content
, doc
,
1112 field
, fname
, line_no
,
1116 if (j
== string::npos
) break;
1117 i
= j
+ delimiter
.size();
1121 vector
<string
> split_values
;
1122 const string
& delimiter
= action
.get_string_arg();
1123 if (delimiter
.size() == 1) {
1124 // Special case for common single character delimiter.
1125 char ch
= delimiter
[0];
1126 string::size_type i
= 0;
1128 string::size_type j
= value
.find(ch
, i
);
1130 split_values
.emplace_back(value
, i
, j
- i
);
1132 if (j
== string::npos
) break;
1136 string::size_type i
= 0;
1138 string::size_type j
= value
.find(delimiter
, i
);
1140 split_values
.emplace_back(value
, i
, j
- i
);
1142 if (j
== string::npos
) break;
1143 i
= j
+ delimiter
.size();
1147 sort(split_values
.begin(), split_values
.end());
1149 for (auto&& val
: split_values
) {
1150 run_actions(action_it
, split_end
,
1151 database
, indexer
, val
,
1152 this_field_is_content
, doc
, fields
,
1153 field
, fname
, line_no
,
1158 action_it
= split_end
;
1161 case Action::UNHTML
: {
1164 // Default HTML character set is latin 1, though
1165 // not specifying one is deprecated these days.
1166 p
.parse_html(value
, "iso-8859-1", false);
1167 } catch (const string
& newcharset
) {
1169 p
.parse_html(value
, newcharset
, true);
1171 if (p
.indexing_allowed
)
1177 case Action::UNIQUE
: {
1178 // If there's no text, just issue a warning.
1179 if (value
.empty()) {
1180 report_location(DIAG_WARN
, fname
, line_no
);
1181 cerr
<< "Ignoring UNIQUE action on empty text"
1186 // Ensure that the value of this field is unique.
1187 // If a record already exists with the same value,
1188 // it will be replaced with the new record.
1190 // Unique fields aren't considered content - if
1191 // there are no other fields in the document, the
1192 // document is to be deleted.
1193 this_field_is_content
= false;
1195 // Argument is the prefix to add to the field value
1196 // to get the unique term.
1197 string t
= action
.get_string_arg();
1198 if (prefix_needs_colon(t
, value
[0])) t
+= ':';
1200 Xapian::PostingIterator p
= database
.postlist_begin(t
);
1201 if (p
!= database
.postlist_end(t
)) {
1208 doc
.add_value(action
.get_num_arg(), value
);
1210 case Action::VALUENUMERIC
: {
1211 if (value
.empty()) break;
1213 double dbl
= strtod(value
.c_str(), &end
);
1215 report_location(DIAG_WARN
, fname
, line_no
);
1216 cerr
<< "Trailing characters in VALUENUMERIC: '"
1217 << value
<< "'" << endl
;
1219 doc
.add_value(action
.get_num_arg(),
1220 Xapian::sortable_serialise(dbl
));
1223 case Action::VALUEPACKED
: {
1225 if (value
.empty() || !C_isdigit(value
[0])) {
1226 // strtoul() accepts leading whitespace and negated
1227 // values, neither of which we want to allow.
1232 word
= strtoul(value
.c_str(), &q
, 10);
1233 if (!errno
&& *q
!= '\0') {
1234 // Trailing characters after converted value.
1239 report_location(DIAG_WARN
, fname
, line_no
);
1240 cerr
<< "valuepacked \"" << value
<< "\" ";
1241 if (errno
== ERANGE
) {
1242 cerr
<< "out of range";
1244 cerr
<< "not an unsigned integer";
1248 int valueslot
= action
.get_num_arg();
1249 doc
.add_value(valueslot
, int_to_binary_string(word
));
1252 case Action::DATE
: {
1253 // Do nothing for empty input.
1254 if (value
.empty()) break;
1256 const string
& type
= action
.get_string_arg();
1258 if (type
== "unix") {
1260 if (!parse_signed(value
.c_str(), t
)) {
1261 report_location(DIAG_WARN
, fname
, line_no
);
1262 cerr
<< "Date value (in secs) for action DATE "
1263 "must be an integer - ignoring" << endl
;
1266 struct tm
*tm
= localtime(&t
);
1267 int y
= tm
->tm_year
+ 1900;
1268 int m
= tm
->tm_mon
+ 1;
1269 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1270 } else if (type
== "unixutc") {
1272 if (!parse_signed(value
.c_str(), t
)) {
1273 report_location(DIAG_WARN
, fname
, line_no
);
1274 cerr
<< "Date value (in secs) for action DATE "
1275 "must be an integer - ignoring" << endl
;
1278 struct tm
*tm
= gmtime(&t
);
1279 int y
= tm
->tm_year
+ 1900;
1280 int m
= tm
->tm_mon
+ 1;
1281 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1282 } else if (type
== "yyyymmdd") {
1283 if (value
.length() != 8) {
1284 report_location(DIAG_WARN
, fname
, line_no
);
1285 cerr
<< "date=yyyymmdd expects an 8 character value "
1286 "- ignoring" << endl
;
1293 doc
.add_boolean_term("D" + yyyymmdd
);
1296 doc
.add_boolean_term("M" + yyyymmdd
);
1299 doc
.add_boolean_term("Y" + yyyymmdd
);
1302 case Action::PARSEDATE
: {
1303 string dateformat
= action
.get_string_arg();
1305 memset(&tm
, 0, sizeof(tm
));
1306 auto ret
= strptime(value
.c_str(), dateformat
.c_str(), &tm
);
1308 report_location(DIAG_WARN
, fname
, line_no
);
1309 cerr
<< "\"" << value
<< "\" doesn't match format "
1310 "\"" << dateformat
<< '\"' << endl
;
1315 report_location(DIAG_WARN
, fname
, line_no
);
1316 cerr
<< "\"" << value
<< "\" not fully matched by "
1317 "format \"" << dateformat
<< "\" "
1318 "(\"" << ret
<< "\" left over) but "
1319 "indexing anyway" << endl
;
1321 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1322 auto gmtoff
= tm
.tm_gmtoff
;
1324 auto secs_since_epoch
= timegm(&tm
);
1325 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1326 secs_since_epoch
-= gmtoff
;
1328 value
= str(secs_since_epoch
);
1332 /* Empty default case to avoid "unhandled enum value"
1341 index_file(const char *fname
, istream
&stream
,
1342 Xapian::WritableDatabase
&database
, Xapian::TermGenerator
&indexer
)
1346 while (!stream
.eof() && getline(stream
, line
)) {
1348 Xapian::Document doc
;
1349 indexer
.set_document(doc
);
1350 Xapian::docid docid
= 0;
1351 map
<string
, list
<string
>> fields
;
1352 bool seen_content
= false;
1353 while (!line
.empty()) {
1354 // Cope with files from MS Windows (\r\n end of lines).
1355 // Trim multiple \r characters, since that seems the best way
1356 // to handle that case.
1357 string::size_type last
= line
.find_last_not_of('\r');
1358 if (last
== string::npos
) break;
1359 line
.resize(last
+ 1);
1361 string::size_type eq
= line
.find('=');
1362 if (eq
== string::npos
&& !line
.empty()) {
1363 report_location(DIAG_ERROR
, fname
, line_no
, line
.size());
1364 cerr
<< "expected = somewhere in this line" << endl
;
1367 string
field(line
, 0, eq
);
1368 string
value(line
, eq
+ 1, string::npos
);
1370 while (getline(stream
, line
)) {
1372 if (line
.empty() || line
[0] != '=') break;
1373 // Cope with files from MS Windows (\r\n end of lines).
1374 // Trim multiple \r characters, since that seems the best way
1375 // to handle that case.
1376 last
= line
.find_last_not_of('\r');
1377 // line[0] == '=', so last != string::npos.
1378 // Replace the '=' with a '\n' so we don't have to use substr.
1380 line
.resize(last
+ 1);
1384 // Default to not indexing spellings.
1385 indexer
.set_flags(Xapian::TermGenerator::flags(0));
1387 bool this_field_is_content
= true;
1388 const vector
<Action
>& v
= index_spec
[field
];
1389 run_actions(v
.begin(), v
.end(),
1390 database
, indexer
, value
,
1391 this_field_is_content
, doc
, fields
,
1392 field
, fname
, line_no
,
1394 if (this_field_is_content
) seen_content
= true;
1397 // If we haven't seen any fields (other than unique identifiers)
1398 // the document is to be deleted.
1399 if (!seen_content
) {
1401 database
.delete_document(docid
);
1402 if (verbose
) cout
<< "Del: " << docid
<< endl
;
1407 for (auto&& i
: fields
) {
1408 for (auto&& field_val
: i
.second
) {
1416 // Put the data in the document
1419 // Add the document to the database
1421 database
.replace_document(docid
, doc
);
1422 if (verbose
) cout
<< "Replace: " << docid
<< endl
;
1425 docid
= database
.add_document(doc
);
1426 if (verbose
) cout
<< "Add: " << docid
<< endl
;
1432 // Commit after each file to make sure all changes from that file make it
1434 if (verbose
) cout
<< "Committing: " << endl
;
1439 show_help(int exit_code
)
1441 cout
<< PROG_NAME
" - " PROG_DESC
"\n"
1442 "Usage: " PROG_NAME
" [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1444 "Creates or updates a Xapian database with the data from the input files listed\n"
1445 "on the command line. If no files are specified, data is read from stdin.\n"
1447 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1448 "format for INDEXER_SCRIPT.\n"
1451 " -v, --verbose display additional messages to aid debugging\n"
1452 " --overwrite create the database anew (the default is to update if\n"
1453 " the database already exists)\n";
1454 print_stemmer_help("");
1455 print_help_and_version_help("");
1460 main(int argc
, char **argv
)
1462 // If the database already exists, default to updating not overwriting.
1463 int database_mode
= Xapian::DB_CREATE_OR_OPEN
;
1465 Xapian::Stem
stemmer("english");
1467 // Without this, strptime() seems to treat formats without a timezone as
1468 // being local time, including %s.
1469 setenv("TZ", "UTC", 1);
1471 constexpr auto NO_ARG
= no_argument
;
1472 constexpr auto REQ_ARG
= required_argument
;
1473 static const struct option longopts
[] = {
1474 { "help", NO_ARG
, NULL
, 'h' },
1475 { "version", NO_ARG
, NULL
, 'V' },
1476 { "stemmer", REQ_ARG
, NULL
, 's' },
1477 { "overwrite", NO_ARG
, NULL
, 'o' },
1478 { "verbose", NO_ARG
, NULL
, 'v' },
1483 while ((getopt_ret
= gnu_getopt_long(argc
, argv
, "vs:hV",
1484 longopts
, NULL
)) != -1) {
1485 switch (getopt_ret
) {
1492 case 'V': // --version
1493 print_package_info(PROG_NAME
);
1495 case 'o': // --overwrite
1496 database_mode
= Xapian::DB_CREATE_OR_OVERWRITE
;
1503 stemmer
= Xapian::Stem(optarg
);
1504 } catch (const Xapian::InvalidArgumentError
&) {
1505 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n";
1506 cerr
<< "Available language names are: "
1507 << Xapian::Stem::get_available_languages() << endl
;
1520 parse_index_script(argv
[1]);
1522 // Open the database. If another process is currently updating the
1523 // database, wait for the lock to become available.
1524 auto flags
= database_mode
| Xapian::DB_RETRY_LOCK
;
1525 Xapian::WritableDatabase
database(argv
[0], flags
);
1527 Xapian::TermGenerator indexer
;
1528 indexer
.set_stemmer(stemmer
);
1529 // Set the database for spellings to be added to by the "spell" action.
1530 indexer
.set_database(database
);
1538 index_file("<stdin>", cin
, database
, indexer
);
1540 // Read file(s) listed on the command line.
1541 for (int i
= 2; i
< argc
; ++i
) {
1542 ifstream
stream(argv
[i
]);
1544 index_file(argv
[i
], stream
, database
, indexer
);
1546 cerr
<< "Can't open file " << argv
[i
] << endl
;
1551 cout
<< "records (added, replaced, deleted) = (" << addcount
<< ", "
1552 << repcount
<< ", " << delcount
<< ")" << endl
;
1553 } catch (const Xapian::Error
&error
) {
1554 cerr
<< "Exception: " << error
.get_description() << endl
;
1556 } catch (const std::bad_alloc
&) {
1557 cerr
<< "Exception: std::bad_alloc" << endl
;
1560 cerr
<< "Unknown Exception" << endl
;