2 * @brief index arbitrary data as described by an index script
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2023 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
28 // Needed to get setenv() and strptime() declared.
41 #include <unordered_set>
50 #include "commonhelp.h"
52 #include "genericxmlparser.h"
54 #include "htmlparser.h"
59 #include "stringutils.h"
61 #include "utf8truncate.h"
65 #include "portability/strptime.h"
68 #include "gnu_getopt.h"
72 #define PROG_NAME "scriptindex"
73 #define PROG_DESC "index arbitrary data as described by an index script"
81 /** What to do if there's a UNIQUE action but a record doesn't use it.
89 } unique_missing
= UNIQUE_ERROR
;
91 /// Track if UNIQUE action is unused in the current record.
92 static bool unique_unused
;
94 /// Track if the current record is being skipping.
95 static bool skipping_record
= false;
98 prefix_needs_colon(const string
& prefix
, unsigned ch
)
100 if (!C_isupper(ch
) && ch
!= ':') return false;
101 string::size_type len
= prefix
.length();
102 return (len
> 1 && prefix
[len
- 1] != ':');
105 const char * action_names
[] = {
106 // Actions used internally:
138 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")\n"
143 // Actions used internally:
173 enum { SPLIT_NONE
, SPLIT_DEDUP
, SPLIT_SORT
, SPLIT_PREFIXES
};
178 // Offset into indexscript line.
181 Action(type action_
, size_t pos_
)
182 : action(action_
), pos(pos_
) { }
183 Action(type action_
, size_t pos_
, const string
& arg
)
184 : action(action_
), string_arg(arg
), pos(pos_
) {
185 num_arg
= atoi(string_arg
.c_str());
187 Action(type action_
, size_t pos_
, const string
& arg
, int num
)
188 : action(action_
), num_arg(num
), string_arg(arg
), pos(pos_
) { }
189 type
get_action() const { return action
; }
190 int get_num_arg() const { return num_arg
; }
191 void set_num_arg(int num
) { num_arg
= num
; }
192 const string
& get_string_arg() const { return string_arg
; }
193 size_t get_pos() const { return pos
; }
196 // These allow searching for an Action with a particular Action::type using
200 operator==(const Action
& a
, Action::type t
) { return a
.get_action() == t
; }
203 operator==(Action::type t
, const Action
& a
) { return a
.get_action() == t
; }
206 operator!=(const Action
& a
, Action::type t
) { return !(a
== t
); }
209 operator!=(Action::type t
, const Action
& a
) { return !(t
== a
); }
212 ltrim(string
& s
, const string
& chars
)
214 auto i
= s
.find_first_not_of(chars
);
215 if (i
) s
.erase(0, i
);
219 rtrim(string
& s
, const string
& chars
)
221 s
.resize(s
.find_last_not_of(chars
) + 1);
225 squash(string
& s
, const string
& chars
)
228 output
.reserve(s
.size());
229 string::size_type i
= 0;
230 while ((i
= s
.find_first_not_of(chars
, i
)) != string::npos
) {
231 auto j
= s
.find_first_of(chars
, i
);
232 if (!output
.empty()) output
+= ' ';
233 output
.append(s
, i
, j
- i
);
236 s
= std::move(output
);
239 enum diag_type
{ DIAG_ERROR
, DIAG_WARN
, DIAG_NOTE
};
241 static unsigned error_count
= 0;
244 report_location(enum diag_type type
,
245 const string
& filename
,
247 size_t pos
= string::npos
)
252 if (pos
!= string::npos
) {
253 // The first column is numbered 1.
254 cerr
<< ':' << pos
+ 1;
263 cerr
<< ": warning: ";
272 report_useless_action(const string
&file
, size_t line
, size_t pos
,
273 const string
&action
)
275 report_location(DIAG_WARN
, file
, line
, pos
);
276 cerr
<< "Index action '" << action
<< "' has no effect\n";
278 static bool given_left_to_right_warning
= false;
279 if (!given_left_to_right_warning
) {
280 given_left_to_right_warning
= true;
281 report_location(DIAG_NOTE
, file
, line
, pos
);
282 cerr
<< "Actions are executed from left to right\n";
286 // Return true if we can support %z on the current platform.
288 parsedate_supports_z()
290 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
291 // Without tm_gmtoff we aren't going to get the timezone information from
295 // Perform a simple run-time test to check if %z is suitably supported.
296 static bool cached_result
= ([]() {
298 memset(&tm
, 0, sizeof(tm
));
299 auto ret
= strptime("+1245", "%z", &tm
);
300 return ret
&& *ret
== '\0' && tm
.tm_gmtoff
== (12 * 60 + 45) * 60;
302 return cached_result
;
306 static bool index_spec_uses_unique
= false;
308 static map
<string
, vector
<Action
>> index_spec
;
310 // Like std::getline() but handle \r\n line endings too.
312 getline_portable(istream
& stream
, string
& line
)
314 istream
& result
= getline(stream
, line
);
315 // Trim multiple \r characters, since that seems the best way to handle
317 line
.resize(UNSIGNED_OVERFLOW_OK(line
.find_last_not_of('\r') + 1));
322 parse_index_script(const string
&filename
)
324 ifstream
script(filename
.c_str());
325 if (!script
.is_open()) {
326 report_location(DIAG_ERROR
, filename
);
327 cerr
<< strerror(errno
) << '\n';
332 // Line number where we saw a `unique` action, or 0 if we haven't.
333 int unique_line_no
= 0;
334 // Offset into line unique_line_no where the `unique` action was.
335 size_t unique_pos
= 0;
336 while (getline(script
, line
)) {
338 vector
<string
> fields
;
339 vector
<Action
> actions
;
340 string::const_iterator i
, j
;
341 const string
&s
= line
;
342 i
= find_if(s
.begin(), s
.end(), [](char ch
) { return !C_isspace(ch
); });
343 if (i
== s
.end() || *i
== '#') {
344 // Blank line or comment.
348 if (!C_isalnum(*i
)) {
349 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
350 cerr
<< "field name must start with alphanumeric\n";
352 j
= find_if(i
+ 1, s
.end(),
353 [](char ch
) { return !C_isalnum(ch
) && ch
!= '_'; });
354 fields
.push_back(string(i
, j
));
355 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
356 if (i
== s
.end()) break;
359 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
363 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
364 cerr
<< "bad character '" << *i
<< "' in field name\n";
366 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
367 if (i
== s
.end()) break;
370 Xapian::termcount weight
= 1;
371 size_t useless_weight_pos
= string::npos
;
372 map
<string
, Action::type
> boolmap
;
374 while (j
!= s
.end()) {
375 size_t action_pos
= j
- s
.begin();
376 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isalnum(ch
); });
377 string
action(s
, j
- s
.begin(), i
- j
);
378 Action::type code
= Action::BAD
;
379 unsigned min_args
= 0, max_args
= 0;
380 bool takes_integer_argument
= false;
381 if (!action
.empty()) {
384 if (action
== "boolean") {
385 code
= Action::BOOLEAN
;
390 if (action
== "date") {
392 min_args
= max_args
= 1;
396 if (action
== "field") {
397 code
= Action::FIELD
;
402 if (action
== "gap") {
405 takes_integer_argument
= true;
409 if (action
== "hash") {
412 takes_integer_argument
= true;
413 } else if (action
== "hextobin") {
414 code
= Action::HEXTOBIN
;
418 if (action
== "index") {
419 code
= Action::INDEX
;
421 } else if (action
== "indexnopos") {
422 code
= Action::INDEXNOPOS
;
427 if (action
== "lower") {
428 code
= Action::LOWER
;
429 } else if (action
== "load") {
431 } else if (action
== "ltrim") {
432 code
= Action::LTRIM
;
437 if (action
== "parsedate") {
438 code
= Action::PARSEDATE
;
439 min_args
= max_args
= 1;
443 if (action
== "rtrim") {
444 code
= Action::RTRIM
;
449 if (action
== "spell") {
450 code
= Action::SPELL
;
451 } else if (action
== "split") {
452 code
= Action::SPLIT
;
455 } else if (action
== "squash") {
456 code
= Action::SQUASH
;
461 if (action
== "truncate") {
462 code
= Action::TRUNCATE
;
463 min_args
= max_args
= 1;
464 takes_integer_argument
= true;
465 } else if (action
== "trim") {
471 if (action
== "unhtml") {
472 code
= Action::UNHTML
;
473 } else if (action
== "unique") {
474 code
= Action::UNIQUE
;
477 } else if (action
== "unxml") {
478 code
= Action::UNXML
;
482 if (action
== "value") {
483 code
= Action::VALUE
;
484 min_args
= max_args
= 1;
485 takes_integer_argument
= true;
486 } else if (action
== "valuenumeric") {
487 code
= Action::VALUENUMERIC
;
488 min_args
= max_args
= 1;
489 takes_integer_argument
= true;
490 } else if (action
== "valuepacked") {
491 code
= Action::VALUEPACKED
;
492 min_args
= max_args
= 1;
493 takes_integer_argument
= true;
497 if (action
== "weight") {
498 code
= Action::WEIGHT
;
499 min_args
= max_args
= 1;
500 // Don't set takes_integer_argument since we parse
501 // it with parse_unsigned() and issue an error there
502 // - setting takes_integer_argument would give a
503 // double error for arguments with a decimal point.
508 if (code
== Action::BAD
) {
509 report_location(DIAG_ERROR
, filename
, line_no
, action_pos
);
510 if (action
.empty()) {
511 i
= find_if(i
, s
.end(), C_isspace
);
512 cerr
<< "Expected index action, found '"
513 << string(s
, j
- s
.begin(), i
- j
) << "'\n";
515 cerr
<< "Unknown index action '" << action
<< "'\n";
518 auto i_after_action
= i
;
519 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
521 if (i
!= s
.end() && *i
== '=') {
522 if (i
!= i_after_action
) {
523 report_location(DIAG_WARN
, filename
, line_no
,
524 i_after_action
- s
.begin());
525 cerr
<< "putting spaces between the action and '=' is "
530 report_location(DIAG_ERROR
, filename
, line_no
,
532 cerr
<< "Index action '" << action
533 << "' doesn't take an argument\n";
537 j
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
539 report_location(DIAG_WARN
, filename
, line_no
,
541 cerr
<< "putting spaces between '=' and the argument is "
547 if (j
!= s
.end() && *j
== '"') {
552 i
= find_if(j
, s
.end(),
554 return ch
== '"' || ch
== '\\';
557 report_location(DIAG_ERROR
, filename
, line_no
,
559 cerr
<< "No closing quote\n";
569 report_location(DIAG_ERROR
, filename
, line_no
,
571 cerr
<< "Bad escaping in quoted action "
597 if (!C_isxdigit(ch1
)) {
599 report_location(DIAG_ERROR
, filename
,
600 line_no
, i
- s
.begin());
601 cerr
<< "Bad hex digit in escaping\n";
608 if (!C_isxdigit(ch2
)) {
611 ch
= hex_decode(ch1
, ch2
);
615 report_location(DIAG_ERROR
, filename
,
616 line_no
, i
- s
.begin());
617 cerr
<< "Bad escape sequence '\\" << ch
624 vals
.emplace_back(std::move(arg
));
625 if (i
== s
.end() || C_isspace(*i
)) break;
629 report_location(DIAG_ERROR
, filename
, line_no
,
631 cerr
<< "Unexpected character '" << *i
632 << "' after closing quote\n";
635 } while (i
!= s
.end() && *i
!= ',' && !C_isspace(*i
));
636 if (*i
!= ',') break;
639 } else if (max_args
> 1) {
640 // Unquoted argument, split on comma.
641 i
= find_if(j
, s
.end(),
643 return C_isspace(ch
) || ch
== ',';
645 vals
.emplace_back(j
, i
);
646 if (*i
!= ',') break;
649 // Unquoted argument, including any commas.
650 i
= find_if(j
, s
.end(),
651 [](char ch
) { return C_isspace(ch
); });
652 vals
.emplace_back(j
, i
);
657 if (vals
.size() == max_args
) {
658 report_location(DIAG_ERROR
, filename
, line_no
,
660 cerr
<< "Index action '" << action
<< "' takes at most "
661 << max_args
<< " arguments\n";
665 if (vals
.size() < min_args
) {
666 report_location(DIAG_ERROR
, filename
, line_no
,
668 if (min_args
== max_args
) {
669 cerr
<< "Index action '" << action
<< "' requires "
670 << min_args
<< " arguments\n";
672 cerr
<< "Index action '" << action
<< "' requires "
673 "at least " << min_args
<< " arguments\n";
675 // Allow action handling code to assume there are min_args
677 vals
.resize(min_args
);
685 if (takes_integer_argument
) {
686 auto dot
= val
.find('.');
687 if (dot
!= string::npos
) {
688 report_location(DIAG_ERROR
, filename
, line_no
,
689 j
- s
.begin() + dot
);
690 cerr
<< "Index action '" << action
691 << "' takes an integer argument\n";
699 report_location(DIAG_ERROR
, filename
, line_no
,
701 cerr
<< "Invalid parameter '" << val
702 << "' for action 'date'\n";
704 actions
.emplace_back(code
, action_pos
, val
);
707 case Action::INDEXNOPOS
:
708 actions
.emplace_back(code
, action_pos
, val
, weight
);
709 useless_weight_pos
= string::npos
;
712 // We don't push an Action for WEIGHT - instead we
713 // store it ready to use in the INDEX and INDEXNOPOS
715 if (!parse_unsigned(val
.c_str(), weight
)) {
716 report_location(DIAG_ERROR
, filename
, line_no
,
718 cerr
<< "Index action 'weight' takes a "
719 "non-negative integer argument\n";
722 if (useless_weight_pos
!= string::npos
) {
723 report_useless_action(filename
, line_no
,
724 useless_weight_pos
, action
);
726 useless_weight_pos
= action_pos
;
728 case Action::PARSEDATE
: {
729 auto bad_code
= val
.find("%Z");
730 if (bad_code
!= val
.npos
) {
731 report_location(DIAG_ERROR
, filename
, line_no
,
732 j
- s
.begin() + bad_code
);
733 cerr
<< "Parsing timezone names with %Z is not "
736 bad_code
= val
.find("%z");
737 if (bad_code
!= val
.npos
&& !parsedate_supports_z()) {
738 report_location(DIAG_ERROR
, filename
, line_no
,
739 j
- s
.begin() + bad_code
);
740 cerr
<< "Parsing timezone offsets with %z is not "
741 "supported on this platform\n";
743 actions
.emplace_back(code
, action_pos
, val
);
746 case Action::SPLIT
: {
748 report_location(DIAG_ERROR
, filename
, line_no
,
750 cerr
<< "Split delimiter can't be empty\n";
752 int operation
= Action::SPLIT_NONE
;
753 if (vals
.size() >= 2) {
754 if (vals
[1] == "dedup") {
755 operation
= Action::SPLIT_DEDUP
;
756 } else if (vals
[1] == "sort") {
757 operation
= Action::SPLIT_SORT
;
758 } else if (vals
[1] == "none") {
759 operation
= Action::SPLIT_NONE
;
760 } else if (vals
[1] == "prefixes") {
761 operation
= Action::SPLIT_PREFIXES
;
763 // FIXME: Column should be for where the `op`
764 // parameter starts, which this isn't if the
765 // value is quoted, contains escape sequences,
767 report_location(DIAG_ERROR
, filename
, line_no
,
768 i
- s
.begin() - vals
[1].size());
769 cerr
<< "Bad split operation '" << vals
[1]
773 actions
.emplace_back(code
, action_pos
, val
, operation
);
776 case Action::TRUNCATE
:
777 if (!actions
.empty() &&
778 actions
.back().get_action() == Action::LOAD
) {
779 /* Turn "load truncate=n" into "load" with
780 * num_arg n, so that we don't needlessly
781 * allocate memory and read data we're just
787 actions
.emplace_back(code
, action_pos
, val
);
790 if (unique_line_no
) {
791 report_location(DIAG_ERROR
, filename
, line_no
,
793 cerr
<< "Index action 'unique' used more than "
795 report_location(DIAG_NOTE
, filename
,
796 unique_line_no
, unique_pos
);
797 cerr
<< "Previously used here\n";
799 unique_line_no
= line_no
;
800 unique_pos
= action_pos
;
801 if (boolmap
.find(val
) == boolmap
.end())
802 boolmap
[val
] = Action::UNIQUE
;
803 if (vals
.size() >= 2) {
804 if (vals
[1] == "missing=error") {
805 unique_missing
= UNIQUE_ERROR
;
806 } else if (vals
[1] == "missing=new") {
807 unique_missing
= UNIQUE_NEW
;
808 } else if (vals
[1] == "missing=warn+new") {
809 unique_missing
= UNIQUE_WARN_NEW
;
810 } else if (vals
[1] == "missing=skip") {
811 unique_missing
= UNIQUE_SKIP
;
812 } else if (vals
[1] == "missing=warn+skip") {
813 unique_missing
= UNIQUE_WARN_SKIP
;
815 report_location(DIAG_ERROR
, filename
, line_no
);
816 cerr
<< "Bad unique parameter '" << vals
[1]
820 actions
.emplace_back(code
, action_pos
, val
);
823 actions
.emplace_back(code
, action_pos
, val
);
824 auto& obj
= actions
.back();
825 auto gap_size
= obj
.get_num_arg();
827 report_location(DIAG_ERROR
, filename
, line_no
,
828 obj
.get_pos() + 3 + 1);
829 cerr
<< "Index action 'gap' takes a strictly "
830 "positive integer argument\n";
835 actions
.emplace_back(code
, action_pos
, val
);
836 auto& obj
= actions
.back();
837 auto max_length
= obj
.get_num_arg();
838 if (max_length
< 6) {
839 report_location(DIAG_ERROR
, filename
, line_no
,
840 obj
.get_pos() + 4 + 1);
841 cerr
<< "Index action 'hash' takes an integer "
842 "argument which must be at least 6\n";
850 for (unsigned char ch
: val
) {
852 auto column
= actions
.back().get_pos() +
853 strlen(action_names
[code
]) + 1;
854 report_location(DIAG_ERROR
, filename
, line_no
,
856 cerr
<< "Index action '" << action_names
[code
]
857 << "' only support ASCII characters "
861 actions
.emplace_back(code
, action_pos
, val
);
863 case Action::BOOLEAN
:
864 boolmap
[val
] = Action::BOOLEAN
;
867 actions
.emplace_back(code
, action_pos
, val
);
869 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
872 report_location(DIAG_ERROR
, filename
, line_no
,
873 i_after_action
- s
.begin());
874 if (min_args
== max_args
) {
875 cerr
<< "Index action '" << action
<< "' requires "
876 << min_args
<< " arguments\n";
878 cerr
<< "Index action '" << action
<< "' requires "
879 "at least " << min_args
<< " arguments\n";
884 case Action::INDEXNOPOS
:
885 useless_weight_pos
= string::npos
;
886 actions
.emplace_back(code
, action_pos
, "", weight
);
889 actions
.emplace_back(code
, action_pos
, "", 100);
892 actions
.emplace_back(code
, action_pos
, "",
893 MAX_SAFE_TERM_LENGTH
- 1);
899 actions
.emplace_back(code
, action_pos
, " \t\f\v\r\n");
902 actions
.emplace_back(code
, action_pos
);
909 if (useless_weight_pos
!= string::npos
) {
910 report_useless_action(filename
, line_no
, useless_weight_pos
,
914 while (!actions
.empty()) {
916 Action::type action
= actions
.back().get_action();
919 case Action::HEXTOBIN
:
922 case Action::PARSEDATE
:
927 case Action::TRUNCATE
:
931 report_useless_action(filename
, line_no
,
932 actions
.back().get_pos(),
933 action_names
[action
]);
942 map
<string
, Action::type
>::const_iterator boolpfx
;
943 for (boolpfx
= boolmap
.begin(); boolpfx
!= boolmap
.end(); ++boolpfx
) {
944 if (boolpfx
->second
== Action::UNIQUE
) {
945 report_location(DIAG_WARN
, filename
, unique_line_no
,
947 cerr
<< "Index action 'unique=" << boolpfx
->first
948 << "' without 'boolean=" << boolpfx
->first
<< "'\n";
949 static bool given_doesnt_imply_boolean_warning
= false;
950 if (!given_doesnt_imply_boolean_warning
) {
951 given_doesnt_imply_boolean_warning
= true;
952 report_location(DIAG_NOTE
, filename
, unique_line_no
,
954 cerr
<< "'unique' doesn't implicitly add a boolean term\n";
959 vector
<string
>::const_iterator field
;
960 for (field
= fields
.begin(); field
!= fields
.end(); ++field
) {
961 vector
<Action
> &v
= index_spec
[*field
];
963 if (fields
.size() == 1) {
964 // Optimise common case where there's only one fieldname
965 // for a list of actions.
966 v
= std::move(actions
);
971 v
.emplace_back(Action::NEW
, string::npos
);
972 v
.insert(v
.end(), actions
.begin(), actions
.end());
977 if (index_spec
.empty()) {
978 report_location(DIAG_ERROR
, filename
, line_no
);
979 cerr
<< "No rules found in index script\n";
986 index_spec_uses_unique
= (unique_line_no
> 0);
990 run_actions(vector
<Action
>::const_iterator action_it
,
991 vector
<Action
>::const_iterator action_end
,
992 Xapian::WritableDatabase
& database
,
993 Xapian::TermGenerator
& indexer
,
994 const string
& old_value
,
995 bool& this_field_is_content
, Xapian::Document
& doc
,
996 map
<string
, list
<string
>>& fields
,
997 string
& field
, const char* fname
,
998 size_t line_no
, Xapian::docid
& docid
)
1000 string value
= old_value
;
1001 while (action_it
!= action_end
) {
1002 auto& action
= *action_it
++;
1003 switch (action
.get_action()) {
1010 if (!value
.empty()) {
1011 string f
= action
.get_string_arg();
1012 if (f
.empty()) f
= field
;
1013 // replace newlines with spaces
1015 string::size_type j
= 0;
1016 while ((j
= s
.find('\n', j
)) != string::npos
)
1018 fields
[f
].push_back(s
);
1022 indexer
.index_text(value
,
1023 action
.get_num_arg(),
1024 action
.get_string_arg());
1026 case Action::INDEXNOPOS
:
1027 // No positional information so phrase searching won't work.
1028 // However, the database will use much less diskspace.
1029 indexer
.index_text_without_positions(value
,
1030 action
.get_num_arg(),
1031 action
.get_string_arg());
1033 case Action::BOOLEAN
: {
1034 // Do nothing if there's no text.
1035 if (value
.empty()) break;
1037 string term
= action
.get_string_arg();
1038 if (prefix_needs_colon(term
, value
[0])) term
+= ':';
1041 doc
.add_boolean_term(term
);
1045 indexer
.increase_termpos(action
.get_num_arg());
1047 case Action::HASH
: {
1048 unsigned int max_length
= action
.get_num_arg();
1049 if (value
.length() > max_length
)
1050 value
= hash_long_term(value
, max_length
);
1053 case Action::HEXTOBIN
: {
1054 size_t len
= value
.length();
1056 report_location(DIAG_ERROR
, fname
, line_no
);
1057 cerr
<< "hextobin: input must have even length\n";
1062 output
.reserve(len
/ 2);
1063 for (size_t j
= 0; j
< len
; j
+= 2) {
1065 char b
= value
[j
+ 1];
1066 if (!C_isxdigit(a
) || !C_isxdigit(b
)) {
1067 report_location(DIAG_ERROR
, fname
, line_no
);
1068 cerr
<< "hextobin: input must be all hex digits\n";
1071 char r
= hex_decode(a
, b
);
1072 output
.push_back(r
);
1074 value
= std::move(output
);
1078 value
= Xapian::Unicode::tolower(value
);
1081 ltrim(value
, action
.get_string_arg());
1084 rtrim(value
, action
.get_string_arg());
1087 rtrim(value
, action
.get_string_arg());
1088 ltrim(value
, action
.get_string_arg());
1090 case Action::SQUASH
:
1091 squash(value
, action
.get_string_arg());
1093 case Action::LOAD
: {
1094 // If there's no input, just issue a warning.
1095 if (value
.empty()) {
1096 report_location(DIAG_WARN
, fname
, line_no
);
1097 cerr
<< "Empty filename in LOAD action\n";
1100 bool truncated
= false;
1101 string filename
= std::move(value
);
1102 // FIXME: Use NOATIME if we own the file or are root.
1103 if (!load_file(filename
, action
.get_num_arg(), NOCACHE
,
1104 value
, truncated
)) {
1105 report_location(DIAG_ERROR
, fname
, line_no
);
1106 cerr
<< "Couldn't load file '" << filename
<< "': "
1107 << strerror(errno
) << '\n';
1110 if (!truncated
) break;
1113 case Action::TRUNCATE
:
1114 utf8_truncate(value
, action
.get_num_arg());
1117 indexer
.set_flags(indexer
.FLAG_SPELLING
);
1119 case Action::SPLIT
: {
1120 // Find the end of the actions which split should execute.
1121 auto split_end
= find(action_it
, action_end
, Action::NEW
);
1123 int split_type
= action
.get_num_arg();
1124 if (value
.empty()) {
1126 } else if (split_type
!= Action::SPLIT_SORT
) {
1127 // Generate split as we consume it.
1128 const string
& delimiter
= action
.get_string_arg();
1130 unique_ptr
<unordered_set
<string
>> seen
;
1131 if (split_type
== Action::SPLIT_DEDUP
) {
1132 seen
.reset(new unordered_set
<string
>);
1135 if (delimiter
.size() == 1) {
1136 // Special case for common single character delimiter.
1137 char ch
= delimiter
[0];
1138 string::size_type i
= 0;
1140 string::size_type j
= value
.find(ch
, i
);
1141 if (split_type
== Action::SPLIT_PREFIXES
) {
1143 string
val(value
, 0, j
);
1144 run_actions(action_it
, split_end
,
1147 this_field_is_content
, doc
,
1149 field
, fname
, line_no
,
1152 } else if (i
!= j
) {
1153 string
val(value
, i
, j
- i
);
1154 if (!seen
.get() || seen
->insert(val
).second
) {
1155 run_actions(action_it
, split_end
,
1158 this_field_is_content
, doc
,
1160 field
, fname
, line_no
,
1164 if (j
== string::npos
) break;
1168 string::size_type i
= 0;
1170 string::size_type j
= value
.find(delimiter
, i
);
1171 if (split_type
== Action::SPLIT_PREFIXES
) {
1173 string
val(value
, 0, j
);
1174 run_actions(action_it
, split_end
,
1177 this_field_is_content
, doc
,
1179 field
, fname
, line_no
,
1182 } else if (i
!= j
) {
1183 string
val(value
, i
, j
- i
);
1184 if (!seen
.get() || seen
->insert(val
).second
) {
1185 run_actions(action_it
, split_end
,
1188 this_field_is_content
, doc
,
1190 field
, fname
, line_no
,
1194 if (j
== string::npos
) break;
1195 i
= j
+ delimiter
.size();
1199 vector
<string
> split_values
;
1200 const string
& delimiter
= action
.get_string_arg();
1201 if (delimiter
.size() == 1) {
1202 // Special case for common single character delimiter.
1203 char ch
= delimiter
[0];
1204 string::size_type i
= 0;
1206 string::size_type j
= value
.find(ch
, i
);
1208 split_values
.emplace_back(value
, i
, j
- i
);
1210 if (j
== string::npos
) break;
1214 string::size_type i
= 0;
1216 string::size_type j
= value
.find(delimiter
, i
);
1218 split_values
.emplace_back(value
, i
, j
- i
);
1220 if (j
== string::npos
) break;
1221 i
= j
+ delimiter
.size();
1225 sort(split_values
.begin(), split_values
.end());
1227 for (auto&& val
: split_values
) {
1228 run_actions(action_it
, split_end
,
1229 database
, indexer
, val
,
1230 this_field_is_content
, doc
, fields
,
1231 field
, fname
, line_no
,
1236 action_it
= split_end
;
1239 case Action::UNHTML
: {
1242 // Default HTML character set is latin 1, though
1243 // not specifying one is deprecated these days.
1244 p
.parse(value
, "iso-8859-1", false);
1245 } catch (const string
& newcharset
) {
1247 p
.parse(value
, newcharset
, true);
1249 if (p
.indexing_allowed
)
1255 case Action::UNXML
: {
1258 value
= std::move(p
.dump
);
1261 case Action::UNIQUE
: {
1262 unique_unused
= false;
1264 if (value
.empty()) {
1265 enum diag_type diag
= DIAG_WARN
;
1266 switch (unique_missing
) {
1270 case UNIQUE_WARN_NEW
:
1271 case UNIQUE_WARN_SKIP
:
1272 report_location(diag
, fname
, line_no
);
1273 cerr
<< "UNIQUE action on empty text\n";
1277 switch (unique_missing
) {
1281 case UNIQUE_WARN_SKIP
:
1282 skipping_record
= true;
1285 case UNIQUE_WARN_NEW
:
1291 // Ensure that the value of this field is unique.
1292 // If a record already exists with the same value,
1293 // it will be replaced with the new record.
1295 // Unique fields aren't considered content - if
1296 // there are no other fields in the document, the
1297 // document is to be deleted.
1298 this_field_is_content
= false;
1300 // Argument is the prefix to add to the field value
1301 // to get the unique term.
1302 string t
= action
.get_string_arg();
1303 if (prefix_needs_colon(t
, value
[0])) t
+= ':';
1305 Xapian::PostingIterator p
= database
.postlist_begin(t
);
1306 if (p
!= database
.postlist_end(t
)) {
1313 doc
.add_value(action
.get_num_arg(), value
);
1315 case Action::VALUENUMERIC
: {
1316 if (value
.empty()) break;
1318 double dbl
= strtod(value
.c_str(), &end
);
1320 report_location(DIAG_WARN
, fname
, line_no
);
1321 cerr
<< "Trailing characters in VALUENUMERIC: '"
1324 doc
.add_value(action
.get_num_arg(),
1325 Xapian::sortable_serialise(dbl
));
1328 case Action::VALUEPACKED
: {
1330 if (value
.empty() || !C_isdigit(value
[0])) {
1331 // strtoul() accepts leading whitespace and negated
1332 // values, neither of which we want to allow.
1337 word
= strtoul(value
.c_str(), &q
, 10);
1338 if (!errno
&& *q
!= '\0') {
1339 // Trailing characters after converted value.
1344 report_location(DIAG_WARN
, fname
, line_no
);
1345 cerr
<< "valuepacked \"" << value
<< "\" ";
1346 if (errno
== ERANGE
) {
1347 cerr
<< "out of range\n";
1349 cerr
<< "not an unsigned integer\n";
1352 int valueslot
= action
.get_num_arg();
1353 doc
.add_value(valueslot
, int_to_binary_string(word
));
1356 case Action::DATE
: {
1357 // Do nothing for empty input.
1358 if (value
.empty()) break;
1360 const string
& type
= action
.get_string_arg();
1362 if (type
== "unix") {
1364 if (!parse_signed(value
.c_str(), t
)) {
1365 report_location(DIAG_WARN
, fname
, line_no
);
1366 cerr
<< "Date value (in secs) for action DATE "
1367 "must be an integer - ignoring\n";
1370 struct tm
*tm
= localtime(&t
);
1371 int y
= tm
->tm_year
+ 1900;
1372 int m
= tm
->tm_mon
+ 1;
1373 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1374 } else if (type
== "unixutc") {
1376 if (!parse_signed(value
.c_str(), t
)) {
1377 report_location(DIAG_WARN
, fname
, line_no
);
1378 cerr
<< "Date value (in secs) for action DATE "
1379 "must be an integer - ignoring\n";
1382 struct tm
*tm
= gmtime(&t
);
1383 int y
= tm
->tm_year
+ 1900;
1384 int m
= tm
->tm_mon
+ 1;
1385 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1386 } else if (type
== "yyyymmdd") {
1387 if (value
.length() != 8) {
1388 report_location(DIAG_WARN
, fname
, line_no
);
1389 cerr
<< "date=yyyymmdd expects an 8 character value "
1397 doc
.add_boolean_term("D" + yyyymmdd
);
1400 doc
.add_boolean_term("M" + yyyymmdd
);
1403 doc
.add_boolean_term("Y" + yyyymmdd
);
1406 case Action::PARSEDATE
: {
1407 string dateformat
= action
.get_string_arg();
1409 memset(&tm
, 0, sizeof(tm
));
1410 auto ret
= strptime(value
.c_str(), dateformat
.c_str(), &tm
);
1412 report_location(DIAG_WARN
, fname
, line_no
);
1413 cerr
<< "\"" << value
<< "\" doesn't match format "
1414 "\"" << dateformat
<< '\"' << '\n';
1419 report_location(DIAG_WARN
, fname
, line_no
);
1420 cerr
<< "\"" << value
<< "\" not fully matched by "
1421 "format \"" << dateformat
<< "\" "
1422 "(\"" << ret
<< "\" left over) but "
1423 "indexing anyway\n";
1425 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1426 auto gmtoff
= tm
.tm_gmtoff
;
1428 auto secs_since_epoch
= timegm(&tm
);
1429 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1430 secs_since_epoch
-= gmtoff
;
1432 value
= str(secs_since_epoch
);
1436 /* Empty default case to avoid "unhandled enum value"
1445 index_file(const char *fname
, istream
&stream
,
1446 Xapian::WritableDatabase
&database
, Xapian::TermGenerator
&indexer
)
1450 while (!stream
.eof() && getline_portable(stream
, line
)) {
1452 // Allow blank lines before the first record and multiple blank lines
1454 if (line
.empty()) continue;
1456 Xapian::Document doc
;
1457 indexer
.set_document(doc
);
1458 Xapian::docid docid
= 0;
1459 map
<string
, list
<string
>> fields
;
1460 bool seen_content
= false;
1461 skipping_record
= false;
1462 unique_unused
= index_spec_uses_unique
;
1463 while (!line
.empty()) {
1464 string::size_type eq
= line
.find('=');
1465 if (eq
== string::npos
&& !line
.empty()) {
1466 report_location(DIAG_ERROR
, fname
, line_no
);
1467 cerr
<< "Expected = somewhere in this line\n";
1470 string
field(line
, 0, eq
);
1471 string
value(line
, eq
+ 1, string::npos
);
1473 while (getline_portable(stream
, line
)) {
1475 if (line
.empty() || line
[0] != '=') break;
1476 // Replace the '=' with a '\n'.
1482 if (skipping_record
) continue;
1484 // Default to not indexing spellings.
1485 indexer
.set_flags(Xapian::TermGenerator::flags(0));
1487 bool this_field_is_content
= true;
1488 const vector
<Action
>& v
= index_spec
[field
];
1489 run_actions(v
.begin(), v
.end(),
1490 database
, indexer
, value
,
1491 this_field_is_content
, doc
, fields
,
1492 field
, fname
, line_no
,
1494 if (this_field_is_content
) seen_content
= true;
1497 if (unique_unused
) {
1498 enum diag_type diag
= DIAG_WARN
;
1499 switch (unique_missing
) {
1503 case UNIQUE_WARN_NEW
:
1504 case UNIQUE_WARN_SKIP
:
1505 report_location(diag
, fname
, line_no
);
1506 cerr
<< "UNIQUE action unused in this record\n";
1510 switch (unique_missing
) {
1514 case UNIQUE_WARN_SKIP
:
1515 skipping_record
= true;
1518 case UNIQUE_WARN_NEW
:
1523 if (skipping_record
) {
1525 } else if (!seen_content
) {
1526 // We haven't seen any fields (other than unique identifiers)
1527 // so the document is to be deleted.
1529 database
.delete_document(docid
);
1530 if (verbose
) cout
<< "Del: " << docid
<< '\n';
1535 for (auto&& i
: fields
) {
1536 for (auto&& field_val
: i
.second
) {
1544 // Put the data in the document
1547 // Add the document to the database
1549 database
.replace_document(docid
, doc
);
1550 if (verbose
) cout
<< "Replace: " << docid
<< '\n';
1553 docid
= database
.add_document(doc
);
1554 if (verbose
) cout
<< "Add: " << docid
<< '\n';
1560 // Commit after each file to make sure all changes from that file make it
1562 if (verbose
) cout
<< "Committing\n";
1568 show_help(int exit_code
)
1570 cout
<< PROG_NAME
" - " PROG_DESC
"\n"
1571 "Usage: " PROG_NAME
" [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1573 "Creates or updates a Xapian database with the data from the input files listed\n"
1574 "on the command line. If no files are specified, data is read from stdin.\n"
1576 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1577 "format for INDEXER_SCRIPT.\n"
1580 " -v, --verbose display additional messages to aid debugging\n"
1581 " --overwrite create the database anew (the default is to update if\n"
1582 " the database already exists)\n";
1583 print_stemmer_help("");
1584 print_help_and_version_help("");
1589 main(int argc
, char **argv
)
1591 // If the database already exists, default to updating not overwriting.
1592 int database_mode
= Xapian::DB_CREATE_OR_OPEN
;
1594 Xapian::Stem
stemmer("english");
1596 // Without this, strptime() seems to treat formats without a timezone as
1597 // being local time, including %s.
1598 setenv("TZ", "UTC", 1);
1600 constexpr auto NO_ARG
= no_argument
;
1601 constexpr auto REQ_ARG
= required_argument
;
1602 static const struct option longopts
[] = {
1603 { "help", NO_ARG
, NULL
, 'h' },
1604 { "version", NO_ARG
, NULL
, 'V' },
1605 { "stemmer", REQ_ARG
, NULL
, 's' },
1606 { "overwrite", NO_ARG
, NULL
, 'o' },
1607 { "verbose", NO_ARG
, NULL
, 'v' },
1612 while ((getopt_ret
= gnu_getopt_long(argc
, argv
, "vs:hV",
1613 longopts
, NULL
)) != -1) {
1614 switch (getopt_ret
) {
1621 case 'V': // --version
1622 print_package_info(PROG_NAME
);
1624 case 'o': // --overwrite
1625 database_mode
= Xapian::DB_CREATE_OR_OVERWRITE
;
1632 stemmer
= Xapian::Stem(optarg
);
1633 } catch (const Xapian::InvalidArgumentError
&) {
1634 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n";
1635 cerr
<< "Available language names are: "
1636 << Xapian::Stem::get_available_languages() << '\n';
1649 parse_index_script(argv
[1]);
1651 // Open the database. If another process is currently updating the
1652 // database, wait for the lock to become available.
1653 auto flags
= database_mode
| Xapian::DB_RETRY_LOCK
;
1654 Xapian::WritableDatabase
database(argv
[0], flags
);
1656 Xapian::TermGenerator indexer
;
1657 indexer
.set_stemmer(stemmer
);
1658 // Set the database for spellings to be added to by the "spell" action.
1659 indexer
.set_database(database
);
1668 index_file("<stdin>", cin
, database
, indexer
);
1670 // Read file(s) listed on the command line.
1671 for (int i
= 2; i
< argc
; ++i
) {
1672 ifstream
stream(argv
[i
]);
1674 index_file(argv
[i
], stream
, database
, indexer
);
1676 cerr
<< "Can't open file " << argv
[i
] << '\n';
1681 cout
<< "records (added, replaced, deleted, skipped) = ("
1685 << skipcount
<< ")\n";
1686 } catch (const Xapian::Error
&error
) {
1687 cerr
<< "Exception: " << error
.get_description() << '\n';
1689 } catch (const std::bad_alloc
&) {
1690 cerr
<< "Exception: std::bad_alloc\n";
1693 cerr
<< "Unknown Exception\n";