2 * @brief index arbitrary data as described by an index script
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2022 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
36 #include <unordered_set>
45 #include "commonhelp.h"
48 #include "myhtmlparse.h"
52 #include "stringutils.h"
54 #include "utf8truncate.h"
59 #include "portability/strptime.h"
62 #include "gnu_getopt.h"
66 #define PROG_NAME "scriptindex"
67 #define PROG_DESC "index arbitrary data as described by an index script"
75 /** What to do if there's a UNIQUE action but a record doesn't use it.
83 } unique_missing
= UNIQUE_WARN_NEW
;
85 /// Track if UNIQUE action is unused in the current record.
86 static bool unique_unused
;
88 /// Track if the current record is being skipping.
89 static bool skipping_record
= false;
92 prefix_needs_colon(const string
& prefix
, unsigned ch
)
94 if (!C_isupper(ch
) && ch
!= ':') return false;
95 string::size_type len
= prefix
.length();
96 return (len
> 1 && prefix
[len
- 1] != ':');
99 const char * action_names
[] = {
100 // Actions used internally:
131 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")\n"
136 // Actions used internally:
165 enum { SPLIT_NONE
, SPLIT_DEDUP
, SPLIT_SORT
, SPLIT_PREFIXES
};
170 // Offset into indexscript line.
173 Action(type action_
, size_t pos_
)
174 : action(action_
), num_arg(0), pos(pos_
) { }
175 Action(type action_
, size_t pos_
, const string
& arg
)
176 : action(action_
), string_arg(arg
), pos(pos_
) {
177 num_arg
= atoi(string_arg
.c_str());
179 Action(type action_
, size_t pos_
, const string
& arg
, int num
)
180 : action(action_
), num_arg(num
), string_arg(arg
), pos(pos_
) { }
181 type
get_action() const { return action
; }
182 int get_num_arg() const { return num_arg
; }
183 void set_num_arg(int num
) { num_arg
= num
; }
184 const string
& get_string_arg() const { return string_arg
; }
185 size_t get_pos() const { return pos
; }
188 // These allow searching for an Action with a particular Action::type using
192 operator==(const Action
& a
, Action::type t
) { return a
.get_action() == t
; }
195 operator==(Action::type t
, const Action
& a
) { return a
.get_action() == t
; }
198 operator!=(const Action
& a
, Action::type t
) { return !(a
== t
); }
201 operator!=(Action::type t
, const Action
& a
) { return !(t
== a
); }
204 ltrim(string
& s
, const string
& chars
)
206 auto i
= s
.find_first_not_of(chars
);
207 if (i
) s
.erase(0, i
);
211 rtrim(string
& s
, const string
& chars
)
213 s
.resize(s
.find_last_not_of(chars
) + 1);
217 squash(string
& s
, const string
& chars
)
220 output
.reserve(s
.size());
221 string::size_type i
= 0;
222 while ((i
= s
.find_first_not_of(chars
, i
)) != string::npos
) {
223 auto j
= s
.find_first_of(chars
, i
);
224 if (!output
.empty()) output
+= ' ';
225 output
.append(s
, i
, j
- i
);
228 s
= std::move(output
);
231 enum diag_type
{ DIAG_ERROR
, DIAG_WARN
, DIAG_NOTE
};
233 static unsigned error_count
= 0;
236 report_location(enum diag_type type
,
237 const string
& filename
,
239 size_t pos
= string::npos
)
244 if (pos
!= string::npos
) {
245 // The first column is numbered 1.
246 cerr
<< ':' << pos
+ 1;
255 cerr
<< ": warning: ";
264 report_useless_action(const string
&file
, size_t line
, size_t pos
,
265 const string
&action
)
267 report_location(DIAG_WARN
, file
, line
, pos
);
268 cerr
<< "Index action '" << action
<< "' has no effect\n";
270 static bool given_left_to_right_warning
= false;
271 if (!given_left_to_right_warning
) {
272 given_left_to_right_warning
= true;
273 report_location(DIAG_NOTE
, file
, line
, pos
);
274 cerr
<< "Actions are executed from left to right\n";
278 static bool index_spec_uses_unique
= false;
280 static map
<string
, vector
<Action
>> index_spec
;
282 // Like std::getline() but handle \r\n line endings too.
284 getline_portable(istream
& stream
, string
& line
)
286 istream
& result
= getline(stream
, line
);
287 // Trim multiple \r characters, since that seems the best way to handle
289 line
.resize(line
.find_last_not_of('\r') + 1);
294 parse_index_script(const string
&filename
)
296 ifstream
script(filename
.c_str());
297 if (!script
.is_open()) {
298 report_location(DIAG_ERROR
, filename
);
299 cerr
<< strerror(errno
) << '\n';
304 // Line number where we saw a `unique` action, or 0 if we haven't.
305 int unique_line_no
= 0;
306 // Offset into line unique_line_no where the `unique` action was.
307 size_t unique_pos
= 0;
308 while (getline(script
, line
)) {
310 vector
<string
> fields
;
311 vector
<Action
> actions
;
312 string::const_iterator i
, j
;
313 const string
&s
= line
;
314 i
= find_if(s
.begin(), s
.end(), [](char ch
) { return !C_isspace(ch
); });
315 if (i
== s
.end() || *i
== '#') {
316 // Blank line or comment.
320 if (!C_isalnum(*i
)) {
321 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
322 cerr
<< "field name must start with alphanumeric\n";
324 j
= find_if(i
+ 1, s
.end(),
325 [](char ch
) { return !C_isalnum(ch
) && ch
!= '_'; });
326 fields
.push_back(string(i
, j
));
327 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
328 if (i
== s
.end()) break;
331 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
335 report_location(DIAG_ERROR
, filename
, line_no
, i
- s
.begin());
336 cerr
<< "bad character '" << *i
<< "' in field name\n";
338 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
339 if (i
== s
.end()) break;
342 Xapian::termcount weight
= 1;
343 size_t useless_weight_pos
= string::npos
;
344 map
<string
, Action::type
> boolmap
;
346 while (j
!= s
.end()) {
347 size_t action_pos
= j
- s
.begin();
348 i
= find_if(j
, s
.end(), [](char ch
) { return !C_isalnum(ch
); });
349 string
action(s
, j
- s
.begin(), i
- j
);
350 Action::type code
= Action::BAD
;
351 unsigned min_args
= 0, max_args
= 0;
352 bool takes_integer_argument
= false;
353 if (!action
.empty()) {
356 if (action
== "boolean") {
357 code
= Action::BOOLEAN
;
362 if (action
== "date") {
364 min_args
= max_args
= 1;
368 if (action
== "field") {
369 code
= Action::FIELD
;
374 if (action
== "gap") {
377 takes_integer_argument
= true;
381 if (action
== "hash") {
384 takes_integer_argument
= true;
385 } else if (action
== "hextobin") {
386 code
= Action::HEXTOBIN
;
390 if (action
== "index") {
391 code
= Action::INDEX
;
393 } else if (action
== "indexnopos") {
394 code
= Action::INDEXNOPOS
;
399 if (action
== "lower") {
400 code
= Action::LOWER
;
401 } else if (action
== "load") {
403 } else if (action
== "ltrim") {
404 code
= Action::LTRIM
;
409 if (action
== "parsedate") {
410 code
= Action::PARSEDATE
;
411 min_args
= max_args
= 1;
415 if (action
== "rtrim") {
416 code
= Action::RTRIM
;
421 if (action
== "spell") {
422 code
= Action::SPELL
;
423 } else if (action
== "split") {
424 code
= Action::SPLIT
;
427 } else if (action
== "squash") {
428 code
= Action::SQUASH
;
433 if (action
== "truncate") {
434 code
= Action::TRUNCATE
;
435 min_args
= max_args
= 1;
436 takes_integer_argument
= true;
437 } else if (action
== "trim") {
443 if (action
== "unhtml") {
444 code
= Action::UNHTML
;
445 } else if (action
== "unique") {
446 code
= Action::UNIQUE
;
452 if (action
== "value") {
453 code
= Action::VALUE
;
454 min_args
= max_args
= 1;
455 takes_integer_argument
= true;
456 } else if (action
== "valuenumeric") {
457 code
= Action::VALUENUMERIC
;
458 min_args
= max_args
= 1;
459 takes_integer_argument
= true;
460 } else if (action
== "valuepacked") {
461 code
= Action::VALUEPACKED
;
462 min_args
= max_args
= 1;
463 takes_integer_argument
= true;
467 if (action
== "weight") {
468 code
= Action::WEIGHT
;
469 min_args
= max_args
= 1;
470 // Don't set takes_integer_argument since we parse
471 // it with parse_unsigned() and issue an error there
472 // - setting takes_integer_argument would give a
473 // double error for arguments with a decimal point.
478 if (code
== Action::BAD
) {
479 report_location(DIAG_ERROR
, filename
, line_no
, action_pos
);
480 cerr
<< "Unknown index action '" << action
<< "'\n";
482 auto i_after_action
= i
;
483 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
485 if (i
!= s
.end() && *i
== '=') {
486 if (i
!= i_after_action
) {
487 report_location(DIAG_WARN
, filename
, line_no
,
488 i_after_action
- s
.begin());
489 cerr
<< "putting spaces between the action and '=' is "
494 report_location(DIAG_ERROR
, filename
, line_no
,
496 cerr
<< "Index action '" << action
497 << "' doesn't take an argument\n";
501 j
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
503 report_location(DIAG_WARN
, filename
, line_no
,
505 cerr
<< "putting spaces between '=' and the argument is "
511 if (j
!= s
.end() && *j
== '"') {
516 i
= find_if(j
, s
.end(),
518 return ch
== '"' || ch
== '\\';
521 report_location(DIAG_ERROR
, filename
, line_no
,
523 cerr
<< "No closing quote\n";
533 report_location(DIAG_ERROR
, filename
, line_no
,
535 cerr
<< "Bad escaping in quoted action "
561 if (!C_isxdigit(ch1
)) {
563 report_location(DIAG_ERROR
, filename
,
564 line_no
, i
- s
.begin());
565 cerr
<< "Bad hex digit in escaping\n";
572 if (!C_isxdigit(ch2
)) {
575 ch
= hex_digit(ch1
) << 4 |
580 report_location(DIAG_ERROR
, filename
,
581 line_no
, i
- s
.begin());
582 cerr
<< "Bad escape sequence '\\" << ch
589 vals
.emplace_back(std::move(arg
));
590 if (i
== s
.end() || C_isspace(*i
)) break;
594 report_location(DIAG_ERROR
, filename
, line_no
,
596 cerr
<< "Unexpected character '" << *i
597 << "' after closing quote\n";
600 } while (i
!= s
.end() && *i
!= ',' && !C_isspace(*i
));
601 if (*i
!= ',') break;
604 } else if (max_args
> 1) {
605 // Unquoted argument, split on comma.
606 i
= find_if(j
, s
.end(),
608 return C_isspace(ch
) || ch
== ',';
610 vals
.emplace_back(j
, i
);
611 if (*i
!= ',') break;
614 // Unquoted argument, including any commas.
615 i
= find_if(j
, s
.end(),
616 [](char ch
) { return C_isspace(ch
); });
617 vals
.emplace_back(j
, i
);
622 if (vals
.size() == max_args
) {
623 report_location(DIAG_ERROR
, filename
, line_no
,
625 cerr
<< "Index action '" << action
<< "' takes at most "
626 << max_args
<< " arguments\n";
630 if (vals
.size() < min_args
) {
631 report_location(DIAG_ERROR
, filename
, line_no
,
633 if (min_args
== max_args
) {
634 cerr
<< "Index action '" << action
<< "' requires "
635 << min_args
<< " arguments\n";
637 cerr
<< "Index action '" << action
<< "' requires "
638 "at least " << min_args
<< " arguments\n";
640 // Allow action handling code to assume there are min_args
642 vals
.resize(min_args
);
650 if (takes_integer_argument
) {
651 auto dot
= val
.find('.');
652 if (dot
!= string::npos
) {
653 report_location(DIAG_ERROR
, filename
, line_no
,
654 j
- s
.begin() + dot
);
655 cerr
<< "Index action '" << action
656 << "' takes an integer argument\n";
664 report_location(DIAG_ERROR
, filename
, line_no
,
666 cerr
<< "Invalid parameter '" << val
667 << "' for action 'date'\n";
669 actions
.emplace_back(code
, action_pos
, val
);
672 case Action::INDEXNOPOS
:
673 actions
.emplace_back(code
, action_pos
, val
, weight
);
674 useless_weight_pos
= string::npos
;
677 // We don't push an Action for WEIGHT - instead we
678 // store it ready to use in the INDEX and INDEXNOPOS
680 if (!parse_unsigned(val
.c_str(), weight
)) {
681 report_location(DIAG_ERROR
, filename
, line_no
,
683 cerr
<< "Index action 'weight' takes a "
684 "non-negative integer argument\n";
687 if (useless_weight_pos
!= string::npos
) {
688 report_useless_action(filename
, line_no
,
689 useless_weight_pos
, action
);
691 useless_weight_pos
= action_pos
;
693 case Action::PARSEDATE
: {
694 auto bad_code
= val
.find("%Z");
695 if (bad_code
!= val
.npos
) {
696 report_location(DIAG_ERROR
, filename
, line_no
,
697 j
- s
.begin() + bad_code
);
698 cerr
<< "Parsing timezone names with %Z is not "
701 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
702 bad_code
= val
.find("%z");
703 if (bad_code
!= val
.npos
) {
704 report_location(DIAG_ERROR
, filename
, line_no
,
705 j
- s
.begin() + bad_code
);
706 cerr
<< "Parsing timezone offsets with %z is not "
707 "supported on this platform\n";
710 actions
.emplace_back(code
, action_pos
, val
);
713 case Action::SPLIT
: {
715 report_location(DIAG_ERROR
, filename
, line_no
,
717 cerr
<< "Split delimiter can't be empty\n";
719 int operation
= Action::SPLIT_NONE
;
720 if (vals
.size() >= 2) {
721 if (vals
[1] == "dedup") {
722 operation
= Action::SPLIT_DEDUP
;
723 } else if (vals
[1] == "sort") {
724 operation
= Action::SPLIT_SORT
;
725 } else if (vals
[1] == "none") {
726 operation
= Action::SPLIT_NONE
;
727 } else if (vals
[1] == "prefixes") {
728 operation
= Action::SPLIT_PREFIXES
;
730 // FIXME: Column should be for where the `op`
731 // parameter starts, which this isn't if the
732 // value is quoted, contains escape sequences,
734 report_location(DIAG_ERROR
, filename
, line_no
,
735 i
- s
.begin() - vals
[1].size());
736 cerr
<< "Bad split operation '" << vals
[1]
740 actions
.emplace_back(code
, action_pos
, val
, operation
);
743 case Action::TRUNCATE
:
744 if (!actions
.empty() &&
745 actions
.back().get_action() == Action::LOAD
) {
746 /* Turn "load truncate=n" into "load" with
747 * num_arg n, so that we don't needlessly
748 * allocate memory and read data we're just
754 actions
.emplace_back(code
, action_pos
, val
);
757 if (unique_line_no
) {
758 report_location(DIAG_ERROR
, filename
, line_no
,
760 cerr
<< "Index action 'unique' used more than "
762 report_location(DIAG_NOTE
, filename
,
763 unique_line_no
, unique_pos
);
764 cerr
<< "Previously used here\n";
766 unique_line_no
= line_no
;
767 unique_pos
= action_pos
;
768 if (boolmap
.find(val
) == boolmap
.end())
769 boolmap
[val
] = Action::UNIQUE
;
770 if (vals
.size() >= 2) {
771 if (vals
[1] == "missing=error") {
772 unique_missing
= UNIQUE_ERROR
;
773 } else if (vals
[1] == "missing=new") {
774 unique_missing
= UNIQUE_NEW
;
775 } else if (vals
[1] == "missing=warn+new") {
776 unique_missing
= UNIQUE_WARN_NEW
;
777 } else if (vals
[1] == "missing=skip") {
778 unique_missing
= UNIQUE_SKIP
;
779 } else if (vals
[1] == "missing=warn+skip") {
780 unique_missing
= UNIQUE_WARN_SKIP
;
782 report_location(DIAG_ERROR
, filename
, line_no
);
783 cerr
<< "Bad unique parameter '" << vals
[1]
787 actions
.emplace_back(code
, action_pos
, val
);
790 actions
.emplace_back(code
, action_pos
, val
);
791 auto& obj
= actions
.back();
792 auto gap_size
= obj
.get_num_arg();
794 report_location(DIAG_ERROR
, filename
, line_no
,
795 obj
.get_pos() + 3 + 1);
796 cerr
<< "Index action 'gap' takes a strictly "
797 "positive integer argument\n";
802 actions
.emplace_back(code
, action_pos
, val
);
803 auto& obj
= actions
.back();
804 auto max_length
= obj
.get_num_arg();
805 if (max_length
< 6) {
806 report_location(DIAG_ERROR
, filename
, line_no
,
807 obj
.get_pos() + 4 + 1);
808 cerr
<< "Index action 'hash' takes an integer "
809 "argument which must be at least 6\n";
817 for (unsigned char ch
: val
) {
819 auto column
= actions
.back().get_pos() +
820 strlen(action_names
[code
]) + 1;
821 report_location(DIAG_ERROR
, filename
, line_no
,
823 cerr
<< "Index action '" << action_names
[code
]
824 << "' only support ASCII characters "
828 actions
.emplace_back(code
, action_pos
, val
);
830 case Action::BOOLEAN
:
831 boolmap
[val
] = Action::BOOLEAN
;
834 actions
.emplace_back(code
, action_pos
, val
);
836 i
= find_if(i
, s
.end(), [](char ch
) { return !C_isspace(ch
); });
839 report_location(DIAG_ERROR
, filename
, line_no
,
840 i_after_action
- s
.begin());
841 if (min_args
== max_args
) {
842 cerr
<< "Index action '" << action
<< "' requires "
843 << min_args
<< " arguments\n";
845 cerr
<< "Index action '" << action
<< "' requires "
846 "at least " << min_args
<< " arguments\n";
851 case Action::INDEXNOPOS
:
852 useless_weight_pos
= string::npos
;
853 actions
.emplace_back(code
, action_pos
, "", weight
);
856 actions
.emplace_back(code
, action_pos
, "", 100);
859 actions
.emplace_back(code
, action_pos
, "",
860 MAX_SAFE_TERM_LENGTH
- 1);
866 actions
.emplace_back(code
, action_pos
, " \t\f\v\r\n");
869 actions
.emplace_back(code
, action_pos
);
876 if (useless_weight_pos
!= string::npos
) {
877 report_useless_action(filename
, line_no
, useless_weight_pos
,
881 while (!actions
.empty()) {
883 Action::type action
= actions
.back().get_action();
886 case Action::HEXTOBIN
:
889 case Action::PARSEDATE
:
894 case Action::TRUNCATE
:
897 report_useless_action(filename
, line_no
,
898 actions
.back().get_pos(),
899 action_names
[action
]);
908 map
<string
, Action::type
>::const_iterator boolpfx
;
909 for (boolpfx
= boolmap
.begin(); boolpfx
!= boolmap
.end(); ++boolpfx
) {
910 if (boolpfx
->second
== Action::UNIQUE
) {
911 report_location(DIAG_WARN
, filename
, unique_line_no
,
913 cerr
<< "Index action 'unique=" << boolpfx
->first
914 << "' without 'boolean=" << boolpfx
->first
<< "'\n";
915 static bool given_doesnt_imply_boolean_warning
= false;
916 if (!given_doesnt_imply_boolean_warning
) {
917 given_doesnt_imply_boolean_warning
= true;
918 report_location(DIAG_NOTE
, filename
, unique_line_no
,
920 cerr
<< "'unique' doesn't implicitly add a boolean term\n";
925 vector
<string
>::const_iterator field
;
926 for (field
= fields
.begin(); field
!= fields
.end(); ++field
) {
927 vector
<Action
> &v
= index_spec
[*field
];
929 if (fields
.size() == 1) {
930 // Optimise common case where there's only one fieldname
931 // for a list of actions.
932 v
= std::move(actions
);
937 v
.emplace_back(Action::NEW
, string::npos
);
938 v
.insert(v
.end(), actions
.begin(), actions
.end());
943 if (index_spec
.empty()) {
944 report_location(DIAG_ERROR
, filename
, line_no
);
945 cerr
<< "No rules found in index script\n";
952 index_spec_uses_unique
= (unique_line_no
> 0);
956 run_actions(vector
<Action
>::const_iterator action_it
,
957 vector
<Action
>::const_iterator action_end
,
958 Xapian::WritableDatabase
& database
,
959 Xapian::TermGenerator
& indexer
,
960 const string
& old_value
,
961 bool& this_field_is_content
, Xapian::Document
& doc
,
962 map
<string
, list
<string
>>& fields
,
963 string
& field
, const char* fname
,
964 size_t line_no
, Xapian::docid
& docid
)
966 string value
= old_value
;
967 while (action_it
!= action_end
) {
968 auto& action
= *action_it
++;
969 switch (action
.get_action()) {
974 // We're processing the same field again - give it a reprieve.
975 this_field_is_content
= true;
978 if (!value
.empty()) {
979 string f
= action
.get_string_arg();
980 if (f
.empty()) f
= field
;
981 // replace newlines with spaces
983 string::size_type j
= 0;
984 while ((j
= s
.find('\n', j
)) != string::npos
)
986 fields
[f
].push_back(s
);
990 indexer
.index_text(value
,
991 action
.get_num_arg(),
992 action
.get_string_arg());
994 case Action::INDEXNOPOS
:
995 // No positional information so phrase searching won't work.
996 // However, the database will use much less diskspace.
997 indexer
.index_text_without_positions(value
,
998 action
.get_num_arg(),
999 action
.get_string_arg());
1001 case Action::BOOLEAN
: {
1002 // Do nothing if there's no text.
1003 if (value
.empty()) break;
1005 string term
= action
.get_string_arg();
1006 if (prefix_needs_colon(term
, value
[0])) term
+= ':';
1009 doc
.add_boolean_term(term
);
1013 indexer
.increase_termpos(action
.get_num_arg());
1015 case Action::HASH
: {
1016 unsigned int max_length
= action
.get_num_arg();
1017 if (value
.length() > max_length
)
1018 value
= hash_long_term(value
, max_length
);
1021 case Action::HEXTOBIN
: {
1022 size_t len
= value
.length();
1024 report_location(DIAG_ERROR
, fname
, line_no
);
1025 cerr
<< "hextobin: input must have even length\n";
1030 output
.reserve(len
/ 2);
1031 for (size_t j
= 0; j
< len
; j
+= 2) {
1033 char b
= value
[j
+ 1];
1034 if (!C_isxdigit(a
) || !C_isxdigit(b
)) {
1035 report_location(DIAG_ERROR
, fname
, line_no
);
1036 cerr
<< "hextobin: input must be all hex digits\n";
1039 char r
= (hex_digit(a
) << 4) | hex_digit(b
);
1040 output
.push_back(r
);
1042 value
= std::move(output
);
1046 value
= Xapian::Unicode::tolower(value
);
1049 ltrim(value
, action
.get_string_arg());
1052 rtrim(value
, action
.get_string_arg());
1055 rtrim(value
, action
.get_string_arg());
1056 ltrim(value
, action
.get_string_arg());
1058 case Action::SQUASH
:
1059 squash(value
, action
.get_string_arg());
1061 case Action::LOAD
: {
1062 // If there's no input, just issue a warning.
1063 if (value
.empty()) {
1064 report_location(DIAG_WARN
, fname
, line_no
);
1065 cerr
<< "Empty filename in LOAD action\n";
1068 bool truncated
= false;
1069 string filename
= std::move(value
);
1070 // FIXME: Use NOATIME if we own the file or are root.
1071 if (!load_file(filename
, action
.get_num_arg(), NOCACHE
,
1072 value
, truncated
)) {
1073 report_location(DIAG_ERROR
, fname
, line_no
);
1074 cerr
<< "Couldn't load file '" << filename
<< "': "
1075 << strerror(errno
) << '\n';
1078 if (!truncated
) break;
1081 case Action::TRUNCATE
:
1082 utf8_truncate(value
, action
.get_num_arg());
1085 indexer
.set_flags(indexer
.FLAG_SPELLING
);
1087 case Action::SPLIT
: {
1088 // Find the end of the actions which split should execute.
1089 auto split_end
= find(action_it
, action_end
, Action::NEW
);
1091 int split_type
= action
.get_num_arg();
1092 if (value
.empty()) {
1094 } else if (split_type
!= Action::SPLIT_SORT
) {
1095 // Generate split as we consume it.
1096 const string
& delimiter
= action
.get_string_arg();
1098 unique_ptr
<unordered_set
<string
>> seen
;
1099 if (split_type
== Action::SPLIT_DEDUP
) {
1100 seen
.reset(new unordered_set
<string
>);
1103 if (delimiter
.size() == 1) {
1104 // Special case for common single character delimiter.
1105 char ch
= delimiter
[0];
1106 string::size_type i
= 0;
1108 string::size_type j
= value
.find(ch
, i
);
1109 if (split_type
== Action::SPLIT_PREFIXES
) {
1111 string
val(value
, 0, j
);
1112 run_actions(action_it
, split_end
,
1115 this_field_is_content
, doc
,
1117 field
, fname
, line_no
,
1120 } else if (i
!= j
) {
1121 string
val(value
, i
, j
- i
);
1122 if (!seen
.get() || seen
->insert(val
).second
) {
1123 run_actions(action_it
, split_end
,
1126 this_field_is_content
, doc
,
1128 field
, fname
, line_no
,
1132 if (j
== string::npos
) break;
1136 string::size_type i
= 0;
1138 string::size_type j
= value
.find(delimiter
, i
);
1139 if (split_type
== Action::SPLIT_PREFIXES
) {
1141 string
val(value
, 0, j
);
1142 run_actions(action_it
, split_end
,
1145 this_field_is_content
, doc
,
1147 field
, fname
, line_no
,
1150 } else if (i
!= j
) {
1151 string
val(value
, i
, j
- i
);
1152 if (!seen
.get() || seen
->insert(val
).second
) {
1153 run_actions(action_it
, split_end
,
1156 this_field_is_content
, doc
,
1158 field
, fname
, line_no
,
1162 if (j
== string::npos
) break;
1163 i
= j
+ delimiter
.size();
1167 vector
<string
> split_values
;
1168 const string
& delimiter
= action
.get_string_arg();
1169 if (delimiter
.size() == 1) {
1170 // Special case for common single character delimiter.
1171 char ch
= delimiter
[0];
1172 string::size_type i
= 0;
1174 string::size_type j
= value
.find(ch
, i
);
1176 split_values
.emplace_back(value
, i
, j
- i
);
1178 if (j
== string::npos
) break;
1182 string::size_type i
= 0;
1184 string::size_type j
= value
.find(delimiter
, i
);
1186 split_values
.emplace_back(value
, i
, j
- i
);
1188 if (j
== string::npos
) break;
1189 i
= j
+ delimiter
.size();
1193 sort(split_values
.begin(), split_values
.end());
1195 for (auto&& val
: split_values
) {
1196 run_actions(action_it
, split_end
,
1197 database
, indexer
, val
,
1198 this_field_is_content
, doc
, fields
,
1199 field
, fname
, line_no
,
1204 action_it
= split_end
;
1207 case Action::UNHTML
: {
1210 // Default HTML character set is latin 1, though
1211 // not specifying one is deprecated these days.
1212 p
.parse_html(value
, "iso-8859-1", false);
1213 } catch (const string
& newcharset
) {
1215 p
.parse_html(value
, newcharset
, true);
1217 if (p
.indexing_allowed
)
1223 case Action::UNIQUE
: {
1224 unique_unused
= false;
1226 if (value
.empty()) {
1227 enum diag_type diag
= DIAG_WARN
;
1228 switch (unique_missing
) {
1232 case UNIQUE_WARN_NEW
:
1233 case UNIQUE_WARN_SKIP
:
1234 report_location(diag
, fname
, line_no
);
1235 cerr
<< "UNIQUE action on empty text\n";
1239 switch (unique_missing
) {
1243 case UNIQUE_WARN_SKIP
:
1244 skipping_record
= true;
1247 case UNIQUE_WARN_NEW
:
1253 // Ensure that the value of this field is unique.
1254 // If a record already exists with the same value,
1255 // it will be replaced with the new record.
1257 // Unique fields aren't considered content - if
1258 // there are no other fields in the document, the
1259 // document is to be deleted.
1260 this_field_is_content
= false;
1262 // Argument is the prefix to add to the field value
1263 // to get the unique term.
1264 string t
= action
.get_string_arg();
1265 if (prefix_needs_colon(t
, value
[0])) t
+= ':';
1267 Xapian::PostingIterator p
= database
.postlist_begin(t
);
1268 if (p
!= database
.postlist_end(t
)) {
1275 doc
.add_value(action
.get_num_arg(), value
);
1277 case Action::VALUENUMERIC
: {
1278 if (value
.empty()) break;
1280 double dbl
= strtod(value
.c_str(), &end
);
1282 report_location(DIAG_WARN
, fname
, line_no
);
1283 cerr
<< "Trailing characters in VALUENUMERIC: '"
1286 doc
.add_value(action
.get_num_arg(),
1287 Xapian::sortable_serialise(dbl
));
1290 case Action::VALUEPACKED
: {
1292 if (value
.empty() || !C_isdigit(value
[0])) {
1293 // strtoul() accepts leading whitespace and negated
1294 // values, neither of which we want to allow.
1299 word
= strtoul(value
.c_str(), &q
, 10);
1300 if (!errno
&& *q
!= '\0') {
1301 // Trailing characters after converted value.
1306 report_location(DIAG_WARN
, fname
, line_no
);
1307 cerr
<< "valuepacked \"" << value
<< "\" ";
1308 if (errno
== ERANGE
) {
1309 cerr
<< "out of range\n";
1311 cerr
<< "not an unsigned integer\n";
1314 int valueslot
= action
.get_num_arg();
1315 doc
.add_value(valueslot
, int_to_binary_string(word
));
1318 case Action::DATE
: {
1319 // Do nothing for empty input.
1320 if (value
.empty()) break;
1322 const string
& type
= action
.get_string_arg();
1324 if (type
== "unix") {
1326 if (!parse_signed(value
.c_str(), t
)) {
1327 report_location(DIAG_WARN
, fname
, line_no
);
1328 cerr
<< "Date value (in secs) for action DATE "
1329 "must be an integer - ignoring\n";
1332 struct tm
*tm
= localtime(&t
);
1333 int y
= tm
->tm_year
+ 1900;
1334 int m
= tm
->tm_mon
+ 1;
1335 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1336 } else if (type
== "unixutc") {
1338 if (!parse_signed(value
.c_str(), t
)) {
1339 report_location(DIAG_WARN
, fname
, line_no
);
1340 cerr
<< "Date value (in secs) for action DATE "
1341 "must be an integer - ignoring\n";
1344 struct tm
*tm
= gmtime(&t
);
1345 int y
= tm
->tm_year
+ 1900;
1346 int m
= tm
->tm_mon
+ 1;
1347 yyyymmdd
= date_to_string(y
, m
, tm
->tm_mday
);
1348 } else if (type
== "yyyymmdd") {
1349 if (value
.length() != 8) {
1350 report_location(DIAG_WARN
, fname
, line_no
);
1351 cerr
<< "date=yyyymmdd expects an 8 character value "
1359 doc
.add_boolean_term("D" + yyyymmdd
);
1362 doc
.add_boolean_term("M" + yyyymmdd
);
1365 doc
.add_boolean_term("Y" + yyyymmdd
);
1368 case Action::PARSEDATE
: {
1369 string dateformat
= action
.get_string_arg();
1371 memset(&tm
, 0, sizeof(tm
));
1372 auto ret
= strptime(value
.c_str(), dateformat
.c_str(), &tm
);
1374 report_location(DIAG_WARN
, fname
, line_no
);
1375 cerr
<< "\"" << value
<< "\" doesn't match format "
1376 "\"" << dateformat
<< '\"' << '\n';
1381 report_location(DIAG_WARN
, fname
, line_no
);
1382 cerr
<< "\"" << value
<< "\" not fully matched by "
1383 "format \"" << dateformat
<< "\" "
1384 "(\"" << ret
<< "\" left over) but "
1385 "indexing anyway\n";
1387 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1388 auto gmtoff
= tm
.tm_gmtoff
;
1390 auto secs_since_epoch
= timegm(&tm
);
1391 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1392 secs_since_epoch
-= gmtoff
;
1394 value
= str(secs_since_epoch
);
1398 /* Empty default case to avoid "unhandled enum value"
1407 index_file(const char *fname
, istream
&stream
,
1408 Xapian::WritableDatabase
&database
, Xapian::TermGenerator
&indexer
)
1412 while (!stream
.eof() && getline_portable(stream
, line
)) {
1414 // Allow blank lines before the first record and multiple blank lines
1416 if (line
.empty()) continue;
1418 Xapian::Document doc
;
1419 indexer
.set_document(doc
);
1420 Xapian::docid docid
= 0;
1421 map
<string
, list
<string
>> fields
;
1422 bool seen_content
= false;
1423 skipping_record
= false;
1424 unique_unused
= index_spec_uses_unique
;
1425 while (!line
.empty()) {
1426 string::size_type eq
= line
.find('=');
1427 if (eq
== string::npos
&& !line
.empty()) {
1428 report_location(DIAG_ERROR
, fname
, line_no
);
1429 cerr
<< "Expected = somewhere in this line\n";
1432 string
field(line
, 0, eq
);
1433 string
value(line
, eq
+ 1, string::npos
);
1435 while (getline_portable(stream
, line
)) {
1437 if (line
.empty() || line
[0] != '=') break;
1438 // Replace the '=' with a '\n'.
1443 if (skipping_record
) continue;
1445 // Default to not indexing spellings.
1446 indexer
.set_flags(Xapian::TermGenerator::flags(0));
1448 bool this_field_is_content
= true;
1449 const vector
<Action
>& v
= index_spec
[field
];
1450 run_actions(v
.begin(), v
.end(),
1451 database
, indexer
, value
,
1452 this_field_is_content
, doc
, fields
,
1453 field
, fname
, line_no
,
1455 if (this_field_is_content
) seen_content
= true;
1458 if (unique_unused
) {
1459 enum diag_type diag
= DIAG_WARN
;
1460 switch (unique_missing
) {
1464 case UNIQUE_WARN_NEW
:
1465 case UNIQUE_WARN_SKIP
:
1466 report_location(diag
, fname
, line_no
);
1467 cerr
<< "UNIQUE action unused in this record\n";
1471 switch (unique_missing
) {
1475 case UNIQUE_WARN_SKIP
:
1476 skipping_record
= true;
1479 case UNIQUE_WARN_NEW
:
1484 if (skipping_record
) {
1486 } else if (!seen_content
) {
1487 // We haven't seen any fields (other than unique identifiers)
1488 // so the document is to be deleted.
1490 database
.delete_document(docid
);
1491 if (verbose
) cout
<< "Del: " << docid
<< '\n';
1496 for (auto&& i
: fields
) {
1497 for (auto&& field_val
: i
.second
) {
1505 // Put the data in the document
1508 // Add the document to the database
1510 database
.replace_document(docid
, doc
);
1511 if (verbose
) cout
<< "Replace: " << docid
<< '\n';
1514 docid
= database
.add_document(doc
);
1515 if (verbose
) cout
<< "Add: " << docid
<< '\n';
1521 // Commit after each file to make sure all changes from that file make it
1523 if (verbose
) cout
<< "Committing\n";
1528 show_help(int exit_code
)
1530 cout
<< PROG_NAME
" - " PROG_DESC
"\n"
1531 "Usage: " PROG_NAME
" [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1533 "Creates or updates a Xapian database with the data from the input files listed\n"
1534 "on the command line. If no files are specified, data is read from stdin.\n"
1536 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1537 "format for INDEXER_SCRIPT.\n"
1540 " -v, --verbose display additional messages to aid debugging\n"
1541 " --overwrite create the database anew (the default is to update if\n"
1542 " the database already exists)\n";
1543 print_stemmer_help("");
1544 print_help_and_version_help("");
1549 main(int argc
, char **argv
)
1551 // If the database already exists, default to updating not overwriting.
1552 int database_mode
= Xapian::DB_CREATE_OR_OPEN
;
1554 Xapian::Stem
stemmer("english");
1556 // Without this, strptime() seems to treat formats without a timezone as
1557 // being local time, including %s.
1558 setenv("TZ", "UTC", 1);
1560 constexpr auto NO_ARG
= no_argument
;
1561 constexpr auto REQ_ARG
= required_argument
;
1562 static const struct option longopts
[] = {
1563 { "help", NO_ARG
, NULL
, 'h' },
1564 { "version", NO_ARG
, NULL
, 'V' },
1565 { "stemmer", REQ_ARG
, NULL
, 's' },
1566 { "overwrite", NO_ARG
, NULL
, 'o' },
1567 { "verbose", NO_ARG
, NULL
, 'v' },
1572 while ((getopt_ret
= gnu_getopt_long(argc
, argv
, "vs:hV",
1573 longopts
, NULL
)) != -1) {
1574 switch (getopt_ret
) {
1581 case 'V': // --version
1582 print_package_info(PROG_NAME
);
1584 case 'o': // --overwrite
1585 database_mode
= Xapian::DB_CREATE_OR_OVERWRITE
;
1592 stemmer
= Xapian::Stem(optarg
);
1593 } catch (const Xapian::InvalidArgumentError
&) {
1594 cerr
<< "Unknown stemming language '" << optarg
<< "'.\n";
1595 cerr
<< "Available language names are: "
1596 << Xapian::Stem::get_available_languages() << '\n';
1609 parse_index_script(argv
[1]);
1611 // Open the database. If another process is currently updating the
1612 // database, wait for the lock to become available.
1613 auto flags
= database_mode
| Xapian::DB_RETRY_LOCK
;
1614 Xapian::WritableDatabase
database(argv
[0], flags
);
1616 Xapian::TermGenerator indexer
;
1617 indexer
.set_stemmer(stemmer
);
1618 // Set the database for spellings to be added to by the "spell" action.
1619 indexer
.set_database(database
);
1628 index_file("<stdin>", cin
, database
, indexer
);
1630 // Read file(s) listed on the command line.
1631 for (int i
= 2; i
< argc
; ++i
) {
1632 ifstream
stream(argv
[i
]);
1634 index_file(argv
[i
], stream
, database
, indexer
);
1636 cerr
<< "Can't open file " << argv
[i
] << '\n';
1641 cout
<< "records (added, replaced, deleted, skipped) = ("
1645 << skipcount
<< ")\n";
1646 } catch (const Xapian::Error
&error
) {
1647 cerr
<< "Exception: " << error
.get_description() << '\n';
1649 } catch (const std::bad_alloc
&) {
1650 cerr
<< "Exception: std::bad_alloc\n";
1653 cerr
<< "Unknown Exception\n";