xapian-applications/omega/scriptindex.cc

   1 /** @file
   2  * @brief index arbitrary data as described by an index script
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 Sam Liddicott
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2023 Olly Betts
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #ifdef __CYGWIN__
  28 // Needed to get setenv() and strptime() declared.
  29 # define _GNU_SOURCE
  30 #endif
  31
  32 #include <xapian.h>
  33
  34 #include <algorithm>
  35 #include <fstream>
  36 #include <iostream>
  37 #include <list>
  38 #include <map>
  39 #include <memory>
  40 #include <string>
  41 #include <unordered_set>
  42 #include <vector>
  43 #include <cstring>
  44
  45 #include <cerrno>
  46 #include <cstdio>
  47 #include <cstdlib>
  48 #include <ctime>
  49
  50 #include "commonhelp.h"
  51 #include "datetime.h"
  52 #include "genericxmlparser.h"
  53 #include "hashterm.h"
  54 #include "htmlparser.h"
  55 #include "loadfile.h"
  56 #include "parseint.h"
  57 #include "setenv.h"
  58 #include "str.h"
  59 #include "stringutils.h"
  60 #include "timegm.h"
  61 #include "utf8truncate.h"
  62 #include "values.h"
  63
  64 #ifndef HAVE_STRPTIME
  65 #include "portability/strptime.h"
  66 #endif
  67
  68 #include "gnu_getopt.h"
  69
  70 using namespace std;
  71
  72 #define PROG_NAME "scriptindex"
  73 #define PROG_DESC "index arbitrary data as described by an index script"
  74
  75 static bool verbose;
  76 static int addcount;
  77 static int repcount;
  78 static int delcount;
  79 static int skipcount;
  80
  81 /** What to do if there's a UNIQUE action but a record doesn't use it.
  82  */
  83 static enum {
  84     UNIQUE_ERROR,
  85     UNIQUE_WARN_NEW,
  86     UNIQUE_NEW,
  87     UNIQUE_WARN_SKIP,
  88     UNIQUE_SKIP
  89 } unique_missing = UNIQUE_ERROR;
  90
  91 /// Track if UNIQUE action is unused in the current record.
  92 static bool unique_unused;
  93
  94 /// Track if the current record is being skipping.
  95 static bool skipping_record = false;
  96
  97 static inline bool
  98 prefix_needs_colon(const string & prefix, unsigned ch)
  99 {
 100     if (!C_isupper(ch) && ch != ':') return false;
 101     string::size_type len = prefix.length();
 102     return (len > 1 && prefix[len - 1] != ':');
 103 }
 104
 105 const char * action_names[] = {
 106     // Actions used internally:
 107     "bad",
 108     "new",
 109     // Actual actions:
 110     "boolean",
 111     "date",
 112     "field",
 113     "gap",
 114     "hash",
 115     "hextobin",
 116     "index",
 117     "indexnopos",
 118     "load",
 119     "lower",
 120     "ltrim",
 121     "parsedate",
 122     "rtrim",
 123     "spell",
 124     "split",
 125     "squash",
 126     "trim",
 127     "truncate",
 128     "unhtml",
 129     "unique",
 130     "unxml",
 131     "value",
 132     "valuenumeric",
 133     "valuepacked",
 134     "weight"
 135 };
 136
 137 // For debugging:
 138 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")\n"
 139
 140 class Action {
 141   public:
 142     typedef enum {
 143         // Actions used internally:
 144         BAD,
 145         NEW,
 146         // Actual actions:
 147         BOOLEAN,
 148         DATE,
 149         FIELD,
 150         GAP,
 151         HASH,
 152         HEXTOBIN,
 153         INDEX,
 154         INDEXNOPOS,
 155         LOAD,
 156         LOWER,
 157         LTRIM,
 158         PARSEDATE,
 159         RTRIM,
 160         SPELL,
 161         SPLIT,
 162         SQUASH,
 163         TRIM,
 164         TRUNCATE,
 165         UNHTML,
 166         UNIQUE,
 167         UNXML,
 168         VALUE,
 169         VALUENUMERIC,
 170         VALUEPACKED,
 171         WEIGHT
 172     } type;
 173     enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
 174   private:
 175     type action;
 176     int num_arg = 0;
 177     string string_arg;
 178     // Offset into indexscript line.
 179     size_t pos;
 180   public:
 181     Action(type action_, size_t pos_)
 182         : action(action_), pos(pos_) { }
 183     Action(type action_, size_t pos_, const string & arg)
 184         : action(action_), string_arg(arg), pos(pos_) {
 185         num_arg = atoi(string_arg.c_str());
 186     }
 187     Action(type action_, size_t pos_, const string & arg, int num)
 188         : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
 189     type get_action() const { return action; }
 190     int get_num_arg() const { return num_arg; }
 191     void set_num_arg(int num) { num_arg = num; }
 192     const string & get_string_arg() const { return string_arg; }
 193     size_t get_pos() const { return pos; }
 194 };
 195
 196 // These allow searching for an Action with a particular Action::type using
 197 // std::find().
 198
 199 inline bool
 200 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
 201
 202 inline bool
 203 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
 204
 205 inline bool
 206 operator!=(const Action& a, Action::type t) { return !(a == t); }
 207
 208 inline bool
 209 operator!=(Action::type t, const Action& a) { return !(t == a); }
 210
 211 static void
 212 ltrim(string& s, const string& chars)
 213 {
 214     auto i = s.find_first_not_of(chars);
 215     if (i) s.erase(0, i);
 216 }
 217
 218 static void
 219 rtrim(string& s, const string& chars)
 220 {
 221     s.resize(s.find_last_not_of(chars) + 1);
 222 }
 223
 224 static void
 225 squash(string& s, const string& chars)
 226 {
 227     string output;
 228     output.reserve(s.size());
 229     string::size_type i = 0;
 230     while ((i = s.find_first_not_of(chars, i)) != string::npos) {
 231         auto j = s.find_first_of(chars, i);
 232         if (!output.empty()) output += ' ';
 233         output.append(s, i, j - i);
 234         i = j;
 235     }
 236     s = std::move(output);
 237 }
 238
 239 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
 240
 241 static unsigned error_count = 0;
 242
 243 static void
 244 report_location(enum diag_type type,
 245                 const string& filename,
 246                 size_t line = 0,
 247                 size_t pos = string::npos)
 248 {
 249     cerr << filename;
 250     if (line != 0) {
 251         cerr << ':' << line;
 252         if (pos != string::npos) {
 253             // The first column is numbered 1.
 254             cerr << ':' << pos + 1;
 255         }
 256     }
 257     switch (type) {
 258         case DIAG_ERROR:
 259             cerr << ": error: ";
 260             ++error_count;
 261             break;
 262         case DIAG_WARN:
 263             cerr << ": warning: ";
 264             break;
 265         case DIAG_NOTE:
 266             cerr << ": note: ";
 267             break;
 268     }
 269 }
 270
 271 static void
 272 report_useless_action(const string &file, size_t line, size_t pos,
 273                       const string &action)
 274 {
 275     report_location(DIAG_WARN, file, line, pos);
 276     cerr << "Index action '" << action << "' has no effect\n";
 277
 278     static bool given_left_to_right_warning = false;
 279     if (!given_left_to_right_warning) {
 280         given_left_to_right_warning = true;
 281         report_location(DIAG_NOTE, file, line, pos);
 282         cerr << "Actions are executed from left to right\n";
 283     }
 284 }
 285
 286 // Return true if we can support %z on the current platform.
 287 static inline bool
 288 parsedate_supports_z()
 289 {
 290 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
 291     // Without tm_gmtoff we aren't going to get the timezone information from
 292     // strptime().
 293     return false;
 294 #else
 295     // Perform a simple run-time test to check if %z is suitably supported.
 296     static bool cached_result = ([]() {
 297         struct tm tm;
 298         memset(&tm, 0, sizeof(tm));
 299         auto ret = strptime("+1245", "%z", &tm);
 300         return ret && *ret == '\0' && tm.tm_gmtoff == (12 * 60 + 45) * 60;
 301     })();
 302     return cached_result;
 303 #endif
 304 }
 305
 306 static bool index_spec_uses_unique = false;
 307
 308 static map<string, vector<Action>> index_spec;
 309
 310 // Like std::getline() but handle \r\n line endings too.
 311 static istream&
 312 getline_portable(istream& stream, string& line)
 313 {
 314     istream& result = getline(stream, line);
 315     // Trim multiple \r characters, since that seems the best way to handle
 316     // that case.
 317     line.resize(UNSIGNED_OVERFLOW_OK(line.find_last_not_of('\r') + 1));
 318     return result;
 319 }
 320
 321 static void
 322 parse_index_script(const string &filename)
 323 {
 324     ifstream script(filename.c_str());
 325     if (!script.is_open()) {
 326         report_location(DIAG_ERROR, filename);
 327         cerr << strerror(errno) << '\n';
 328         exit(1);
 329     }
 330     string line;
 331     size_t line_no = 0;
 332     // Line number where we saw a `unique` action, or 0 if we haven't.
 333     int unique_line_no = 0;
 334     // Offset into line unique_line_no where the `unique` action was.
 335     size_t unique_pos = 0;
 336     while (getline(script, line)) {
 337         ++line_no;
 338         vector<string> fields;
 339         vector<Action> actions;
 340         string::const_iterator i, j;
 341         const string &s = line;
 342         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 343         if (i == s.end() || *i == '#') {
 344             // Blank line or comment.
 345             continue;
 346         }
 347         while (true) {
 348             if (!C_isalnum(*i)) {
 349                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 350                 cerr << "field name must start with alphanumeric\n";
 351             }
 352             j = find_if(i + 1, s.end(),
 353                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 354             fields.push_back(string(i, j));
 355             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 356             if (i == s.end()) break;
 357             if (*i == ':') {
 358                 ++i;
 359                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 360                 break;
 361             }
 362             if (i == j) {
 363                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 364                 cerr << "bad character '" << *i << "' in field name\n";
 365                 ++i;
 366                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 367                 if (i == s.end()) break;
 368             }
 369         }
 370         Xapian::termcount weight = 1;
 371         size_t useless_weight_pos = string::npos;
 372         map<string, Action::type> boolmap;
 373         j = i;
 374         while (j != s.end()) {
 375             size_t action_pos = j - s.begin();
 376             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 377             string action(s, j - s.begin(), i - j);
 378             Action::type code = Action::BAD;
 379             unsigned min_args = 0, max_args = 0;
 380             bool takes_integer_argument = false;
 381             if (!action.empty()) {
 382                 switch (action[0]) {
 383                     case 'b':
 384                         if (action == "boolean") {
 385                             code = Action::BOOLEAN;
 386                             max_args = 1;
 387                         }
 388                         break;
 389                     case 'd':
 390                         if (action == "date") {
 391                             code = Action::DATE;
 392                             min_args = max_args = 1;
 393                         }
 394                         break;
 395                     case 'f':
 396                         if (action == "field") {
 397                             code = Action::FIELD;
 398                             max_args = 1;
 399                         }
 400                         break;
 401                     case 'g':
 402                         if (action == "gap") {
 403                             code = Action::GAP;
 404                             max_args = 1;
 405                             takes_integer_argument = true;
 406                         }
 407                         break;
 408                     case 'h':
 409                         if (action == "hash") {
 410                             code = Action::HASH;
 411                             max_args = 1;
 412                             takes_integer_argument = true;
 413                         } else if (action == "hextobin") {
 414                             code = Action::HEXTOBIN;
 415                         }
 416                         break;
 417                     case 'i':
 418                         if (action == "index") {
 419                             code = Action::INDEX;
 420                             max_args = 1;
 421                         } else if (action == "indexnopos") {
 422                             code = Action::INDEXNOPOS;
 423                             max_args = 1;
 424                         }
 425                         break;
 426                     case 'l':
 427                         if (action == "lower") {
 428                             code = Action::LOWER;
 429                         } else if (action == "load") {
 430                             code = Action::LOAD;
 431                         } else if (action == "ltrim") {
 432                             code = Action::LTRIM;
 433                             max_args = 1;
 434                         }
 435                         break;
 436                     case 'p':
 437                         if (action == "parsedate") {
 438                             code = Action::PARSEDATE;
 439                             min_args = max_args = 1;
 440                         }
 441                         break;
 442                     case 'r':
 443                         if (action == "rtrim") {
 444                             code = Action::RTRIM;
 445                             max_args = 1;
 446                         }
 447                         break;
 448                     case 's':
 449                         if (action == "spell") {
 450                             code = Action::SPELL;
 451                         } else if (action == "split") {
 452                             code = Action::SPLIT;
 453                             min_args = 1;
 454                             max_args = 2;
 455                         } else if (action == "squash") {
 456                             code = Action::SQUASH;
 457                             max_args = 1;
 458                         }
 459                         break;
 460                     case 't':
 461                         if (action == "truncate") {
 462                             code = Action::TRUNCATE;
 463                             min_args = max_args = 1;
 464                             takes_integer_argument = true;
 465                         } else if (action == "trim") {
 466                             code = Action::TRIM;
 467                             max_args = 1;
 468                         }
 469                         break;
 470                     case 'u':
 471                         if (action == "unhtml") {
 472                             code = Action::UNHTML;
 473                         } else if (action == "unique") {
 474                             code = Action::UNIQUE;
 475                             min_args = 1;
 476                             max_args = 2;
 477                         } else if (action == "unxml") {
 478                             code = Action::UNXML;
 479                         }
 480                         break;
 481                     case 'v':
 482                         if (action == "value") {
 483                             code = Action::VALUE;
 484                             min_args = max_args = 1;
 485                             takes_integer_argument = true;
 486                         } else if (action == "valuenumeric") {
 487                             code = Action::VALUENUMERIC;
 488                             min_args = max_args = 1;
 489                             takes_integer_argument = true;
 490                         } else if (action == "valuepacked") {
 491                             code = Action::VALUEPACKED;
 492                             min_args = max_args = 1;
 493                             takes_integer_argument = true;
 494                         }
 495                         break;
 496                     case 'w':
 497                         if (action == "weight") {
 498                             code = Action::WEIGHT;
 499                             min_args = max_args = 1;
 500                             // Don't set takes_integer_argument since we parse
 501                             // it with parse_unsigned() and issue an error there
 502                             // - setting takes_integer_argument would give a
 503                             // double error for arguments with a decimal point.
 504                         }
 505                         break;
 506                 }
 507             }
 508             if (code == Action::BAD) {
 509                 report_location(DIAG_ERROR, filename, line_no, action_pos);
 510                 if (action.empty()) {
 511                     i = find_if(i, s.end(), C_isspace);
 512                     cerr << "Expected index action, found '"
 513                          << string(s, j - s.begin(), i - j) << "'\n";
 514                 } else {
 515                     cerr << "Unknown index action '" << action << "'\n";
 516                 }
 517             }
 518             auto i_after_action = i;
 519             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 520
 521             if (i != s.end() && *i == '=') {
 522                 if (i != i_after_action) {
 523                     report_location(DIAG_WARN, filename, line_no,
 524                                     i_after_action - s.begin());
 525                     cerr << "putting spaces between the action and '=' is "
 526                             "deprecated\n";
 527                 }
 528
 529                 if (max_args == 0) {
 530                     report_location(DIAG_ERROR, filename, line_no,
 531                                     i - s.begin());
 532                     cerr << "Index action '" << action
 533                          << "' doesn't take an argument\n";
 534                 }
 535
 536                 ++i;
 537                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 538                 if (i != j) {
 539                     report_location(DIAG_WARN, filename, line_no,
 540                                     i - s.begin());
 541                     cerr << "putting spaces between '=' and the argument is "
 542                             "deprecated\n";
 543                 }
 544
 545                 vector<string> vals;
 546                 while (true) {
 547                     if (j != s.end() && *j == '"') {
 548                         // Quoted argument.
 549                         ++j;
 550                         string arg;
 551                         while (true) {
 552                             i = find_if(j, s.end(),
 553                                         [](char ch) {
 554                                             return ch == '"' || ch == '\\';
 555                                         });
 556                             if (i == s.end()) {
 557                                 report_location(DIAG_ERROR, filename, line_no,
 558                                                 s.size());
 559                                 cerr << "No closing quote\n";
 560                                 break;
 561                             }
 562                             arg.append(j, i);
 563                             if (*i++ == '"')
 564                                 break;
 565
 566                             // Escape sequence.
 567                             if (i == s.end()) {
 568 bad_escaping:
 569                                 report_location(DIAG_ERROR, filename, line_no,
 570                                                 i - s.begin());
 571                                 cerr << "Bad escaping in quoted action "
 572                                         "argument\n";
 573                                 break;
 574                             }
 575
 576                             char ch = *i;
 577                             switch (ch) {
 578                                 case '\\':
 579                                 case '"':
 580                                     break;
 581                                 case '0':
 582                                     ch = '\0';
 583                                     break;
 584                                 case 'n':
 585                                     ch = '\n';
 586                                     break;
 587                                 case 'r':
 588                                     ch = '\r';
 589                                     break;
 590                                 case 't':
 591                                     ch = '\t';
 592                                     break;
 593                                 case 'x': {
 594                                     if (++i == s.end())
 595                                         goto bad_escaping;
 596                                     char ch1 = *i;
 597                                     if (!C_isxdigit(ch1)) {
 598 bad_hex_digit:
 599                                         report_location(DIAG_ERROR, filename,
 600                                                         line_no, i - s.begin());
 601                                         cerr << "Bad hex digit in escaping\n";
 602                                         --i;
 603                                         break;
 604                                     }
 605                                     if (++i == s.end())
 606                                         goto bad_escaping;
 607                                     char ch2 = *i;
 608                                     if (!C_isxdigit(ch2)) {
 609                                         goto bad_hex_digit;
 610                                     }
 611                                     ch = hex_decode(ch1, ch2);
 612                                     break;
 613                                 }
 614                                 default:
 615                                     report_location(DIAG_ERROR, filename,
 616                                                     line_no, i - s.begin());
 617                                     cerr << "Bad escape sequence '\\" << ch
 618                                          << "'\n";
 619                                     break;
 620                             }
 621                             arg += ch;
 622                             j = i + 1;
 623                         }
 624                         vals.emplace_back(std::move(arg));
 625                         if (i == s.end() || C_isspace(*i)) break;
 626                         if (*i == ',') {
 627                             ++i;
 628                         } else {
 629                             report_location(DIAG_ERROR, filename, line_no,
 630                                             i - s.begin());
 631                             cerr << "Unexpected character '" << *i
 632                                  << "' after closing quote\n";
 633                             do {
 634                                 ++i;
 635                             } while (i != s.end() && *i != ',' && !C_isspace(*i));
 636                             if (*i != ',') break;
 637                             ++i;
 638                         }
 639                     } else if (max_args > 1) {
 640                         // Unquoted argument, split on comma.
 641                         i = find_if(j, s.end(),
 642                                     [](char ch) {
 643                                         return C_isspace(ch) || ch == ',';
 644                                     });
 645                         vals.emplace_back(j, i);
 646                         if (*i != ',') break;
 647                         ++i;
 648                     } else {
 649                         // Unquoted argument, including any commas.
 650                         i = find_if(j, s.end(),
 651                                     [](char ch) { return C_isspace(ch); });
 652                         vals.emplace_back(j, i);
 653                         break;
 654                     }
 655                     j = i;
 656
 657                     if (vals.size() == max_args) {
 658                         report_location(DIAG_ERROR, filename, line_no,
 659                                         i - s.begin());
 660                         cerr << "Index action '" << action << "' takes at most "
 661                              << max_args << " arguments\n";
 662                     }
 663                 }
 664
 665                 if (vals.size() < min_args) {
 666                     report_location(DIAG_ERROR, filename, line_no,
 667                                     i - s.begin());
 668                     if (min_args == max_args) {
 669                         cerr << "Index action '" << action << "' requires "
 670                              << min_args << " arguments\n";
 671                     } else {
 672                         cerr << "Index action '" << action << "' requires "
 673                                 "at least " << min_args << " arguments\n";
 674                     }
 675                     // Allow action handling code to assume there are min_args
 676                     // arguments.
 677                     vals.resize(min_args);
 678                 }
 679
 680                 string val;
 681                 if (!vals.empty()) {
 682                     val = vals.front();
 683                 }
 684
 685                 if (takes_integer_argument) {
 686                     auto dot = val.find('.');
 687                     if (dot != string::npos) {
 688                         report_location(DIAG_ERROR, filename, line_no,
 689                                         j - s.begin() + dot);
 690                         cerr << "Index action '" << action
 691                              << "' takes an integer argument\n";
 692                     }
 693                 }
 694                 switch (code) {
 695                     case Action::DATE:
 696                         if (val != "unix" &&
 697                             val != "unixutc" &&
 698                             val != "yyyymmdd") {
 699                             report_location(DIAG_ERROR, filename, line_no,
 700                                             j - s.begin());
 701                             cerr << "Invalid parameter '" << val
 702                                  << "' for action 'date'\n";
 703                         }
 704                         actions.emplace_back(code, action_pos, val);
 705                         break;
 706                     case Action::INDEX:
 707                     case Action::INDEXNOPOS:
 708                         actions.emplace_back(code, action_pos, val, weight);
 709                         useless_weight_pos = string::npos;
 710                         break;
 711                     case Action::WEIGHT:
 712                         // We don't push an Action for WEIGHT - instead we
 713                         // store it ready to use in the INDEX and INDEXNOPOS
 714                         // Actions.
 715                         if (!parse_unsigned(val.c_str(), weight)) {
 716                             report_location(DIAG_ERROR, filename, line_no,
 717                                             j - s.begin());
 718                             cerr << "Index action 'weight' takes a "
 719                                     "non-negative integer argument\n";
 720                             weight = 0;
 721                         }
 722                         if (useless_weight_pos != string::npos) {
 723                             report_useless_action(filename, line_no,
 724                                                   useless_weight_pos, action);
 725                         }
 726                         useless_weight_pos = action_pos;
 727                         break;
 728                     case Action::PARSEDATE: {
 729                         auto bad_code = val.find("%Z");
 730                         if (bad_code != val.npos) {
 731                             report_location(DIAG_ERROR, filename, line_no,
 732                                             j - s.begin() + bad_code);
 733                             cerr << "Parsing timezone names with %Z is not "
 734                                     "supported\n";
 735                         }
 736                         bad_code = val.find("%z");
 737                         if (bad_code != val.npos && !parsedate_supports_z()) {
 738                             report_location(DIAG_ERROR, filename, line_no,
 739                                             j - s.begin() + bad_code);
 740                             cerr << "Parsing timezone offsets with %z is not "
 741                                     "supported on this platform\n";
 742                         }
 743                         actions.emplace_back(code, action_pos, val);
 744                         break;
 745                     }
 746                     case Action::SPLIT: {
 747                         if (val.empty()) {
 748                             report_location(DIAG_ERROR, filename, line_no,
 749                                             j - s.begin());
 750                             cerr << "Split delimiter can't be empty\n";
 751                         }
 752                         int operation = Action::SPLIT_NONE;
 753                         if (vals.size() >= 2) {
 754                             if (vals[1] == "dedup") {
 755                                 operation = Action::SPLIT_DEDUP;
 756                             } else if (vals[1] == "sort") {
 757                                 operation = Action::SPLIT_SORT;
 758                             } else if (vals[1] == "none") {
 759                                 operation = Action::SPLIT_NONE;
 760                             } else if (vals[1] == "prefixes") {
 761                                 operation = Action::SPLIT_PREFIXES;
 762                             } else {
 763                                 // FIXME: Column should be for where the `op`
 764                                 // parameter starts, which this isn't if the
 765                                 // value is quoted, contains escape sequences,
 766                                 // etc.
 767                                 report_location(DIAG_ERROR, filename, line_no,
 768                                                 i - s.begin() - vals[1].size());
 769                                 cerr << "Bad split operation '" << vals[1]
 770                                      << "'\n";
 771                             }
 772                         }
 773                         actions.emplace_back(code, action_pos, val, operation);
 774                         break;
 775                     }
 776                     case Action::TRUNCATE:
 777                         if (!actions.empty() &&
 778                             actions.back().get_action() == Action::LOAD) {
 779                             /* Turn "load truncate=n" into "load" with
 780                              * num_arg n, so that we don't needlessly
 781                              * allocate memory and read data we're just
 782                              * going to ignore.
 783                              */
 784                             actions.pop_back();
 785                             code = Action::LOAD;
 786                         }
 787                         actions.emplace_back(code, action_pos, val);
 788                         break;
 789                     case Action::UNIQUE:
 790                         if (unique_line_no) {
 791                             report_location(DIAG_ERROR, filename, line_no,
 792                                             action_pos);
 793                             cerr << "Index action 'unique' used more than "
 794                                     "once\n";
 795                             report_location(DIAG_NOTE, filename,
 796                                             unique_line_no, unique_pos);
 797                             cerr << "Previously used here\n";
 798                         }
 799                         unique_line_no = line_no;
 800                         unique_pos = action_pos;
 801                         if (boolmap.find(val) == boolmap.end())
 802                             boolmap[val] = Action::UNIQUE;
 803                         if (vals.size() >= 2) {
 804                             if (vals[1] == "missing=error") {
 805                                 unique_missing = UNIQUE_ERROR;
 806                             } else if (vals[1] == "missing=new") {
 807                                 unique_missing = UNIQUE_NEW;
 808                             } else if (vals[1] == "missing=warn+new") {
 809                                 unique_missing = UNIQUE_WARN_NEW;
 810                             } else if (vals[1] == "missing=skip") {
 811                                 unique_missing = UNIQUE_SKIP;
 812                             } else if (vals[1] == "missing=warn+skip") {
 813                                 unique_missing = UNIQUE_WARN_SKIP;
 814                             } else {
 815                                 report_location(DIAG_ERROR, filename, line_no);
 816                                 cerr << "Bad unique parameter '" << vals[1]
 817                                      << "'\n";
 818                             }
 819                         }
 820                         actions.emplace_back(code, action_pos, val);
 821                         break;
 822                     case Action::GAP: {
 823                         actions.emplace_back(code, action_pos, val);
 824                         auto& obj = actions.back();
 825                         auto gap_size = obj.get_num_arg();
 826                         if (gap_size <= 0) {
 827                             report_location(DIAG_ERROR, filename, line_no,
 828                                             obj.get_pos() + 3 + 1);
 829                             cerr << "Index action 'gap' takes a strictly "
 830                                     "positive integer argument\n";
 831                         }
 832                         break;
 833                     }
 834                     case Action::HASH: {
 835                         actions.emplace_back(code, action_pos, val);
 836                         auto& obj = actions.back();
 837                         auto max_length = obj.get_num_arg();
 838                         if (max_length < 6) {
 839                             report_location(DIAG_ERROR, filename, line_no,
 840                                             obj.get_pos() + 4 + 1);
 841                             cerr << "Index action 'hash' takes an integer "
 842                                     "argument which must be at least 6\n";
 843                         }
 844                         break;
 845                     }
 846                     case Action::LTRIM:
 847                     case Action::RTRIM:
 848                     case Action::SQUASH:
 849                     case Action::TRIM:
 850                         for (unsigned char ch : val) {
 851                             if (ch >= 0x80) {
 852                                 auto column = actions.back().get_pos() +
 853                                               strlen(action_names[code]) + 1;
 854                                 report_location(DIAG_ERROR, filename, line_no,
 855                                                 column);
 856                                 cerr << "Index action '" << action_names[code]
 857                                      << "' only support ASCII characters "
 858                                         "currently\n";
 859                             }
 860                         }
 861                         actions.emplace_back(code, action_pos, val);
 862                         break;
 863                     case Action::BOOLEAN:
 864                         boolmap[val] = Action::BOOLEAN;
 865                         /* FALLTHRU */
 866                     default:
 867                         actions.emplace_back(code, action_pos, val);
 868                 }
 869                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 870             } else {
 871                 if (min_args > 0) {
 872                     report_location(DIAG_ERROR, filename, line_no,
 873                                     i_after_action - s.begin());
 874                     if (min_args == max_args) {
 875                         cerr << "Index action '" << action << "' requires "
 876                              << min_args << " arguments\n";
 877                     } else {
 878                         cerr << "Index action '" << action << "' requires "
 879                                 "at least " << min_args << " arguments\n";
 880                     }
 881                 }
 882                 switch (code) {
 883                     case Action::INDEX:
 884                     case Action::INDEXNOPOS:
 885                         useless_weight_pos = string::npos;
 886                         actions.emplace_back(code, action_pos, "", weight);
 887                         break;
 888                     case Action::GAP:
 889                         actions.emplace_back(code, action_pos, "", 100);
 890                         break;
 891                     case Action::HASH:
 892                         actions.emplace_back(code, action_pos, "",
 893                                              MAX_SAFE_TERM_LENGTH - 1);
 894                         break;
 895                     case Action::LTRIM:
 896                     case Action::RTRIM:
 897                     case Action::SQUASH:
 898                     case Action::TRIM:
 899                         actions.emplace_back(code, action_pos, " \t\f\v\r\n");
 900                         break;
 901                     default:
 902                         actions.emplace_back(code, action_pos);
 903                         break;
 904                 }
 905             }
 906             j = i;
 907         }
 908
 909         if (useless_weight_pos != string::npos) {
 910             report_useless_action(filename, line_no, useless_weight_pos,
 911                                   "weight");
 912         }
 913
 914         while (!actions.empty()) {
 915             bool done = true;
 916             Action::type action = actions.back().get_action();
 917             switch (action) {
 918                 case Action::HASH:
 919                 case Action::HEXTOBIN:
 920                 case Action::LOWER:
 921                 case Action::LTRIM:
 922                 case Action::PARSEDATE:
 923                 case Action::RTRIM:
 924                 case Action::SPELL:
 925                 case Action::SQUASH:
 926                 case Action::TRIM:
 927                 case Action::TRUNCATE:
 928                 case Action::UNHTML:
 929                 case Action::UNXML:
 930                     done = false;
 931                     report_useless_action(filename, line_no,
 932                                           actions.back().get_pos(),
 933                                           action_names[action]);
 934                     actions.pop_back();
 935                     break;
 936                 default:
 937                     break;
 938             }
 939             if (done) break;
 940         }
 941
 942         map<string, Action::type>::const_iterator boolpfx;
 943         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 944             if (boolpfx->second == Action::UNIQUE) {
 945                 report_location(DIAG_WARN, filename, unique_line_no,
 946                                 unique_pos);
 947                 cerr << "Index action 'unique=" << boolpfx->first
 948                      << "' without 'boolean=" << boolpfx->first << "'\n";
 949                 static bool given_doesnt_imply_boolean_warning = false;
 950                 if (!given_doesnt_imply_boolean_warning) {
 951                     given_doesnt_imply_boolean_warning = true;
 952                     report_location(DIAG_NOTE, filename, unique_line_no,
 953                                     unique_pos);
 954                     cerr << "'unique' doesn't implicitly add a boolean term\n";
 955                 }
 956             }
 957         }
 958
 959         vector<string>::const_iterator field;
 960         for (field = fields.begin(); field != fields.end(); ++field) {
 961             vector<Action> &v = index_spec[*field];
 962             if (v.empty()) {
 963                 if (fields.size() == 1) {
 964                     // Optimise common case where there's only one fieldname
 965                     // for a list of actions.
 966                     v = std::move(actions);
 967                 } else {
 968                     v = actions;
 969                 }
 970             } else {
 971                 v.emplace_back(Action::NEW, string::npos);
 972                 v.insert(v.end(), actions.begin(), actions.end());
 973             }
 974         }
 975     }
 976
 977     if (index_spec.empty()) {
 978         report_location(DIAG_ERROR, filename, line_no);
 979         cerr << "No rules found in index script\n";
 980     }
 981
 982     if (error_count) {
 983         exit(1);
 984     }
 985
 986     index_spec_uses_unique = (unique_line_no > 0);
 987 }
 988
 989 static bool
 990 run_actions(vector<Action>::const_iterator action_it,
 991             vector<Action>::const_iterator action_end,
 992             Xapian::WritableDatabase& database,
 993             Xapian::TermGenerator& indexer,
 994             const string& old_value,
 995             bool& this_field_is_content, Xapian::Document& doc,
 996             map<string, list<string>>& fields,
 997             string& field, const char* fname,
 998             size_t line_no, Xapian::docid& docid)
 999 {
1000     string value = old_value;
1001     while (action_it != action_end) {
1002         auto& action = *action_it++;
1003         switch (action.get_action()) {
1004             case Action::BAD:
1005                 abort();
1006             case Action::NEW:
1007                 value = old_value;
1008                 break;
1009             case Action::FIELD:
1010                 if (!value.empty()) {
1011                     string f = action.get_string_arg();
1012                     if (f.empty()) f = field;
1013                     // replace newlines with spaces
1014                     string s = value;
1015                     string::size_type j = 0;
1016                     while ((j = s.find('\n', j)) != string::npos)
1017                         s[j] = ' ';
1018                     fields[f].push_back(s);
1019                 }
1020                 break;
1021             case Action::INDEX:
1022                 indexer.index_text(value,
1023                                    action.get_num_arg(),
1024                                    action.get_string_arg());
1025                 break;
1026             case Action::INDEXNOPOS:
1027                 // No positional information so phrase searching won't work.
1028                 // However, the database will use much less diskspace.
1029                 indexer.index_text_without_positions(value,
1030                                                      action.get_num_arg(),
1031                                                      action.get_string_arg());
1032                 break;
1033             case Action::BOOLEAN: {
1034                 // Do nothing if there's no text.
1035                 if (value.empty()) break;
1036
1037                 string term = action.get_string_arg();
1038                 if (prefix_needs_colon(term, value[0])) term += ':';
1039                 term += value;
1040
1041                 doc.add_boolean_term(term);
1042                 break;
1043             }
1044             case Action::GAP:
1045                 indexer.increase_termpos(action.get_num_arg());
1046                 break;
1047             case Action::HASH: {
1048                 unsigned int max_length = action.get_num_arg();
1049                 if (value.length() > max_length)
1050                     value = hash_long_term(value, max_length);
1051                 break;
1052             }
1053             case Action::HEXTOBIN: {
1054                 size_t len = value.length();
1055                 if (len & 1) {
1056                     report_location(DIAG_ERROR, fname, line_no);
1057                     cerr << "hextobin: input must have even length\n";
1058                     exit(1);
1059                 }
1060
1061                 string output;
1062                 output.reserve(len / 2);
1063                 for (size_t j = 0; j < len; j += 2) {
1064                     char a = value[j];
1065                     char b = value[j + 1];
1066                     if (!C_isxdigit(a) || !C_isxdigit(b)) {
1067                         report_location(DIAG_ERROR, fname, line_no);
1068                         cerr << "hextobin: input must be all hex digits\n";
1069                         exit(1);
1070                     }
1071                     char r = hex_decode(a, b);
1072                     output.push_back(r);
1073                 }
1074                 value = std::move(output);
1075                 break;
1076             }
1077             case Action::LOWER:
1078                 value = Xapian::Unicode::tolower(value);
1079                 break;
1080             case Action::LTRIM:
1081                 ltrim(value, action.get_string_arg());
1082                 break;
1083             case Action::RTRIM:
1084                 rtrim(value, action.get_string_arg());
1085                 break;
1086             case Action::TRIM:
1087                 rtrim(value, action.get_string_arg());
1088                 ltrim(value, action.get_string_arg());
1089                 break;
1090             case Action::SQUASH:
1091                 squash(value, action.get_string_arg());
1092                 break;
1093             case Action::LOAD: {
1094                 // If there's no input, just issue a warning.
1095                 if (value.empty()) {
1096                     report_location(DIAG_WARN, fname, line_no);
1097                     cerr << "Empty filename in LOAD action\n";
1098                     break;
1099                 }
1100                 bool truncated = false;
1101                 string filename = std::move(value);
1102                 // FIXME: Use NOATIME if we own the file or are root.
1103                 if (!load_file(filename, action.get_num_arg(), NOCACHE,
1104                                value, truncated)) {
1105                     report_location(DIAG_ERROR, fname, line_no);
1106                     cerr << "Couldn't load file '" << filename << "': "
1107                          << strerror(errno) << '\n';
1108                     exit(1);
1109                 }
1110                 if (!truncated) break;
1111             }
1112             /* FALLTHRU */
1113             case Action::TRUNCATE:
1114                 utf8_truncate(value, action.get_num_arg());
1115                 break;
1116             case Action::SPELL:
1117                 indexer.set_flags(indexer.FLAG_SPELLING);
1118                 break;
1119             case Action::SPLIT: {
1120                 // Find the end of the actions which split should execute.
1121                 auto split_end = find(action_it, action_end, Action::NEW);
1122
1123                 int split_type = action.get_num_arg();
1124                 if (value.empty()) {
1125                     // Nothing to do.
1126                 } else if (split_type != Action::SPLIT_SORT) {
1127                     // Generate split as we consume it.
1128                     const string& delimiter = action.get_string_arg();
1129
1130                     unique_ptr<unordered_set<string>> seen;
1131                     if (split_type == Action::SPLIT_DEDUP) {
1132                         seen.reset(new unordered_set<string>);
1133                     }
1134
1135                     if (delimiter.size() == 1) {
1136                         // Special case for common single character delimiter.
1137                         char ch = delimiter[0];
1138                         string::size_type i = 0;
1139                         while (true) {
1140                             string::size_type j = value.find(ch, i);
1141                             if (split_type == Action::SPLIT_PREFIXES) {
1142                                 if (j > 0) {
1143                                     string val(value, 0, j);
1144                                     run_actions(action_it, split_end,
1145                                                 database, indexer,
1146                                                 val,
1147                                                 this_field_is_content, doc,
1148                                                 fields,
1149                                                 field, fname, line_no,
1150                                                 docid);
1151                                 }
1152                             } else if (i != j) {
1153                                 string val(value, i, j - i);
1154                                 if (!seen.get() || seen->insert(val).second) {
1155                                     run_actions(action_it, split_end,
1156                                                 database, indexer,
1157                                                 val,
1158                                                 this_field_is_content, doc,
1159                                                 fields,
1160                                                 field, fname, line_no,
1161                                                 docid);
1162                                 }
1163                             }
1164                             if (j == string::npos) break;
1165                             i = j + 1;
1166                         }
1167                     } else {
1168                         string::size_type i = 0;
1169                         while (true) {
1170                             string::size_type j = value.find(delimiter, i);
1171                             if (split_type == Action::SPLIT_PREFIXES) {
1172                                 if (j > 0) {
1173                                     string val(value, 0, j);
1174                                     run_actions(action_it, split_end,
1175                                                 database, indexer,
1176                                                 val,
1177                                                 this_field_is_content, doc,
1178                                                 fields,
1179                                                 field, fname, line_no,
1180                                                 docid);
1181                                 }
1182                             } else if (i != j) {
1183                                 string val(value, i, j - i);
1184                                 if (!seen.get() || seen->insert(val).second) {
1185                                     run_actions(action_it, split_end,
1186                                                 database, indexer,
1187                                                 val,
1188                                                 this_field_is_content, doc,
1189                                                 fields,
1190                                                 field, fname, line_no,
1191                                                 docid);
1192                                 }
1193                             }
1194                             if (j == string::npos) break;
1195                             i = j + delimiter.size();
1196                         }
1197                     }
1198                 } else {
1199                     vector<string> split_values;
1200                     const string& delimiter = action.get_string_arg();
1201                     if (delimiter.size() == 1) {
1202                         // Special case for common single character delimiter.
1203                         char ch = delimiter[0];
1204                         string::size_type i = 0;
1205                         while (true) {
1206                             string::size_type j = value.find(ch, i);
1207                             if (i != j) {
1208                                 split_values.emplace_back(value, i, j - i);
1209                             }
1210                             if (j == string::npos) break;
1211                             i = j + 1;
1212                         }
1213                     } else {
1214                         string::size_type i = 0;
1215                         while (true) {
1216                             string::size_type j = value.find(delimiter, i);
1217                             if (i != j) {
1218                                 split_values.emplace_back(value, i, j - i);
1219                             }
1220                             if (j == string::npos) break;
1221                             i = j + delimiter.size();
1222                         }
1223                     }
1224
1225                     sort(split_values.begin(), split_values.end());
1226
1227                     for (auto&& val : split_values) {
1228                         run_actions(action_it, split_end,
1229                                     database, indexer, val,
1230                                     this_field_is_content, doc, fields,
1231                                     field, fname, line_no,
1232                                     docid);
1233                     }
1234                 }
1235
1236                 action_it = split_end;
1237                 break;
1238             }
1239             case Action::UNHTML: {
1240                 HtmlParser p;
1241                 try {
1242                     // Default HTML character set is latin 1, though
1243                     // not specifying one is deprecated these days.
1244                     p.parse(value, "iso-8859-1", false);
1245                 } catch (const string & newcharset) {
1246                     p.reset();
1247                     p.parse(value, newcharset, true);
1248                 }
1249                 if (p.indexing_allowed)
1250                     value = p.dump;
1251                 else
1252                     value = "";
1253                 break;
1254             }
1255             case Action::UNXML: {
1256                 GenericXmlParser p;
1257                 p.parse(value);
1258                 value = std::move(p.dump);
1259                 break;
1260             }
1261             case Action::UNIQUE: {
1262                 unique_unused = false;
1263
1264                 if (value.empty()) {
1265                     enum diag_type diag = DIAG_WARN;
1266                     switch (unique_missing) {
1267                       case UNIQUE_ERROR:
1268                         diag = DIAG_ERROR;
1269                         /* FALLTHRU */
1270                       case UNIQUE_WARN_NEW:
1271                       case UNIQUE_WARN_SKIP:
1272                         report_location(diag, fname, line_no);
1273                         cerr << "UNIQUE action on empty text\n";
1274                       default:
1275                         break;
1276                     }
1277                     switch (unique_missing) {
1278                       case UNIQUE_ERROR:
1279                         exit(1);
1280                       case UNIQUE_SKIP:
1281                       case UNIQUE_WARN_SKIP:
1282                         skipping_record = true;
1283                         break;
1284                       case UNIQUE_NEW:
1285                       case UNIQUE_WARN_NEW:
1286                         break;
1287                     }
1288                     break;
1289                 }
1290
1291                 // Ensure that the value of this field is unique.
1292                 // If a record already exists with the same value,
1293                 // it will be replaced with the new record.
1294
1295                 // Unique fields aren't considered content - if
1296                 // there are no other fields in the document, the
1297                 // document is to be deleted.
1298                 this_field_is_content = false;
1299
1300                 // Argument is the prefix to add to the field value
1301                 // to get the unique term.
1302                 string t = action.get_string_arg();
1303                 if (prefix_needs_colon(t, value[0])) t += ':';
1304                 t += value;
1305                 Xapian::PostingIterator p = database.postlist_begin(t);
1306                 if (p != database.postlist_end(t)) {
1307                     docid = *p;
1308                 }
1309                 break;
1310             }
1311             case Action::VALUE:
1312                 if (!value.empty())
1313                     doc.add_value(action.get_num_arg(), value);
1314                 break;
1315             case Action::VALUENUMERIC: {
1316                 if (value.empty()) break;
1317                 char * end;
1318                 double dbl = strtod(value.c_str(), &end);
1319                 if (*end) {
1320                     report_location(DIAG_WARN, fname, line_no);
1321                     cerr << "Trailing characters in VALUENUMERIC: '"
1322                          << value << "'\n";
1323                 }
1324                 doc.add_value(action.get_num_arg(),
1325                               Xapian::sortable_serialise(dbl));
1326                 break;
1327             }
1328             case Action::VALUEPACKED: {
1329                 uint32_t word = 0;
1330                 if (value.empty() || !C_isdigit(value[0])) {
1331                     // strtoul() accepts leading whitespace and negated
1332                     // values, neither of which we want to allow.
1333                     errno = EINVAL;
1334                 } else {
1335                     errno = 0;
1336                     char* q;
1337                     word = strtoul(value.c_str(), &q, 10);
1338                     if (!errno && *q != '\0') {
1339                         // Trailing characters after converted value.
1340                         errno = EINVAL;
1341                     }
1342                 }
1343                 if (errno) {
1344                     report_location(DIAG_WARN, fname, line_no);
1345                     cerr << "valuepacked \"" << value << "\" ";
1346                     if (errno == ERANGE) {
1347                         cerr << "out of range\n";
1348                     } else {
1349                         cerr << "not an unsigned integer\n";
1350                     }
1351                 }
1352                 int valueslot = action.get_num_arg();
1353                 doc.add_value(valueslot, int_to_binary_string(word));
1354                 break;
1355             }
1356             case Action::DATE: {
1357                 // Do nothing for empty input.
1358                 if (value.empty()) break;
1359
1360                 const string & type = action.get_string_arg();
1361                 string yyyymmdd;
1362                 if (type == "unix") {
1363                     time_t t;
1364                     if (!parse_signed(value.c_str(), t)) {
1365                         report_location(DIAG_WARN, fname, line_no);
1366                         cerr << "Date value (in secs) for action DATE "
1367                                 "must be an integer - ignoring\n";
1368                         break;
1369                     }
1370                     struct tm *tm = localtime(&t);
1371                     int y = tm->tm_year + 1900;
1372                     int m = tm->tm_mon + 1;
1373                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1374                 } else if (type == "unixutc") {
1375                     time_t t;
1376                     if (!parse_signed(value.c_str(), t)) {
1377                         report_location(DIAG_WARN, fname, line_no);
1378                         cerr << "Date value (in secs) for action DATE "
1379                                 "must be an integer - ignoring\n";
1380                         break;
1381                     }
1382                     struct tm *tm = gmtime(&t);
1383                     int y = tm->tm_year + 1900;
1384                     int m = tm->tm_mon + 1;
1385                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1386                 } else if (type == "yyyymmdd") {
1387                     if (value.length() != 8) {
1388                         report_location(DIAG_WARN, fname, line_no);
1389                         cerr << "date=yyyymmdd expects an 8 character value "
1390                                 "- ignoring\n";
1391                         break;
1392                     }
1393                     yyyymmdd = value;
1394                 }
1395
1396                 // Date (YYYYMMDD)
1397                 doc.add_boolean_term("D" + yyyymmdd);
1398                 yyyymmdd.resize(6);
1399                 // Month (YYYYMM)
1400                 doc.add_boolean_term("M" + yyyymmdd);
1401                 yyyymmdd.resize(4);
1402                 // Year (YYYY)
1403                 doc.add_boolean_term("Y" + yyyymmdd);
1404                 break;
1405             }
1406             case Action::PARSEDATE: {
1407                 string dateformat = action.get_string_arg();
1408                 struct tm tm;
1409                 memset(&tm, 0, sizeof(tm));
1410                 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1411                 if (ret == NULL) {
1412                     report_location(DIAG_WARN, fname, line_no);
1413                     cerr << "\"" << value << "\" doesn't match format "
1414                             "\"" << dateformat << '\"' << '\n';
1415                     break;
1416                 }
1417
1418                 if (*ret != '\0') {
1419                     report_location(DIAG_WARN, fname, line_no);
1420                     cerr << "\"" << value << "\" not fully matched by "
1421                             "format \"" << dateformat << "\" "
1422                             "(\"" << ret << "\" left over) but "
1423                             "indexing anyway\n";
1424                 }
1425 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1426                 auto gmtoff = tm.tm_gmtoff;
1427 #endif
1428                 auto secs_since_epoch = timegm(&tm);
1429 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1430                 secs_since_epoch -= gmtoff;
1431 #endif
1432                 value = str(secs_since_epoch);
1433                 break;
1434             }
1435             default:
1436                 /* Empty default case to avoid "unhandled enum value"
1437                  * warnings. */
1438                 break;
1439         }
1440     }
1441     return true;
1442 }
1443
1444 static void
1445 index_file(const char *fname, istream &stream,
1446            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1447 {
1448     string line;
1449     size_t line_no = 0;
1450     while (!stream.eof() && getline_portable(stream, line)) {
1451         ++line_no;
1452         // Allow blank lines before the first record and multiple blank lines
1453         // between records.
1454         if (line.empty()) continue;
1455
1456         Xapian::Document doc;
1457         indexer.set_document(doc);
1458         Xapian::docid docid = 0;
1459         map<string, list<string>> fields;
1460         bool seen_content = false;
1461         skipping_record = false;
1462         unique_unused = index_spec_uses_unique;
1463         while (!line.empty()) {
1464             string::size_type eq = line.find('=');
1465             if (eq == string::npos && !line.empty()) {
1466                 report_location(DIAG_ERROR, fname, line_no);
1467                 cerr << "Expected = somewhere in this line\n";
1468                 exit(1);
1469             }
1470             string field(line, 0, eq);
1471             string value(line, eq + 1, string::npos);
1472             line.clear();
1473             while (getline_portable(stream, line)) {
1474                 ++line_no;
1475                 if (line.empty() || line[0] != '=') break;
1476                 // Replace the '=' with a '\n'.
1477                 line[0] = '\n';
1478                 value += line;
1479                 line.erase();
1480             }
1481
1482             if (skipping_record) continue;
1483
1484             // Default to not indexing spellings.
1485             indexer.set_flags(Xapian::TermGenerator::flags(0));
1486
1487             bool this_field_is_content = true;
1488             const vector<Action>& v = index_spec[field];
1489             run_actions(v.begin(), v.end(),
1490                         database, indexer, value,
1491                         this_field_is_content, doc, fields,
1492                         field, fname, line_no,
1493                         docid);
1494             if (this_field_is_content) seen_content = true;
1495         }
1496
1497         if (unique_unused) {
1498             enum diag_type diag = DIAG_WARN;
1499             switch (unique_missing) {
1500               case UNIQUE_ERROR:
1501                 diag = DIAG_ERROR;
1502                 /* FALLTHRU */
1503               case UNIQUE_WARN_NEW:
1504               case UNIQUE_WARN_SKIP:
1505                 report_location(diag, fname, line_no);
1506                 cerr << "UNIQUE action unused in this record\n";
1507               default:
1508                 break;
1509             }
1510             switch (unique_missing) {
1511               case UNIQUE_ERROR:
1512                 exit(1);
1513               case UNIQUE_SKIP:
1514               case UNIQUE_WARN_SKIP:
1515                 skipping_record = true;
1516                 break;
1517               case UNIQUE_NEW:
1518               case UNIQUE_WARN_NEW:
1519                 break;
1520             }
1521         }
1522
1523         if (skipping_record) {
1524             ++skipcount;
1525         } else if (!seen_content) {
1526             // We haven't seen any fields (other than unique identifiers)
1527             // so the document is to be deleted.
1528             if (docid) {
1529                 database.delete_document(docid);
1530                 if (verbose) cout << "Del: " << docid << '\n';
1531                 ++delcount;
1532             }
1533         } else {
1534             string data;
1535             for (auto&& i : fields) {
1536                 for (auto&& field_val : i.second) {
1537                     data += i.first;
1538                     data += '=';
1539                     data += field_val;
1540                     data += '\n';
1541                 }
1542             }
1543
1544             // Put the data in the document
1545             doc.set_data(data);
1546
1547             // Add the document to the database
1548             if (docid) {
1549                 database.replace_document(docid, doc);
1550                 if (verbose) cout << "Replace: " << docid << '\n';
1551                 ++repcount;
1552             } else {
1553                 docid = database.add_document(doc);
1554                 if (verbose) cout << "Add: " << docid << '\n';
1555                 ++addcount;
1556             }
1557         }
1558     }
1559
1560     // Commit after each file to make sure all changes from that file make it
1561     // in.
1562     if (verbose) cout << "Committing\n";
1563     database.commit();
1564 }
1565
1566 [[noreturn]]
1567 static void
1568 show_help(int exit_code)
1569 {
1570     cout << PROG_NAME " - " PROG_DESC "\n"
1571 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1572 "\n"
1573 "Creates or updates a Xapian database with the data from the input files listed\n"
1574 "on the command line.  If no files are specified, data is read from stdin.\n"
1575 "\n"
1576 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1577 "format for INDEXER_SCRIPT.\n"
1578 "\n"
1579 "Options:\n"
1580 "  -v, --verbose       display additional messages to aid debugging\n"
1581 "      --overwrite     create the database anew (the default is to update if\n"
1582 "                      the database already exists)\n";
1583     print_stemmer_help("");
1584     print_help_and_version_help("");
1585     exit(exit_code);
1586 }
1587
1588 int
1589 main(int argc, char **argv)
1590 try {
1591     // If the database already exists, default to updating not overwriting.
1592     int database_mode = Xapian::DB_CREATE_OR_OPEN;
1593     verbose = false;
1594     Xapian::Stem stemmer("english");
1595
1596     // Without this, strptime() seems to treat formats without a timezone as
1597     // being local time, including %s.
1598     setenv("TZ", "UTC", 1);
1599
1600     constexpr auto NO_ARG = no_argument;
1601     constexpr auto REQ_ARG = required_argument;
1602     static const struct option longopts[] = {
1603         { "help",       NO_ARG,         NULL, 'h' },
1604         { "version",    NO_ARG,         NULL, 'V' },
1605         { "stemmer",    REQ_ARG,        NULL, 's' },
1606         { "overwrite",  NO_ARG,         NULL, 'o' },
1607         { "verbose",    NO_ARG,         NULL, 'v' },
1608         { 0, 0, NULL, 0 }
1609     };
1610
1611     int getopt_ret;
1612     while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1613                                          longopts, NULL)) != -1) {
1614         switch (getopt_ret) {
1615             default:
1616                 show_help(1);
1617                 break;
1618             case 'h': // --help
1619                 show_help(0);
1620                 break;
1621             case 'V': // --version
1622                 print_package_info(PROG_NAME);
1623                 return 0;
1624             case 'o': // --overwrite
1625                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1626                 break;
1627             case 'v':
1628                 verbose = true;
1629                 break;
1630             case 's':
1631                 try {
1632                     stemmer = Xapian::Stem(optarg);
1633                 } catch (const Xapian::InvalidArgumentError &) {
1634                     cerr << "Unknown stemming language '" << optarg << "'.\n";
1635                     cerr << "Available language names are: "
1636                          << Xapian::Stem::get_available_languages() << '\n';
1637                     return 1;
1638                 }
1639                 break;
1640         }
1641     }
1642
1643     argv += optind;
1644     argc -= optind;
1645     if (argc < 2) {
1646         show_help(1);
1647     }
1648
1649     parse_index_script(argv[1]);
1650
1651     // Open the database.  If another process is currently updating the
1652     // database, wait for the lock to become available.
1653     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1654     Xapian::WritableDatabase database(argv[0], flags);
1655
1656     Xapian::TermGenerator indexer;
1657     indexer.set_stemmer(stemmer);
1658     // Set the database for spellings to be added to by the "spell" action.
1659     indexer.set_database(database);
1660
1661     addcount = 0;
1662     repcount = 0;
1663     delcount = 0;
1664     skipcount = 0;
1665
1666     if (argc == 2) {
1667         // Read from stdin.
1668         index_file("<stdin>", cin, database, indexer);
1669     } else {
1670         // Read file(s) listed on the command line.
1671         for (int i = 2; i < argc; ++i) {
1672             ifstream stream(argv[i]);
1673             if (stream) {
1674                 index_file(argv[i], stream, database, indexer);
1675             } else {
1676                 cerr << "Can't open file " << argv[i] << '\n';
1677             }
1678         }
1679     }
1680
1681     cout << "records (added, replaced, deleted, skipped) = ("
1682          << addcount << ", "
1683          << repcount << ", "
1684          << delcount << ", "
1685          << skipcount << ")\n";
1686 } catch (const Xapian::Error &error) {
1687     cerr << "Exception: " << error.get_description() << '\n';
1688     exit(1);
1689 } catch (const std::bad_alloc &) {
1690     cerr << "Exception: std::bad_alloc\n";
1691     exit(1);
1692 } catch (...) {
1693     cerr << "Unknown Exception\n";
1694     exit(1);
1695 }