xapian-applications/omega/scriptindex.cc

   1 /** @file
   2  * @brief index arbitrary data as described by an index script
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 Sam Liddicott
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2022 Olly Betts
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #include <xapian.h>
  28
  29 #include <algorithm>
  30 #include <fstream>
  31 #include <iostream>
  32 #include <list>
  33 #include <map>
  34 #include <memory>
  35 #include <string>
  36 #include <unordered_set>
  37 #include <vector>
  38 #include <cstring>
  39
  40 #include <cerrno>
  41 #include <cstdio>
  42 #include <cstdlib>
  43 #include <ctime>
  44
  45 #include "commonhelp.h"
  46 #include "hashterm.h"
  47 #include "loadfile.h"
  48 #include "myhtmlparse.h"
  49 #include "parseint.h"
  50 #include "setenv.h"
  51 #include "str.h"
  52 #include "stringutils.h"
  53 #include "timegm.h"
  54 #include "utf8truncate.h"
  55 #include "utils.h"
  56 #include "values.h"
  57
  58 #ifndef HAVE_STRPTIME
  59 #include "portability/strptime.h"
  60 #endif
  61
  62 #include "gnu_getopt.h"
  63
  64 using namespace std;
  65
  66 #define PROG_NAME "scriptindex"
  67 #define PROG_DESC "index arbitrary data as described by an index script"
  68
  69 static bool verbose;
  70 static int addcount;
  71 static int repcount;
  72 static int delcount;
  73
  74 static inline bool
  75 prefix_needs_colon(const string & prefix, unsigned ch)
  76 {
  77     if (!C_isupper(ch) && ch != ':') return false;
  78     string::size_type len = prefix.length();
  79     return (len > 1 && prefix[len - 1] != ':');
  80 }
  81
  82 const char * action_names[] = {
  83     // Actions used internally:
  84     "bad",
  85     "new",
  86     // Actual actions:
  87     "boolean",
  88     "date",
  89     "field",
  90     "gap",
  91     "hash",
  92     "hextobin",
  93     "index",
  94     "indexnopos",
  95     "load",
  96     "lower",
  97     "ltrim",
  98     "parsedate",
  99     "rtrim",
 100     "spell",
 101     "split",
 102     "squash",
 103     "trim",
 104     "truncate",
 105     "unhtml",
 106     "unique",
 107     "value",
 108     "valuenumeric",
 109     "valuepacked",
 110     "weight"
 111 };
 112
 113 // For debugging:
 114 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
 115
 116 class Action {
 117   public:
 118     typedef enum {
 119         // Actions used internally:
 120         BAD,
 121         NEW,
 122         // Actual actions:
 123         BOOLEAN,
 124         DATE,
 125         FIELD,
 126         GAP,
 127         HASH,
 128         HEXTOBIN,
 129         INDEX,
 130         INDEXNOPOS,
 131         LOAD,
 132         LOWER,
 133         LTRIM,
 134         PARSEDATE,
 135         RTRIM,
 136         SPELL,
 137         SPLIT,
 138         SQUASH,
 139         TRIM,
 140         TRUNCATE,
 141         UNHTML,
 142         UNIQUE,
 143         VALUE,
 144         VALUENUMERIC,
 145         VALUEPACKED,
 146         WEIGHT
 147     } type;
 148     enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
 149   private:
 150     type action;
 151     int num_arg;
 152     string string_arg;
 153     // Offset into indexscript line.
 154     size_t pos;
 155   public:
 156     Action(type action_, size_t pos_)
 157         : action(action_), num_arg(0), pos(pos_) { }
 158     Action(type action_, size_t pos_, const string & arg)
 159         : action(action_), string_arg(arg), pos(pos_) {
 160         num_arg = atoi(string_arg.c_str());
 161     }
 162     Action(type action_, size_t pos_, const string & arg, int num)
 163         : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
 164     type get_action() const { return action; }
 165     int get_num_arg() const { return num_arg; }
 166     void set_num_arg(int num) { num_arg = num; }
 167     const string & get_string_arg() const { return string_arg; }
 168     size_t get_pos() const { return pos; }
 169 };
 170
 171 // These allow searching for an Action with a particular Action::type using
 172 // std::find().
 173
 174 inline bool
 175 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
 176
 177 inline bool
 178 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
 179
 180 inline bool
 181 operator!=(const Action& a, Action::type t) { return !(a == t); }
 182
 183 inline bool
 184 operator!=(Action::type t, const Action& a) { return !(t == a); }
 185
 186 static void
 187 ltrim(string& s, const string& chars)
 188 {
 189     auto i = s.find_first_not_of(chars);
 190     if (i) s.erase(0, i);
 191 }
 192
 193 static void
 194 rtrim(string& s, const string& chars)
 195 {
 196     s.resize(s.find_last_not_of(chars) + 1);
 197 }
 198
 199 static void
 200 squash(string& s, const string& chars)
 201 {
 202     string output;
 203     output.reserve(s.size());
 204     string::size_type i = 0;
 205     while ((i = s.find_first_not_of(chars, i)) != string::npos) {
 206         auto j = s.find_first_of(chars, i);
 207         if (!output.empty()) output += ' ';
 208         output.append(s, i, j - i);
 209         i = j;
 210     }
 211     s = std::move(output);
 212 }
 213
 214 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
 215
 216 static unsigned error_count = 0;
 217
 218 static void
 219 report_location(enum diag_type type,
 220                 const string& filename,
 221                 size_t line = 0,
 222                 size_t pos = string::npos)
 223 {
 224     cerr << filename;
 225     if (line != 0) {
 226         cerr << ':' << line;
 227         if (pos != string::npos) {
 228             // The first column is numbered 1.
 229             cerr << ':' << pos + 1;
 230         }
 231     }
 232     switch (type) {
 233         case DIAG_ERROR:
 234             cerr << ": error: ";
 235             ++error_count;
 236             break;
 237         case DIAG_WARN:
 238             cerr << ": warning: ";
 239             break;
 240         case DIAG_NOTE:
 241             cerr << ": note: ";
 242             break;
 243     }
 244 }
 245
 246 static void
 247 report_useless_action(const string &file, size_t line, size_t pos,
 248                       const string &action)
 249 {
 250     report_location(DIAG_WARN, file, line, pos);
 251     cerr << "Index action '" << action << "' has no effect" << endl;
 252
 253     static bool given_left_to_right_warning = false;
 254     if (!given_left_to_right_warning) {
 255         given_left_to_right_warning = true;
 256         report_location(DIAG_NOTE, file, line, pos);
 257         cerr << "Actions are executed from left to right" << endl;
 258     }
 259 }
 260
 261 static map<string, vector<Action>> index_spec;
 262
 263 static void
 264 parse_index_script(const string &filename)
 265 {
 266     ifstream script(filename.c_str());
 267     if (!script.is_open()) {
 268         report_location(DIAG_ERROR, filename);
 269         cerr << strerror(errno) << endl;
 270         exit(1);
 271     }
 272     string line;
 273     size_t line_no = 0;
 274     // Line number where we saw a `unique` action, or 0 if we haven't.
 275     int unique_line_no = 0;
 276     // Offset into line unique_line_no where the `unique` action was.
 277     size_t unique_pos = 0;
 278     while (getline(script, line)) {
 279         ++line_no;
 280         vector<string> fields;
 281         vector<Action> actions;
 282         string::const_iterator i, j;
 283         const string &s = line;
 284         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 285         if (i == s.end() || *i == '#') {
 286             // Blank line or comment.
 287             continue;
 288         }
 289         while (true) {
 290             if (!C_isalnum(*i)) {
 291                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 292                 cerr << "field name must start with alphanumeric" << endl;
 293             }
 294             j = find_if(i + 1, s.end(),
 295                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 296             fields.push_back(string(i, j));
 297             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 298             if (i == s.end()) break;
 299             if (*i == ':') {
 300                 ++i;
 301                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 302                 break;
 303             }
 304             if (i == j) {
 305                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 306                 cerr << "bad character '" << *i << "' in field name" << endl;
 307                 ++i;
 308                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 309                 if (i == s.end()) break;
 310             }
 311         }
 312         Xapian::termcount weight = 1;
 313         size_t useless_weight_pos = string::npos;
 314         map<string, Action::type> boolmap;
 315         j = i;
 316         while (j != s.end()) {
 317             size_t action_pos = j - s.begin();
 318             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 319             string action(s, j - s.begin(), i - j);
 320             Action::type code = Action::BAD;
 321             unsigned min_args = 0, max_args = 0;
 322             bool takes_integer_argument = false;
 323             if (!action.empty()) {
 324                 switch (action[0]) {
 325                     case 'b':
 326                         if (action == "boolean") {
 327                             code = Action::BOOLEAN;
 328                             max_args = 1;
 329                         }
 330                         break;
 331                     case 'd':
 332                         if (action == "date") {
 333                             code = Action::DATE;
 334                             min_args = max_args = 1;
 335                         }
 336                         break;
 337                     case 'f':
 338                         if (action == "field") {
 339                             code = Action::FIELD;
 340                             max_args = 1;
 341                         }
 342                         break;
 343                     case 'g':
 344                         if (action == "gap") {
 345                             code = Action::GAP;
 346                             max_args = 1;
 347                             takes_integer_argument = true;
 348                         }
 349                         break;
 350                     case 'h':
 351                         if (action == "hash") {
 352                             code = Action::HASH;
 353                             max_args = 1;
 354                             takes_integer_argument = true;
 355                         } else if (action == "hextobin") {
 356                             code = Action::HEXTOBIN;
 357                         }
 358                         break;
 359                     case 'i':
 360                         if (action == "index") {
 361                             code = Action::INDEX;
 362                             max_args = 1;
 363                         } else if (action == "indexnopos") {
 364                             code = Action::INDEXNOPOS;
 365                             max_args = 1;
 366                         }
 367                         break;
 368                     case 'l':
 369                         if (action == "lower") {
 370                             code = Action::LOWER;
 371                         } else if (action == "load") {
 372                             code = Action::LOAD;
 373                         } else if (action == "ltrim") {
 374                             code = Action::LTRIM;
 375                             max_args = 1;
 376                         }
 377                         break;
 378                     case 'p':
 379                         if (action == "parsedate") {
 380                             code = Action::PARSEDATE;
 381                             min_args = max_args = 1;
 382                         }
 383                         break;
 384                     case 'r':
 385                         if (action == "rtrim") {
 386                             code = Action::RTRIM;
 387                             max_args = 1;
 388                         }
 389                         break;
 390                     case 's':
 391                         if (action == "spell") {
 392                             code = Action::SPELL;
 393                         } else if (action == "split") {
 394                             code = Action::SPLIT;
 395                             min_args = 1;
 396                             max_args = 2;
 397                         } else if (action == "squash") {
 398                             code = Action::SQUASH;
 399                             max_args = 1;
 400                         }
 401                         break;
 402                     case 't':
 403                         if (action == "truncate") {
 404                             code = Action::TRUNCATE;
 405                             min_args = max_args = 1;
 406                             takes_integer_argument = true;
 407                         } else if (action == "trim") {
 408                             code = Action::TRIM;
 409                             max_args = 1;
 410                         }
 411                         break;
 412                     case 'u':
 413                         if (action == "unhtml") {
 414                             code = Action::UNHTML;
 415                         } else if (action == "unique") {
 416                             code = Action::UNIQUE;
 417                             min_args = max_args = 1;
 418                         }
 419                         break;
 420                     case 'v':
 421                         if (action == "value") {
 422                             code = Action::VALUE;
 423                             min_args = max_args = 1;
 424                             takes_integer_argument = true;
 425                         } else if (action == "valuenumeric") {
 426                             code = Action::VALUENUMERIC;
 427                             min_args = max_args = 1;
 428                             takes_integer_argument = true;
 429                         } else if (action == "valuepacked") {
 430                             code = Action::VALUEPACKED;
 431                             min_args = max_args = 1;
 432                             takes_integer_argument = true;
 433                         }
 434                         break;
 435                     case 'w':
 436                         if (action == "weight") {
 437                             code = Action::WEIGHT;
 438                             min_args = max_args = 1;
 439                             // Don't set takes_integer_argument since we parse
 440                             // it with parse_unsigned() and issue an error there
 441                             // - setting takes_integer_argument would give a
 442                             // double error for arguments with a decimal point.
 443                         }
 444                         break;
 445                 }
 446             }
 447             if (code == Action::BAD) {
 448                 report_location(DIAG_ERROR, filename, line_no, action_pos);
 449                 cerr << "Unknown index action '" << action << "'" << endl;
 450             }
 451             auto i_after_action = i;
 452             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 453
 454             if (i != s.end() && *i == '=') {
 455                 if (i != i_after_action) {
 456                     report_location(DIAG_WARN, filename, line_no,
 457                                     i_after_action - s.begin());
 458                     cerr << "putting spaces between the action and '=' is "
 459                             "deprecated" << endl;
 460                 }
 461
 462                 if (max_args == 0) {
 463                     report_location(DIAG_ERROR, filename, line_no,
 464                                     i - s.begin());
 465                     cerr << "Index action '" << action
 466                          << "' doesn't take an argument" << endl;
 467                 }
 468
 469                 ++i;
 470                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 471                 if (i != j) {
 472                     report_location(DIAG_WARN, filename, line_no,
 473                                     i - s.begin());
 474                     cerr << "putting spaces between '=' and the argument is "
 475                             "deprecated" << endl;
 476                 }
 477
 478                 vector<string> vals;
 479                 while (true) {
 480                     if (j != s.end() && *j == '"') {
 481                         // Quoted argument.
 482                         ++j;
 483                         string arg;
 484                         while (true) {
 485                             i = find_if(j, s.end(),
 486                                         [](char ch) {
 487                                             return ch == '"' || ch == '\\';
 488                                         });
 489                             if (i == s.end()) {
 490                                 report_location(DIAG_ERROR, filename, line_no,
 491                                                 s.size());
 492                                 cerr << "No closing quote" << endl;
 493                                 break;
 494                             }
 495                             arg.append(j, i);
 496                             if (*i++ == '"')
 497                                 break;
 498
 499                             // Escape sequence.
 500                             if (i == s.end()) {
 501 bad_escaping:
 502                                 report_location(DIAG_ERROR, filename, line_no,
 503                                                 i - s.begin());
 504                                 cerr << "Bad escaping in quoted action argument"
 505                                      << endl;
 506                                 break;
 507                             }
 508
 509                             char ch = *i;
 510                             switch (ch) {
 511                                 case '\\':
 512                                 case '"':
 513                                     break;
 514                                 case '0':
 515                                     ch = '\0';
 516                                     break;
 517                                 case 'n':
 518                                     ch = '\n';
 519                                     break;
 520                                 case 'r':
 521                                     ch = '\r';
 522                                     break;
 523                                 case 't':
 524                                     ch = '\t';
 525                                     break;
 526                                 case 'x': {
 527                                     if (++i == s.end())
 528                                         goto bad_escaping;
 529                                     char ch1 = *i;
 530                                     if (!C_isxdigit(ch1)) {
 531 bad_hex_digit:
 532                                         report_location(DIAG_ERROR, filename,
 533                                                         line_no, i - s.begin());
 534                                         cerr << "Bad hex digit in escaping\n";
 535                                         --i;
 536                                         break;
 537                                     }
 538                                     if (++i == s.end())
 539                                         goto bad_escaping;
 540                                     char ch2 = *i;
 541                                     if (!C_isxdigit(ch2)) {
 542                                         goto bad_hex_digit;
 543                                     }
 544                                     ch = hex_digit(ch1) << 4 |
 545                                          hex_digit(ch2);
 546                                     break;
 547                                 }
 548                                 default:
 549                                     report_location(DIAG_ERROR, filename,
 550                                                     line_no, i - s.begin());
 551                                     cerr << "Bad escape sequence '\\" << ch
 552                                          << "'\n";
 553                                     break;
 554                             }
 555                             arg += ch;
 556                             j = i + 1;
 557                         }
 558                         vals.emplace_back(std::move(arg));
 559                         if (i == s.end() || C_isspace(*i)) break;
 560                         if (*i == ',') {
 561                             ++i;
 562                         } else {
 563                             report_location(DIAG_ERROR, filename, line_no,
 564                                             i - s.begin());
 565                             cerr << "Unexpected character '" << *i
 566                                  << "' after closing quote" << endl;
 567                             do {
 568                                 ++i;
 569                             } while (i != s.end() && *i != ',' && !C_isspace(*i));
 570                             if (*i != ',') break;
 571                             ++i;
 572                         }
 573                     } else if (max_args > 1) {
 574                         // Unquoted argument, split on comma.
 575                         i = find_if(j, s.end(),
 576                                     [](char ch) {
 577                                         return C_isspace(ch) || ch == ',';
 578                                     });
 579                         vals.emplace_back(j, i);
 580                         if (*i != ',') break;
 581                         ++i;
 582                     } else {
 583                         // Unquoted argument, including any commas.
 584                         i = find_if(j, s.end(),
 585                                     [](char ch) { return C_isspace(ch); });
 586                         vals.emplace_back(j, i);
 587                         break;
 588                     }
 589                     j = i;
 590
 591                     if (vals.size() == max_args) {
 592                         report_location(DIAG_ERROR, filename, line_no,
 593                                         i - s.begin());
 594                         cerr << "Index action '" << action
 595                              << "' takes at most " << max_args << " arguments"
 596                              << endl;
 597                     }
 598                 }
 599
 600                 if (vals.size() < min_args) {
 601                     report_location(DIAG_ERROR, filename, line_no,
 602                                     i - s.begin());
 603                     if (min_args == max_args) {
 604                         cerr << "Index action '" << action
 605                              << "' requires " << min_args << " arguments"
 606                              << endl;
 607                     } else {
 608                         cerr << "Index action '" << action
 609                              << "' requires at least " << min_args << " arguments"
 610                              << endl;
 611                     }
 612                     // Allow action handling code to assume there are min_args
 613                     // arguments.
 614                     vals.resize(min_args);
 615                 }
 616
 617                 string val;
 618                 if (!vals.empty()) {
 619                     val = vals.front();
 620                 }
 621
 622                 if (takes_integer_argument) {
 623                     auto dot = val.find('.');
 624                     if (dot != string::npos) {
 625                         report_location(DIAG_ERROR, filename, line_no,
 626                                         j - s.begin() + dot);
 627                         cerr << "Index action '" << action
 628                              << "' takes an integer argument" << endl;
 629                     }
 630                 }
 631                 switch (code) {
 632                     case Action::DATE:
 633                         if (val != "unix" &&
 634                             val != "unixutc" &&
 635                             val != "yyyymmdd") {
 636                             report_location(DIAG_ERROR, filename, line_no,
 637                                             j - s.begin());
 638                             cerr << "Invalid parameter '" << val << "' for "
 639                                     "action 'date'" << endl;
 640                         }
 641                         actions.emplace_back(code, action_pos, val);
 642                         break;
 643                     case Action::INDEX:
 644                     case Action::INDEXNOPOS:
 645                         actions.emplace_back(code, action_pos, val, weight);
 646                         useless_weight_pos = string::npos;
 647                         break;
 648                     case Action::WEIGHT:
 649                         // We don't push an Action for WEIGHT - instead we
 650                         // store it ready to use in the INDEX and INDEXNOPOS
 651                         // Actions.
 652                         if (!parse_unsigned(val.c_str(), weight)) {
 653                             report_location(DIAG_ERROR, filename, line_no,
 654                                             j - s.begin());
 655                             cerr << "Index action 'weight' takes a "
 656                                     "non-negative integer argument" << endl;
 657                             weight = 0;
 658                         }
 659                         if (useless_weight_pos != string::npos) {
 660                             report_useless_action(filename, line_no,
 661                                                   useless_weight_pos, action);
 662                         }
 663                         useless_weight_pos = action_pos;
 664                         break;
 665                     case Action::PARSEDATE: {
 666                         auto bad_code = val.find("%Z");
 667                         if (bad_code != val.npos) {
 668                             report_location(DIAG_ERROR, filename, line_no,
 669                                             j - s.begin() + bad_code);
 670                             cerr << "Parsing timezone names with %Z is not supported" << endl;
 671                         }
 672 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
 673                         bad_code = val.find("%z");
 674                         if (bad_code != val.npos) {
 675                             report_location(DIAG_ERROR, filename, line_no,
 676                                             j - s.begin() + bad_code);
 677                             cerr << "Parsing timezone offsets with %z is not supported on "
 678                                     "this platform" << endl;
 679                         }
 680 #endif
 681                         actions.emplace_back(code, action_pos, val);
 682                         break;
 683                     }
 684                     case Action::SPLIT: {
 685                         if (val.empty()) {
 686                             report_location(DIAG_ERROR, filename, line_no,
 687                                             j - s.begin());
 688                             cerr << "Split delimiter can't be empty" << endl;
 689                         }
 690                         int operation = Action::SPLIT_NONE;
 691                         if (vals.size() >= 2) {
 692                             if (vals[1] == "dedup") {
 693                                 operation = Action::SPLIT_DEDUP;
 694                             } else if (vals[1] == "sort") {
 695                                 operation = Action::SPLIT_SORT;
 696                             } else if (vals[1] == "none") {
 697                                 operation = Action::SPLIT_NONE;
 698                             } else if (vals[1] == "prefixes") {
 699                                 operation = Action::SPLIT_PREFIXES;
 700                             } else {
 701                                 // FIXME: Column should be for where the `op`
 702                                 // parameter starts, which this isn't if the
 703                                 // value is quoted, contains escape sequences,
 704                                 // etc.
 705                                 report_location(DIAG_ERROR, filename, line_no,
 706                                                 i - s.begin() - vals[1].size());
 707                                 cerr << "Bad split operation '" << vals[1]
 708                                      << "'" << endl;
 709                             }
 710                         }
 711                         actions.emplace_back(code, action_pos, val, operation);
 712                         break;
 713                     }
 714                     case Action::TRUNCATE:
 715                         if (!actions.empty() &&
 716                             actions.back().get_action() == Action::LOAD) {
 717                             /* Turn "load truncate=n" into "load" with
 718                              * num_arg n, so that we don't needlessly
 719                              * allocate memory and read data we're just
 720                              * going to ignore.
 721                              */
 722                             actions.pop_back();
 723                             code = Action::LOAD;
 724                         }
 725                         actions.emplace_back(code, action_pos, val);
 726                         break;
 727                     case Action::UNIQUE:
 728                         if (unique_line_no) {
 729                             report_location(DIAG_ERROR, filename, line_no,
 730                                             action_pos);
 731                             cerr << "Index action 'unique' used more than once"
 732                                  << endl;
 733                             report_location(DIAG_NOTE, filename,
 734                                             unique_line_no, unique_pos);
 735                             cerr << "Previously used here" << endl;
 736                         }
 737                         unique_line_no = line_no;
 738                         unique_pos = action_pos;
 739                         if (boolmap.find(val) == boolmap.end())
 740                             boolmap[val] = Action::UNIQUE;
 741                         actions.emplace_back(code, action_pos, val);
 742                         break;
 743                     case Action::GAP: {
 744                         actions.emplace_back(code, action_pos, val);
 745                         auto& obj = actions.back();
 746                         auto gap_size = obj.get_num_arg();
 747                         if (gap_size <= 0) {
 748                             report_location(DIAG_ERROR, filename, line_no,
 749                                             obj.get_pos() + 3 + 1);
 750                             cerr << "Index action 'gap' takes a strictly "
 751                                     "positive integer argument" << endl;
 752                         }
 753                         break;
 754                     }
 755                     case Action::HASH: {
 756                         actions.emplace_back(code, action_pos, val);
 757                         auto& obj = actions.back();
 758                         auto max_length = obj.get_num_arg();
 759                         if (max_length < 6) {
 760                             report_location(DIAG_ERROR, filename, line_no,
 761                                             obj.get_pos() + 4 + 1);
 762                             cerr << "Index action 'hash' takes an integer "
 763                                     "argument which must be at least 6" << endl;
 764                         }
 765                         break;
 766                     }
 767                     case Action::LTRIM:
 768                     case Action::RTRIM:
 769                     case Action::SQUASH:
 770                     case Action::TRIM:
 771                         for (unsigned char ch : val) {
 772                             if (ch >= 0x80) {
 773                                 auto column = actions.back().get_pos() +
 774                                               strlen(action_names[code]) + 1;
 775                                 report_location(DIAG_ERROR, filename, line_no,
 776                                                 column);
 777                                 cerr << "Index action '" << action_names[code]
 778                                      << "' only support ASCII characters "
 779                                         "currently\n";
 780                             }
 781                         }
 782                         actions.emplace_back(code, action_pos, val);
 783                         break;
 784                     case Action::BOOLEAN:
 785                         boolmap[val] = Action::BOOLEAN;
 786                         /* FALLTHRU */
 787                     default:
 788                         actions.emplace_back(code, action_pos, val);
 789                 }
 790                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 791             } else {
 792                 if (min_args > 0) {
 793                     report_location(DIAG_ERROR, filename, line_no,
 794                                     i_after_action - s.begin());
 795                     if (min_args == max_args) {
 796                         cerr << "Index action '" << action << "' requires "
 797                              << min_args << " arguments" << endl;
 798                     } else {
 799                         cerr << "Index action '" << action << "' requires at least "
 800                              << min_args << " arguments" << endl;
 801                     }
 802                 }
 803                 switch (code) {
 804                     case Action::INDEX:
 805                     case Action::INDEXNOPOS:
 806                         useless_weight_pos = string::npos;
 807                         actions.emplace_back(code, action_pos, "", weight);
 808                         break;
 809                     case Action::GAP:
 810                         actions.emplace_back(code, action_pos, "", 100);
 811                         break;
 812                     case Action::HASH:
 813                         actions.emplace_back(code, action_pos, "",
 814                                              MAX_SAFE_TERM_LENGTH - 1);
 815                         break;
 816                     case Action::LTRIM:
 817                     case Action::RTRIM:
 818                     case Action::SQUASH:
 819                     case Action::TRIM:
 820                         actions.emplace_back(code, action_pos, " \t\f\v\r\n");
 821                         break;
 822                     default:
 823                         actions.emplace_back(code, action_pos);
 824                         break;
 825                 }
 826             }
 827             j = i;
 828         }
 829
 830         if (useless_weight_pos != string::npos) {
 831             report_useless_action(filename, line_no, useless_weight_pos,
 832                                   "weight");
 833         }
 834
 835         while (!actions.empty()) {
 836             bool done = true;
 837             Action::type action = actions.back().get_action();
 838             switch (action) {
 839                 case Action::HASH:
 840                 case Action::HEXTOBIN:
 841                 case Action::LOWER:
 842                 case Action::LTRIM:
 843                 case Action::PARSEDATE:
 844                 case Action::RTRIM:
 845                 case Action::SPELL:
 846                 case Action::SQUASH:
 847                 case Action::TRIM:
 848                 case Action::TRUNCATE:
 849                 case Action::UNHTML:
 850                     done = false;
 851                     report_useless_action(filename, line_no,
 852                                           actions.back().get_pos(),
 853                                           action_names[action]);
 854                     actions.pop_back();
 855                     break;
 856                 default:
 857                     break;
 858             }
 859             if (done) break;
 860         }
 861
 862         map<string, Action::type>::const_iterator boolpfx;
 863         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 864             if (boolpfx->second == Action::UNIQUE) {
 865                 report_location(DIAG_WARN, filename, unique_line_no,
 866                                 unique_pos);
 867                 cerr << "Index action 'unique=" << boolpfx->first
 868                      << "' without 'boolean=" << boolpfx->first << "'" << endl;
 869                 static bool given_doesnt_imply_boolean_warning = false;
 870                 if (!given_doesnt_imply_boolean_warning) {
 871                     given_doesnt_imply_boolean_warning = true;
 872                     report_location(DIAG_NOTE, filename, unique_line_no,
 873                                     unique_pos);
 874                     cerr << "'unique' doesn't implicitly add a boolean term"
 875                          << endl;
 876                 }
 877             }
 878         }
 879
 880         vector<string>::const_iterator field;
 881         for (field = fields.begin(); field != fields.end(); ++field) {
 882             vector<Action> &v = index_spec[*field];
 883             if (v.empty()) {
 884                 if (fields.size() == 1) {
 885                     // Optimise common case where there's only one fieldname
 886                     // for a list of actions.
 887                     v = std::move(actions);
 888                 } else {
 889                     v = actions;
 890                 }
 891             } else {
 892                 v.emplace_back(Action::NEW, string::npos);
 893                 v.insert(v.end(), actions.begin(), actions.end());
 894             }
 895         }
 896     }
 897
 898     if (index_spec.empty()) {
 899         report_location(DIAG_ERROR, filename, line_no);
 900         cerr << "No rules found in index script" << endl;
 901     }
 902
 903     if (error_count) {
 904         exit(1);
 905     }
 906 }
 907
 908 static bool
 909 run_actions(vector<Action>::const_iterator action_it,
 910             vector<Action>::const_iterator action_end,
 911             Xapian::WritableDatabase& database,
 912             Xapian::TermGenerator& indexer,
 913             const string& old_value,
 914             bool& this_field_is_content, Xapian::Document& doc,
 915             map<string, list<string>>& fields,
 916             string& field, const char* fname,
 917             size_t line_no, Xapian::docid& docid)
 918 {
 919     string value = old_value;
 920     while (action_it != action_end) {
 921         auto& action = *action_it++;
 922         switch (action.get_action()) {
 923             case Action::BAD:
 924                 abort();
 925             case Action::NEW:
 926                 value = old_value;
 927                 // We're processing the same field again - give it a reprieve.
 928                 this_field_is_content = true;
 929                 break;
 930             case Action::FIELD:
 931                 if (!value.empty()) {
 932                     string f = action.get_string_arg();
 933                     if (f.empty()) f = field;
 934                     // replace newlines with spaces
 935                     string s = value;
 936                     string::size_type j = 0;
 937                     while ((j = s.find('\n', j)) != string::npos)
 938                         s[j] = ' ';
 939                     fields[f].push_back(s);
 940                 }
 941                 break;
 942             case Action::INDEX:
 943                 indexer.index_text(value,
 944                                    action.get_num_arg(),
 945                                    action.get_string_arg());
 946                 break;
 947             case Action::INDEXNOPOS:
 948                 // No positional information so phrase searching won't work.
 949                 // However, the database will use much less diskspace.
 950                 indexer.index_text_without_positions(value,
 951                                                      action.get_num_arg(),
 952                                                      action.get_string_arg());
 953                 break;
 954             case Action::BOOLEAN: {
 955                 // Do nothing if there's no text.
 956                 if (value.empty()) break;
 957
 958                 string term = action.get_string_arg();
 959                 if (prefix_needs_colon(term, value[0])) term += ':';
 960                 term += value;
 961
 962                 doc.add_boolean_term(term);
 963                 break;
 964             }
 965             case Action::GAP:
 966                 indexer.increase_termpos(action.get_num_arg());
 967                 break;
 968             case Action::HASH: {
 969                 unsigned int max_length = action.get_num_arg();
 970                 if (value.length() > max_length)
 971                     value = hash_long_term(value, max_length);
 972                 break;
 973             }
 974             case Action::HEXTOBIN: {
 975                 size_t len = value.length();
 976                 if (len & 1) {
 977                     report_location(DIAG_ERROR, fname, line_no);
 978                     cerr << "hextobin: input must have even length"
 979                          << endl;
 980                     exit(1);
 981                 }
 982
 983                 string output;
 984                 output.reserve(len / 2);
 985                 for (size_t j = 0; j < len; j += 2) {
 986                     char a = value[j];
 987                     char b = value[j + 1];
 988                     if (!C_isxdigit(a) || !C_isxdigit(b)) {
 989                         report_location(DIAG_ERROR, fname, line_no);
 990                         cerr << "hextobin: input must be all hex digits\n";
 991                         exit(1);
 992                     }
 993                     char r = (hex_digit(a) << 4) | hex_digit(b);
 994                     output.push_back(r);
 995                 }
 996                 value = std::move(output);
 997                 break;
 998             }
 999             case Action::LOWER:
1000                 value = Xapian::Unicode::tolower(value);
1001                 break;
1002             case Action::LTRIM:
1003                 ltrim(value, action.get_string_arg());
1004                 break;
1005             case Action::RTRIM:
1006                 rtrim(value, action.get_string_arg());
1007                 break;
1008             case Action::TRIM:
1009                 rtrim(value, action.get_string_arg());
1010                 ltrim(value, action.get_string_arg());
1011                 break;
1012             case Action::SQUASH:
1013                 squash(value, action.get_string_arg());
1014                 break;
1015             case Action::LOAD: {
1016                 // If there's no input, just issue a warning.
1017                 if (value.empty()) {
1018                     report_location(DIAG_WARN, fname, line_no);
1019                     cerr << "Empty filename in LOAD action" << endl;
1020                     break;
1021                 }
1022                 bool truncated = false;
1023                 string filename = std::move(value);
1024                 // FIXME: Use NOATIME if we own the file or are root.
1025                 if (!load_file(filename, action.get_num_arg(), NOCACHE,
1026                                value, truncated)) {
1027                     report_location(DIAG_ERROR, fname, line_no);
1028                     cerr << "Couldn't load file '" << filename << "': "
1029                          << strerror(errno) << endl;
1030                     exit(1);
1031                 }
1032                 if (!truncated) break;
1033             }
1034             /* FALLTHRU */
1035             case Action::TRUNCATE:
1036                 utf8_truncate(value, action.get_num_arg());
1037                 break;
1038             case Action::SPELL:
1039                 indexer.set_flags(indexer.FLAG_SPELLING);
1040                 break;
1041             case Action::SPLIT: {
1042                 // Find the end of the actions which split should execute.
1043                 auto split_end = find(action_it, action_end, Action::NEW);
1044
1045                 int split_type = action.get_num_arg();
1046                 if (value.empty()) {
1047                     // Nothing to do.
1048                 } else if (split_type != Action::SPLIT_SORT) {
1049                     // Generate split as we consume it.
1050                     const string& delimiter = action.get_string_arg();
1051
1052                     unique_ptr<unordered_set<string>> seen;
1053                     if (split_type == Action::SPLIT_DEDUP) {
1054                         seen.reset(new unordered_set<string>);
1055                     }
1056
1057                     if (delimiter.size() == 1) {
1058                         // Special case for common single character delimiter.
1059                         char ch = delimiter[0];
1060                         string::size_type i = 0;
1061                         while (true) {
1062                             string::size_type j = value.find(ch, i);
1063                             if (split_type == Action::SPLIT_PREFIXES) {
1064                                 if (j > 0) {
1065                                     string val(value, 0, j);
1066                                     run_actions(action_it, split_end,
1067                                                 database, indexer,
1068                                                 val,
1069                                                 this_field_is_content, doc,
1070                                                 fields,
1071                                                 field, fname, line_no,
1072                                                 docid);
1073                                 }
1074                             } else if (i != j) {
1075                                 string val(value, i, j - i);
1076                                 if (!seen.get() || seen->insert(val).second) {
1077                                     run_actions(action_it, split_end,
1078                                                 database, indexer,
1079                                                 val,
1080                                                 this_field_is_content, doc,
1081                                                 fields,
1082                                                 field, fname, line_no,
1083                                                 docid);
1084                                 }
1085                             }
1086                             if (j == string::npos) break;
1087                             i = j + 1;
1088                         }
1089                     } else {
1090                         string::size_type i = 0;
1091                         while (true) {
1092                             string::size_type j = value.find(delimiter, i);
1093                             if (split_type == Action::SPLIT_PREFIXES) {
1094                                 if (j > 0) {
1095                                     string val(value, 0, j);
1096                                     run_actions(action_it, split_end,
1097                                                 database, indexer,
1098                                                 val,
1099                                                 this_field_is_content, doc,
1100                                                 fields,
1101                                                 field, fname, line_no,
1102                                                 docid);
1103                                 }
1104                             } else if (i != j) {
1105                                 string val(value, i, j - i);
1106                                 if (!seen.get() || seen->insert(val).second) {
1107                                     run_actions(action_it, split_end,
1108                                                 database, indexer,
1109                                                 val,
1110                                                 this_field_is_content, doc,
1111                                                 fields,
1112                                                 field, fname, line_no,
1113                                                 docid);
1114                                 }
1115                             }
1116                             if (j == string::npos) break;
1117                             i = j + delimiter.size();
1118                         }
1119                     }
1120                 } else {
1121                     vector<string> split_values;
1122                     const string& delimiter = action.get_string_arg();
1123                     if (delimiter.size() == 1) {
1124                         // Special case for common single character delimiter.
1125                         char ch = delimiter[0];
1126                         string::size_type i = 0;
1127                         while (true) {
1128                             string::size_type j = value.find(ch, i);
1129                             if (i != j) {
1130                                 split_values.emplace_back(value, i, j - i);
1131                             }
1132                             if (j == string::npos) break;
1133                             i = j + 1;
1134                         }
1135                     } else {
1136                         string::size_type i = 0;
1137                         while (true) {
1138                             string::size_type j = value.find(delimiter, i);
1139                             if (i != j) {
1140                                 split_values.emplace_back(value, i, j - i);
1141                             }
1142                             if (j == string::npos) break;
1143                             i = j + delimiter.size();
1144                         }
1145                     }
1146
1147                     sort(split_values.begin(), split_values.end());
1148
1149                     for (auto&& val : split_values) {
1150                         run_actions(action_it, split_end,
1151                                     database, indexer, val,
1152                                     this_field_is_content, doc, fields,
1153                                     field, fname, line_no,
1154                                     docid);
1155                     }
1156                 }
1157
1158                 action_it = split_end;
1159                 break;
1160             }
1161             case Action::UNHTML: {
1162                 MyHtmlParser p;
1163                 try {
1164                     // Default HTML character set is latin 1, though
1165                     // not specifying one is deprecated these days.
1166                     p.parse_html(value, "iso-8859-1", false);
1167                 } catch (const string & newcharset) {
1168                     p.reset();
1169                     p.parse_html(value, newcharset, true);
1170                 }
1171                 if (p.indexing_allowed)
1172                     value = p.dump;
1173                 else
1174                     value = "";
1175                 break;
1176             }
1177             case Action::UNIQUE: {
1178                 // If there's no text, just issue a warning.
1179                 if (value.empty()) {
1180                     report_location(DIAG_WARN, fname, line_no);
1181                     cerr << "Ignoring UNIQUE action on empty text"
1182                          << endl;
1183                     break;
1184                 }
1185
1186                 // Ensure that the value of this field is unique.
1187                 // If a record already exists with the same value,
1188                 // it will be replaced with the new record.
1189
1190                 // Unique fields aren't considered content - if
1191                 // there are no other fields in the document, the
1192                 // document is to be deleted.
1193                 this_field_is_content = false;
1194
1195                 // Argument is the prefix to add to the field value
1196                 // to get the unique term.
1197                 string t = action.get_string_arg();
1198                 if (prefix_needs_colon(t, value[0])) t += ':';
1199                 t += value;
1200                 Xapian::PostingIterator p = database.postlist_begin(t);
1201                 if (p != database.postlist_end(t)) {
1202                     docid = *p;
1203                 }
1204                 break;
1205             }
1206             case Action::VALUE:
1207                 if (!value.empty())
1208                     doc.add_value(action.get_num_arg(), value);
1209                 break;
1210             case Action::VALUENUMERIC: {
1211                 if (value.empty()) break;
1212                 char * end;
1213                 double dbl = strtod(value.c_str(), &end);
1214                 if (*end) {
1215                     report_location(DIAG_WARN, fname, line_no);
1216                     cerr << "Trailing characters in VALUENUMERIC: '"
1217                          << value << "'" << endl;
1218                 }
1219                 doc.add_value(action.get_num_arg(),
1220                               Xapian::sortable_serialise(dbl));
1221                 break;
1222             }
1223             case Action::VALUEPACKED: {
1224                 uint32_t word = 0;
1225                 if (value.empty() || !C_isdigit(value[0])) {
1226                     // strtoul() accepts leading whitespace and negated
1227                     // values, neither of which we want to allow.
1228                     errno = EINVAL;
1229                 } else {
1230                     errno = 0;
1231                     char* q;
1232                     word = strtoul(value.c_str(), &q, 10);
1233                     if (!errno && *q != '\0') {
1234                         // Trailing characters after converted value.
1235                         errno = EINVAL;
1236                     }
1237                 }
1238                 if (errno) {
1239                     report_location(DIAG_WARN, fname, line_no);
1240                     cerr << "valuepacked \"" << value << "\" ";
1241                     if (errno == ERANGE) {
1242                         cerr << "out of range";
1243                     } else {
1244                         cerr << "not an unsigned integer";
1245                     }
1246                     cerr << endl;
1247                 }
1248                 int valueslot = action.get_num_arg();
1249                 doc.add_value(valueslot, int_to_binary_string(word));
1250                 break;
1251             }
1252             case Action::DATE: {
1253                 // Do nothing for empty input.
1254                 if (value.empty()) break;
1255
1256                 const string & type = action.get_string_arg();
1257                 string yyyymmdd;
1258                 if (type == "unix") {
1259                     time_t t;
1260                     if (!parse_signed(value.c_str(), t)) {
1261                         report_location(DIAG_WARN, fname, line_no);
1262                         cerr << "Date value (in secs) for action DATE "
1263                                 "must be an integer - ignoring" << endl;
1264                         break;
1265                     }
1266                     struct tm *tm = localtime(&t);
1267                     int y = tm->tm_year + 1900;
1268                     int m = tm->tm_mon + 1;
1269                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1270                 } else if (type == "unixutc") {
1271                     time_t t;
1272                     if (!parse_signed(value.c_str(), t)) {
1273                         report_location(DIAG_WARN, fname, line_no);
1274                         cerr << "Date value (in secs) for action DATE "
1275                                 "must be an integer - ignoring" << endl;
1276                         break;
1277                     }
1278                     struct tm *tm = gmtime(&t);
1279                     int y = tm->tm_year + 1900;
1280                     int m = tm->tm_mon + 1;
1281                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1282                 } else if (type == "yyyymmdd") {
1283                     if (value.length() != 8) {
1284                         report_location(DIAG_WARN, fname, line_no);
1285                         cerr << "date=yyyymmdd expects an 8 character value "
1286                                 "- ignoring" << endl;
1287                         break;
1288                     }
1289                     yyyymmdd = value;
1290                 }
1291
1292                 // Date (YYYYMMDD)
1293                 doc.add_boolean_term("D" + yyyymmdd);
1294                 yyyymmdd.resize(6);
1295                 // Month (YYYYMM)
1296                 doc.add_boolean_term("M" + yyyymmdd);
1297                 yyyymmdd.resize(4);
1298                 // Year (YYYY)
1299                 doc.add_boolean_term("Y" + yyyymmdd);
1300                 break;
1301             }
1302             case Action::PARSEDATE: {
1303                 string dateformat = action.get_string_arg();
1304                 struct tm tm;
1305                 memset(&tm, 0, sizeof(tm));
1306                 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1307                 if (ret == NULL) {
1308                     report_location(DIAG_WARN, fname, line_no);
1309                     cerr << "\"" << value << "\" doesn't match format "
1310                             "\"" << dateformat << '\"' << endl;
1311                     break;
1312                 }
1313
1314                 if (*ret != '\0') {
1315                     report_location(DIAG_WARN, fname, line_no);
1316                     cerr << "\"" << value << "\" not fully matched by "
1317                             "format \"" << dateformat << "\" "
1318                             "(\"" << ret << "\" left over) but "
1319                             "indexing anyway" << endl;
1320                 }
1321 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1322                 auto gmtoff = tm.tm_gmtoff;
1323 #endif
1324                 auto secs_since_epoch = timegm(&tm);
1325 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1326                 secs_since_epoch -= gmtoff;
1327 #endif
1328                 value = str(secs_since_epoch);
1329                 break;
1330             }
1331             default:
1332                 /* Empty default case to avoid "unhandled enum value"
1333                  * warnings. */
1334                 break;
1335         }
1336     }
1337     return true;
1338 }
1339
1340 static void
1341 index_file(const char *fname, istream &stream,
1342            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1343 {
1344     string line;
1345     size_t line_no = 0;
1346     while (!stream.eof() && getline(stream, line)) {
1347         ++line_no;
1348         Xapian::Document doc;
1349         indexer.set_document(doc);
1350         Xapian::docid docid = 0;
1351         map<string, list<string>> fields;
1352         bool seen_content = false;
1353         while (!line.empty()) {
1354             // Cope with files from MS Windows (\r\n end of lines).
1355             // Trim multiple \r characters, since that seems the best way
1356             // to handle that case.
1357             string::size_type last = line.find_last_not_of('\r');
1358             if (last == string::npos) break;
1359             line.resize(last + 1);
1360
1361             string::size_type eq = line.find('=');
1362             if (eq == string::npos && !line.empty()) {
1363                 report_location(DIAG_ERROR, fname, line_no, line.size());
1364                 cerr << "expected = somewhere in this line" << endl;
1365                 exit(1);
1366             }
1367             string field(line, 0, eq);
1368             string value(line, eq + 1, string::npos);
1369             line.clear();
1370             while (getline(stream, line)) {
1371                 ++line_no;
1372                 if (line.empty() || line[0] != '=') break;
1373                 // Cope with files from MS Windows (\r\n end of lines).
1374                 // Trim multiple \r characters, since that seems the best way
1375                 // to handle that case.
1376                 last = line.find_last_not_of('\r');
1377                 // line[0] == '=', so last != string::npos.
1378                 // Replace the '=' with a '\n' so we don't have to use substr.
1379                 line[0] = '\n';
1380                 line.resize(last + 1);
1381                 value += line;
1382             }
1383
1384             // Default to not indexing spellings.
1385             indexer.set_flags(Xapian::TermGenerator::flags(0));
1386
1387             bool this_field_is_content = true;
1388             const vector<Action>& v = index_spec[field];
1389             run_actions(v.begin(), v.end(),
1390                         database, indexer, value,
1391                         this_field_is_content, doc, fields,
1392                         field, fname, line_no,
1393                         docid);
1394             if (this_field_is_content) seen_content = true;
1395         }
1396
1397         // If we haven't seen any fields (other than unique identifiers)
1398         // the document is to be deleted.
1399         if (!seen_content) {
1400             if (docid) {
1401                 database.delete_document(docid);
1402                 if (verbose) cout << "Del: " << docid << endl;
1403                 ++delcount;
1404             }
1405         } else {
1406             string data;
1407             for (auto&& i : fields) {
1408                 for (auto&& field_val : i.second) {
1409                     data += i.first;
1410                     data += '=';
1411                     data += field_val;
1412                     data += '\n';
1413                 }
1414             }
1415
1416             // Put the data in the document
1417             doc.set_data(data);
1418
1419             // Add the document to the database
1420             if (docid) {
1421                 database.replace_document(docid, doc);
1422                 if (verbose) cout << "Replace: " << docid << endl;
1423                 ++repcount;
1424             } else {
1425                 docid = database.add_document(doc);
1426                 if (verbose) cout << "Add: " << docid << endl;
1427                 ++addcount;
1428             }
1429         }
1430     }
1431
1432     // Commit after each file to make sure all changes from that file make it
1433     // in.
1434     if (verbose) cout << "Committing: " << endl;
1435     database.commit();
1436 }
1437
1438 static void
1439 show_help(int exit_code)
1440 {
1441     cout << PROG_NAME " - " PROG_DESC "\n"
1442 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1443 "\n"
1444 "Creates or updates a Xapian database with the data from the input files listed\n"
1445 "on the command line.  If no files are specified, data is read from stdin.\n"
1446 "\n"
1447 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1448 "format for INDEXER_SCRIPT.\n"
1449 "\n"
1450 "Options:\n"
1451 "  -v, --verbose       display additional messages to aid debugging\n"
1452 "      --overwrite     create the database anew (the default is to update if\n"
1453 "                      the database already exists)\n";
1454     print_stemmer_help("");
1455     print_help_and_version_help("");
1456     exit(exit_code);
1457 }
1458
1459 int
1460 main(int argc, char **argv)
1461 try {
1462     // If the database already exists, default to updating not overwriting.
1463     int database_mode = Xapian::DB_CREATE_OR_OPEN;
1464     verbose = false;
1465     Xapian::Stem stemmer("english");
1466
1467     // Without this, strptime() seems to treat formats without a timezone as
1468     // being local time, including %s.
1469     setenv("TZ", "UTC", 1);
1470
1471     constexpr auto NO_ARG = no_argument;
1472     constexpr auto REQ_ARG = required_argument;
1473     static const struct option longopts[] = {
1474         { "help",       NO_ARG,         NULL, 'h' },
1475         { "version",    NO_ARG,         NULL, 'V' },
1476         { "stemmer",    REQ_ARG,        NULL, 's' },
1477         { "overwrite",  NO_ARG,         NULL, 'o' },
1478         { "verbose",    NO_ARG,         NULL, 'v' },
1479         { 0, 0, NULL, 0 }
1480     };
1481
1482     int getopt_ret;
1483     while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1484                                          longopts, NULL)) != -1) {
1485         switch (getopt_ret) {
1486             default:
1487                 show_help(1);
1488                 break;
1489             case 'h': // --help
1490                 show_help(0);
1491                 break;
1492             case 'V': // --version
1493                 print_package_info(PROG_NAME);
1494                 return 0;
1495             case 'o': // --overwrite
1496                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1497                 break;
1498             case 'v':
1499                 verbose = true;
1500                 break;
1501             case 's':
1502                 try {
1503                     stemmer = Xapian::Stem(optarg);
1504                 } catch (const Xapian::InvalidArgumentError &) {
1505                     cerr << "Unknown stemming language '" << optarg << "'.\n";
1506                     cerr << "Available language names are: "
1507                          << Xapian::Stem::get_available_languages() << endl;
1508                     return 1;
1509                 }
1510                 break;
1511         }
1512     }
1513
1514     argv += optind;
1515     argc -= optind;
1516     if (argc < 2) {
1517         show_help(1);
1518     }
1519
1520     parse_index_script(argv[1]);
1521
1522     // Open the database.  If another process is currently updating the
1523     // database, wait for the lock to become available.
1524     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1525     Xapian::WritableDatabase database(argv[0], flags);
1526
1527     Xapian::TermGenerator indexer;
1528     indexer.set_stemmer(stemmer);
1529     // Set the database for spellings to be added to by the "spell" action.
1530     indexer.set_database(database);
1531
1532     addcount = 0;
1533     repcount = 0;
1534     delcount = 0;
1535
1536     if (argc == 2) {
1537         // Read from stdin.
1538         index_file("<stdin>", cin, database, indexer);
1539     } else {
1540         // Read file(s) listed on the command line.
1541         for (int i = 2; i < argc; ++i) {
1542             ifstream stream(argv[i]);
1543             if (stream) {
1544                 index_file(argv[i], stream, database, indexer);
1545             } else {
1546                 cerr << "Can't open file " << argv[i] << endl;
1547             }
1548         }
1549     }
1550
1551     cout << "records (added, replaced, deleted) = (" << addcount << ", "
1552          << repcount << ", " << delcount << ")" << endl;
1553 } catch (const Xapian::Error &error) {
1554     cerr << "Exception: " << error.get_description() << endl;
1555     exit(1);
1556 } catch (const std::bad_alloc &) {
1557     cerr << "Exception: std::bad_alloc" << endl;
1558     exit(1);
1559 } catch (...) {
1560     cerr << "Unknown Exception" << endl;
1561     exit(1);
1562 }