xapian-applications/omega/scriptindex.cc

   1 /** @file
   2  * @brief index arbitrary data as described by an index script
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 Sam Liddicott
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002-2022 Olly Betts
   8  *
   9  * This program is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU General Public License as
  11  * published by the Free Software Foundation; either version 2 of the
  12  * License, or (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  22  * USA
  23  */
  24
  25 #include <config.h>
  26
  27 #include <xapian.h>
  28
  29 #include <algorithm>
  30 #include <fstream>
  31 #include <iostream>
  32 #include <list>
  33 #include <map>
  34 #include <memory>
  35 #include <string>
  36 #include <unordered_set>
  37 #include <vector>
  38 #include <cstring>
  39
  40 #include <cerrno>
  41 #include <cstdio>
  42 #include <cstdlib>
  43 #include <ctime>
  44
  45 #include "commonhelp.h"
  46 #include "hashterm.h"
  47 #include "loadfile.h"
  48 #include "myhtmlparse.h"
  49 #include "parseint.h"
  50 #include "setenv.h"
  51 #include "str.h"
  52 #include "stringutils.h"
  53 #include "timegm.h"
  54 #include "utf8truncate.h"
  55 #include "utils.h"
  56 #include "values.h"
  57
  58 #ifndef HAVE_STRPTIME
  59 #include "portability/strptime.h"
  60 #endif
  61
  62 #include "gnu_getopt.h"
  63
  64 using namespace std;
  65
  66 #define PROG_NAME "scriptindex"
  67 #define PROG_DESC "index arbitrary data as described by an index script"
  68
  69 static bool verbose;
  70 static int addcount;
  71 static int repcount;
  72 static int delcount;
  73 static int skipcount;
  74
  75 /** What to do if there's a UNIQUE action but a record doesn't use it.
  76  */
  77 static enum {
  78     UNIQUE_ERROR,
  79     UNIQUE_WARN_NEW,
  80     UNIQUE_NEW,
  81     UNIQUE_WARN_SKIP,
  82     UNIQUE_SKIP
  83 } unique_missing = UNIQUE_WARN_NEW;
  84
  85 /// Track if UNIQUE action is unused in the current record.
  86 static bool unique_unused;
  87
  88 /// Track if the current record is being skipping.
  89 static bool skipping_record = false;
  90
  91 static inline bool
  92 prefix_needs_colon(const string & prefix, unsigned ch)
  93 {
  94     if (!C_isupper(ch) && ch != ':') return false;
  95     string::size_type len = prefix.length();
  96     return (len > 1 && prefix[len - 1] != ':');
  97 }
  98
  99 const char * action_names[] = {
 100     // Actions used internally:
 101     "bad",
 102     "new",
 103     // Actual actions:
 104     "boolean",
 105     "date",
 106     "field",
 107     "gap",
 108     "hash",
 109     "hextobin",
 110     "index",
 111     "indexnopos",
 112     "load",
 113     "lower",
 114     "ltrim",
 115     "parsedate",
 116     "rtrim",
 117     "spell",
 118     "split",
 119     "squash",
 120     "trim",
 121     "truncate",
 122     "unhtml",
 123     "unique",
 124     "value",
 125     "valuenumeric",
 126     "valuepacked",
 127     "weight"
 128 };
 129
 130 // For debugging:
 131 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")\n"
 132
 133 class Action {
 134   public:
 135     typedef enum {
 136         // Actions used internally:
 137         BAD,
 138         NEW,
 139         // Actual actions:
 140         BOOLEAN,
 141         DATE,
 142         FIELD,
 143         GAP,
 144         HASH,
 145         HEXTOBIN,
 146         INDEX,
 147         INDEXNOPOS,
 148         LOAD,
 149         LOWER,
 150         LTRIM,
 151         PARSEDATE,
 152         RTRIM,
 153         SPELL,
 154         SPLIT,
 155         SQUASH,
 156         TRIM,
 157         TRUNCATE,
 158         UNHTML,
 159         UNIQUE,
 160         VALUE,
 161         VALUENUMERIC,
 162         VALUEPACKED,
 163         WEIGHT
 164     } type;
 165     enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
 166   private:
 167     type action;
 168     int num_arg;
 169     string string_arg;
 170     // Offset into indexscript line.
 171     size_t pos;
 172   public:
 173     Action(type action_, size_t pos_)
 174         : action(action_), num_arg(0), pos(pos_) { }
 175     Action(type action_, size_t pos_, const string & arg)
 176         : action(action_), string_arg(arg), pos(pos_) {
 177         num_arg = atoi(string_arg.c_str());
 178     }
 179     Action(type action_, size_t pos_, const string & arg, int num)
 180         : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
 181     type get_action() const { return action; }
 182     int get_num_arg() const { return num_arg; }
 183     void set_num_arg(int num) { num_arg = num; }
 184     const string & get_string_arg() const { return string_arg; }
 185     size_t get_pos() const { return pos; }
 186 };
 187
 188 // These allow searching for an Action with a particular Action::type using
 189 // std::find().
 190
 191 inline bool
 192 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
 193
 194 inline bool
 195 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
 196
 197 inline bool
 198 operator!=(const Action& a, Action::type t) { return !(a == t); }
 199
 200 inline bool
 201 operator!=(Action::type t, const Action& a) { return !(t == a); }
 202
 203 static void
 204 ltrim(string& s, const string& chars)
 205 {
 206     auto i = s.find_first_not_of(chars);
 207     if (i) s.erase(0, i);
 208 }
 209
 210 static void
 211 rtrim(string& s, const string& chars)
 212 {
 213     s.resize(s.find_last_not_of(chars) + 1);
 214 }
 215
 216 static void
 217 squash(string& s, const string& chars)
 218 {
 219     string output;
 220     output.reserve(s.size());
 221     string::size_type i = 0;
 222     while ((i = s.find_first_not_of(chars, i)) != string::npos) {
 223         auto j = s.find_first_of(chars, i);
 224         if (!output.empty()) output += ' ';
 225         output.append(s, i, j - i);
 226         i = j;
 227     }
 228     s = std::move(output);
 229 }
 230
 231 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
 232
 233 static unsigned error_count = 0;
 234
 235 static void
 236 report_location(enum diag_type type,
 237                 const string& filename,
 238                 size_t line = 0,
 239                 size_t pos = string::npos)
 240 {
 241     cerr << filename;
 242     if (line != 0) {
 243         cerr << ':' << line;
 244         if (pos != string::npos) {
 245             // The first column is numbered 1.
 246             cerr << ':' << pos + 1;
 247         }
 248     }
 249     switch (type) {
 250         case DIAG_ERROR:
 251             cerr << ": error: ";
 252             ++error_count;
 253             break;
 254         case DIAG_WARN:
 255             cerr << ": warning: ";
 256             break;
 257         case DIAG_NOTE:
 258             cerr << ": note: ";
 259             break;
 260     }
 261 }
 262
 263 static void
 264 report_useless_action(const string &file, size_t line, size_t pos,
 265                       const string &action)
 266 {
 267     report_location(DIAG_WARN, file, line, pos);
 268     cerr << "Index action '" << action << "' has no effect\n";
 269
 270     static bool given_left_to_right_warning = false;
 271     if (!given_left_to_right_warning) {
 272         given_left_to_right_warning = true;
 273         report_location(DIAG_NOTE, file, line, pos);
 274         cerr << "Actions are executed from left to right\n";
 275     }
 276 }
 277
 278 static bool index_spec_uses_unique = false;
 279
 280 static map<string, vector<Action>> index_spec;
 281
 282 // Like std::getline() but handle \r\n line endings too.
 283 static istream&
 284 getline_portable(istream& stream, string& line)
 285 {
 286     istream& result = getline(stream, line);
 287     // Trim multiple \r characters, since that seems the best way to handle
 288     // that case.
 289     line.resize(line.find_last_not_of('\r') + 1);
 290     return result;
 291 }
 292
 293 static void
 294 parse_index_script(const string &filename)
 295 {
 296     ifstream script(filename.c_str());
 297     if (!script.is_open()) {
 298         report_location(DIAG_ERROR, filename);
 299         cerr << strerror(errno) << '\n';
 300         exit(1);
 301     }
 302     string line;
 303     size_t line_no = 0;
 304     // Line number where we saw a `unique` action, or 0 if we haven't.
 305     int unique_line_no = 0;
 306     // Offset into line unique_line_no where the `unique` action was.
 307     size_t unique_pos = 0;
 308     while (getline(script, line)) {
 309         ++line_no;
 310         vector<string> fields;
 311         vector<Action> actions;
 312         string::const_iterator i, j;
 313         const string &s = line;
 314         i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
 315         if (i == s.end() || *i == '#') {
 316             // Blank line or comment.
 317             continue;
 318         }
 319         while (true) {
 320             if (!C_isalnum(*i)) {
 321                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 322                 cerr << "field name must start with alphanumeric\n";
 323             }
 324             j = find_if(i + 1, s.end(),
 325                         [](char ch) { return !C_isalnum(ch) && ch != '_'; });
 326             fields.push_back(string(i, j));
 327             i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
 328             if (i == s.end()) break;
 329             if (*i == ':') {
 330                 ++i;
 331                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 332                 break;
 333             }
 334             if (i == j) {
 335                 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
 336                 cerr << "bad character '" << *i << "' in field name\n";
 337                 ++i;
 338                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 339                 if (i == s.end()) break;
 340             }
 341         }
 342         Xapian::termcount weight = 1;
 343         size_t useless_weight_pos = string::npos;
 344         map<string, Action::type> boolmap;
 345         j = i;
 346         while (j != s.end()) {
 347             size_t action_pos = j - s.begin();
 348             i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
 349             string action(s, j - s.begin(), i - j);
 350             Action::type code = Action::BAD;
 351             unsigned min_args = 0, max_args = 0;
 352             bool takes_integer_argument = false;
 353             if (!action.empty()) {
 354                 switch (action[0]) {
 355                     case 'b':
 356                         if (action == "boolean") {
 357                             code = Action::BOOLEAN;
 358                             max_args = 1;
 359                         }
 360                         break;
 361                     case 'd':
 362                         if (action == "date") {
 363                             code = Action::DATE;
 364                             min_args = max_args = 1;
 365                         }
 366                         break;
 367                     case 'f':
 368                         if (action == "field") {
 369                             code = Action::FIELD;
 370                             max_args = 1;
 371                         }
 372                         break;
 373                     case 'g':
 374                         if (action == "gap") {
 375                             code = Action::GAP;
 376                             max_args = 1;
 377                             takes_integer_argument = true;
 378                         }
 379                         break;
 380                     case 'h':
 381                         if (action == "hash") {
 382                             code = Action::HASH;
 383                             max_args = 1;
 384                             takes_integer_argument = true;
 385                         } else if (action == "hextobin") {
 386                             code = Action::HEXTOBIN;
 387                         }
 388                         break;
 389                     case 'i':
 390                         if (action == "index") {
 391                             code = Action::INDEX;
 392                             max_args = 1;
 393                         } else if (action == "indexnopos") {
 394                             code = Action::INDEXNOPOS;
 395                             max_args = 1;
 396                         }
 397                         break;
 398                     case 'l':
 399                         if (action == "lower") {
 400                             code = Action::LOWER;
 401                         } else if (action == "load") {
 402                             code = Action::LOAD;
 403                         } else if (action == "ltrim") {
 404                             code = Action::LTRIM;
 405                             max_args = 1;
 406                         }
 407                         break;
 408                     case 'p':
 409                         if (action == "parsedate") {
 410                             code = Action::PARSEDATE;
 411                             min_args = max_args = 1;
 412                         }
 413                         break;
 414                     case 'r':
 415                         if (action == "rtrim") {
 416                             code = Action::RTRIM;
 417                             max_args = 1;
 418                         }
 419                         break;
 420                     case 's':
 421                         if (action == "spell") {
 422                             code = Action::SPELL;
 423                         } else if (action == "split") {
 424                             code = Action::SPLIT;
 425                             min_args = 1;
 426                             max_args = 2;
 427                         } else if (action == "squash") {
 428                             code = Action::SQUASH;
 429                             max_args = 1;
 430                         }
 431                         break;
 432                     case 't':
 433                         if (action == "truncate") {
 434                             code = Action::TRUNCATE;
 435                             min_args = max_args = 1;
 436                             takes_integer_argument = true;
 437                         } else if (action == "trim") {
 438                             code = Action::TRIM;
 439                             max_args = 1;
 440                         }
 441                         break;
 442                     case 'u':
 443                         if (action == "unhtml") {
 444                             code = Action::UNHTML;
 445                         } else if (action == "unique") {
 446                             code = Action::UNIQUE;
 447                             min_args = 1;
 448                             max_args = 2;
 449                         }
 450                         break;
 451                     case 'v':
 452                         if (action == "value") {
 453                             code = Action::VALUE;
 454                             min_args = max_args = 1;
 455                             takes_integer_argument = true;
 456                         } else if (action == "valuenumeric") {
 457                             code = Action::VALUENUMERIC;
 458                             min_args = max_args = 1;
 459                             takes_integer_argument = true;
 460                         } else if (action == "valuepacked") {
 461                             code = Action::VALUEPACKED;
 462                             min_args = max_args = 1;
 463                             takes_integer_argument = true;
 464                         }
 465                         break;
 466                     case 'w':
 467                         if (action == "weight") {
 468                             code = Action::WEIGHT;
 469                             min_args = max_args = 1;
 470                             // Don't set takes_integer_argument since we parse
 471                             // it with parse_unsigned() and issue an error there
 472                             // - setting takes_integer_argument would give a
 473                             // double error for arguments with a decimal point.
 474                         }
 475                         break;
 476                 }
 477             }
 478             if (code == Action::BAD) {
 479                 report_location(DIAG_ERROR, filename, line_no, action_pos);
 480                 cerr << "Unknown index action '" << action << "'\n";
 481             }
 482             auto i_after_action = i;
 483             i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 484
 485             if (i != s.end() && *i == '=') {
 486                 if (i != i_after_action) {
 487                     report_location(DIAG_WARN, filename, line_no,
 488                                     i_after_action - s.begin());
 489                     cerr << "putting spaces between the action and '=' is "
 490                             "deprecated\n";
 491                 }
 492
 493                 if (max_args == 0) {
 494                     report_location(DIAG_ERROR, filename, line_no,
 495                                     i - s.begin());
 496                     cerr << "Index action '" << action
 497                          << "' doesn't take an argument\n";
 498                 }
 499
 500                 ++i;
 501                 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 502                 if (i != j) {
 503                     report_location(DIAG_WARN, filename, line_no,
 504                                     i - s.begin());
 505                     cerr << "putting spaces between '=' and the argument is "
 506                             "deprecated\n";
 507                 }
 508
 509                 vector<string> vals;
 510                 while (true) {
 511                     if (j != s.end() && *j == '"') {
 512                         // Quoted argument.
 513                         ++j;
 514                         string arg;
 515                         while (true) {
 516                             i = find_if(j, s.end(),
 517                                         [](char ch) {
 518                                             return ch == '"' || ch == '\\';
 519                                         });
 520                             if (i == s.end()) {
 521                                 report_location(DIAG_ERROR, filename, line_no,
 522                                                 s.size());
 523                                 cerr << "No closing quote\n";
 524                                 break;
 525                             }
 526                             arg.append(j, i);
 527                             if (*i++ == '"')
 528                                 break;
 529
 530                             // Escape sequence.
 531                             if (i == s.end()) {
 532 bad_escaping:
 533                                 report_location(DIAG_ERROR, filename, line_no,
 534                                                 i - s.begin());
 535                                 cerr << "Bad escaping in quoted action "
 536                                         "argument\n";
 537                                 break;
 538                             }
 539
 540                             char ch = *i;
 541                             switch (ch) {
 542                                 case '\\':
 543                                 case '"':
 544                                     break;
 545                                 case '0':
 546                                     ch = '\0';
 547                                     break;
 548                                 case 'n':
 549                                     ch = '\n';
 550                                     break;
 551                                 case 'r':
 552                                     ch = '\r';
 553                                     break;
 554                                 case 't':
 555                                     ch = '\t';
 556                                     break;
 557                                 case 'x': {
 558                                     if (++i == s.end())
 559                                         goto bad_escaping;
 560                                     char ch1 = *i;
 561                                     if (!C_isxdigit(ch1)) {
 562 bad_hex_digit:
 563                                         report_location(DIAG_ERROR, filename,
 564                                                         line_no, i - s.begin());
 565                                         cerr << "Bad hex digit in escaping\n";
 566                                         --i;
 567                                         break;
 568                                     }
 569                                     if (++i == s.end())
 570                                         goto bad_escaping;
 571                                     char ch2 = *i;
 572                                     if (!C_isxdigit(ch2)) {
 573                                         goto bad_hex_digit;
 574                                     }
 575                                     ch = hex_digit(ch1) << 4 |
 576                                          hex_digit(ch2);
 577                                     break;
 578                                 }
 579                                 default:
 580                                     report_location(DIAG_ERROR, filename,
 581                                                     line_no, i - s.begin());
 582                                     cerr << "Bad escape sequence '\\" << ch
 583                                          << "'\n";
 584                                     break;
 585                             }
 586                             arg += ch;
 587                             j = i + 1;
 588                         }
 589                         vals.emplace_back(std::move(arg));
 590                         if (i == s.end() || C_isspace(*i)) break;
 591                         if (*i == ',') {
 592                             ++i;
 593                         } else {
 594                             report_location(DIAG_ERROR, filename, line_no,
 595                                             i - s.begin());
 596                             cerr << "Unexpected character '" << *i
 597                                  << "' after closing quote\n";
 598                             do {
 599                                 ++i;
 600                             } while (i != s.end() && *i != ',' && !C_isspace(*i));
 601                             if (*i != ',') break;
 602                             ++i;
 603                         }
 604                     } else if (max_args > 1) {
 605                         // Unquoted argument, split on comma.
 606                         i = find_if(j, s.end(),
 607                                     [](char ch) {
 608                                         return C_isspace(ch) || ch == ',';
 609                                     });
 610                         vals.emplace_back(j, i);
 611                         if (*i != ',') break;
 612                         ++i;
 613                     } else {
 614                         // Unquoted argument, including any commas.
 615                         i = find_if(j, s.end(),
 616                                     [](char ch) { return C_isspace(ch); });
 617                         vals.emplace_back(j, i);
 618                         break;
 619                     }
 620                     j = i;
 621
 622                     if (vals.size() == max_args) {
 623                         report_location(DIAG_ERROR, filename, line_no,
 624                                         i - s.begin());
 625                         cerr << "Index action '" << action << "' takes at most "
 626                              << max_args << " arguments\n";
 627                     }
 628                 }
 629
 630                 if (vals.size() < min_args) {
 631                     report_location(DIAG_ERROR, filename, line_no,
 632                                     i - s.begin());
 633                     if (min_args == max_args) {
 634                         cerr << "Index action '" << action << "' requires "
 635                              << min_args << " arguments\n";
 636                     } else {
 637                         cerr << "Index action '" << action << "' requires "
 638                                 "at least " << min_args << " arguments\n";
 639                     }
 640                     // Allow action handling code to assume there are min_args
 641                     // arguments.
 642                     vals.resize(min_args);
 643                 }
 644
 645                 string val;
 646                 if (!vals.empty()) {
 647                     val = vals.front();
 648                 }
 649
 650                 if (takes_integer_argument) {
 651                     auto dot = val.find('.');
 652                     if (dot != string::npos) {
 653                         report_location(DIAG_ERROR, filename, line_no,
 654                                         j - s.begin() + dot);
 655                         cerr << "Index action '" << action
 656                              << "' takes an integer argument\n";
 657                     }
 658                 }
 659                 switch (code) {
 660                     case Action::DATE:
 661                         if (val != "unix" &&
 662                             val != "unixutc" &&
 663                             val != "yyyymmdd") {
 664                             report_location(DIAG_ERROR, filename, line_no,
 665                                             j - s.begin());
 666                             cerr << "Invalid parameter '" << val
 667                                  << "' for action 'date'\n";
 668                         }
 669                         actions.emplace_back(code, action_pos, val);
 670                         break;
 671                     case Action::INDEX:
 672                     case Action::INDEXNOPOS:
 673                         actions.emplace_back(code, action_pos, val, weight);
 674                         useless_weight_pos = string::npos;
 675                         break;
 676                     case Action::WEIGHT:
 677                         // We don't push an Action for WEIGHT - instead we
 678                         // store it ready to use in the INDEX and INDEXNOPOS
 679                         // Actions.
 680                         if (!parse_unsigned(val.c_str(), weight)) {
 681                             report_location(DIAG_ERROR, filename, line_no,
 682                                             j - s.begin());
 683                             cerr << "Index action 'weight' takes a "
 684                                     "non-negative integer argument\n";
 685                             weight = 0;
 686                         }
 687                         if (useless_weight_pos != string::npos) {
 688                             report_useless_action(filename, line_no,
 689                                                   useless_weight_pos, action);
 690                         }
 691                         useless_weight_pos = action_pos;
 692                         break;
 693                     case Action::PARSEDATE: {
 694                         auto bad_code = val.find("%Z");
 695                         if (bad_code != val.npos) {
 696                             report_location(DIAG_ERROR, filename, line_no,
 697                                             j - s.begin() + bad_code);
 698                             cerr << "Parsing timezone names with %Z is not "
 699                                     "supported\n";
 700                         }
 701 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
 702                         bad_code = val.find("%z");
 703                         if (bad_code != val.npos) {
 704                             report_location(DIAG_ERROR, filename, line_no,
 705                                             j - s.begin() + bad_code);
 706                             cerr << "Parsing timezone offsets with %z is not "
 707                                     "supported on this platform\n";
 708                         }
 709 #endif
 710                         actions.emplace_back(code, action_pos, val);
 711                         break;
 712                     }
 713                     case Action::SPLIT: {
 714                         if (val.empty()) {
 715                             report_location(DIAG_ERROR, filename, line_no,
 716                                             j - s.begin());
 717                             cerr << "Split delimiter can't be empty\n";
 718                         }
 719                         int operation = Action::SPLIT_NONE;
 720                         if (vals.size() >= 2) {
 721                             if (vals[1] == "dedup") {
 722                                 operation = Action::SPLIT_DEDUP;
 723                             } else if (vals[1] == "sort") {
 724                                 operation = Action::SPLIT_SORT;
 725                             } else if (vals[1] == "none") {
 726                                 operation = Action::SPLIT_NONE;
 727                             } else if (vals[1] == "prefixes") {
 728                                 operation = Action::SPLIT_PREFIXES;
 729                             } else {
 730                                 // FIXME: Column should be for where the `op`
 731                                 // parameter starts, which this isn't if the
 732                                 // value is quoted, contains escape sequences,
 733                                 // etc.
 734                                 report_location(DIAG_ERROR, filename, line_no,
 735                                                 i - s.begin() - vals[1].size());
 736                                 cerr << "Bad split operation '" << vals[1]
 737                                      << "'\n";
 738                             }
 739                         }
 740                         actions.emplace_back(code, action_pos, val, operation);
 741                         break;
 742                     }
 743                     case Action::TRUNCATE:
 744                         if (!actions.empty() &&
 745                             actions.back().get_action() == Action::LOAD) {
 746                             /* Turn "load truncate=n" into "load" with
 747                              * num_arg n, so that we don't needlessly
 748                              * allocate memory and read data we're just
 749                              * going to ignore.
 750                              */
 751                             actions.pop_back();
 752                             code = Action::LOAD;
 753                         }
 754                         actions.emplace_back(code, action_pos, val);
 755                         break;
 756                     case Action::UNIQUE:
 757                         if (unique_line_no) {
 758                             report_location(DIAG_ERROR, filename, line_no,
 759                                             action_pos);
 760                             cerr << "Index action 'unique' used more than "
 761                                     "once\n";
 762                             report_location(DIAG_NOTE, filename,
 763                                             unique_line_no, unique_pos);
 764                             cerr << "Previously used here\n";
 765                         }
 766                         unique_line_no = line_no;
 767                         unique_pos = action_pos;
 768                         if (boolmap.find(val) == boolmap.end())
 769                             boolmap[val] = Action::UNIQUE;
 770                         if (vals.size() >= 2) {
 771                             if (vals[1] == "missing=error") {
 772                                 unique_missing = UNIQUE_ERROR;
 773                             } else if (vals[1] == "missing=new") {
 774                                 unique_missing = UNIQUE_NEW;
 775                             } else if (vals[1] == "missing=warn+new") {
 776                                 unique_missing = UNIQUE_WARN_NEW;
 777                             } else if (vals[1] == "missing=skip") {
 778                                 unique_missing = UNIQUE_SKIP;
 779                             } else if (vals[1] == "missing=warn+skip") {
 780                                 unique_missing = UNIQUE_WARN_SKIP;
 781                             } else {
 782                                 report_location(DIAG_ERROR, filename, line_no);
 783                                 cerr << "Bad unique parameter '" << vals[1]
 784                                      << "'\n";
 785                             }
 786                         }
 787                         actions.emplace_back(code, action_pos, val);
 788                         break;
 789                     case Action::GAP: {
 790                         actions.emplace_back(code, action_pos, val);
 791                         auto& obj = actions.back();
 792                         auto gap_size = obj.get_num_arg();
 793                         if (gap_size <= 0) {
 794                             report_location(DIAG_ERROR, filename, line_no,
 795                                             obj.get_pos() + 3 + 1);
 796                             cerr << "Index action 'gap' takes a strictly "
 797                                     "positive integer argument\n";
 798                         }
 799                         break;
 800                     }
 801                     case Action::HASH: {
 802                         actions.emplace_back(code, action_pos, val);
 803                         auto& obj = actions.back();
 804                         auto max_length = obj.get_num_arg();
 805                         if (max_length < 6) {
 806                             report_location(DIAG_ERROR, filename, line_no,
 807                                             obj.get_pos() + 4 + 1);
 808                             cerr << "Index action 'hash' takes an integer "
 809                                     "argument which must be at least 6\n";
 810                         }
 811                         break;
 812                     }
 813                     case Action::LTRIM:
 814                     case Action::RTRIM:
 815                     case Action::SQUASH:
 816                     case Action::TRIM:
 817                         for (unsigned char ch : val) {
 818                             if (ch >= 0x80) {
 819                                 auto column = actions.back().get_pos() +
 820                                               strlen(action_names[code]) + 1;
 821                                 report_location(DIAG_ERROR, filename, line_no,
 822                                                 column);
 823                                 cerr << "Index action '" << action_names[code]
 824                                      << "' only support ASCII characters "
 825                                         "currently\n";
 826                             }
 827                         }
 828                         actions.emplace_back(code, action_pos, val);
 829                         break;
 830                     case Action::BOOLEAN:
 831                         boolmap[val] = Action::BOOLEAN;
 832                         /* FALLTHRU */
 833                     default:
 834                         actions.emplace_back(code, action_pos, val);
 835                 }
 836                 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
 837             } else {
 838                 if (min_args > 0) {
 839                     report_location(DIAG_ERROR, filename, line_no,
 840                                     i_after_action - s.begin());
 841                     if (min_args == max_args) {
 842                         cerr << "Index action '" << action << "' requires "
 843                              << min_args << " arguments\n";
 844                     } else {
 845                         cerr << "Index action '" << action << "' requires "
 846                                 "at least " << min_args << " arguments\n";
 847                     }
 848                 }
 849                 switch (code) {
 850                     case Action::INDEX:
 851                     case Action::INDEXNOPOS:
 852                         useless_weight_pos = string::npos;
 853                         actions.emplace_back(code, action_pos, "", weight);
 854                         break;
 855                     case Action::GAP:
 856                         actions.emplace_back(code, action_pos, "", 100);
 857                         break;
 858                     case Action::HASH:
 859                         actions.emplace_back(code, action_pos, "",
 860                                              MAX_SAFE_TERM_LENGTH - 1);
 861                         break;
 862                     case Action::LTRIM:
 863                     case Action::RTRIM:
 864                     case Action::SQUASH:
 865                     case Action::TRIM:
 866                         actions.emplace_back(code, action_pos, " \t\f\v\r\n");
 867                         break;
 868                     default:
 869                         actions.emplace_back(code, action_pos);
 870                         break;
 871                 }
 872             }
 873             j = i;
 874         }
 875
 876         if (useless_weight_pos != string::npos) {
 877             report_useless_action(filename, line_no, useless_weight_pos,
 878                                   "weight");
 879         }
 880
 881         while (!actions.empty()) {
 882             bool done = true;
 883             Action::type action = actions.back().get_action();
 884             switch (action) {
 885                 case Action::HASH:
 886                 case Action::HEXTOBIN:
 887                 case Action::LOWER:
 888                 case Action::LTRIM:
 889                 case Action::PARSEDATE:
 890                 case Action::RTRIM:
 891                 case Action::SPELL:
 892                 case Action::SQUASH:
 893                 case Action::TRIM:
 894                 case Action::TRUNCATE:
 895                 case Action::UNHTML:
 896                     done = false;
 897                     report_useless_action(filename, line_no,
 898                                           actions.back().get_pos(),
 899                                           action_names[action]);
 900                     actions.pop_back();
 901                     break;
 902                 default:
 903                     break;
 904             }
 905             if (done) break;
 906         }
 907
 908         map<string, Action::type>::const_iterator boolpfx;
 909         for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
 910             if (boolpfx->second == Action::UNIQUE) {
 911                 report_location(DIAG_WARN, filename, unique_line_no,
 912                                 unique_pos);
 913                 cerr << "Index action 'unique=" << boolpfx->first
 914                      << "' without 'boolean=" << boolpfx->first << "'\n";
 915                 static bool given_doesnt_imply_boolean_warning = false;
 916                 if (!given_doesnt_imply_boolean_warning) {
 917                     given_doesnt_imply_boolean_warning = true;
 918                     report_location(DIAG_NOTE, filename, unique_line_no,
 919                                     unique_pos);
 920                     cerr << "'unique' doesn't implicitly add a boolean term\n";
 921                 }
 922             }
 923         }
 924
 925         vector<string>::const_iterator field;
 926         for (field = fields.begin(); field != fields.end(); ++field) {
 927             vector<Action> &v = index_spec[*field];
 928             if (v.empty()) {
 929                 if (fields.size() == 1) {
 930                     // Optimise common case where there's only one fieldname
 931                     // for a list of actions.
 932                     v = std::move(actions);
 933                 } else {
 934                     v = actions;
 935                 }
 936             } else {
 937                 v.emplace_back(Action::NEW, string::npos);
 938                 v.insert(v.end(), actions.begin(), actions.end());
 939             }
 940         }
 941     }
 942
 943     if (index_spec.empty()) {
 944         report_location(DIAG_ERROR, filename, line_no);
 945         cerr << "No rules found in index script\n";
 946     }
 947
 948     if (error_count) {
 949         exit(1);
 950     }
 951
 952     index_spec_uses_unique = (unique_line_no > 0);
 953 }
 954
 955 static bool
 956 run_actions(vector<Action>::const_iterator action_it,
 957             vector<Action>::const_iterator action_end,
 958             Xapian::WritableDatabase& database,
 959             Xapian::TermGenerator& indexer,
 960             const string& old_value,
 961             bool& this_field_is_content, Xapian::Document& doc,
 962             map<string, list<string>>& fields,
 963             string& field, const char* fname,
 964             size_t line_no, Xapian::docid& docid)
 965 {
 966     string value = old_value;
 967     while (action_it != action_end) {
 968         auto& action = *action_it++;
 969         switch (action.get_action()) {
 970             case Action::BAD:
 971                 abort();
 972             case Action::NEW:
 973                 value = old_value;
 974                 // We're processing the same field again - give it a reprieve.
 975                 this_field_is_content = true;
 976                 break;
 977             case Action::FIELD:
 978                 if (!value.empty()) {
 979                     string f = action.get_string_arg();
 980                     if (f.empty()) f = field;
 981                     // replace newlines with spaces
 982                     string s = value;
 983                     string::size_type j = 0;
 984                     while ((j = s.find('\n', j)) != string::npos)
 985                         s[j] = ' ';
 986                     fields[f].push_back(s);
 987                 }
 988                 break;
 989             case Action::INDEX:
 990                 indexer.index_text(value,
 991                                    action.get_num_arg(),
 992                                    action.get_string_arg());
 993                 break;
 994             case Action::INDEXNOPOS:
 995                 // No positional information so phrase searching won't work.
 996                 // However, the database will use much less diskspace.
 997                 indexer.index_text_without_positions(value,
 998                                                      action.get_num_arg(),
 999                                                      action.get_string_arg());
1000                 break;
1001             case Action::BOOLEAN: {
1002                 // Do nothing if there's no text.
1003                 if (value.empty()) break;
1004
1005                 string term = action.get_string_arg();
1006                 if (prefix_needs_colon(term, value[0])) term += ':';
1007                 term += value;
1008
1009                 doc.add_boolean_term(term);
1010                 break;
1011             }
1012             case Action::GAP:
1013                 indexer.increase_termpos(action.get_num_arg());
1014                 break;
1015             case Action::HASH: {
1016                 unsigned int max_length = action.get_num_arg();
1017                 if (value.length() > max_length)
1018                     value = hash_long_term(value, max_length);
1019                 break;
1020             }
1021             case Action::HEXTOBIN: {
1022                 size_t len = value.length();
1023                 if (len & 1) {
1024                     report_location(DIAG_ERROR, fname, line_no);
1025                     cerr << "hextobin: input must have even length\n";
1026                     exit(1);
1027                 }
1028
1029                 string output;
1030                 output.reserve(len / 2);
1031                 for (size_t j = 0; j < len; j += 2) {
1032                     char a = value[j];
1033                     char b = value[j + 1];
1034                     if (!C_isxdigit(a) || !C_isxdigit(b)) {
1035                         report_location(DIAG_ERROR, fname, line_no);
1036                         cerr << "hextobin: input must be all hex digits\n";
1037                         exit(1);
1038                     }
1039                     char r = (hex_digit(a) << 4) | hex_digit(b);
1040                     output.push_back(r);
1041                 }
1042                 value = std::move(output);
1043                 break;
1044             }
1045             case Action::LOWER:
1046                 value = Xapian::Unicode::tolower(value);
1047                 break;
1048             case Action::LTRIM:
1049                 ltrim(value, action.get_string_arg());
1050                 break;
1051             case Action::RTRIM:
1052                 rtrim(value, action.get_string_arg());
1053                 break;
1054             case Action::TRIM:
1055                 rtrim(value, action.get_string_arg());
1056                 ltrim(value, action.get_string_arg());
1057                 break;
1058             case Action::SQUASH:
1059                 squash(value, action.get_string_arg());
1060                 break;
1061             case Action::LOAD: {
1062                 // If there's no input, just issue a warning.
1063                 if (value.empty()) {
1064                     report_location(DIAG_WARN, fname, line_no);
1065                     cerr << "Empty filename in LOAD action\n";
1066                     break;
1067                 }
1068                 bool truncated = false;
1069                 string filename = std::move(value);
1070                 // FIXME: Use NOATIME if we own the file or are root.
1071                 if (!load_file(filename, action.get_num_arg(), NOCACHE,
1072                                value, truncated)) {
1073                     report_location(DIAG_ERROR, fname, line_no);
1074                     cerr << "Couldn't load file '" << filename << "': "
1075                          << strerror(errno) << '\n';
1076                     exit(1);
1077                 }
1078                 if (!truncated) break;
1079             }
1080             /* FALLTHRU */
1081             case Action::TRUNCATE:
1082                 utf8_truncate(value, action.get_num_arg());
1083                 break;
1084             case Action::SPELL:
1085                 indexer.set_flags(indexer.FLAG_SPELLING);
1086                 break;
1087             case Action::SPLIT: {
1088                 // Find the end of the actions which split should execute.
1089                 auto split_end = find(action_it, action_end, Action::NEW);
1090
1091                 int split_type = action.get_num_arg();
1092                 if (value.empty()) {
1093                     // Nothing to do.
1094                 } else if (split_type != Action::SPLIT_SORT) {
1095                     // Generate split as we consume it.
1096                     const string& delimiter = action.get_string_arg();
1097
1098                     unique_ptr<unordered_set<string>> seen;
1099                     if (split_type == Action::SPLIT_DEDUP) {
1100                         seen.reset(new unordered_set<string>);
1101                     }
1102
1103                     if (delimiter.size() == 1) {
1104                         // Special case for common single character delimiter.
1105                         char ch = delimiter[0];
1106                         string::size_type i = 0;
1107                         while (true) {
1108                             string::size_type j = value.find(ch, i);
1109                             if (split_type == Action::SPLIT_PREFIXES) {
1110                                 if (j > 0) {
1111                                     string val(value, 0, j);
1112                                     run_actions(action_it, split_end,
1113                                                 database, indexer,
1114                                                 val,
1115                                                 this_field_is_content, doc,
1116                                                 fields,
1117                                                 field, fname, line_no,
1118                                                 docid);
1119                                 }
1120                             } else if (i != j) {
1121                                 string val(value, i, j - i);
1122                                 if (!seen.get() || seen->insert(val).second) {
1123                                     run_actions(action_it, split_end,
1124                                                 database, indexer,
1125                                                 val,
1126                                                 this_field_is_content, doc,
1127                                                 fields,
1128                                                 field, fname, line_no,
1129                                                 docid);
1130                                 }
1131                             }
1132                             if (j == string::npos) break;
1133                             i = j + 1;
1134                         }
1135                     } else {
1136                         string::size_type i = 0;
1137                         while (true) {
1138                             string::size_type j = value.find(delimiter, i);
1139                             if (split_type == Action::SPLIT_PREFIXES) {
1140                                 if (j > 0) {
1141                                     string val(value, 0, j);
1142                                     run_actions(action_it, split_end,
1143                                                 database, indexer,
1144                                                 val,
1145                                                 this_field_is_content, doc,
1146                                                 fields,
1147                                                 field, fname, line_no,
1148                                                 docid);
1149                                 }
1150                             } else if (i != j) {
1151                                 string val(value, i, j - i);
1152                                 if (!seen.get() || seen->insert(val).second) {
1153                                     run_actions(action_it, split_end,
1154                                                 database, indexer,
1155                                                 val,
1156                                                 this_field_is_content, doc,
1157                                                 fields,
1158                                                 field, fname, line_no,
1159                                                 docid);
1160                                 }
1161                             }
1162                             if (j == string::npos) break;
1163                             i = j + delimiter.size();
1164                         }
1165                     }
1166                 } else {
1167                     vector<string> split_values;
1168                     const string& delimiter = action.get_string_arg();
1169                     if (delimiter.size() == 1) {
1170                         // Special case for common single character delimiter.
1171                         char ch = delimiter[0];
1172                         string::size_type i = 0;
1173                         while (true) {
1174                             string::size_type j = value.find(ch, i);
1175                             if (i != j) {
1176                                 split_values.emplace_back(value, i, j - i);
1177                             }
1178                             if (j == string::npos) break;
1179                             i = j + 1;
1180                         }
1181                     } else {
1182                         string::size_type i = 0;
1183                         while (true) {
1184                             string::size_type j = value.find(delimiter, i);
1185                             if (i != j) {
1186                                 split_values.emplace_back(value, i, j - i);
1187                             }
1188                             if (j == string::npos) break;
1189                             i = j + delimiter.size();
1190                         }
1191                     }
1192
1193                     sort(split_values.begin(), split_values.end());
1194
1195                     for (auto&& val : split_values) {
1196                         run_actions(action_it, split_end,
1197                                     database, indexer, val,
1198                                     this_field_is_content, doc, fields,
1199                                     field, fname, line_no,
1200                                     docid);
1201                     }
1202                 }
1203
1204                 action_it = split_end;
1205                 break;
1206             }
1207             case Action::UNHTML: {
1208                 MyHtmlParser p;
1209                 try {
1210                     // Default HTML character set is latin 1, though
1211                     // not specifying one is deprecated these days.
1212                     p.parse_html(value, "iso-8859-1", false);
1213                 } catch (const string & newcharset) {
1214                     p.reset();
1215                     p.parse_html(value, newcharset, true);
1216                 }
1217                 if (p.indexing_allowed)
1218                     value = p.dump;
1219                 else
1220                     value = "";
1221                 break;
1222             }
1223             case Action::UNIQUE: {
1224                 unique_unused = false;
1225
1226                 if (value.empty()) {
1227                     enum diag_type diag = DIAG_WARN;
1228                     switch (unique_missing) {
1229                       case UNIQUE_ERROR:
1230                         diag = DIAG_ERROR;
1231                         /* FALLTHRU */
1232                       case UNIQUE_WARN_NEW:
1233                       case UNIQUE_WARN_SKIP:
1234                         report_location(diag, fname, line_no);
1235                         cerr << "UNIQUE action on empty text\n";
1236                       default:
1237                         break;
1238                     }
1239                     switch (unique_missing) {
1240                       case UNIQUE_ERROR:
1241                         exit(1);
1242                       case UNIQUE_SKIP:
1243                       case UNIQUE_WARN_SKIP:
1244                         skipping_record = true;
1245                         break;
1246                       case UNIQUE_NEW:
1247                       case UNIQUE_WARN_NEW:
1248                         break;
1249                     }
1250                     break;
1251                 }
1252
1253                 // Ensure that the value of this field is unique.
1254                 // If a record already exists with the same value,
1255                 // it will be replaced with the new record.
1256
1257                 // Unique fields aren't considered content - if
1258                 // there are no other fields in the document, the
1259                 // document is to be deleted.
1260                 this_field_is_content = false;
1261
1262                 // Argument is the prefix to add to the field value
1263                 // to get the unique term.
1264                 string t = action.get_string_arg();
1265                 if (prefix_needs_colon(t, value[0])) t += ':';
1266                 t += value;
1267                 Xapian::PostingIterator p = database.postlist_begin(t);
1268                 if (p != database.postlist_end(t)) {
1269                     docid = *p;
1270                 }
1271                 break;
1272             }
1273             case Action::VALUE:
1274                 if (!value.empty())
1275                     doc.add_value(action.get_num_arg(), value);
1276                 break;
1277             case Action::VALUENUMERIC: {
1278                 if (value.empty()) break;
1279                 char * end;
1280                 double dbl = strtod(value.c_str(), &end);
1281                 if (*end) {
1282                     report_location(DIAG_WARN, fname, line_no);
1283                     cerr << "Trailing characters in VALUENUMERIC: '"
1284                          << value << "'\n";
1285                 }
1286                 doc.add_value(action.get_num_arg(),
1287                               Xapian::sortable_serialise(dbl));
1288                 break;
1289             }
1290             case Action::VALUEPACKED: {
1291                 uint32_t word = 0;
1292                 if (value.empty() || !C_isdigit(value[0])) {
1293                     // strtoul() accepts leading whitespace and negated
1294                     // values, neither of which we want to allow.
1295                     errno = EINVAL;
1296                 } else {
1297                     errno = 0;
1298                     char* q;
1299                     word = strtoul(value.c_str(), &q, 10);
1300                     if (!errno && *q != '\0') {
1301                         // Trailing characters after converted value.
1302                         errno = EINVAL;
1303                     }
1304                 }
1305                 if (errno) {
1306                     report_location(DIAG_WARN, fname, line_no);
1307                     cerr << "valuepacked \"" << value << "\" ";
1308                     if (errno == ERANGE) {
1309                         cerr << "out of range\n";
1310                     } else {
1311                         cerr << "not an unsigned integer\n";
1312                     }
1313                 }
1314                 int valueslot = action.get_num_arg();
1315                 doc.add_value(valueslot, int_to_binary_string(word));
1316                 break;
1317             }
1318             case Action::DATE: {
1319                 // Do nothing for empty input.
1320                 if (value.empty()) break;
1321
1322                 const string & type = action.get_string_arg();
1323                 string yyyymmdd;
1324                 if (type == "unix") {
1325                     time_t t;
1326                     if (!parse_signed(value.c_str(), t)) {
1327                         report_location(DIAG_WARN, fname, line_no);
1328                         cerr << "Date value (in secs) for action DATE "
1329                                 "must be an integer - ignoring\n";
1330                         break;
1331                     }
1332                     struct tm *tm = localtime(&t);
1333                     int y = tm->tm_year + 1900;
1334                     int m = tm->tm_mon + 1;
1335                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1336                 } else if (type == "unixutc") {
1337                     time_t t;
1338                     if (!parse_signed(value.c_str(), t)) {
1339                         report_location(DIAG_WARN, fname, line_no);
1340                         cerr << "Date value (in secs) for action DATE "
1341                                 "must be an integer - ignoring\n";
1342                         break;
1343                     }
1344                     struct tm *tm = gmtime(&t);
1345                     int y = tm->tm_year + 1900;
1346                     int m = tm->tm_mon + 1;
1347                     yyyymmdd = date_to_string(y, m, tm->tm_mday);
1348                 } else if (type == "yyyymmdd") {
1349                     if (value.length() != 8) {
1350                         report_location(DIAG_WARN, fname, line_no);
1351                         cerr << "date=yyyymmdd expects an 8 character value "
1352                                 "- ignoring\n";
1353                         break;
1354                     }
1355                     yyyymmdd = value;
1356                 }
1357
1358                 // Date (YYYYMMDD)
1359                 doc.add_boolean_term("D" + yyyymmdd);
1360                 yyyymmdd.resize(6);
1361                 // Month (YYYYMM)
1362                 doc.add_boolean_term("M" + yyyymmdd);
1363                 yyyymmdd.resize(4);
1364                 // Year (YYYY)
1365                 doc.add_boolean_term("Y" + yyyymmdd);
1366                 break;
1367             }
1368             case Action::PARSEDATE: {
1369                 string dateformat = action.get_string_arg();
1370                 struct tm tm;
1371                 memset(&tm, 0, sizeof(tm));
1372                 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1373                 if (ret == NULL) {
1374                     report_location(DIAG_WARN, fname, line_no);
1375                     cerr << "\"" << value << "\" doesn't match format "
1376                             "\"" << dateformat << '\"' << '\n';
1377                     break;
1378                 }
1379
1380                 if (*ret != '\0') {
1381                     report_location(DIAG_WARN, fname, line_no);
1382                     cerr << "\"" << value << "\" not fully matched by "
1383                             "format \"" << dateformat << "\" "
1384                             "(\"" << ret << "\" left over) but "
1385                             "indexing anyway\n";
1386                 }
1387 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1388                 auto gmtoff = tm.tm_gmtoff;
1389 #endif
1390                 auto secs_since_epoch = timegm(&tm);
1391 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1392                 secs_since_epoch -= gmtoff;
1393 #endif
1394                 value = str(secs_since_epoch);
1395                 break;
1396             }
1397             default:
1398                 /* Empty default case to avoid "unhandled enum value"
1399                  * warnings. */
1400                 break;
1401         }
1402     }
1403     return true;
1404 }
1405
1406 static void
1407 index_file(const char *fname, istream &stream,
1408            Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1409 {
1410     string line;
1411     size_t line_no = 0;
1412     while (!stream.eof() && getline_portable(stream, line)) {
1413         ++line_no;
1414         // Allow blank lines before the first record and multiple blank lines
1415         // between records.
1416         if (line.empty()) continue;
1417
1418         Xapian::Document doc;
1419         indexer.set_document(doc);
1420         Xapian::docid docid = 0;
1421         map<string, list<string>> fields;
1422         bool seen_content = false;
1423         skipping_record = false;
1424         unique_unused = index_spec_uses_unique;
1425         while (!line.empty()) {
1426             string::size_type eq = line.find('=');
1427             if (eq == string::npos && !line.empty()) {
1428                 report_location(DIAG_ERROR, fname, line_no);
1429                 cerr << "Expected = somewhere in this line\n";
1430                 exit(1);
1431             }
1432             string field(line, 0, eq);
1433             string value(line, eq + 1, string::npos);
1434             line.clear();
1435             while (getline_portable(stream, line)) {
1436                 ++line_no;
1437                 if (line.empty() || line[0] != '=') break;
1438                 // Replace the '=' with a '\n'.
1439                 line[0] = '\n';
1440                 value += line;
1441             }
1442
1443             if (skipping_record) continue;
1444
1445             // Default to not indexing spellings.
1446             indexer.set_flags(Xapian::TermGenerator::flags(0));
1447
1448             bool this_field_is_content = true;
1449             const vector<Action>& v = index_spec[field];
1450             run_actions(v.begin(), v.end(),
1451                         database, indexer, value,
1452                         this_field_is_content, doc, fields,
1453                         field, fname, line_no,
1454                         docid);
1455             if (this_field_is_content) seen_content = true;
1456         }
1457
1458         if (unique_unused) {
1459             enum diag_type diag = DIAG_WARN;
1460             switch (unique_missing) {
1461               case UNIQUE_ERROR:
1462                 diag = DIAG_ERROR;
1463                 /* FALLTHRU */
1464               case UNIQUE_WARN_NEW:
1465               case UNIQUE_WARN_SKIP:
1466                 report_location(diag, fname, line_no);
1467                 cerr << "UNIQUE action unused in this record\n";
1468               default:
1469                 break;
1470             }
1471             switch (unique_missing) {
1472               case UNIQUE_ERROR:
1473                 exit(1);
1474               case UNIQUE_SKIP:
1475               case UNIQUE_WARN_SKIP:
1476                 skipping_record = true;
1477                 break;
1478               case UNIQUE_NEW:
1479               case UNIQUE_WARN_NEW:
1480                 break;
1481             }
1482         }
1483
1484         if (skipping_record) {
1485             ++skipcount;
1486         } else if (!seen_content) {
1487             // We haven't seen any fields (other than unique identifiers)
1488             // so the document is to be deleted.
1489             if (docid) {
1490                 database.delete_document(docid);
1491                 if (verbose) cout << "Del: " << docid << '\n';
1492                 ++delcount;
1493             }
1494         } else {
1495             string data;
1496             for (auto&& i : fields) {
1497                 for (auto&& field_val : i.second) {
1498                     data += i.first;
1499                     data += '=';
1500                     data += field_val;
1501                     data += '\n';
1502                 }
1503             }
1504
1505             // Put the data in the document
1506             doc.set_data(data);
1507
1508             // Add the document to the database
1509             if (docid) {
1510                 database.replace_document(docid, doc);
1511                 if (verbose) cout << "Replace: " << docid << '\n';
1512                 ++repcount;
1513             } else {
1514                 docid = database.add_document(doc);
1515                 if (verbose) cout << "Add: " << docid << '\n';
1516                 ++addcount;
1517             }
1518         }
1519     }
1520
1521     // Commit after each file to make sure all changes from that file make it
1522     // in.
1523     if (verbose) cout << "Committing\n";
1524     database.commit();
1525 }
1526
1527 static void
1528 show_help(int exit_code)
1529 {
1530     cout << PROG_NAME " - " PROG_DESC "\n"
1531 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1532 "\n"
1533 "Creates or updates a Xapian database with the data from the input files listed\n"
1534 "on the command line.  If no files are specified, data is read from stdin.\n"
1535 "\n"
1536 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1537 "format for INDEXER_SCRIPT.\n"
1538 "\n"
1539 "Options:\n"
1540 "  -v, --verbose       display additional messages to aid debugging\n"
1541 "      --overwrite     create the database anew (the default is to update if\n"
1542 "                      the database already exists)\n";
1543     print_stemmer_help("");
1544     print_help_and_version_help("");
1545     exit(exit_code);
1546 }
1547
1548 int
1549 main(int argc, char **argv)
1550 try {
1551     // If the database already exists, default to updating not overwriting.
1552     int database_mode = Xapian::DB_CREATE_OR_OPEN;
1553     verbose = false;
1554     Xapian::Stem stemmer("english");
1555
1556     // Without this, strptime() seems to treat formats without a timezone as
1557     // being local time, including %s.
1558     setenv("TZ", "UTC", 1);
1559
1560     constexpr auto NO_ARG = no_argument;
1561     constexpr auto REQ_ARG = required_argument;
1562     static const struct option longopts[] = {
1563         { "help",       NO_ARG,         NULL, 'h' },
1564         { "version",    NO_ARG,         NULL, 'V' },
1565         { "stemmer",    REQ_ARG,        NULL, 's' },
1566         { "overwrite",  NO_ARG,         NULL, 'o' },
1567         { "verbose",    NO_ARG,         NULL, 'v' },
1568         { 0, 0, NULL, 0 }
1569     };
1570
1571     int getopt_ret;
1572     while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1573                                          longopts, NULL)) != -1) {
1574         switch (getopt_ret) {
1575             default:
1576                 show_help(1);
1577                 break;
1578             case 'h': // --help
1579                 show_help(0);
1580                 break;
1581             case 'V': // --version
1582                 print_package_info(PROG_NAME);
1583                 return 0;
1584             case 'o': // --overwrite
1585                 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1586                 break;
1587             case 'v':
1588                 verbose = true;
1589                 break;
1590             case 's':
1591                 try {
1592                     stemmer = Xapian::Stem(optarg);
1593                 } catch (const Xapian::InvalidArgumentError &) {
1594                     cerr << "Unknown stemming language '" << optarg << "'.\n";
1595                     cerr << "Available language names are: "
1596                          << Xapian::Stem::get_available_languages() << '\n';
1597                     return 1;
1598                 }
1599                 break;
1600         }
1601     }
1602
1603     argv += optind;
1604     argc -= optind;
1605     if (argc < 2) {
1606         show_help(1);
1607     }
1608
1609     parse_index_script(argv[1]);
1610
1611     // Open the database.  If another process is currently updating the
1612     // database, wait for the lock to become available.
1613     auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1614     Xapian::WritableDatabase database(argv[0], flags);
1615
1616     Xapian::TermGenerator indexer;
1617     indexer.set_stemmer(stemmer);
1618     // Set the database for spellings to be added to by the "spell" action.
1619     indexer.set_database(database);
1620
1621     addcount = 0;
1622     repcount = 0;
1623     delcount = 0;
1624     skipcount = 0;
1625
1626     if (argc == 2) {
1627         // Read from stdin.
1628         index_file("<stdin>", cin, database, indexer);
1629     } else {
1630         // Read file(s) listed on the command line.
1631         for (int i = 2; i < argc; ++i) {
1632             ifstream stream(argv[i]);
1633             if (stream) {
1634                 index_file(argv[i], stream, database, indexer);
1635             } else {
1636                 cerr << "Can't open file " << argv[i] << '\n';
1637             }
1638         }
1639     }
1640
1641     cout << "records (added, replaced, deleted, skipped) = ("
1642          << addcount << ", "
1643          << repcount << ", "
1644          << delcount << ", "
1645          << skipcount << ")\n";
1646 } catch (const Xapian::Error &error) {
1647     cerr << "Exception: " << error.get_description() << '\n';
1648     exit(1);
1649 } catch (const std::bad_alloc &) {
1650     cerr << "Exception: std::bad_alloc\n";
1651     exit(1);
1652 } catch (...) {
1653     cerr << "Unknown Exception\n";
1654     exit(1);
1655 }