xapian-applications/omega/query.cc

   1 /** @file
   2  * @brief query executor for omega
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002 Intercede 1749 Ltd
   8  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021 Olly Betts
   9  * Copyright 2008 Thomas Viehmann
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include <algorithm>
  30 #include <iostream>
  31 #include <map>
  32 #include <random>
  33 #include <set>
  34 #include <unordered_map>
  35 #include <unordered_set>
  36 #include <vector>
  37
  38 #include <cassert>
  39 #include <cctype>
  40 #include <cerrno>
  41 #include <stdio.h>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include "strcasecmp.h"
  45 #include <ctime>
  46
  47 #include "safeunistd.h"
  48 #include <sys/types.h>
  49 #include "safesysstat.h"
  50 #include "safefcntl.h"
  51
  52 #include "realtime.h"
  53
  54 #include <cdb.h>
  55
  56 #include "csvescape.h"
  57 #include "date.h"
  58 #include "datevalue.h"
  59 #include "fields.h"
  60 #include "jsonescape.h"
  61 #include "utils.h"
  62 #include "omega.h"
  63 #include "query.h"
  64 #include "cgiparam.h"
  65 #include "loadfile.h"
  66 #include "sample.h"
  67 #include "sort.h"
  68 #include "str.h"
  69 #include "stringutils.h"
  70 #include "transform.h"
  71 #include "urldecode.h"
  72 #include "urlencode.h"
  73 #include "unixperm.h"
  74 #include "values.h"
  75 #include "weight.h"
  76 #include "expand.h"
  77 #include "md5wrap.h"
  78 #include "parseint.h"
  79 #include <xapian.h>
  80
  81 using namespace std;
  82
  83 using Xapian::Utf8Iterator;
  84
  85 using Xapian::Unicode::is_wordchar;
  86
  87 /// Map shard to DB parameter value and stats to allow docid mapping.
  88 vector<SubDB> subdbs;
  89
  90 static bool query_parsed = false;
  91 static bool done_query = false;
  92 static Xapian::docid last = 0;
  93 static Xapian::docid topdoc = 0;
  94
  95 static Xapian::MSet mset;
  96 static Xapian::RSet rset;
  97
  98 static map<Xapian::docid, bool> ticked;
  99
 100 static void ensure_query_parsed();
 101 static void ensure_match();
 102
 103 static Xapian::Query query;
 104 //static string url_query_string;
 105 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 106
 107 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
 108 // be true if a date filter is specified which simplifies to
 109 // Query::MatchNothing at construction time.
 110 static bool date_filter_set = false;
 111 static Xapian::Query date_filter;
 112
 113 static Xapian::QueryParser qp;
 114 static Xapian::NumberRangeProcessor * size_rp = NULL;
 115 static Xapian::Stem *stemmer = NULL;
 116
 117 static string eval_file(const string& fmtfile, bool* p_not_found = nullptr);
 118
 119 static set<string> termset;
 120
 121 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 122 static map<string, string> termprefix_to_userprefix;
 123
 124 static string queryterms;
 125
 126 static string error_msg;
 127
 128 static double secs = -1;
 129
 130 static const char DEFAULT_LOG_ENTRY[] =
 131         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 132         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 133         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 134         "$dbname\t"
 135         "$query\t"
 136         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 137
 138 class MyStopper : public Xapian::Stopper {
 139   public:
 140     bool operator()(const string& t) const override {
 141         switch (t[0]) {
 142             case 'a':
 143                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 144                         t == "are" || t == "as" || t == "at");
 145             case 'b':
 146                 return (t == "be" || t == "by");
 147             case 'e':
 148                 return (t == "en");
 149             case 'f':
 150                 return (t == "for" || t == "from");
 151             case 'h':
 152                 return (t == "how");
 153             case 'i':
 154                 return (t == "i" || t == "in" || t == "is" || t == "it");
 155             case 'o':
 156                 return (t == "of" || t == "on" || t == "or");
 157             case 't':
 158                 return (t == "that" || t == "the" || t == "this" || t == "to");
 159             case 'w':
 160                 return (t == "was" || t == "what" || t == "when" ||
 161                         t == "where" || t == "which" || t == "who" ||
 162                         t == "why" || t == "will" || t == "with");
 163             case 'y':
 164                 return (t == "you" || t == "your");
 165             default:
 166                 return false;
 167         }
 168     }
 169 };
 170
 171 static size_t
 172 prefix_from_term(string* prefix, const string& term)
 173 {
 174     if (!term.empty()) {
 175         if (term[0] == 'X') {
 176             const string::const_iterator begin = term.begin();
 177             string::const_iterator i = begin + 1;
 178             while (i != term.end() && C_isupper(*i))
 179                 ++i;
 180             if (prefix)
 181                 prefix->assign(begin, i);
 182             if (i != term.end() && *i == ':')
 183                 ++i;
 184             return i - begin;
 185         }
 186
 187         if (C_isupper(term[0])) {
 188             if (prefix)
 189                 *prefix = term[0];
 190             return 1;
 191         }
 192     }
 193
 194     if (prefix)
 195         prefix->resize(0);
 196     return 0;
 197 }
 198
 199 // Don't allow ".." in format names, log file names, etc as this would allow
 200 // people to open a format "../../etc/passwd" or similar.
 201 // FIXME: make this check more exact ("foo..bar" is safe)
 202 // FIXME: log when this check fails
 203 static bool
 204 vet_filename(const string &filename)
 205 {
 206     string::size_type i = filename.find("..");
 207     return (i == string::npos);
 208 }
 209
 210 // Heuristics:
 211 // * If any terms have been removed, it's a "fresh query" so we discard any
 212 //   relevance judgements
 213 // * If all previous terms are there but more have been added then we keep
 214 //   the relevance judgements, but return the first page of hits
 215 //
 216 // NEW_QUERY entirely new query
 217 // SAME_QUERY unchanged query
 218 // EXTENDED_QUERY new query, but based on the old one
 219 // BAD_QUERY parse error (message in error_msg)
 220 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 221
 222 static multimap<string, string> query_strings;
 223
 224 void
 225 add_query_string(const string& prefix, const string& s)
 226 {
 227     string query_string = s;
 228     // Strip leading and trailing whitespace from query_string.
 229     trim(query_string);
 230     if (!query_string.empty())
 231         query_strings.insert(make_pair(prefix, query_string));
 232 }
 233
 234 static unsigned
 235 read_qp_flags(const string & opt_pfx, unsigned f)
 236 {
 237     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 238     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 239         unsigned mask = 0;
 240         const char * s = i->first.c_str() + opt_pfx.size();
 241         switch (s[0]) {
 242             case 'a':
 243                 // Note that the ``Xapian::QueryParser::FLAG_ACCUMULATE`` flag
 244                 // is or-ed in below because it's needed for ``$stoplist`` and
 245                 // ``$unstem`` to work correctly, and so is deliberately not
 246                 // available to specify here.
 247                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 248                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 249                     break;
 250                 }
 251                 if (strcmp(s, "auto_synonyms") == 0) {
 252                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 253                     break;
 254                 }
 255                 break;
 256             case 'b':
 257                 if (strcmp(s, "boolean") == 0) {
 258                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 259                     break;
 260                 }
 261                 if (strcmp(s, "boolean_any_case") == 0) {
 262                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 263                     break;
 264                 }
 265                 break;
 266             case 'c':
 267                 if (strcmp(s, "cjk_ngram") == 0) {
 268                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 269                     break;
 270                 }
 271                 break;
 272             case 'd':
 273                 if (strcmp(s, "default") == 0) {
 274                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 275                     break;
 276                 }
 277                 break;
 278             case 'f':
 279                 if (strcmp(s, "fuzzy") == 0) {
 280                     mask = Xapian::QueryParser::FLAG_FUZZY;
 281                     break;
 282                 }
 283                 break;
 284             case 'l':
 285                 if (strcmp(s, "lovehate") == 0) {
 286                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 287                     break;
 288                 }
 289                 break;
 290             case 'n':
 291                 if (strcmp(s, "no_positions") == 0) {
 292                     mask = Xapian::QueryParser::FLAG_NO_POSITIONS;
 293                     break;
 294                 }
 295                 if (strcmp(s, "ngrams") == 0) {
 296                     mask = Xapian::QueryParser::FLAG_NGRAMS;
 297                     break;
 298                 }
 299                 break;
 300             case 'p':
 301                 if (strcmp(s, "partial") == 0) {
 302                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 303                     break;
 304                 }
 305                 if (strcmp(s, "phrase") == 0) {
 306                     mask = Xapian::QueryParser::FLAG_PHRASE;
 307                     break;
 308                 }
 309                 if (strcmp(s, "pure_not") == 0) {
 310                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 311                     break;
 312                 }
 313                 break;
 314             case 's':
 315                 if (strcmp(s, "spelling_correction") == 0) {
 316                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 317                     break;
 318                 }
 319                 if (strcmp(s, "synonym") == 0) {
 320                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 321                     break;
 322                 }
 323                 break;
 324             case 'w':
 325                 if (strcmp(s, "wildcard") == 0) {
 326                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 327                     break;
 328                 }
 329 #if XAPIAN_AT_LEAST(1,5,0)
 330                 if (strcmp(s, "wildcard_glob") == 0) {
 331                     mask = Xapian::QueryParser::FLAG_WILDCARD_GLOB;
 332                     break;
 333                 }
 334                 if (strcmp(s, "wildcard_multi") == 0) {
 335                     mask = Xapian::QueryParser::FLAG_WILDCARD_MULTI;
 336                     break;
 337                 }
 338                 if (strcmp(s, "wildcard_single") == 0) {
 339                     mask = Xapian::QueryParser::FLAG_WILDCARD_SINGLE;
 340                     break;
 341                 }
 342                 if (strcmp(s, "word_breaks") == 0) {
 343                     mask = Xapian::QueryParser::FLAG_WORD_BREAKS;
 344                     break;
 345                 }
 346 #endif
 347                 break;
 348         }
 349
 350         if (i->second.empty()) {
 351             f &= ~mask;
 352         } else {
 353             f |= mask;
 354         }
 355     }
 356     // Always enable FLAG_ACCUMULATE so that $stoplist and $unstem report
 357     // values accumulated over all query strings parsed as part of a query, not
 358     // just the last one parsed.
 359     return f | Xapian::QueryParser::FLAG_ACCUMULATE;
 360 }
 361
 362 static querytype
 363 parse_queries(const string& oldp)
 364 {
 365     // Parse the query string.
 366     auto opt_it = option.find("stem_strategy");
 367     if (opt_it != option.end()) {
 368         if (opt_it->second == "all") {
 369             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 370         } else if (opt_it->second == "all_z") {
 371             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
 372         } else if (opt_it->second == "none") {
 373             qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
 374         } else if (opt_it->second == "some") {
 375             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
 376         } else if (opt_it->second == "some_full_pos") {
 377             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
 378         }
 379     } else {
 380         opt_it = option.find("stem_all");
 381         if (opt_it != option.end() && opt_it->second == "true") {
 382             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 383         }
 384     }
 385     qp.set_stopper((new MyStopper())->release());
 386     qp.set_default_op(default_op);
 387     qp.set_database(db);
 388     // FIXME: provide a custom RP which handles size:10..20K, etc.
 389     if (!size_rp)
 390         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 391     qp.add_rangeprocessor(size_rp);
 392     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 393     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 394         string user_prefix(pfx->first, 7);
 395         const string & term_pfx_list = pfx->second;
 396         string::size_type i = 0;
 397         do {
 398             string::size_type i0 = i;
 399             i = term_pfx_list.find('\t', i);
 400             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 401             qp.add_prefix(user_prefix, term_pfx);
 402             // std::map::insert() won't overwrite an existing entry, so we'll
 403             // prefer the first user_prefix for which a particular term prefix
 404             // is specified.
 405             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 406         } while (UNSIGNED_OVERFLOW_OK(++i));
 407     }
 408     pfx = option.lower_bound("boolprefix,");
 409     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 410         string user_prefix(pfx->first, 11, string::npos);
 411         auto it = option.find("nonexclusiveprefix," + pfx->second);
 412         bool exclusive = (it == option.end() || it->second.empty());
 413         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 414         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 415     }
 416
 417     try {
 418         unsigned default_flags = read_qp_flags("flag_", 0);
 419
 420         vector<Xapian::Query> queries;
 421         queries.reserve(query_strings.size());
 422
 423         for (auto& j : query_strings) {
 424             const string& prefix = j.first;
 425             const string& query_string = j.second;
 426
 427             // Choose the stemmer to use for this input.
 428             string stemlang = option[prefix + ":stemmer"];
 429             if (stemlang.empty())
 430                 stemlang = option["stemmer"];
 431             qp.set_stemmer(Xapian::Stem(stemlang));
 432
 433             // Work out the flags to use for this input.
 434             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 435
 436             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 437             if (!q.empty())
 438                 queries.push_back(q);
 439         }
 440
 441         Xapian::Query::op intra_query_op = Xapian::Query::OP_AND;
 442         if (queries.size() > 1) {
 443             // Determine operator to use to combine multiple P and P.<prefix>
 444             // parameters.  Note that we only need to bother if there are two
 445             // or more query strings, since for one or none the operator
 446             // specified isn't actually used.
 447             opt_it = option.find("intra_query_op");
 448             if (opt_it != option.end()) {
 449                 const string& v = opt_it->second;
 450                 if (v == "OR" || v == "or") {
 451                     intra_query_op = Xapian::Query::OP_OR;
 452                 }
 453             }
 454         }
 455         query = Xapian::Query(intra_query_op, queries.begin(), queries.end());
 456     } catch (Xapian::QueryParserError &e) {
 457         error_msg = e.get_msg();
 458         return BAD_QUERY;
 459     }
 460
 461     Xapian::termcount n_new_terms = 0;
 462     for (Xapian::TermIterator i = query.get_terms_begin();
 463          i != query.get_terms_end(); ++i) {
 464         if (termset.find(*i) == termset.end()) {
 465             termset.insert(*i);
 466             if (!queryterms.empty()) queryterms += '\t';
 467             queryterms += *i;
 468         }
 469         n_new_terms++;
 470     }
 471
 472     // Check new query against the previous one
 473     if (oldp.empty()) {
 474         // If oldp was empty that means there were no parsed query terms
 475         // before, so if there are now this is a new query.
 476         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 477     }
 478
 479     // The terms in oldp are separated by tabs.
 480     const char oldp_separator = '\t';
 481     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 482
 483     // short-cut: if the new query has fewer terms, it must be a new one
 484     if (n_new_terms < n_old_terms) return NEW_QUERY;
 485
 486     const char *term = oldp.c_str();
 487     const char *pend;
 488     while ((pend = strchr(term, oldp_separator)) != NULL) {
 489         if (termset.find(string(term, pend - term)) == termset.end())
 490             return NEW_QUERY;
 491         term = pend + 1;
 492     }
 493     if (*term) {
 494         if (termset.find(string(term)) == termset.end())
 495             return NEW_QUERY;
 496     }
 497
 498     // Use termset.size() rather than n_new_terms so we correctly handle
 499     // the case when the query has repeated terms.
 500     // This works wrongly in the case when the user extends the query
 501     // by adding a term already in it, but that's unlikely and the behaviour
 502     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 503     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 504     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 505     return SAME_QUERY;
 506 }
 507
 508 static multimap<string, string> filter_map;
 509 static set<string> neg_filters;
 510
 511 void add_bterm(const string &term) {
 512     string prefix;
 513     if (prefix_from_term(&prefix, term) > 0)
 514         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 515 }
 516
 517 void add_nterm(const string &term) {
 518     if (!term.empty())
 519         neg_filters.insert(term);
 520 }
 521
 522 void
 523 add_date_filter(const string& date_start,
 524                 const string& date_end,
 525                 const string& date_span,
 526                 Xapian::valueno date_value_slot)
 527 {
 528     if (date_start.empty() && date_end.empty() && date_span.empty())
 529         return;
 530
 531     Xapian::Query q;
 532     if (date_value_slot != Xapian::BAD_VALUENO) {
 533         // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 534         // latter the sort order just works correctly between different
 535         // precisions).
 536         bool as_time_t =
 537             db.get_value_lower_bound(date_value_slot).size() == 4 &&
 538             db.get_value_upper_bound(date_value_slot).size() == 4;
 539         q = date_value_range(as_time_t, date_value_slot,
 540                              date_start, date_end,
 541                              date_span);
 542     } else {
 543         q = date_range_filter(date_start, date_end, date_span);
 544         q |= Xapian::Query("Dlatest");
 545     }
 546
 547     if (date_filter_set) {
 548         date_filter &= q;
 549     } else {
 550         date_filter_set = true;
 551         date_filter = q;
 552     }
 553 }
 554
 555 static void
 556 run_query()
 557 {
 558     string scheme;
 559     bool force_boolean = false;
 560     if (!filter_map.empty()) {
 561         // OR together filters with the same prefix (or AND for non-exclusive
 562         // prefixes), then AND together the resultant groups.
 563         vector<Xapian::Query> filter_vec;
 564         vector<string> same_vec;
 565         string current;
 566         for (auto i = filter_map.begin(); ; ++i) {
 567             bool over = (i == filter_map.end());
 568             if (over || i->first != current) {
 569                 switch (same_vec.size()) {
 570                     case 0:
 571                         break;
 572                     case 1:
 573                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 574                         break;
 575                     default: {
 576                         Xapian::Query::op op = Xapian::Query::OP_OR;
 577                         auto it = option.find("nonexclusiveprefix," + current);
 578                         if (it != option.end() && !it->second.empty()) {
 579                             op = Xapian::Query::OP_AND;
 580                         }
 581                         filter_vec.push_back(Xapian::Query(op,
 582                                                            same_vec.begin(),
 583                                                            same_vec.end()));
 584                         break;
 585                     }
 586                 }
 587                 same_vec.clear();
 588                 if (over) break;
 589                 current = i->first;
 590             }
 591             same_vec.push_back(i->second);
 592         }
 593
 594         Xapian::Query filter(Xapian::Query::OP_AND,
 595                              filter_vec.begin(), filter_vec.end());
 596
 597         if (query.empty()) {
 598             // If no query strings were provided then promote the filters
 599             // to be THE query - filtering an empty query will give no
 600             // matches.
 601             std::swap(query, filter);
 602             auto&& it = option.find("weightingpurefilter");
 603             if (it != option.end() && !it->second.empty()) {
 604                 scheme = it->second;
 605             } else {
 606                 force_boolean = true;
 607             }
 608         } else {
 609             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 610         }
 611     }
 612
 613     if (date_filter_set) {
 614         // If no query strings were provided then promote the daterange
 615         // filter to be THE query instead of filtering an empty query.
 616         if (query.empty()) {
 617             query = date_filter;
 618             force_boolean = true;
 619         } else {
 620             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 621         }
 622     }
 623
 624     if (!neg_filters.empty()) {
 625         // OR together all negated filters.
 626         Xapian::Query filter(Xapian::Query::OP_OR,
 627                              neg_filters.begin(), neg_filters.end());
 628
 629         if (query.empty() && !date_filter_set) {
 630             // If we only have a negative filter for the query, use MatchAll as
 631             // the query to apply the filters to.
 632             query = Xapian::Query::MatchAll;
 633             force_boolean = true;
 634         }
 635         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 636     }
 637
 638     if (!enquire || !error_msg.empty()) return;
 639
 640     if (!force_boolean && scheme.empty()) {
 641         auto&& it = option.find("weighting");
 642         if (it != option.end()) scheme = it->second;
 643     }
 644     set_weighting_scheme(*enquire, scheme, force_boolean);
 645
 646     enquire->set_cutoff(threshold);
 647
 648     if (sort_keymaker) {
 649         if (sort_after) {
 650             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 651                                                     reverse_sort);
 652         } else {
 653             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 654                                                     reverse_sort);
 655         }
 656     } else if (sort_key != Xapian::BAD_VALUENO) {
 657         if (sort_after) {
 658             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 659         } else {
 660             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 661         }
 662     }
 663
 664     enquire->set_docid_order(docid_order);
 665
 666     if (collapse) {
 667         enquire->set_collapse_key(collapse_key);
 668     }
 669
 670     if (!query.empty()) {
 671 #if 0
 672         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 673         // we're going to break some existing setups if users upgrade.  We
 674         // probably want a way to set this from OmegaScript.
 675         const char * remote_user = getenv("REMOTE_USER");
 676         if (remote_user)
 677             apply_unix_permissions(query, remote_user);
 678 #endif
 679
 680         enquire->set_query(query);
 681         // We could use the value of topdoc as first parameter, but we
 682         // need to know the first few items in the mset to fake a
 683         // relevance set for topterms.
 684         //
 685         // If min_hits isn't set, check at least one extra result so we
 686         // know if we've reached the end of the matches or not - then we
 687         // can avoid offering a "next" button which leads to an empty page.
 688         mset = enquire->get_mset(0, topdoc + hits_per_page,
 689                                  topdoc + max(hits_per_page + 1, min_hits),
 690                                  &rset);
 691     }
 692 }
 693
 694 string
 695 html_escape(const string &str)
 696 {
 697     string res;
 698     string::size_type p = 0;
 699     while (p < str.size()) {
 700         char ch = str[p++];
 701         switch (ch) {
 702             case '<':
 703                 res += "&lt;";
 704                 continue;
 705             case '>':
 706                 res += "&gt;";
 707                 continue;
 708             case '&':
 709                 res += "&amp;";
 710                 continue;
 711             case '"':
 712                 res += "&quot;";
 713                 continue;
 714             default:
 715                 res += ch;
 716         }
 717     }
 718     return res;
 719 }
 720
 721 static string
 722 html_strip(const string &str)
 723 {
 724     string res;
 725     string::size_type p = 0;
 726     bool skip = false;
 727     while (p < str.size()) {
 728         char ch = str[p++];
 729         switch (ch) {
 730             case '<':
 731                 skip = true;
 732                 continue;
 733             case '>':
 734                 skip = false;
 735                 continue;
 736             default:
 737                 if (!skip) res += ch;
 738         }
 739     }
 740     return res;
 741 }
 742
 743 class WordList {
 744     static string prev_list;
 745     static unordered_map<string, int> word_to_occurrence;
 746   public:
 747     void build_word_map(const string& list) {
 748         // Don't build map again if passed list of terms is same as before.
 749         if (prev_list == list) return;
 750         word_to_occurrence.clear();
 751         string::size_type split = 0, split2;
 752         int word_index = 0;
 753         string word;
 754         while ((split2 = list.find('\t', split)) != string::npos) {
 755             word = list.substr(split, split2 - split);
 756             if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 757                 ++word_index;
 758             split = split2 + 1;
 759         }
 760         word = list.substr(split, list.size() - split);
 761         if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 762             ++word_index;
 763         prev_list = list;
 764     }
 765
 766     int word_in_list(const string& word) {
 767         auto it = word_to_occurrence.find(word);
 768         if (it == word_to_occurrence.end()) return -1;
 769         return it->second;
 770     }
 771 };
 772
 773 string WordList::prev_list;
 774 unordered_map<string, int> WordList::word_to_occurrence;
 775
 776 // Not a character in an identifier
 777 static inline bool
 778 p_notid(unsigned int c)
 779 {
 780     return !C_isalnum(c) && c != '_';
 781 }
 782
 783 // Not a character in an HTML tag name
 784 static inline bool
 785 p_nottag(unsigned int c)
 786 {
 787     return !C_isalnum(c) && c != '.' && c != '-';
 788 }
 789
 790 // FIXME: shares algorithm with indextext.cc!
 791 static string
 792 html_highlight(const string &s, const string &list,
 793                const string &bra, const string &ket)
 794 {
 795     if (!stemmer) {
 796         stemmer = new Xapian::Stem(option["stemmer"]);
 797     }
 798
 799     string res;
 800
 801     Utf8Iterator j(s);
 802     const Utf8Iterator s_end;
 803     while (true) {
 804         Utf8Iterator first = j;
 805         while (first != s_end && !is_wordchar(*first)) ++first;
 806         if (first == s_end) break;
 807         Utf8Iterator term_end;
 808         string term;
 809         string word;
 810         const char *l = j.raw();
 811         if (*first < 128 && C_isupper(*first)) {
 812             j = first;
 813             Xapian::Unicode::append_utf8(term, *j);
 814             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 815                 Xapian::Unicode::append_utf8(term, *j);
 816             }
 817             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 818                 term.resize(0);
 819             }
 820             term_end = j;
 821         }
 822         if (term.empty()) {
 823             j = first;
 824             while (is_wordchar(*j)) {
 825                 Xapian::Unicode::append_utf8(term, *j);
 826                 ++j;
 827                 if (j == s_end) break;
 828                 if (*j == '&' || *j == '\'') {
 829                     Utf8Iterator next = j;
 830                     ++next;
 831                     if (next == s_end || !is_wordchar(*next)) break;
 832                     term += *j;
 833                     j = next;
 834                 }
 835             }
 836             term_end = j;
 837             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 838                 string::size_type len = term.length();
 839                 if (*j == '#') {
 840                     term += '#';
 841                     do { ++j; } while (j != s_end && *j == '#');
 842                 } else {
 843                     while (j != s_end && (*j == '+' || *j == '-')) {
 844                         Xapian::Unicode::append_utf8(term, *j);
 845                         ++j;
 846                     }
 847                 }
 848                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 849                     term.resize(len);
 850                 } else {
 851                     term_end = j;
 852                 }
 853             }
 854         }
 855         j = term_end;
 856         term = Xapian::Unicode::tolower(term);
 857         WordList w;
 858         w.build_word_map(list);
 859         int match = w.word_in_list(term);
 860         if (match == -1) {
 861             string stem = "Z";
 862             stem += (*stemmer)(term);
 863             match = w.word_in_list(stem);
 864         }
 865         if (match >= 0) {
 866             res += html_escape(string(l, first.raw() - l));
 867             if (!bra.empty()) {
 868                 res += bra;
 869             } else {
 870                 static const char * colours[] = {
 871                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 872                     "990000", "009900", "996600", "006699", "990099"
 873                 };
 874                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 875                 const char * bg = colours[idx];
 876                 if (strchr(bg, 'f')) {
 877                     res += "<b style=\"color:black;background-color:#";
 878                 } else {
 879                     res += "<b style=\"color:white;background-color:#";
 880                 }
 881                 res += bg;
 882                 res += "\">";
 883             }
 884             word.assign(first.raw(), j.raw() - first.raw());
 885             res += html_escape(word);
 886             if (!bra.empty()) {
 887                 res += ket;
 888             } else {
 889                 res += "</b>";
 890             }
 891         } else {
 892             res += html_escape(string(l, j.raw() - l));
 893         }
 894     }
 895     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 896     return res;
 897 }
 898
 899 #if 0
 900 static void
 901 print_query_string(const char *after)
 902 {
 903     if (after && strncmp(after, "&B=", 3) == 0) {
 904         char prefix = after[3];
 905         string::size_type start = 0, amp = 0;
 906         while (true) {
 907             amp = url_query_string.find('&', amp);
 908             if (amp == string::npos) {
 909                 cout << url_query_string.substr(start);
 910                 return;
 911             }
 912             amp++;
 913             while (url_query_string[amp] == 'B' &&
 914                    url_query_string[amp + 1] == '=' &&
 915                    url_query_string[amp + 2] == prefix) {
 916                 cout << url_query_string.substr(start, amp - start - 1);
 917                 start = url_query_string.find('&', amp + 3);
 918                 if (start == string::npos) return;
 919                 amp = start + 1;
 920             }
 921         }
 922     }
 923     cout << url_query_string;
 924 }
 925 #endif
 926
 927 class CachedFields : private Fields {
 928     Xapian::docid did_cached = 0;
 929
 930   public:
 931     CachedFields() {}
 932
 933     const string& get_field(Xapian::docid did, const string& name) {
 934         if (did != did_cached) {
 935             did_cached = did;
 936             auto it = option.find("fieldnames");
 937             Fields::parse_fields(db.get_document(did).get_data(),
 938                                  it == option.end() ? nullptr : &it->second);
 939         }
 940         return Fields::get_field(name);
 941     }
 942 };
 943
 944 static CachedFields fields;
 945 static Xapian::docid q0;
 946 static Xapian::doccount hit_no;
 947 static int percent;
 948 static double weight;
 949 static Xapian::doccount collapsed;
 950
 951 static string print_caption(const string& fmt, vector<string>& param);
 952
 953 enum tagval {
 954 CMD_,
 955 CMD_add,
 956 CMD_addfilter,
 957 CMD_allterms,
 958 CMD_and,
 959 CMD_base64,
 960 CMD_cgi,
 961 CMD_cgilist,
 962 CMD_cgiparams,
 963 CMD_chr,
 964 CMD_collapsed,
 965 CMD_cond,
 966 CMD_contains,
 967 CMD_csv,
 968 CMD_date,
 969 CMD_dbname,
 970 CMD_dbsize,
 971 CMD_def,
 972 CMD_defaultop,
 973 CMD_div,
 974 CMD_emptydocs,
 975 CMD_env,
 976 CMD_eq,
 977 CMD_error,
 978 CMD_field,
 979 CMD_filesize,
 980 CMD_filters,
 981 CMD_filterterms,
 982 CMD_find,
 983 CMD_fmt,
 984 CMD_foreach,
 985 CMD_freq,
 986 CMD_ge,
 987 CMD_gt,
 988 CMD_hash,
 989 CMD_highlight,
 990 CMD_hit,
 991 CMD_hitlist,
 992 CMD_hitsperpage,
 993 CMD_hostname,
 994 CMD_html,
 995 CMD_htmlstrip,
 996 CMD_httpheader,
 997 CMD_id,
 998 CMD_if,
 999 CMD_include,
1000 CMD_json,
1001 CMD_jsonarray,
1002 CMD_jsonbool,
1003 CMD_jsonobject,
1004 CMD_jsonobject2,
1005 CMD_keys,
1006 CMD_last,
1007 CMD_lastpage,
1008 CMD_le,
1009 CMD_length,
1010 CMD_list,
1011 CMD_log,
1012 CMD_lookup,
1013 CMD_lower,
1014 CMD_lt,
1015 CMD_map,
1016 CMD_match,
1017 CMD_max,
1018 CMD_min,
1019 CMD_mod,
1020 CMD_msize,
1021 CMD_msizeexact,
1022 CMD_msizelower,
1023 CMD_msizeupper,
1024 CMD_mul,
1025 CMD_muldiv,
1026 CMD_ne,
1027 CMD_nice,
1028 CMD_not,
1029 CMD_now,
1030 CMD_opt,
1031 CMD_or,
1032 CMD_ord,
1033 CMD_pack,
1034 CMD_percentage,
1035 CMD_prettyterm,
1036 CMD_prettyurl,
1037 CMD_query,
1038 CMD_querydescription,
1039 CMD_queryterms,
1040 CMD_random,
1041 CMD_range,
1042 CMD_record,
1043 CMD_relevant,
1044 CMD_relevants,
1045 CMD_score,
1046 CMD_set,
1047 CMD_seterror,
1048 CMD_setmap,
1049 CMD_setrelevant,
1050 CMD_slice,
1051 CMD_snippet,
1052 CMD_sort,
1053 CMD_sortableunserialise,
1054 CMD_split,
1055 CMD_srandom,
1056 CMD_stoplist,
1057 CMD_sub,
1058 CMD_subdb,
1059 CMD_subid,
1060 CMD_substr,
1061 CMD_suggestion,
1062 CMD_switch,
1063 CMD_termprefix,
1064 CMD_terms,
1065 CMD_thispage,
1066 CMD_time,
1067 CMD_topdoc,
1068 CMD_topterms,
1069 CMD_transform,
1070 CMD_truncate,
1071 CMD_uniq,
1072 CMD_unique,
1073 CMD_unpack,
1074 CMD_unprefix,
1075 CMD_unstem,
1076 CMD_upper,
1077 CMD_url,
1078 CMD_value,
1079 CMD_valuelowerbound,
1080 CMD_valueupperbound,
1081 CMD_version,
1082 CMD_weight,
1083 CMD_MACRO // special tag for macro evaluation
1084 };
1085
1086 struct func_attrib {
1087     int tag;
1088     int minargs, maxargs, evalargs;
1089     char ensure;
1090 };
1091
1092 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1093 struct func_desc {
1094     const char *name;
1095     struct func_attrib a;
1096 };
1097
1098 #define N (-1)
1099 #define M 'M'
1100 #define Q 'Q'
1101 // NB when adding a new command which ensures M or Q, update the list in
1102 // docs/omegascript.rst
1103 static const struct func_desc func_tab[] = {
1104 //name minargs maxargs evalargs ensure
1105 {"",{CMD_,         N, N, 0, 0}},// commented out code
1106 T(add,             0, N, N, 0), // add a list of numbers
1107 T(addfilter,       1, 2, N, 0), // add filter term
1108 T(allterms,        0, 1, N, 0), // list of all terms matching document
1109 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1110 T(base64,          1, 1, N, 0), // base64 encode
1111 T(cgi,             1, 1, N, 0), // return cgi parameter value
1112 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1113 T(cgiparams,       0, 0, N, 0), // return list of cgi parameter names
1114 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1115 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1116 T(cond,            2, N, 0, 0), // cascaded conditionals
1117 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1118 T(csv,             1, 2, N, 0), // CSV string escaping
1119 T(date,            1, 2, N, 0), // convert time_t to strftime format
1120                                 // (default: YYYY-MM-DD)
1121 T(dbname,          0, 0, N, 0), // database name
1122 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1123 T(def,             2, 2, 1, 0), // define a macro
1124 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1125 T(div,             2, 2, N, 0), // integer divide
1126 T(emptydocs,       0, 1, N, 0), // list of empty documents
1127 T(env,             1, 1, N, 0), // environment variable
1128 T(eq,              2, 2, N, 0), // test equality
1129 T(error,           0, 0, N, 0), // error message
1130 T(field,           1, 2, N, 0), // lookup field in record
1131 T(filesize,        1, 1, N, 0), // pretty printed filesize
1132 T(filters,         0, 1, N, 0), // serialisation of current filters
1133 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1134 T(find,            2, 2, N, 0), // find entry in list
1135 T(fmt,             0, 0, N, 0), // name of current format
1136 T(foreach,         2, 2, 1, 0), // evaluate something for every entry in a list
1137 T(freq,            1, 1, N, 0), // frequency of a term
1138 T(ge,              2, 2, N, 0), // test >=
1139 T(gt,              2, 2, N, 0), // test >
1140 T(hash,            2, 2, N, 0), // hash a string using the specified hash function
1141 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1142 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1143 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1144 T(hitsperpage,     0, 0, N, 0), // hits per page
1145 T(hostname,        1, 1, N, 0), // extract hostname from URL
1146 T(html,            1, 1, N, 0), // html escape string (<>&")
1147 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1148 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1149 T(id,              0, 0, N, 0), // docid of current doc
1150 T(if,              1, 3, 1, 0), // conditional
1151 T(include,         1, 2, 1, 0), // include another file
1152 T(json,            1, 1, N, 0), // JSON string escaping
1153 T(jsonarray,       1, 2, 1, 0), // Format list as a JSON array
1154 T(jsonbool,        1, 1, 1, 0), // Format list as a JSON bool
1155 T(jsonobject,      1, 3, 1, 0), // Format map as JSON object
1156 T(jsonobject2,     2, 4, 2, 0), // Format 2 lists as JSON object
1157 T(keys,            1, 1, N, 0), // list of keys from a map
1158 T(last,            0, 0, N, M), // hit number one beyond end of current page
1159 T(lastpage,        0, 0, N, M), // number of last hit page
1160 T(le,              2, 2, N, 0), // test <=
1161 T(length,          1, 1, N, 0), // length of list
1162 T(list,            2, 5, N, 0), // pretty print list
1163 T(log,             1, 2, 1, 0), // create a log entry
1164 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1165 T(lower,           1, 1, N, 0), // convert string to lower case
1166 T(lt,              2, 2, N, 0), // test <
1167 T(map,             2, 2, 1, 0), // map a list into another list
1168 T(match,           2, 3, N, 0), // regex match
1169 T(max,             1, N, N, 0), // maximum of a list of values
1170 T(min,             1, N, N, 0), // minimum of a list of values
1171 T(mod,             2, 2, N, 0), // integer modulus
1172 T(msize,           0, 0, N, M), // number of matches (estimated)
1173 T(msizeexact,      0, 0, N, M), // is $msize exact?
1174 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1175 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1176 T(mul,             2, N, N, 0), // multiply a list of numbers
1177 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1178 T(ne,              2, 2, N, 0), // test not equal
1179 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1180 T(not,             1, 1, N, 0), // logical not
1181 T(now,             0, 0, N, 0), // current date/time as a time_t
1182 T(opt,             1, 2, N, 0), // lookup an option value
1183 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1184 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1185 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1186 T(percentage,      0, 0, N, 0), // percentage score of current hit
1187 T(prettyterm,      1, 1, N, Q), // pretty print term name
1188 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1189 T(query,           0, 1, N, Q), // query
1190 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1191 T(queryterms,      0, 0, N, Q), // list of query terms
1192 T(random,          1, 1, N, 0), // return a random number
1193 T(range,           2, 2, N, 0), // return list of values between start and end
1194 T(record,          0, 1, N, 0), // record contents of document
1195 T(relevant,        0, 1, N, Q), // is document relevant?
1196 T(relevants,       0, 0, N, Q), // return list of relevant documents
1197 T(score,           0, 0, N, 0), // score (0-10) of current hit
1198 T(set,             2, 2, N, 0), // set option value
1199 T(seterror,        1, 1, N, 0), // set error_msg, setting it early stops query execution
1200 T(setmap,          1, N, N, 0), // set map of option values
1201 T(setrelevant,     1, 1, N, Q), // set rset
1202 T(slice,           2, 2, N, 0), // slice a list using a second list
1203 T(snippet,         1, 6, N, M), // generate snippet from text
1204 T(sort,            1, 2, N, 0), // alpha sort a list
1205 T(sortableunserialise,
1206                    1, 1, N, 0), // decode with Xapian::sortable_unserialise
1207 T(split,           1, 2, N, 0), // split a string to give a list
1208 T(srandom,         1, 1, N, 0), // seed for random number
1209 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1210 T(sub,             2, 2, N, 0), // subtract
1211 T(subdb,           0, 1, N, 0), // name of subdb docid is in
1212 T(subid,           0, 1, N, 0), // docid in the subdb#
1213 T(substr,          2, 3, N, 0), // substring
1214 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1215 T(switch,          3, N, 1, 0), // return position of substring, or empty string
1216 T(termprefix,      1, 1, N, 0), // get any prefix from a term
1217 T(terms,           0, 1, N, M), // list of matching terms
1218 T(thispage,        0, 0, N, M), // page number of current page
1219 T(time,            0, 0, N, M), // how long the match took (in seconds)
1220 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1221                                 // (counting from 0)
1222 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1223                                 // (default 16)
1224 T(transform,       3, 4, N, 0), // transform with a regexp
1225 T(truncate,        2, 4, N, 0), // truncate after a word
1226 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1227 T(unique,          1, 1, N, 0), // removed duplicates from any list
1228 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1229 T(unprefix,        1, 1, N, 0), // remove any prefix from a term
1230 T(unstem,          1, 1, N, Q), // return list of terms from the parsed query
1231                                 // which stemmed to this term
1232 T(upper,           1, 1, N, 0), // convert string to upper case
1233 T(url,             1, 1, N, 0), // url encode argument
1234 T(value,           1, 2, N, 0), // return document value
1235 T(valuelowerbound, 1, 1, N, 0), // return value slot lower bound
1236 T(valueupperbound, 1, 1, N, 0), // return value slot upper bound
1237 T(version,         0, 0, N, 0), // omega version string
1238 T(weight,          0, 0, N, 0), // weight of the current hit
1239 { NULL,{0,         0, 0, 0, 0}}
1240 };
1241
1242 #undef T // Leaving T defined screws up Sun's C++ compiler!
1243
1244 static vector<string> macros;
1245
1246 // Call write() repeatedly until all data is written or we get a
1247 // non-recoverable error.
1248 static ssize_t
1249 write_all(int fd, const char * buf, size_t count)
1250 {
1251     while (count) {
1252         ssize_t r = write(fd, buf, count);
1253         if (rare(r < 0)) {
1254             if (errno == EINTR) continue;
1255             return r;
1256         }
1257         buf += r;
1258         count -= r;
1259     }
1260     return 0;
1261 }
1262
1263 // mersenne twister for RNG
1264 static mt19937 rng;
1265 static bool seed_set = false;
1266
1267 static string eval(const string& fmt, vector<string>& param);
1268
1269 /** Implements $foreach{} and $map{}. */
1270 static string
1271 foreach(const string& list,
1272         const string& pat,
1273         vector<string>& param,
1274         char sep = '\0')
1275 {
1276     string result;
1277     string saved_arg0 = std::move(param[0]);
1278     string::size_type i = 0, j;
1279     while (true) {
1280         j = list.find('\t', i);
1281         param[0].assign(list, i, j - i);
1282         result += eval(pat, param);
1283         if (j == string::npos) break;
1284         if (sep) result += sep;
1285         i = j + 1;
1286     }
1287     param[0] = std::move(saved_arg0);
1288     return result;
1289 }
1290
1291 static string
1292 eval(const string& fmt, vector<string>& param)
1293 {
1294     static map<string, const struct func_attrib *> func_map;
1295     if (func_map.empty()) {
1296         for (auto p = func_tab; p->name != NULL; ++p) {
1297             func_map[string(p->name)] = &(p->a);
1298         }
1299     }
1300     string res;
1301     string::size_type p = 0, q;
1302     while ((q = fmt.find('$', p)) != string::npos) try {
1303         res.append(fmt, p, q - p);
1304         string::size_type code_start = q; // note down for error reporting
1305         q++;
1306         if (q >= fmt.size()) break;
1307         unsigned char ch = fmt[q];
1308         switch (ch) {
1309             // Magic sequences:
1310             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1311             case '$':
1312                 res += '$';
1313                 p = q + 1;
1314                 continue;
1315             case '(':
1316                 res += '{';
1317                 p = q + 1;
1318                 continue;
1319             case ')':
1320                 res += '}';
1321                 p = q + 1;
1322                 continue;
1323             case '.':
1324                 res += ',';
1325                 p = q + 1;
1326                 continue;
1327             case '_':
1328                 ch = '0';
1329                 // FALL THRU
1330             case '1': case '2': case '3': case '4': case '5':
1331             case '6': case '7': case '8': case '9':
1332                 ch -= '0';
1333                 if (ch < param.size()) res += param[ch];
1334                 p = q + 1;
1335                 continue;
1336             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1337             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1338             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1339             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1340             case 'y': case 'z':
1341             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1342             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1343             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1344             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1345             case 'Y': case 'Z':
1346             case '{':
1347                 break;
1348             default:
1349                 string msg = "Unknown $ code in: $";
1350                 msg.append(fmt, q, string::npos);
1351                 throw msg;
1352         }
1353         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1354         string var(fmt, q, p - q);
1355         map<string, const struct func_attrib *>::const_iterator func;
1356         func = func_map.find(var);
1357         if (func == func_map.end()) {
1358             throw "Unknown function '" + var + "'";
1359         }
1360         vector<string> args;
1361         if (fmt[p] == '{') {
1362             q = p + 1;
1363             int nest = 1;
1364             while (true) {
1365                 p = fmt.find_first_of(",{}", p + 1);
1366                 if (p == string::npos)
1367                     throw "missing } in " + fmt.substr(code_start);
1368                 if (fmt[p] == '{') {
1369                     ++nest;
1370                 } else {
1371                     if (nest == 1) {
1372                         // should we split the args
1373                         if (func->second->minargs != N) {
1374                             args.push_back(fmt.substr(q, p - q));
1375                             q = p + 1;
1376                         }
1377                     }
1378                     if (fmt[p] == '}' && --nest == 0) break;
1379                 }
1380             }
1381             if (func->second->minargs == N)
1382                 args.push_back(fmt.substr(q, p - q));
1383             ++p;
1384         }
1385
1386         if (func->second->minargs != N) {
1387             if (int(args.size()) < func->second->minargs)
1388                 throw "too few arguments to $" + var;
1389             if (func->second->maxargs != N &&
1390                 int(args.size()) > func->second->maxargs)
1391                 throw "too many arguments to $" + var;
1392
1393             vector<string>::size_type n;
1394             if (func->second->evalargs != N)
1395                 n = func->second->evalargs;
1396             else
1397                 n = args.size();
1398
1399             for (vector<string>::size_type j = 0; j < n; ++j)
1400                 args[j] = eval(args[j], param);
1401         }
1402         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1403             ensure_query_parsed();
1404         if (func->second->ensure == 'M') ensure_match();
1405         string value;
1406         switch (func->second->tag) {
1407             case CMD_:
1408                 break;
1409             case CMD_add: {
1410                 int total = 0;
1411                 for (auto&& arg : args)
1412                     total += string_to_int(arg);
1413                 value = str(total);
1414                 break;
1415             }
1416             case CMD_addfilter:
1417                 if (args.size() == 1 || args[1].empty() || args[1] == "B") {
1418                     add_bterm(args[0]);
1419                 } else if (args[1] == "N") {
1420                     add_nterm(args[0]);
1421                 } else {
1422                     string msg = "Invalid $addfilter type '";
1423                     msg += args[1];
1424                     msg += "'";
1425                     throw msg;
1426                 }
1427                 break;
1428             case CMD_allterms: {
1429                 // list of all terms indexing document
1430                 Xapian::docid id = q0;
1431                 if (!args.empty() &&
1432                     (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
1433                     throw "Document id for command allterms should be > 0";
1434                 }
1435                 for (Xapian::TermIterator term = db.termlist_begin(id);
1436                      term != db.termlist_end(id); ++term) {
1437                     value += *term;
1438                     value += '\t';
1439                 }
1440
1441                 if (!value.empty()) value.erase(value.size() - 1);
1442                 break;
1443             }
1444             case CMD_and: {
1445                 value = "true";
1446                 for (auto&& arg : args) {
1447                     if (eval(arg, param).empty()) {
1448                         value.resize(0);
1449                         break;
1450                     }
1451                 }
1452                 break;
1453             }
1454             case CMD_base64: {
1455                 const static char encode[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef"
1456                                              "ghijklmnopqrstuvwxyz0123456789+/";
1457                 const char pad = '=';
1458                 const string& input = args[0];
1459                 value.reserve((input.size() + 2) / 3 * 4);
1460                 auto it = input.begin();
1461                 auto n = input.size() / 3;
1462                 for ( ; n; --n) {
1463                     uint32_t v = uint8_t(*it++);
1464                     v = (v << 8) | uint8_t(*it++);
1465                     v = (v << 8) | uint8_t(*it++);
1466                     value += encode[v >> 18];
1467                     value += encode[(v >> 12) & 63];
1468                     value += encode[(v >> 6) & 63];
1469                     value += encode[v & 63];
1470                 }
1471                 switch (input.size() % 3) {
1472                     case 2: {
1473                         uint32_t v = uint8_t(*it++);
1474                         v = (v << 8) | uint8_t(*it++);
1475                         value += encode[v >> 10];
1476                         value += encode[(v >> 4) & 63];
1477                         value += encode[(v << 2) & 63];
1478                         value += pad;
1479                         break;
1480                     }
1481                     case 1: {
1482                         uint32_t v = uint8_t(*it++);
1483                         value += encode[v >> 2];
1484                         value += encode[(v << 4) & 63];
1485                         value += pad;
1486                         value += pad;
1487                         break;
1488                     }
1489                 }
1490                 break;
1491             }
1492             case CMD_cgi: {
1493                 auto i = cgi_params.find(args[0]);
1494                 if (i != cgi_params.end()) value = i->second;
1495                 break;
1496             }
1497             case CMD_cgilist: {
1498                 auto g = cgi_params.equal_range(args[0]);
1499                 for (auto i = g.first; i != g.second; ++i) {
1500                     value += i->second;
1501                     value += '\t';
1502                 }
1503                 if (!value.empty()) value.erase(value.size() - 1);
1504                 break;
1505             }
1506             case CMD_cgiparams: {
1507                 const string* prev = NULL;
1508                 for (auto&& i : cgi_params) {
1509                     if (prev && i.first == *prev) continue;
1510                     value += i.first;
1511                     value += '\t';
1512                     prev = &i.first;
1513                 }
1514                 if (!value.empty()) value.erase(value.size() - 1);
1515                 break;
1516             }
1517             case CMD_chr: {
1518                 unsigned int codepoint;
1519                 if (!parse_unsigned(args[0].c_str(), codepoint)) {
1520                     throw "Unicode codepoint for command chr should be >= 0";
1521                 }
1522                 Xapian::Unicode::append_utf8(value, codepoint);
1523                 break;
1524             }
1525             case CMD_collapsed: {
1526                 value = str(collapsed);
1527                 break;
1528             }
1529             case CMD_cond:
1530                 for (size_t i = 0; i < args.size(); i += 2) {
1531                     if (i == args.size() - 1) {
1532                         // Handle optional "else" value.
1533                         value = eval(args[i], param);
1534                         break;
1535                     }
1536                     if (!eval(args[i], param).empty()) {
1537                         value = eval(args[i + 1], param);
1538                         break;
1539                     }
1540                 }
1541                 break;
1542             case CMD_contains: {
1543                 size_t pos = args[1].find(args[0]);
1544                 if (pos != string::npos) {
1545                     value = str(pos);
1546                 }
1547                 break;
1548             }
1549             case CMD_csv:
1550                 value = args[0];
1551                 if (args.size() > 1 && !args[1].empty()) {
1552                     csv_escape_always(value);
1553                 } else {
1554                     csv_escape(value);
1555                 }
1556                 break;
1557             case CMD_date:
1558                 value = args[0];
1559                 if (!value.empty()) {
1560                     char buf[64] = "";
1561                     time_t date;
1562                     if (!parse_signed(value.c_str(), date)) {
1563                         throw "Date (in secs) for command date should "
1564                               "be an integer";
1565                     }
1566                     if (date != static_cast<time_t>(-1)) {
1567                         struct tm *then;
1568                         then = gmtime(&date);
1569                         string date_fmt = "%Y-%m-%d";
1570                         if (args.size() > 1) date_fmt = eval(args[1], param);
1571                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1572                     }
1573                     value = buf;
1574                 }
1575                 break;
1576             case CMD_dbname:
1577                 value = dbname;
1578                 break;
1579             case CMD_dbsize: {
1580                 static Xapian::doccount dbsize;
1581                 if (!dbsize) dbsize = db.get_doccount();
1582                 value = str(dbsize);
1583                 break;
1584             }
1585             case CMD_def: {
1586                 func_attrib *fa = new func_attrib;
1587                 fa->tag = CMD_MACRO + macros.size();
1588                 fa->minargs = 0;
1589                 fa->maxargs = 9;
1590                 fa->evalargs = N; // FIXME: or 0?
1591                 fa->ensure = 0;
1592
1593                 macros.push_back(args[1]);
1594                 func_map[args[0]] = fa;
1595                 break;
1596             }
1597             case CMD_defaultop:
1598                 if (default_op == Xapian::Query::OP_AND) {
1599                     value = "and";
1600                 } else {
1601                     value = "or";
1602                 }
1603                 break;
1604             case CMD_div: {
1605                 int denom = string_to_int(args[1]);
1606                 if (denom == 0) {
1607                     value = "divide by 0";
1608                 } else {
1609                     value = str(string_to_int(args[0]) / denom);
1610                 }
1611                 break;
1612             }
1613             case CMD_emptydocs: {
1614                 string t;
1615                 if (!args.empty())
1616                     t = args[0];
1617                 Xapian::PostingIterator i;
1618                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1619                     if (i.get_doclength() != 0) continue;
1620                     if (!value.empty()) value += '\t';
1621                     value += str(*i);
1622                 }
1623                 break;
1624             }
1625             case CMD_env: {
1626                 char *env = getenv(args[0].c_str());
1627                 if (env != NULL) value = env;
1628                 break;
1629             }
1630             case CMD_eq:
1631                 if (args[0] == args[1]) value = "true";
1632                 break;
1633             case CMD_error:
1634                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1635                     error_msg = "Database '" + dbname + "' couldn't be opened";
1636                 }
1637                 value = error_msg;
1638                 break;
1639             case CMD_field: {
1640                 Xapian::docid did = q0;
1641                 if (args.size() > 1 &&
1642                     (!parse_unsigned(args[1].c_str(), did) || did == 0)) {
1643                     throw "Document id for command field should be > 0";
1644                 }
1645                 value = fields.get_field(did, args[0]);
1646                 break;
1647             }
1648             case CMD_filesize: {
1649                 if (args[0].empty()) break;
1650                 // FIXME: rounding?  i18n?
1651                 int size;
1652                 if (!parse_signed(args[0].c_str(), size)) {
1653                     throw "Filesize must be an integer";
1654                 }
1655                 int intpart = size;
1656                 int fraction = -1;
1657                 const char * format = 0;
1658                 if (size < 0) {
1659                     // Negative size -> empty result.
1660                 } else if (size == 1) {
1661                     format = "%d byte";
1662                 } else if (size < 1024) {
1663                     format = "%d bytes";
1664                 } else {
1665                     if (size < 1024 * 1024) {
1666                         format = "%d.%cK";
1667                     } else {
1668                         size /= 1024;
1669                         if (size < 1024 * 1024) {
1670                             format = "%d.%cM";
1671                         } else {
1672                             size /= 1024;
1673                             format = "%d.%cG";
1674                         }
1675                     }
1676                     intpart = unsigned(size) / 1024;
1677                     fraction = unsigned(size) % 1024;
1678                 }
1679                 if (format) {
1680                     char buf[200];
1681                     int len;
1682                     if (fraction == -1) {
1683                         len = snprintf(buf, sizeof(buf), format, intpart);
1684                     } else {
1685                         fraction = (fraction * 10 / 1024) + '0';
1686                         len = snprintf(buf, sizeof(buf), format, intpart, fraction);
1687                     }
1688                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1689                     value.assign(buf, len);
1690                 }
1691                 break;
1692             }
1693             case CMD_filters:
1694                 value = args.size() ? old_filters : filters;
1695                 break;
1696             case CMD_filterterms: {
1697                 Xapian::TermIterator term = db.allterms_begin();
1698                 term.skip_to(args[0]);
1699                 while (term != db.allterms_end()) {
1700                     string t = *term;
1701                     if (!startswith(t, args[0])) break;
1702                     value += t;
1703                     value += '\t';
1704                     ++term;
1705                 }
1706
1707                 if (!value.empty()) value.erase(value.size() - 1);
1708                 break;
1709             }
1710             case CMD_find: {
1711                 string l = args[0], s = args[1];
1712                 string::size_type i = 0, j = 0;
1713                 size_t count = 0;
1714                 while (j != l.size()) {
1715                     j = l.find('\t', i);
1716                     if (j == string::npos) j = l.size();
1717                     if (j - i == s.length()) {
1718                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1719                             value = str(count);
1720                             break;
1721                         }
1722                     }
1723                     ++count;
1724                     i = j + 1;
1725                 }
1726                 break;
1727             }
1728             case CMD_fmt:
1729                 value = fmtname;
1730                 break;
1731             case CMD_foreach:
1732                 if (!args[0].empty()) {
1733                     value = foreach(args[0], args[1], param);
1734                 }
1735                 break;
1736             case CMD_freq: {
1737                 const string& term = args[0];
1738                 Xapian::doccount termfreq = 0;
1739                 if (done_query) {
1740                     termfreq = mset.get_termfreq(term);
1741                 }
1742                 if (termfreq == 0) {
1743                     // We want $freq to work before the match is run, and we
1744                     // don't want using it to force the match to run.
1745                     termfreq = db.get_termfreq(term);
1746                 }
1747                 value = str(termfreq);
1748                 break;
1749             }
1750             case CMD_ge:
1751                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1752                     value = "true";
1753                 break;
1754             case CMD_gt:
1755                 if (string_to_int(args[0]) > string_to_int(args[1]))
1756                     value = "true";
1757                 break;
1758             case CMD_hash: {
1759                 const string& data = args[0];
1760                 const string& hash = args[1];
1761                 if (hash == "md5") {
1762                     string md5;
1763                     md5_string(data, md5);
1764                     value.reserve(md5.size() * 2);
1765                     for (unsigned char byte : md5) {
1766                         value += "0123456789abcdef"[byte >> 4];
1767                         value += "0123456789abcdef"[byte & 0x0f];
1768                     }
1769                 } else {
1770                     throw "Unknown hash function: " + hash;
1771                 }
1772                 break;
1773             }
1774             case CMD_highlight: {
1775                 string bra, ket;
1776                 if (args.size() > 2) {
1777                     bra = args[2];
1778                     if (args.size() > 3) {
1779                         ket = args[3];
1780                     } else {
1781                         string::const_iterator i;
1782                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1783                         ket = "</";
1784                         ket.append(bra, 1, i - bra.begin() - 1);
1785                         ket += '>';
1786                     }
1787                 }
1788
1789                 value = html_highlight(args[0], args[1], bra, ket);
1790                 break;
1791             }
1792             case CMD_hit:
1793                 // 0-based mset index
1794                 value = str(hit_no);
1795                 break;
1796             case CMD_hitlist: {
1797 #if 0
1798                 url_query_string = "?DB=";
1799                 url_query_string += dbname;
1800                 for (auto& j : query_strings) {
1801                     if (j.first.empty()) {
1802                         url_query_string += "&P=";
1803                     } else {
1804                         url_query_string += "&P."
1805                         url_query_string += j.first;
1806                         url_query_string += '=';
1807                     }
1808                     const char *q = j.second.c_str();
1809                     int ch;
1810                     while ((ch = *q++) != '\0') {
1811                         switch (ch) {
1812                           case '+':
1813                             url_query_string += "%2b";
1814                             break;
1815                           case '"':
1816                             url_query_string += "%22";
1817                             break;
1818                           case '%':
1819                             url_query_string += "%25";
1820                             break;
1821                           case '&':
1822                             url_query_string += "%26";
1823                             break;
1824                           case ' ':
1825                             ch = '+';
1826                             /* fall through */
1827                           default:
1828                             url_query_string += ch;
1829                         }
1830                     }
1831                 }
1832                 // add any boolean terms
1833                 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1834                     url_query_string += "&B=";
1835                     url_query_string += i->second;
1836                 }
1837 #endif
1838                 auto save_hit_no = hit_no;
1839                 for (hit_no = topdoc; hit_no < last; ++hit_no)
1840                     value += print_caption(args[0], param);
1841                 hit_no = save_hit_no;
1842                 break;
1843             }
1844             case CMD_hitsperpage:
1845                 value = str(hits_per_page);
1846                 break;
1847             case CMD_hostname: {
1848                 value = args[0];
1849                 // remove URL scheme and/or path
1850                 string::size_type i = value.find("://");
1851                 if (i == string::npos) i = 0; else i += 3;
1852                 value = value.substr(i, value.find('/', i) - i);
1853                 // remove user@ or user:password@
1854                 i = value.find('@');
1855                 if (i != string::npos) value.erase(0, i + 1);
1856                 // remove :port
1857                 i = value.find(':');
1858                 if (i != string::npos) value.resize(i);
1859                 break;
1860             }
1861             case CMD_html:
1862                 value = html_escape(args[0]);
1863                 break;
1864             case CMD_htmlstrip:
1865                 value = html_strip(args[0]);
1866                 break;
1867             case CMD_httpheader:
1868                 if (!suppress_http_headers) {
1869                     cout << args[0] << ": " << args[1] << endl;
1870                     if (!set_content_type && args[0].length() == 12 &&
1871                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1872                         set_content_type = true;
1873                     }
1874                 }
1875                 break;
1876             case CMD_id:
1877                 // document id
1878                 value = str(q0);
1879                 break;
1880             case CMD_if:
1881                 if (args.size() > 1 && !args[0].empty())
1882                     value = eval(args[1], param);
1883                 else if (args.size() > 2)
1884                     value = eval(args[2], param);
1885                 break;
1886             case CMD_include: {
1887                 if (args.size() == 1) {
1888                     value = eval_file(args[0]);
1889                 } else {
1890                     bool fallback = false;
1891                     value = eval_file(args[0], &fallback);
1892                     if (fallback) {
1893                         value = eval(args[1], param);
1894                     }
1895                 }
1896                 break;
1897             }
1898             case CMD_json:
1899                 value = args[0];
1900                 json_escape(value);
1901                 break;
1902             case CMD_jsonarray: {
1903                 const string & l = args[0];
1904                 string::size_type i = 0, j;
1905                 if (l.empty()) {
1906                     value = "[]";
1907                     break;
1908                 }
1909                 vector<string> new_args(1);
1910                 value = "[";
1911                 while (true) {
1912                     j = l.find('\t', i);
1913                     string elt(l, i, j - i);
1914                     if (args.size() == 1) {
1915                         value += '"';
1916                         json_escape(elt);
1917                         value += elt;
1918                         value += '"';
1919                     } else {
1920                         new_args[0] = std::move(elt);
1921                         value += eval(args[1], new_args);
1922                     }
1923                     if (j == string::npos) break;
1924                     value += ',';
1925                     i = j + 1;
1926                 }
1927                 value += ']';
1928                 break;
1929             }
1930             case CMD_jsonbool:
1931                 value = args[0].empty() ? "false" : "true";
1932                 break;
1933             case CMD_jsonobject: {
1934                 vector<string> new_args(1);
1935
1936                 class map_range {
1937                     typedef map<string, string>::const_iterator iterator;
1938                     iterator b, e;
1939
1940                   public:
1941                     map_range(iterator b_, iterator e_) : b(b_), e(e_) {}
1942
1943                     iterator begin() const { return b; }
1944                     iterator end() const { return e; }
1945                 };
1946
1947                 string prefix = args[0] + ',';
1948                 auto b = option.lower_bound(prefix);
1949                 ++prefix.back();
1950                 auto e = option.lower_bound(prefix);
1951                 value = to_json(map_range(b, e),
1952                                 [&](const string& k) {
1953                                     string key(k, prefix.size());
1954                                     if (args.size() > 1 && !args[1].empty()) {
1955                                         new_args[0] = std::move(key);
1956                                         key = eval(args[1], new_args);
1957                                     }
1958                                     return key;
1959                                 },
1960                                 [&](const string& v) {
1961                                     if (args.size() > 2 && !args[2].empty()) {
1962                                         new_args[0] = v;
1963                                         return eval(args[2], new_args);
1964                                     }
1965                                     string r(1, '"');
1966                                     string elt = v;
1967                                     json_escape(elt);
1968                                     r += elt;
1969                                     r += '"';
1970                                     return r;
1971                                 });
1972                 break;
1973             }
1974             case CMD_jsonobject2: {
1975                 vector<string> new_args(1);
1976
1977                 static string dummy;
1978
1979                 class list_range {
1980                     const string& keys;
1981                     const string& values;
1982
1983                   public:
1984                     class iterator {
1985                         const string& keys;
1986                         const string& values;
1987                         string::size_type ki = 0;
1988                         string::size_type kj;
1989                         string::size_type vi = 0;
1990                         string::size_type vj;
1991
1992                       public:
1993                         iterator()
1994                             : keys(dummy), values(dummy),
1995                               ki(string::npos), vi(string::npos) {}
1996
1997                         iterator(const string& k, const string& v)
1998                             : keys(k), values(v) {
1999                             if (keys.empty() && values.empty()) {
2000                                 // Don't treat this as: { "": "" }
2001                                 ki = kj = vi = vj = string::npos;
2002                             } else {
2003                                 kj = keys.find('\t');
2004                                 vj = values.find('\t');
2005                             }
2006                         }
2007
2008                         pair<string, string> operator*() const {
2009                             return {keys.substr(ki, kj - ki),
2010                                     values.substr(vi, vj - vi)};
2011                         }
2012
2013                         iterator& operator++() {
2014                             ki = kj;
2015                             if (ki != string::npos) {
2016                                 ++ki;
2017                                 kj = keys.find('\t', ki);
2018                             }
2019                             vi = vj;
2020                             if (vi != string::npos) {
2021                                 ++vi;
2022                                 vj = values.find('\t', vi);
2023                             }
2024                             if ((ki == string::npos) !=
2025                                 (vi == string::npos)) {
2026                                 throw "$jsonobject2: Different number of keys "
2027                                       "and values";
2028                             }
2029                             return *this;
2030                         }
2031
2032                         iterator operator++(int) {
2033                             iterator r = *this;
2034                             operator++();
2035                             return r;
2036                         }
2037
2038                         bool operator==(const iterator& o) const {
2039                             return ki == o.ki && vi == o.vi;
2040                         }
2041
2042                         bool operator!=(const iterator& o) const {
2043                             return !(*this == o);
2044                         }
2045                     };
2046
2047                     list_range(const string& k, const string& v)
2048                         : keys(k), values(v) { }
2049
2050                     iterator begin() const { return iterator(keys, values); }
2051                     iterator end() const { return iterator(); }
2052                 };
2053
2054                 value = to_json(list_range(args[0], args[1]),
2055                                 [&](const string& k) {
2056                                     string key = k;
2057                                     if (args.size() > 2 && !args[2].empty()) {
2058                                         new_args[0] = std::move(key);
2059                                         key = eval(args[2], new_args);
2060                                     }
2061                                     return key;
2062                                 },
2063                                 [&](const string& v) {
2064                                     if (args.size() > 3 && !args[3].empty()) {
2065                                         new_args[0] = v;
2066                                         return eval(args[3], new_args);
2067                                     }
2068                                     string r(1, '"');
2069                                     string elt = v;
2070                                     json_escape(elt);
2071                                     r += elt;
2072                                     r += '"';
2073                                     return r;
2074                                 });
2075                 break;
2076             }
2077             case CMD_keys: {
2078                 string prefix = args[0] + ',';
2079                 auto i = option.lower_bound(prefix);
2080                 for (; i != option.end() && startswith(i->first, prefix); ++i) {
2081                     const string& key = i->first;
2082                     if (!value.empty()) value += '\t';
2083                     value.append(key, prefix.size(), string::npos);
2084                 }
2085                 break;
2086             }
2087             case CMD_last:
2088                 value = str(last);
2089                 break;
2090             case CMD_lastpage: {
2091                 int l = mset.get_matches_estimated();
2092                 if (l > 0) l = (l - 1) / hits_per_page + 1;
2093                 value = str(l);
2094                 break;
2095             }
2096             case CMD_le:
2097                 if (string_to_int(args[0]) <= string_to_int(args[1]))
2098                     value = "true";
2099                 break;
2100             case CMD_length:
2101                 if (args[0].empty()) {
2102                     value = "0";
2103                 } else {
2104                     size_t length = count(args[0].begin(), args[0].end(), '\t');
2105                     value = str(length + 1);
2106                 }
2107                 break;
2108             case CMD_list: {
2109                 if (!args[0].empty()) {
2110                     string pre, inter, interlast, post;
2111                     switch (args.size()) {
2112                      case 2:
2113                         inter = interlast = args[1];
2114                         break;
2115                      case 3:
2116                         inter = args[1];
2117                         interlast = args[2];
2118                         break;
2119                      case 4:
2120                         pre = args[1];
2121                         inter = interlast = args[2];
2122                         post = args[3];
2123                         break;
2124                      case 5:
2125                         pre = args[1];
2126                         inter = args[2];
2127                         interlast = args[3];
2128                         post = args[4];
2129                         break;
2130                     }
2131                     value += pre;
2132                     string list = args[0];
2133                     string::size_type split = 0, split2;
2134                     while ((split2 = list.find('\t', split)) != string::npos) {
2135                         if (split) value += inter;
2136                         value.append(list, split, split2 - split);
2137                         split = split2 + 1;
2138                     }
2139                     if (split) value += interlast;
2140                     value.append(list, split, string::npos);
2141                     value += post;
2142                 }
2143                 break;
2144             }
2145             case CMD_log: {
2146                 if (!vet_filename(args[0])) {
2147                     value = "filename can't contain \"..\"";
2148                     break;
2149                 }
2150                 string logfile = log_dir + args[0];
2151                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
2152                 if (fd == -1) {
2153                     value = "open failed: ";
2154                     value += strerror(errno);
2155                     break;
2156                 }
2157                 vector<string> noargs;
2158                 noargs.resize(1);
2159                 string line;
2160                 if (args.size() > 1) {
2161                     line = args[1];
2162                 } else {
2163                     line = DEFAULT_LOG_ENTRY;
2164                 }
2165                 line = eval(line, noargs);
2166                 line += '\n';
2167                 if (write_all(fd, line.data(), line.length()) < 0) {
2168                     value = "write failed: ";
2169                     value += strerror(errno);
2170                 }
2171                 close(fd);
2172                 break;
2173             }
2174             case CMD_lookup: {
2175                 if (!vet_filename(args[0])) break;
2176                 string cdbfile = cdb_dir + args[0];
2177                 int fd = open(cdbfile.c_str(), O_RDONLY);
2178                 if (fd == -1) break;
2179
2180                 struct cdb cdb;
2181                 if (cdb_init(&cdb, fd) < 0) {
2182                     close(fd);
2183                     break;
2184                 }
2185
2186                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
2187                     size_t datalen = cdb_datalen(&cdb);
2188                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
2189                     if (dat) {
2190                         value.assign(static_cast<const char *>(dat), datalen);
2191                     }
2192                 }
2193
2194                 cdb_free(&cdb);
2195                 close(fd); // FIXME: cache fds?
2196                 break;
2197             }
2198             case CMD_lower:
2199                 value = Xapian::Unicode::tolower(args[0]);
2200                 break;
2201             case CMD_lt:
2202                 if (string_to_int(args[0]) < string_to_int(args[1]))
2203                     value = "true";
2204                 break;
2205             case CMD_map:
2206                 if (!args[0].empty()) {
2207                     value = foreach(args[0], args[1], param, '\t');
2208                 }
2209                 break;
2210             case CMD_match:
2211                 omegascript_match(value, args);
2212                 break;
2213             case CMD_max: {
2214                 vector<string>::const_iterator i = args.begin();
2215                 int val = string_to_int(*i++);
2216                 for (; i != args.end(); ++i) {
2217                     int x = string_to_int(*i);
2218                     if (x > val) val = x;
2219                 }
2220                 value = str(val);
2221                 break;
2222             }
2223             case CMD_min: {
2224                 vector<string>::const_iterator i = args.begin();
2225                 int val = string_to_int(*i++);
2226                 for (; i != args.end(); ++i) {
2227                     int x = string_to_int(*i);
2228                     if (x < val) val = x;
2229                 }
2230                 value = str(val);
2231                 break;
2232             }
2233             case CMD_mod: {
2234                 int denom = string_to_int(args[1]);
2235                 if (denom == 0) {
2236                     value = "divide by 0";
2237                 } else {
2238                     value = str(string_to_int(args[0]) % denom);
2239                 }
2240                 break;
2241             }
2242             case CMD_msize:
2243                 // Estimated number of matches.
2244                 value = str(mset.get_matches_estimated());
2245                 break;
2246             case CMD_msizeexact:
2247                 // Is msize exact?
2248                 if (mset.get_matches_lower_bound()
2249                     == mset.get_matches_upper_bound())
2250                     value = "true";
2251                 break;
2252             case CMD_msizelower:
2253                 // Lower bound on number of matches.
2254                 value = str(mset.get_matches_lower_bound());
2255                 break;
2256             case CMD_msizeupper:
2257                 // Upper bound on number of matches.
2258                 value = str(mset.get_matches_upper_bound());
2259                 break;
2260             case CMD_mul: {
2261                 vector<string>::const_iterator i = args.begin();
2262                 int total = string_to_int(*i++);
2263                 while (i != args.end())
2264                     total *= string_to_int(*i++);
2265                 value = str(total);
2266                 break;
2267             }
2268             case CMD_muldiv: {
2269                 int denom = string_to_int(args[2]);
2270                 if (denom == 0) {
2271                     value = "divide by 0";
2272                 } else {
2273                     int num = string_to_int(args[0]) * string_to_int(args[1]);
2274                     value = str(num / denom);
2275                 }
2276                 break;
2277             }
2278             case CMD_ne:
2279                 if (args[0] != args[1]) value = "true";
2280                 break;
2281             case CMD_nice: {
2282                 string::const_iterator i = args[0].begin();
2283                 int len = args[0].length();
2284                 while (len) {
2285                     value += *i++;
2286                     if (--len && len % 3 == 0) value += option["thousand"];
2287                 }
2288                 break;
2289             }
2290             case CMD_not:
2291                 if (args[0].empty()) value = "true";
2292                 break;
2293             case CMD_now:
2294                 value = str(static_cast<unsigned long>(time(NULL)));
2295                 break;
2296             case CMD_opt:
2297                 if (args.size() == 2) {
2298                     value = option[args[0] + "," + args[1]];
2299                 } else {
2300                     value = option[args[0]];
2301                 }
2302                 break;
2303             case CMD_or: {
2304                 for (auto&& arg : args) {
2305                     value = eval(arg, param);
2306                     if (!value.empty()) break;
2307                 }
2308                 break;
2309             }
2310             case CMD_ord: {
2311                 if (!args[0].empty()) {
2312                     Utf8Iterator it(args[0]);
2313                     value = str(*it);
2314                 }
2315                 break;
2316             }
2317             case CMD_pack: {
2318                 int number;
2319                 if (!parse_signed(args[0].c_str(), number)) {
2320                     throw "NUMBER parameter for pack command "
2321                           "must be an integer";
2322                 }
2323                 value = int_to_binary_string(number);
2324                 break;
2325             }
2326             case CMD_percentage:
2327                 // percentage score
2328                 value = str(percent);
2329                 break;
2330             case CMD_prettyterm:
2331                 value = pretty_term(args[0]);
2332                 break;
2333             case CMD_prettyurl:
2334                 value = args[0];
2335                 url_prettify(value);
2336                 break;
2337             case CMD_query: {
2338                 auto r = query_strings.equal_range(args.empty() ?
2339                                                    string() : args[0]);
2340                 for (auto j = r.first; j != r.second; ++j) {
2341                     if (!value.empty()) value += '\t';
2342                     const string & s = j->second;
2343                     size_t start = 0, tab;
2344                     while ((tab = s.find('\t', start)) != string::npos) {
2345                         value.append(s, start, tab - start);
2346                         value += ' ';
2347                         start = tab + 1;
2348                     }
2349                     value.append(s, start, string::npos);
2350                 }
2351                 break;
2352             }
2353             case CMD_querydescription:
2354                 value = query.get_description();
2355                 break;
2356             case CMD_queryterms:
2357                 value = queryterms;
2358                 break;
2359             case CMD_random: {
2360                 if (!seed_set) {
2361                     random_device rd;
2362                     rng.seed(rd());
2363                     seed_set = true;
2364                 }
2365                 uniform_int_distribution<int>
2366                     distr(0, string_to_int(args[0]));
2367                 value = str(distr(rng));
2368                 break;
2369             }
2370             case CMD_range: {
2371                 int start, end;
2372                 if (!parse_signed(args[0].c_str(), start)) {
2373                     throw "Start value for range command "
2374                           "must be an integer";
2375                 }
2376                 if (!parse_signed(args[1].c_str(), end)) {
2377                     throw "End value for range command "
2378                           "must be an integer";
2379                 }
2380                 while (start <= end) {
2381                     value += str(start);
2382                     if (start < end) value += '\t';
2383                     start++;
2384                 }
2385                 break;
2386             }
2387             case CMD_record: {
2388                 Xapian::docid id = q0;
2389                 if (!args.empty() &&
2390                     (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2391                     throw "Document id for command record should be > 0";
2392                 }
2393                 value = db.get_document(id).get_data();
2394                 break;
2395             }
2396             case CMD_relevant: {
2397                 // document id if relevant; empty otherwise
2398                 Xapian::docid id = q0;
2399                 if (!args.empty() &&
2400                     (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2401                     throw "Document id for command relevant should be > 0";
2402                 }
2403                 auto i = ticked.find(id);
2404                 if (i != ticked.end()) {
2405                     i->second = false; // icky side-effect
2406                     value = str(id);
2407                 }
2408                 break;
2409             }
2410             case CMD_relevants: {
2411                 for (auto i : ticked) {
2412                     if (i.second) {
2413                         value += str(i.first);
2414                         value += '\t';
2415                     }
2416                 }
2417                 if (!value.empty()) value.erase(value.size() - 1);
2418                 break;
2419             }
2420             case CMD_score:
2421                 // Score (0 to 10)
2422                 value = str(percent / 10);
2423                 break;
2424             case CMD_set:
2425                 option[args[0]] = args[1];
2426                 break;
2427             case CMD_seterror:
2428                 error_msg = args[0];
2429                 break;
2430             case CMD_setmap: {
2431                 string base = args[0] + ',';
2432                 if (args.size() % 2 != 1)
2433                     throw string("$setmap requires an odd number of arguments");
2434                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2435                     option[base + args[i]] = args[i + 1];
2436                 }
2437                 break;
2438             }
2439             case CMD_setrelevant: {
2440                 string::size_type i = 0, j;
2441                 while (true) {
2442                     j = args[0].find_first_not_of("0123456789", i);
2443                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2444                     if (id) {
2445                         rset.add_document(id);
2446                         ticked[id] = true;
2447                     }
2448                     if (j == string::npos) break;
2449                     i = j + 1;
2450                 }
2451                 break;
2452             }
2453             case CMD_slice: {
2454                 string list = args[0], pos = args[1];
2455                 vector<string> items;
2456                 string::size_type i = 0, j;
2457                 while (true) {
2458                     j = list.find('\t', i);
2459                     items.push_back(list.substr(i, j - i));
2460                     if (j == string::npos) break;
2461                     i = j + 1;
2462                 }
2463                 i = 0;
2464                 bool have_added = false;
2465                 while (true) {
2466                     j = pos.find('\t', i);
2467                     int item = string_to_int(pos.substr(i, j - i));
2468                     if (item >= 0 && size_t(item) < items.size()) {
2469                         if (have_added) value += '\t';
2470                         value += items[item];
2471                         have_added = true;
2472                     }
2473                     if (j == string::npos) break;
2474                     i = j + 1;
2475                 }
2476                 break;
2477             }
2478             case CMD_snippet: {
2479                 size_t length = 200;
2480                 if (args.size() > 1 && !args[1].empty()) {
2481                     if (!parse_unsigned(args[1].c_str(), length)) {
2482                         throw "Snippet length must be >= 0";
2483                     }
2484                 }
2485                 unsigned flags = mset.SNIPPET_BACKGROUND_MODEL |
2486                                  mset.SNIPPET_EXHAUSTIVE;
2487                 if (args.size() > 2 && !args[2].empty()) {
2488                     flags = 0;
2489                     const string& s = args[2];
2490                     size_t i = 0;
2491                     while (true) {
2492                         size_t j = s.find('|', i);
2493                         string flag(s, i, j - i);
2494                         for (char& c : flag) {
2495                             c = C_tolower(c);
2496                         }
2497                         if (startswith(flag, "snippet_")) {
2498                             flag.erase(0, CONST_STRLEN("snippet_"));
2499                         }
2500                         if (flag == "background_model") {
2501                             flags |= mset.SNIPPET_BACKGROUND_MODEL;
2502                         } else if (flag == "cjk_ngram") {
2503                             flags |= mset.SNIPPET_CJK_NGRAM;
2504                         } else if (flag == "empty_without_match") {
2505                             flags |= mset.SNIPPET_EMPTY_WITHOUT_MATCH;
2506                         } else if (flag == "exhaustive") {
2507                             flags |= mset.SNIPPET_EXHAUSTIVE;
2508                         } else if (flag == "ngrams") {
2509                             flags |= mset.SNIPPET_NGRAMS;
2510                         } else if (flag == "word_breaks") {
2511                             flags |= mset.SNIPPET_WORD_BREAKS;
2512                         } else {
2513                             throw "Unknown $snippet flag '" + flag + "'";
2514                         }
2515                         if (j == string::npos) break;
2516                         i = j + 1;
2517                     }
2518                 }
2519                 string bra, ket, gap;
2520                 if (args.size() > 3) {
2521                     bra = args[3];
2522                 } else {
2523                     bra = "<strong>";
2524                 }
2525                 if (args.size() > 4) {
2526                     ket = args[4];
2527                 } else {
2528                     ket = "</strong>";
2529                 }
2530                 if (args.size() > 5) {
2531                     gap = args[5];
2532                 } else {
2533                     gap = "...";
2534                 }
2535                 if (!stemmer)
2536                     stemmer = new Xapian::Stem(option["stemmer"]);
2537                 value = mset.snippet(args[0], length, *stemmer, flags,
2538                                      bra, ket, gap);
2539                 break;
2540             }
2541             case CMD_sort:
2542                 omegascript_sort(args, value);
2543                 break;
2544             case CMD_sortableunserialise:
2545                 // FIXME: This uses printf %f - maybe we want more than 6
2546                 // decimal places in some cases though...
2547                 value = double_to_string(Xapian::sortable_unserialise(args[0]));
2548                 break;
2549             case CMD_split: {
2550                 string split;
2551                 if (args.size() == 1) {
2552                     split = " ";
2553                     value = args[0];
2554                 } else {
2555                     split = args[0];
2556                     value = args[1];
2557                 }
2558                 string::size_type i = 0;
2559                 while (true) {
2560                     if (split.empty()) {
2561                         ++i;
2562                         if (i >= value.size()) break;
2563                     } else {
2564                         i = value.find(split, i);
2565                         if (i == string::npos) break;
2566                     }
2567                     value.replace(i, split.size(), 1, '\t');
2568                     ++i;
2569                 }
2570                 break;
2571             }
2572             case CMD_srandom: {
2573                 int seed = string_to_int(args[0]);
2574                 rng.seed(seed);
2575                 seed_set = true;
2576                 break;
2577             }
2578             case CMD_stoplist: {
2579                 Xapian::TermIterator i = qp.stoplist_begin();
2580                 Xapian::TermIterator end = qp.stoplist_end();
2581                 while (i != end) {
2582                     if (!value.empty()) value += '\t';
2583                     value += *i;
2584                     ++i;
2585                 }
2586                 break;
2587             }
2588             case CMD_sub:
2589                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2590                 break;
2591             case CMD_subdb: {
2592                 Xapian::docid id = q0;
2593                 if (args.size() > 0 &&
2594                     (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2595                     throw "Document id of the subdb command should be > 0";
2596                 }
2597                 value = subdbs[(id - 1) % subdbs.size()].get_name();
2598                 break;
2599             }
2600             case CMD_subid: {
2601                 Xapian::docid id = q0;
2602                 if (args.size() > 0 &&
2603                     (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2604                     throw "Document id of the subid command should be > 0";
2605                 }
2606                 // This is the docid in the single shard.
2607                 Xapian::docid shard_did = (id - 1) / subdbs.size() + 1;
2608                 // We now need to map this back to the docid in the collection
2609                 // of shards specified by the DB parameter value which $subdb
2610                 // returns.
2611                 const SubDB& subdb = subdbs[(id - 1) % subdbs.size()];
2612                 value = str(subdb.map_docid(shard_did));
2613                 break;
2614             }
2615             case CMD_substr: {
2616                 int start;
2617                 if (!parse_signed(args[1].c_str(), start)) {
2618                     throw "Start value for substr command "
2619                           "must be an integer";
2620                 }
2621                 if (start < 0) {
2622                     if (static_cast<size_t>(-start) >= args[0].size()) {
2623                         start = 0;
2624                     } else {
2625                         start = static_cast<int>(args[0].size()) + start;
2626                     }
2627                 } else {
2628                     if (static_cast<size_t>(start) >= args[0].size()) break;
2629                 }
2630                 size_t len = string::npos;
2631                 if (args.size() > 2) {
2632                     int int_len;
2633                     if (!parse_signed(args[2].c_str(), int_len)) {
2634                         throw "Length value for substr command "
2635                               "must be an integer";
2636                     }
2637                     if (int_len >= 0) {
2638                         len = size_t(int_len);
2639                     } else {
2640                         len = args[0].size() - start;
2641                         if (static_cast<size_t>(-int_len) >= len) {
2642                             len = 0;
2643                         } else {
2644                             len -= static_cast<size_t>(-int_len);
2645                         }
2646                     }
2647                 }
2648                 value.assign(args[0], start, len);
2649                 break;
2650             }
2651             case CMD_suggestion:
2652                 value = qp.get_corrected_query_string();
2653                 break;
2654             case CMD_switch: {
2655                 const string& val = args[0];
2656                 for (size_t i = 1; i < args.size(); i += 2) {
2657                     if (i == args.size() - 1) {
2658                         // Handle optional "else" value.
2659                         value = eval(args[i], param);
2660                         break;
2661                     }
2662                     if (val == eval(args[i], param)) {
2663                         value = eval(args[i + 1], param);
2664                         break;
2665                     }
2666                 }
2667                 break;
2668             }
2669             case CMD_termprefix:
2670                 (void)prefix_from_term(&value, args[0]);
2671                 break;
2672             case CMD_terms: {
2673                 // list of matching terms
2674                 if (!enquire) break;
2675                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2676                 if (args.empty()) {
2677                     while (term != enquire->get_matching_terms_end(q0)) {
2678                         // check term was in the typed query so we ignore
2679                         // boolean filter terms
2680                         const string & t = *term;
2681                         if (termset.find(t) != termset.end()) {
2682                             value += t;
2683                             value += '\t';
2684                         }
2685                         ++term;
2686                     }
2687                 } else {
2688                     // Return matching terms with specified prefix.  We can't
2689                     // use skip_to() as the terms aren't ordered by termname.
2690                     const string & pfx = args[0];
2691                     while (term != enquire->get_matching_terms_end(q0)) {
2692                         const string & t = *term;
2693                         if (startswith(t, pfx)) {
2694                             value += t;
2695                             value += '\t';
2696                         }
2697                         ++term;
2698                     }
2699                 }
2700
2701                 if (!value.empty()) value.erase(value.size() - 1);
2702                 break;
2703             }
2704             case CMD_thispage:
2705                 value = str(topdoc / hits_per_page + 1);
2706                 break;
2707             case CMD_time:
2708                 if (secs >= 0) {
2709                     char buf[64];
2710                     snprintf(buf, sizeof(buf), "%.6f", secs);
2711                     // MSVC's snprintf omits the zero byte if the string is
2712                     // sizeof(buf) long.
2713                     buf[sizeof(buf) - 1] = '\0';
2714                     value = buf;
2715                 }
2716                 break;
2717             case CMD_topdoc:
2718                 // first document on current page of hit list (counting from 0)
2719                 value = str(topdoc);
2720                 break;
2721             case CMD_topterms:
2722                 if (enquire) {
2723                     int howmany = 16;
2724                     if (!args.empty()) {
2725                         if (!parse_signed(args[0].c_str(), howmany)) {
2726                             throw "Number of terms for command "
2727                                   "topterms must be an integer";
2728                         }
2729                     }
2730                     if (howmany < 0) howmany = 0;
2731                     // List of expand terms
2732                     Xapian::ESet eset;
2733                     OmegaExpandDecider decider(db, &termset);
2734
2735                     if (!rset.empty()) {
2736                         set_expansion_scheme(*enquire, option);
2737                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2738                     } else if (mset.size()) {
2739                         // invent an rset
2740                         Xapian::RSet tmp;
2741
2742                         int c = 5;
2743                         // FIXME: what if mset does not start at first match?
2744                         for (Xapian::docid did : mset) {
2745                             tmp.add_document(did);
2746                             if (--c == 0) break;
2747                         }
2748
2749                         set_expansion_scheme(*enquire, option);
2750                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2751                     }
2752
2753                     // Don't show more than one word with the same stem.
2754                     set<string> stems;
2755                     Xapian::ESetIterator i;
2756                     for (i = eset.begin(); i != eset.end(); ++i) {
2757                         string term(*i);
2758                         string stem = (*stemmer)(term);
2759                         if (stems.find(stem) != stems.end()) continue;
2760                         stems.insert(stem);
2761                         value += term;
2762                         value += '\t';
2763                         if (--howmany == 0) break;
2764                     }
2765                     if (!value.empty()) value.erase(value.size() - 1);
2766                 }
2767                 break;
2768             case CMD_transform:
2769                 omegascript_transform(value, args);
2770                 break;
2771             case CMD_truncate: {
2772                 unsigned int length;
2773                 if (!parse_unsigned(args[1].c_str(), length)) {
2774                     throw "Length for truncate command must be >= 0";
2775                 }
2776                 value = generate_sample(args[0],
2777                                         length,
2778                                         args.size() > 2 ? args[2] : string(),
2779                                         args.size() > 3 ? args[3] : string());
2780                 break;
2781             }
2782             case CMD_uniq: {
2783                 const string &list = args[0];
2784                 if (list.empty()) break;
2785                 string::size_type split = 0, split2;
2786                 string prev;
2787                 do {
2788                     split2 = list.find('\t', split);
2789                     string item(list, split, split2 - split);
2790                     if (split == 0) {
2791                         value = item;
2792                     } else if (item != prev) {
2793                         value += '\t';
2794                         value += item;
2795                     }
2796                     prev = item;
2797                     split = UNSIGNED_OVERFLOW_OK(split2 + 1);
2798                 } while (split2 != string::npos);
2799                 break;
2800             }
2801             case CMD_unique: {
2802                 unordered_set<string> seen;
2803                 const string &list = args[0];
2804                 if (list.empty()) break;
2805                 string::size_type split = 0, split2;
2806                 do {
2807                     split2 = list.find('\t', split);
2808                     string item(list, split, split2 - split);
2809                     if (seen.insert(item).second) {
2810                         if (split != 0)
2811                             value += '\t';
2812                         value += item;
2813                     }
2814                     split = UNSIGNED_OVERFLOW_OK(split2 + 1);
2815                 } while (split2 != string::npos);
2816                 break;
2817             }
2818             case CMD_unpack:
2819                 value = str(binary_string_to_int(args[0]));
2820                 break;
2821             case CMD_unprefix: {
2822                 size_t prefix_len = prefix_from_term(NULL, args[0]);
2823                 value.assign(args[0], prefix_len, string::npos);
2824                 break;
2825             }
2826             case CMD_unstem: {
2827                 const string &term = args[0];
2828                 Xapian::TermIterator i = qp.unstem_begin(term);
2829                 Xapian::TermIterator end = qp.unstem_end(term);
2830                 while (i != end) {
2831                     if (!value.empty()) value += '\t';
2832                     value += *i;
2833                     ++i;
2834                 }
2835                 break;
2836             }
2837             case CMD_upper:
2838                 value = Xapian::Unicode::toupper(args[0]);
2839                 break;
2840             case CMD_url:
2841                 url_encode(value, args[0]);
2842                 break;
2843             case CMD_value: {
2844                 Xapian::docid id = q0;
2845                 Xapian::valueno slot;
2846                 if (!parse_unsigned(args[0].c_str(), slot)) {
2847                     throw "Value slot number should be >= 0";
2848                 }
2849                 if (args.size() > 1 &&
2850                     (!parse_unsigned(args[1].c_str(), id) || id == 0)) {
2851                     throw "Document id for value command must be > 0";
2852                 }
2853                 value = db.get_document(id).get_value(slot);
2854                 break;
2855             }
2856             case CMD_valuelowerbound: {
2857                 Xapian::valueno slot;
2858                 if (!parse_unsigned(args[0].c_str(), slot)) {
2859                     throw "Value slot number should be >= 0";
2860                 }
2861                 value = db.get_value_lower_bound(slot);
2862                 break;
2863             }
2864             case CMD_valueupperbound: {
2865                 Xapian::valueno slot;
2866                 if (!parse_unsigned(args[0].c_str(), slot)) {
2867                     throw "Value slot number should be >= 0";
2868                 }
2869                 value = db.get_value_upper_bound(slot);
2870                 break;
2871             }
2872             case CMD_version:
2873                 value = PACKAGE_STRING;
2874                 break;
2875             case CMD_weight:
2876                 value = double_to_string(weight);
2877                 break;
2878             default: {
2879                 args.insert(args.begin(), param[0]);
2880                 int macro_no = func->second->tag - CMD_MACRO;
2881                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2882                 // throw "Unknown function '" + var + "'";
2883                 value = eval(macros[macro_no], args);
2884                 break;
2885             }
2886         }
2887         res += value;
2888     } catch (const Xapian::Error & e) {
2889         // FIXME: this means we only see the most recent error in $error
2890         // - is that the best approach?
2891         error_msg = e.get_description();
2892     }
2893
2894     res.append(fmt, p, string::npos);
2895     return res;
2896 }
2897
2898 static string
2899 eval_file(const string& fmtfile, bool* p_not_found)
2900 {
2901     // Use -1 to indicate vet_filename() failed.
2902     int eno = -1;
2903     if (vet_filename(fmtfile)) {
2904         string file = template_dir + fmtfile;
2905         string fmt;
2906         errno = 0;
2907         if (load_file(file, fmt)) {
2908             vector<string> noargs;
2909             noargs.resize(1);
2910             return eval(fmt, noargs);
2911         }
2912         eno = errno;
2913     }
2914
2915     if (p_not_found) {
2916         *p_not_found = true;
2917         return string();
2918     }
2919
2920     // FIXME: report why!
2921     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2922     if (eno) {
2923         msg += " (";
2924         msg += (eno < 0 ? "name contains '..'" : strerror(eno));
2925         msg += ')';
2926     }
2927     throw msg;
2928 }
2929
2930 extern string
2931 pretty_term(string term)
2932 {
2933     // Just leave empty strings and single characters alone.
2934     if (term.length() <= 1) return term;
2935
2936     // Assume unprefixed terms are unstemmed.
2937     if (!C_isupper(term[0])) return term;
2938
2939     // Handle stemmed terms.
2940     bool stemmed = (term[0] == 'Z');
2941     if (stemmed) {
2942         // First of all, check if a term in the query stemmed to this one.
2943         Xapian::TermIterator u = qp.unstem_begin(term);
2944         // There might be multiple words with the same stem, but we only want
2945         // one so just take the first.
2946         if (u != qp.unstem_end(term)) return *u;
2947
2948         // Remove the 'Z'.
2949         term.erase(0, 1);
2950     }
2951
2952     bool add_quotes = false;
2953
2954     // Check if the term has a prefix.
2955     if (C_isupper(term[0])) {
2956         // See if we have this prefix in the termprefix_to_userprefix map.  If
2957         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2958         string prefix;
2959         size_t prefix_len = prefix_from_term(&prefix, term);
2960
2961         map<string, string>::const_iterator i;
2962         i = termprefix_to_userprefix.find(prefix);
2963         if (i != termprefix_to_userprefix.end()) {
2964             string user_prefix = i->second;
2965             user_prefix += ':';
2966             term.replace(0, prefix_len, user_prefix);
2967         } else {
2968             // We don't have a prefix mapping for this, so just set a flag to
2969             // add quotes around the term.
2970             add_quotes = true;
2971         }
2972     }
2973
2974     if (stemmed) term += '.';
2975
2976     if (add_quotes) {
2977         term.insert(0, "\"");
2978         term.append("\"");
2979     }
2980
2981     return term;
2982 }
2983
2984 static string
2985 print_caption(const string& fmt, vector<string>& param)
2986 {
2987     q0 = *(mset[hit_no]);
2988
2989     weight = mset[hit_no].get_weight();
2990     percent = mset.convert_to_percent(mset[hit_no]);
2991     collapsed = mset[hit_no].get_collapse_count();
2992
2993     return eval(fmt, param);
2994 }
2995
2996 void
2997 parse_omegascript()
2998 {
2999     try {
3000         string output = eval_file(fmtname);
3001         if (!set_content_type && !suppress_http_headers) {
3002             cout << "Content-Type: text/html" << endl;
3003             set_content_type = true;
3004         }
3005         if (!suppress_http_headers) cout << endl;
3006         cout << output;
3007     } catch (...) {
3008         // Ensure the headers have been output so that any exception gets
3009         // reported rather than giving a server error.
3010         if (!set_content_type && !suppress_http_headers) {
3011             cout << "Content-Type: text/html" << endl;
3012             set_content_type = true;
3013         }
3014         if (!suppress_http_headers) cout << endl;
3015         throw;
3016     }
3017 }
3018
3019 static void
3020 ensure_query_parsed()
3021 {
3022     if (query_parsed) return;
3023     query_parsed = true;
3024
3025     // Should we discard the existing R-set recorded in R CGI parameters?
3026     bool discard_rset = false;
3027
3028     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
3029     // CGI parameters)?
3030     bool force_first_page = false;
3031
3032     string v;
3033     // get list of terms from previous iteration of query
3034     auto val = cgi_params.find("xP");
3035     if (val != cgi_params.end()) {
3036         v = val->second;
3037         // If xP given, default to discarding any RSet and forcing the first
3038         // page of results.  If the query is the same, or an extension of
3039         // the previous query, we adjust these again below.
3040         discard_rset = true;
3041         force_first_page = true;
3042     }
3043     querytype result = parse_queries(v);
3044     switch (result) {
3045         case BAD_QUERY:
3046             break;
3047         case NEW_QUERY:
3048             break;
3049         case SAME_QUERY:
3050         case EXTENDED_QUERY:
3051             // If we've changed database, force the first page of hits
3052             // and discard the R-set (since the docids will have changed)
3053             val = cgi_params.find("xDB");
3054             if (val != cgi_params.end() && val->second != dbname) break;
3055             if (result == SAME_QUERY && force_first_page) {
3056                 val = cgi_params.find("xFILTERS");
3057                 if (val != cgi_params.end() && val->second != filters &&
3058                     val->second != old_filters) {
3059                     // Filters have changed since last query.
3060                 } else {
3061                     force_first_page = false;
3062                 }
3063             }
3064             discard_rset = false;
3065             break;
3066     }
3067
3068     if (!force_first_page) {
3069         // Work out which mset element is the first hit we want
3070         // to display
3071         val = cgi_params.find("TOPDOC");
3072         if (val != cgi_params.end()) {
3073             if (!parse_unsigned(val->second.c_str(), topdoc)) {
3074                 throw "TOPDOC parameter must be >= 0";
3075             }
3076         }
3077
3078         // Handle next, previous, and page links
3079         if (cgi_params.find(">") != cgi_params.end()) {
3080             topdoc += hits_per_page;
3081         } else if (cgi_params.find("<") != cgi_params.end()) {
3082             if (topdoc >= hits_per_page)
3083                 topdoc -= hits_per_page;
3084             else
3085                 topdoc = 0;
3086         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
3087                    (val = cgi_params.find("#")) != cgi_params.end()) {
3088             if (!C_isdigit(val->second[0])) {
3089                 throw "Page parameter must be >= 0";
3090             }
3091             long page = atol(val->second.c_str());
3092             // Do something sensible for page 0 (we count pages from 1).
3093             if (page == 0) page = 1;
3094             topdoc = (page - 1) * hits_per_page;
3095         }
3096
3097         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
3098         // Normally we snap TOPDOC like this so that things work nicely if
3099         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
3100         // postprocessing the output of omega and want variable sized pages,
3101         // this is unhelpful.
3102         bool raw_search = false;
3103         val = cgi_params.find("RAWSEARCH");
3104         if (val != cgi_params.end()) {
3105             unsigned int temp;
3106             if (!parse_unsigned(val->second.c_str(), temp)) {
3107                 throw "RAWSEARCH parameter must be >= 0";
3108             }
3109             raw_search = bool(temp);
3110         }
3111
3112         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
3113     }
3114
3115     if (!discard_rset) {
3116         // put documents marked as relevant into the rset
3117         auto g = cgi_params.equal_range("R");
3118         for (auto i = g.first; i != g.second; ++i) {
3119             const string & value = i->second;
3120             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
3121                 while (value[j] == '.') ++j;
3122                 Xapian::docid d;
3123                 if (!parse_unsigned(value.c_str() + j, d) || d == 0) {
3124                     throw "Document id for 'R' parameter must be > 0";
3125                 }
3126                 if (d) {
3127                     rset.add_document(d);
3128                     ticked[d] = true;
3129                 }
3130             }
3131         }
3132     }
3133 }
3134
3135 // run query if we haven't already
3136 static void
3137 ensure_match()
3138 {
3139     if (done_query) return;
3140
3141     secs = RealTime::now();
3142     run_query();
3143     if (secs != -1)
3144         secs = RealTime::now() - secs;
3145
3146     done_query = true;
3147     last = mset.get_matches_lower_bound();
3148     if (last == 0) {
3149         // Otherwise topdoc ends up being -6 if it's non-zero!
3150         topdoc = 0;
3151     } else {
3152         if (topdoc >= last)
3153             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
3154         // last is the count of documents up to the end of the current page
3155         // (as returned by $last)
3156         if (topdoc + hits_per_page < last)
3157             last = topdoc + hits_per_page;
3158     }
3159 }
3160
3161 // OmegaExpandDecider methods.
3162
3163 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
3164                                        set<string> * querytermset)
3165     : db(db_)
3166 {
3167     // We'll want the stemmer for testing matches anyway.
3168     if (!stemmer)
3169         stemmer = new Xapian::Stem(option["stemmer"]);
3170     if (querytermset) {
3171         set<string>::const_iterator i;
3172         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
3173             string term(*i);
3174             if (term.empty()) continue;
3175
3176             unsigned char ch = term[0];
3177             bool stemmed = (ch == 'Z');
3178             if (stemmed) {
3179                 term.erase(0, 1);
3180                 if (term.empty()) continue;
3181                 ch = term[0];
3182             }
3183
3184             if (C_isupper(ch)) {
3185                 size_t prefix_len = prefix_from_term(NULL, term);
3186                 term.erase(0, prefix_len);
3187             }
3188
3189             if (!stemmed) term = (*stemmer)(term);
3190
3191             exclude_stems.insert(term);
3192         }
3193     }
3194 }
3195
3196 bool
3197 OmegaExpandDecider::operator()(const string & term) const
3198 {
3199     unsigned char ch = term[0];
3200
3201     // Reject terms with a prefix.
3202     if (C_isupper(ch)) return false;
3203
3204     {
3205         MyStopper stopper;
3206         // Don't suggest stopwords.
3207         if (stopper(term)) return false;
3208     }
3209
3210     // Reject small numbers.
3211     if (term.size() < 4 && C_isdigit(ch)) return false;
3212
3213     // Reject terms containing a space.
3214     if (term.find(' ') != string::npos) return false;
3215
3216     // Skip terms with stems in the exclude_stems set, to avoid suggesting
3217     // terms which are already in the query in some form.
3218     string stem = (*stemmer)(term);
3219     if (exclude_stems.find(stem) != exclude_stems.end())
3220         return false;
3221
3222     // Ignore terms that only occur once (hapaxes) since they aren't
3223     // useful for finding related documents - they only occur in a
3224     // document that's already been marked as relevant.
3225     // FIXME: add an expand option to ignore terms where
3226     // termfreq == rtermfreq.
3227     if (db.get_termfreq(term) <= 1) return false;
3228
3229     return true;
3230 }