xapian-applications/omega/query.cc

   1 /** @file
   2  * @brief query executor for omega
   3  */
   4 /* Copyright 1999,2000,2001 BrightStation PLC
   5  * Copyright 2001 James Aylett
   6  * Copyright 2001,2002 Ananova Ltd
   7  * Copyright 2002 Intercede 1749 Ltd
   8  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021 Olly Betts
   9  * Copyright 2008 Thomas Viehmann
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License as
  13  * published by the Free Software Foundation; either version 2 of the
  14  * License, or (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  24  * USA
  25  */
  26
  27 #include <config.h>
  28
  29 #include <algorithm>
  30 #include <iostream>
  31 #include <map>
  32 #include <set>
  33 #include <unordered_map>
  34 #include <unordered_set>
  35 #include <vector>
  36
  37 #include <cassert>
  38 #include <cctype>
  39 #include <cerrno>
  40 #include <stdio.h>
  41 #include <cstdlib>
  42 #include <cstring>
  43 #include "strcasecmp.h"
  44 #include <ctime>
  45
  46 #include "safeunistd.h"
  47 #include <sys/types.h>
  48 #include "safesysstat.h"
  49 #include "safefcntl.h"
  50
  51 #include "realtime.h"
  52
  53 #include <cdb.h>
  54
  55 #include "csvescape.h"
  56 #include "date.h"
  57 #include "datevalue.h"
  58 #include "fields.h"
  59 #include "jsonescape.h"
  60 #include "utils.h"
  61 #include "omega.h"
  62 #include "query.h"
  63 #include "cgiparam.h"
  64 #include "loadfile.h"
  65 #include "sample.h"
  66 #include "sort.h"
  67 #include "str.h"
  68 #include "stringutils.h"
  69 #include "transform.h"
  70 #include "urldecode.h"
  71 #include "urlencode.h"
  72 #include "unixperm.h"
  73 #include "values.h"
  74 #include "weight.h"
  75 #include "expand.h"
  76
  77 #include <xapian.h>
  78
  79 using namespace std;
  80
  81 using Xapian::Utf8Iterator;
  82
  83 using Xapian::Unicode::is_wordchar;
  84
  85 #ifndef SNPRINTF
  86 #include <cstdarg>
  87
  88 static int my_snprintf(char *str, size_t size, const char *format, ...)
  89 {
  90     int res;
  91     va_list ap;
  92     va_start(ap, format);
  93     str[size - 1] = '\0';
  94     res = vsprintf(str, format, ap);
  95     if (str[size - 1] || res < 0 || size_t(res) >= size)
  96         abort(); /* Overflowed! */
  97     va_end(ap);
  98     return res;
  99 }
 100 #else
 101 #define my_snprintf SNPRINTF
 102 #endif
 103
 104 /// Map shard to DB parameter value and stats to allow docid mapping.
 105 vector<SubDB> subdbs;
 106
 107 static bool query_parsed = false;
 108 static bool done_query = false;
 109 static Xapian::docid last = 0;
 110 static Xapian::docid topdoc = 0;
 111
 112 static Xapian::MSet mset;
 113 static Xapian::RSet rset;
 114
 115 static map<Xapian::docid, bool> ticked;
 116
 117 static void ensure_query_parsed();
 118 static void ensure_match();
 119
 120 static Xapian::Query query;
 121 //static string url_query_string;
 122 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
 123
 124 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
 125 // be true if a date filter is specified which simplifies to
 126 // Query::MatchNothing at construction time.
 127 static bool date_filter_set = false;
 128 static Xapian::Query date_filter;
 129
 130 static Xapian::QueryParser qp;
 131 static Xapian::NumberRangeProcessor * size_rp = NULL;
 132 static Xapian::Stem *stemmer = NULL;
 133
 134 static string eval_file(const string& fmtfile, bool* p_not_found = nullptr);
 135
 136 static set<string> termset;
 137
 138 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
 139 static map<string, string> termprefix_to_userprefix;
 140
 141 static string queryterms;
 142
 143 static string error_msg;
 144
 145 static double secs = -1;
 146
 147 static const char DEFAULT_LOG_ENTRY[] =
 148         "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
 149         "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
 150         "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
 151         "$dbname\t"
 152         "$query\t"
 153         "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
 154
 155 class MyStopper : public Xapian::Stopper {
 156   public:
 157     bool operator()(const string &t) const {
 158         switch (t[0]) {
 159             case 'a':
 160                 return (t == "a" || t == "about" || t == "an" || t == "and" ||
 161                         t == "are" || t == "as" || t == "at");
 162             case 'b':
 163                 return (t == "be" || t == "by");
 164             case 'e':
 165                 return (t == "en");
 166             case 'f':
 167                 return (t == "for" || t == "from");
 168             case 'h':
 169                 return (t == "how");
 170             case 'i':
 171                 return (t == "i" || t == "in" || t == "is" || t == "it");
 172             case 'o':
 173                 return (t == "of" || t == "on" || t == "or");
 174             case 't':
 175                 return (t == "that" || t == "the" || t == "this" || t == "to");
 176             case 'w':
 177                 return (t == "was" || t == "what" || t == "when" ||
 178                         t == "where" || t == "which" || t == "who" ||
 179                         t == "why" || t == "will" || t == "with");
 180             case 'y':
 181                 return (t == "you" || t == "your");
 182             default:
 183                 return false;
 184         }
 185     }
 186 };
 187
 188 static size_t
 189 prefix_from_term(string* prefix, const string& term)
 190 {
 191     if (!term.empty()) {
 192         if (term[0] == 'X') {
 193             const string::const_iterator begin = term.begin();
 194             string::const_iterator i = begin + 1;
 195             while (i != term.end() && C_isupper(*i))
 196                 ++i;
 197             if (prefix)
 198                 prefix->assign(begin, i);
 199             if (i != term.end() && *i == ':')
 200                 ++i;
 201             return i - begin;
 202         }
 203
 204         if (C_isupper(term[0])) {
 205             if (prefix)
 206                 *prefix = term[0];
 207             return 1;
 208         }
 209     }
 210
 211     if (prefix)
 212         prefix->resize(0);
 213     return 0;
 214 }
 215
 216 // Don't allow ".." in format names, log file names, etc as this would allow
 217 // people to open a format "../../etc/passwd" or similar.
 218 // FIXME: make this check more exact ("foo..bar" is safe)
 219 // FIXME: log when this check fails
 220 static bool
 221 vet_filename(const string &filename)
 222 {
 223     string::size_type i = filename.find("..");
 224     return (i == string::npos);
 225 }
 226
 227 // Heuristics:
 228 // * If any terms have been removed, it's a "fresh query" so we discard any
 229 //   relevance judgements
 230 // * If all previous terms are there but more have been added then we keep
 231 //   the relevance judgements, but return the first page of hits
 232 //
 233 // NEW_QUERY entirely new query
 234 // SAME_QUERY unchanged query
 235 // EXTENDED_QUERY new query, but based on the old one
 236 // BAD_QUERY parse error (message in error_msg)
 237 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
 238
 239 static multimap<string, string> query_strings;
 240
 241 void
 242 add_query_string(const string& prefix, const string& s)
 243 {
 244     string query_string = s;
 245     // Strip leading and trailing whitespace from query_string.
 246     trim(query_string);
 247     if (!query_string.empty())
 248         query_strings.insert(make_pair(prefix, query_string));
 249 }
 250
 251 static unsigned
 252 read_qp_flags(const string & opt_pfx, unsigned f)
 253 {
 254     map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
 255     for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
 256         unsigned mask = 0;
 257         const char * s = i->first.c_str() + opt_pfx.size();
 258         switch (s[0]) {
 259             case 'a':
 260                 // Note that the ``Xapian::QueryParser::FLAG_ACCUMULATE`` flag
 261                 // is or-ed in below because it's needed for ``$stoplist`` and
 262                 // ``$unstem`` to work correctly, and so is deliberately not
 263                 // available to specify here.
 264                 if (strcmp(s, "auto_multiword_synonyms") == 0) {
 265                     mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
 266                     break;
 267                 }
 268                 if (strcmp(s, "auto_synonyms") == 0) {
 269                     mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
 270                     break;
 271                 }
 272                 break;
 273             case 'b':
 274                 if (strcmp(s, "boolean") == 0) {
 275                     mask = Xapian::QueryParser::FLAG_BOOLEAN;
 276                     break;
 277                 }
 278                 if (strcmp(s, "boolean_any_case") == 0) {
 279                     mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
 280                     break;
 281                 }
 282                 break;
 283             case 'c':
 284                 if (strcmp(s, "cjk_ngram") == 0) {
 285                     mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
 286                     break;
 287                 }
 288                 break;
 289             case 'd':
 290                 if (strcmp(s, "default") == 0) {
 291                     mask = Xapian::QueryParser::FLAG_DEFAULT;
 292                     break;
 293                 }
 294                 break;
 295             case 'l':
 296                 if (strcmp(s, "lovehate") == 0) {
 297                     mask = Xapian::QueryParser::FLAG_LOVEHATE;
 298                     break;
 299                 }
 300                 break;
 301             case 'n':
 302                 if (strcmp(s, "no_positions") == 0) {
 303                     mask = Xapian::QueryParser::FLAG_NO_POSITIONS;
 304                     break;
 305                 }
 306                 break;
 307             case 'p':
 308                 if (strcmp(s, "partial") == 0) {
 309                     mask = Xapian::QueryParser::FLAG_PARTIAL;
 310                     break;
 311                 }
 312                 if (strcmp(s, "phrase") == 0) {
 313                     mask = Xapian::QueryParser::FLAG_PHRASE;
 314                     break;
 315                 }
 316                 if (strcmp(s, "pure_not") == 0) {
 317                     mask = Xapian::QueryParser::FLAG_PURE_NOT;
 318                     break;
 319                 }
 320                 break;
 321             case 's':
 322                 if (strcmp(s, "spelling_correction") == 0) {
 323                     mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
 324                     break;
 325                 }
 326                 if (strcmp(s, "synonym") == 0) {
 327                     mask = Xapian::QueryParser::FLAG_SYNONYM;
 328                     break;
 329                 }
 330                 break;
 331             case 'w':
 332                 if (strcmp(s, "wildcard") == 0) {
 333                     mask = Xapian::QueryParser::FLAG_WILDCARD;
 334                     break;
 335                 }
 336                 break;
 337         }
 338
 339         if (i->second.empty()) {
 340             f &= ~mask;
 341         } else {
 342             f |= mask;
 343         }
 344     }
 345     // Always enable FLAG_ACCUMULATE so that $stoplist and $unstem report
 346     // values accumulated over all query strings parsed as part of a query, not
 347     // just the last one parsed.
 348     return f | Xapian::QueryParser::FLAG_ACCUMULATE;
 349 }
 350
 351 static querytype
 352 parse_queries(const string& oldp)
 353 {
 354     // Parse the query string.
 355     auto opt_it = option.find("stem_strategy");
 356     if (opt_it != option.end()) {
 357         if (opt_it->second == "all") {
 358             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 359         } else if (opt_it->second == "all_z") {
 360             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
 361         } else if (opt_it->second == "none") {
 362             qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
 363         } else if (opt_it->second == "some") {
 364             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
 365         } else if (opt_it->second == "some_full_pos") {
 366             qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
 367         }
 368     } else {
 369         opt_it = option.find("stem_all");
 370         if (opt_it != option.end() && opt_it->second == "true") {
 371             qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
 372         }
 373     }
 374     qp.set_stopper((new MyStopper())->release());
 375     qp.set_default_op(default_op);
 376     qp.set_database(db);
 377     // FIXME: provide a custom RP which handles size:10..20K, etc.
 378     if (!size_rp)
 379         size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
 380     qp.add_rangeprocessor(size_rp);
 381     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
 382     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
 383         string user_prefix(pfx->first, 7);
 384         const string & term_pfx_list = pfx->second;
 385         string::size_type i = 0;
 386         do {
 387             string::size_type i0 = i;
 388             i = term_pfx_list.find('\t', i);
 389             const string & term_pfx = term_pfx_list.substr(i0, i - i0);
 390             qp.add_prefix(user_prefix, term_pfx);
 391             // std::map::insert() won't overwrite an existing entry, so we'll
 392             // prefer the first user_prefix for which a particular term prefix
 393             // is specified.
 394             termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
 395         } while (++i);
 396     }
 397     pfx = option.lower_bound("boolprefix,");
 398     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
 399         string user_prefix(pfx->first, 11, string::npos);
 400         auto it = option.find("nonexclusiveprefix," + pfx->second);
 401         bool exclusive = (it == option.end() || it->second.empty());
 402         qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
 403         termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
 404     }
 405
 406     try {
 407         unsigned default_flags = read_qp_flags("flag_", 0);
 408         if (option["spelling"] == "true")
 409             default_flags |= qp.FLAG_SPELLING_CORRECTION;
 410
 411         vector<Xapian::Query> queries;
 412         queries.reserve(query_strings.size());
 413
 414         for (auto& j : query_strings) {
 415             const string& prefix = j.first;
 416             const string& query_string = j.second;
 417
 418             // Choose the stemmer to use for this input.
 419             string stemlang = option[prefix + ":stemmer"];
 420             if (stemlang.empty())
 421                 stemlang = option["stemmer"];
 422             qp.set_stemmer(Xapian::Stem(stemlang));
 423
 424             // Work out the flags to use for this input.
 425             unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
 426
 427             Xapian::Query q = qp.parse_query(query_string, f, prefix);
 428             if (!q.empty())
 429                 queries.push_back(q);
 430         }
 431         query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
 432     } catch (Xapian::QueryParserError &e) {
 433         error_msg = e.get_msg();
 434         return BAD_QUERY;
 435     }
 436
 437     Xapian::termcount n_new_terms = 0;
 438     for (Xapian::TermIterator i = query.get_terms_begin();
 439          i != query.get_terms_end(); ++i) {
 440         if (termset.find(*i) == termset.end()) {
 441             termset.insert(*i);
 442             if (!queryterms.empty()) queryterms += '\t';
 443             queryterms += *i;
 444         }
 445         n_new_terms++;
 446     }
 447
 448     // Check new query against the previous one
 449     if (oldp.empty()) {
 450         // If oldp was empty that means there were no parsed query terms
 451         // before, so if there are now this is a new query.
 452         return n_new_terms ? NEW_QUERY : SAME_QUERY;
 453     }
 454
 455     // The terms in oldp are separated by tabs.
 456     const char oldp_separator = '\t';
 457     size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
 458
 459     // short-cut: if the new query has fewer terms, it must be a new one
 460     if (n_new_terms < n_old_terms) return NEW_QUERY;
 461
 462     const char *term = oldp.c_str();
 463     const char *pend;
 464     while ((pend = strchr(term, oldp_separator)) != NULL) {
 465         if (termset.find(string(term, pend - term)) == termset.end())
 466             return NEW_QUERY;
 467         term = pend + 1;
 468     }
 469     if (*term) {
 470         if (termset.find(string(term)) == termset.end())
 471             return NEW_QUERY;
 472     }
 473
 474     // Use termset.size() rather than n_new_terms so we correctly handle
 475     // the case when the query has repeated terms.
 476     // This works wrongly in the case when the user extends the query
 477     // by adding a term already in it, but that's unlikely and the behaviour
 478     // isn't too bad (we just don't reset page 1).  We also mishandle a few
 479     // other obscure cases e.g. adding quotes to turn a query into a phrase.
 480     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
 481     return SAME_QUERY;
 482 }
 483
 484 static multimap<string, string> filter_map;
 485 static set<string> neg_filters;
 486
 487 void add_bterm(const string &term) {
 488     string prefix;
 489     if (prefix_from_term(&prefix, term) > 0)
 490         filter_map.insert(multimap<string, string>::value_type(prefix, term));
 491 }
 492
 493 void add_nterm(const string &term) {
 494     if (!term.empty())
 495         neg_filters.insert(term);
 496 }
 497
 498 void
 499 add_date_filter(const string& date_start,
 500                 const string& date_end,
 501                 const string& date_span,
 502                 Xapian::valueno date_value_slot)
 503 {
 504     if (date_start.empty() && date_end.empty() && date_span.empty())
 505         return;
 506
 507     Xapian::Query q;
 508     if (date_value_slot != Xapian::BAD_VALUENO) {
 509         // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
 510         // latter the sort order just works correctly between different
 511         // precisions).
 512         bool as_time_t =
 513             db.get_value_lower_bound(date_value_slot).size() == 4 &&
 514             db.get_value_upper_bound(date_value_slot).size() == 4;
 515         q = date_value_range(as_time_t, date_value_slot,
 516                              date_start, date_end,
 517                              date_span);
 518     } else {
 519         q = date_range_filter(date_start, date_end, date_span);
 520         q |= Xapian::Query("Dlatest");
 521     }
 522
 523     if (date_filter_set) {
 524         date_filter &= q;
 525     } else {
 526         date_filter_set = true;
 527         date_filter = q;
 528     }
 529 }
 530
 531 static void
 532 run_query()
 533 {
 534     string scheme;
 535     bool force_boolean = false;
 536     if (!filter_map.empty()) {
 537         // OR together filters with the same prefix (or AND for non-exclusive
 538         // prefixes), then AND together the resultant groups.
 539         vector<Xapian::Query> filter_vec;
 540         vector<string> same_vec;
 541         string current;
 542         for (auto i = filter_map.begin(); ; ++i) {
 543             bool over = (i == filter_map.end());
 544             if (over || i->first != current) {
 545                 switch (same_vec.size()) {
 546                     case 0:
 547                         break;
 548                     case 1:
 549                         filter_vec.push_back(Xapian::Query(same_vec[0]));
 550                         break;
 551                     default: {
 552                         Xapian::Query::op op = Xapian::Query::OP_OR;
 553                         auto it = option.find("nonexclusiveprefix," + current);
 554                         if (it != option.end() && !it->second.empty()) {
 555                             op = Xapian::Query::OP_AND;
 556                         }
 557                         filter_vec.push_back(Xapian::Query(op,
 558                                                            same_vec.begin(),
 559                                                            same_vec.end()));
 560                         break;
 561                     }
 562                 }
 563                 same_vec.clear();
 564                 if (over) break;
 565                 current = i->first;
 566             }
 567             same_vec.push_back(i->second);
 568         }
 569
 570         Xapian::Query filter(Xapian::Query::OP_AND,
 571                              filter_vec.begin(), filter_vec.end());
 572
 573         if (query.empty()) {
 574             // If no query strings were provided then promote the filters
 575             // to be THE query - filtering an empty query will give no
 576             // matches.
 577             std::swap(query, filter);
 578             auto&& it = option.find("weightingpurefilter");
 579             if (it != option.end() && !it->second.empty()) {
 580                 scheme = it->second;
 581             } else {
 582                 force_boolean = true;
 583             }
 584         } else {
 585             query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
 586         }
 587     }
 588
 589     if (date_filter_set) {
 590         // If no query strings were provided then promote the daterange
 591         // filter to be THE query instead of filtering an empty query.
 592         if (query.empty()) {
 593             query = date_filter;
 594             force_boolean = true;
 595         } else {
 596             query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
 597         }
 598     }
 599
 600     if (!neg_filters.empty()) {
 601         // OR together all negated filters.
 602         Xapian::Query filter(Xapian::Query::OP_OR,
 603                              neg_filters.begin(), neg_filters.end());
 604
 605         if (query.empty() && !date_filter_set) {
 606             // If we only have a negative filter for the query, use MatchAll as
 607             // the query to apply the filters to.
 608             query = Xapian::Query::MatchAll;
 609             force_boolean = true;
 610         }
 611         query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
 612     }
 613
 614     if (!enquire || !error_msg.empty()) return;
 615
 616     if (!force_boolean && scheme.empty()) {
 617         auto&& it = option.find("weighting");
 618         if (it != option.end()) scheme = it->second;
 619     }
 620     set_weighting_scheme(*enquire, scheme, force_boolean);
 621
 622     enquire->set_cutoff(threshold);
 623
 624     if (sort_keymaker) {
 625         if (sort_after) {
 626             enquire->set_sort_by_relevance_then_key(sort_keymaker,
 627                                                     reverse_sort);
 628         } else {
 629             enquire->set_sort_by_key_then_relevance(sort_keymaker,
 630                                                     reverse_sort);
 631         }
 632     } else if (sort_key != Xapian::BAD_VALUENO) {
 633         if (sort_after) {
 634             enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
 635         } else {
 636             enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
 637         }
 638     }
 639
 640     enquire->set_docid_order(docid_order);
 641
 642     if (collapse) {
 643         enquire->set_collapse_key(collapse_key);
 644     }
 645
 646     if (!query.empty()) {
 647 #if 0
 648         // FIXME: If we start doing permissions checks based on $REMOTE_USER
 649         // we're going to break some existing setups if users upgrade.  We
 650         // probably want a way to set this from OmegaScript.
 651         const char * remote_user = getenv("REMOTE_USER");
 652         if (remote_user)
 653             apply_unix_permissions(query, remote_user);
 654 #endif
 655
 656         enquire->set_query(query);
 657         // We could use the value of topdoc as first parameter, but we
 658         // need to know the first few items in the mset to fake a
 659         // relevance set for topterms.
 660         //
 661         // If min_hits isn't set, check at least one extra result so we
 662         // know if we've reached the end of the matches or not - then we
 663         // can avoid offering a "next" button which leads to an empty page.
 664         mset = enquire->get_mset(0, topdoc + hits_per_page,
 665                                  topdoc + max(hits_per_page + 1, min_hits),
 666                                  &rset);
 667     }
 668 }
 669
 670 string
 671 html_escape(const string &str)
 672 {
 673     string res;
 674     string::size_type p = 0;
 675     while (p < str.size()) {
 676         char ch = str[p++];
 677         switch (ch) {
 678             case '<':
 679                 res += "&lt;";
 680                 continue;
 681             case '>':
 682                 res += "&gt;";
 683                 continue;
 684             case '&':
 685                 res += "&amp;";
 686                 continue;
 687             case '"':
 688                 res += "&quot;";
 689                 continue;
 690             default:
 691                 res += ch;
 692         }
 693     }
 694     return res;
 695 }
 696
 697 static string
 698 html_strip(const string &str)
 699 {
 700     string res;
 701     string::size_type p = 0;
 702     bool skip = false;
 703     while (p < str.size()) {
 704         char ch = str[p++];
 705         switch (ch) {
 706             case '<':
 707                 skip = true;
 708                 continue;
 709             case '>':
 710                 skip = false;
 711                 continue;
 712             default:
 713                 if (!skip) res += ch;
 714         }
 715     }
 716     return res;
 717 }
 718
 719 class WordList {
 720     static string prev_list;
 721     static unordered_map<string, int> word_to_occurrence;
 722   public:
 723     void build_word_map(const string& list) {
 724         // Don't build map again if passed list of terms is same as before.
 725         if (prev_list == list) return;
 726         word_to_occurrence.clear();
 727         string::size_type split = 0, split2;
 728         int word_index = 0;
 729         string word;
 730         while ((split2 = list.find('\t', split)) != string::npos) {
 731             word = list.substr(split, split2 - split);
 732             if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 733                 ++word_index;
 734             split = split2 + 1;
 735         }
 736         word = list.substr(split, list.size() - split);
 737         if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
 738             ++word_index;
 739         prev_list = list;
 740     }
 741
 742     int word_in_list(const string& word) {
 743         auto it = word_to_occurrence.find(word);
 744         if (it == word_to_occurrence.end()) return -1;
 745         return it->second;
 746     }
 747 };
 748
 749 string WordList::prev_list;
 750 unordered_map<string, int> WordList::word_to_occurrence;
 751
 752 // Not a character in an identifier
 753 static inline bool
 754 p_notid(unsigned int c)
 755 {
 756     return !C_isalnum(c) && c != '_';
 757 }
 758
 759 // Not a character in an HTML tag name
 760 static inline bool
 761 p_nottag(unsigned int c)
 762 {
 763     return !C_isalnum(c) && c != '.' && c != '-';
 764 }
 765
 766 // FIXME: shares algorithm with indextext.cc!
 767 static string
 768 html_highlight(const string &s, const string &list,
 769                const string &bra, const string &ket)
 770 {
 771     if (!stemmer) {
 772         stemmer = new Xapian::Stem(option["stemmer"]);
 773     }
 774
 775     string res;
 776
 777     Utf8Iterator j(s);
 778     const Utf8Iterator s_end;
 779     while (true) {
 780         Utf8Iterator first = j;
 781         while (first != s_end && !is_wordchar(*first)) ++first;
 782         if (first == s_end) break;
 783         Utf8Iterator term_end;
 784         string term;
 785         string word;
 786         const char *l = j.raw();
 787         if (*first < 128 && C_isupper(*first)) {
 788             j = first;
 789             Xapian::Unicode::append_utf8(term, *j);
 790             while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
 791                 Xapian::Unicode::append_utf8(term, *j);
 792             }
 793             if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
 794                 term.resize(0);
 795             }
 796             term_end = j;
 797         }
 798         if (term.empty()) {
 799             j = first;
 800             while (is_wordchar(*j)) {
 801                 Xapian::Unicode::append_utf8(term, *j);
 802                 ++j;
 803                 if (j == s_end) break;
 804                 if (*j == '&' || *j == '\'') {
 805                     Utf8Iterator next = j;
 806                     ++next;
 807                     if (next == s_end || !is_wordchar(*next)) break;
 808                     term += *j;
 809                     j = next;
 810                 }
 811             }
 812             term_end = j;
 813             if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
 814                 string::size_type len = term.length();
 815                 if (*j == '#') {
 816                     term += '#';
 817                     do { ++j; } while (j != s_end && *j == '#');
 818                 } else {
 819                     while (j != s_end && (*j == '+' || *j == '-')) {
 820                         Xapian::Unicode::append_utf8(term, *j);
 821                         ++j;
 822                     }
 823                 }
 824                 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
 825                     term.resize(len);
 826                 } else {
 827                     term_end = j;
 828                 }
 829             }
 830         }
 831         j = term_end;
 832         term = Xapian::Unicode::tolower(term);
 833         WordList w;
 834         w.build_word_map(list);
 835         int match = w.word_in_list(term);
 836         if (match == -1) {
 837             string stem = "Z";
 838             stem += (*stemmer)(term);
 839             match = w.word_in_list(stem);
 840         }
 841         if (match >= 0) {
 842             res += html_escape(string(l, first.raw() - l));
 843             if (!bra.empty()) {
 844                 res += bra;
 845             } else {
 846                 static const char * colours[] = {
 847                     "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
 848                     "990000", "009900", "996600", "006699", "990099"
 849                 };
 850                 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
 851                 const char * bg = colours[idx];
 852                 if (strchr(bg, 'f')) {
 853                     res += "<b style=\"color:black;background-color:#";
 854                 } else {
 855                     res += "<b style=\"color:white;background-color:#";
 856                 }
 857                 res += bg;
 858                 res += "\">";
 859             }
 860             word.assign(first.raw(), j.raw() - first.raw());
 861             res += html_escape(word);
 862             if (!bra.empty()) {
 863                 res += ket;
 864             } else {
 865                 res += "</b>";
 866             }
 867         } else {
 868             res += html_escape(string(l, j.raw() - l));
 869         }
 870     }
 871     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
 872     return res;
 873 }
 874
 875 #if 0
 876 static void
 877 print_query_string(const char *after)
 878 {
 879     if (after && strncmp(after, "&B=", 3) == 0) {
 880         char prefix = after[3];
 881         string::size_type start = 0, amp = 0;
 882         while (true) {
 883             amp = url_query_string.find('&', amp);
 884             if (amp == string::npos) {
 885                 cout << url_query_string.substr(start);
 886                 return;
 887             }
 888             amp++;
 889             while (url_query_string[amp] == 'B' &&
 890                    url_query_string[amp + 1] == '=' &&
 891                    url_query_string[amp + 2] == prefix) {
 892                 cout << url_query_string.substr(start, amp - start - 1);
 893                 start = url_query_string.find('&', amp + 3);
 894                 if (start == string::npos) return;
 895                 amp = start + 1;
 896             }
 897         }
 898     }
 899     cout << url_query_string;
 900 }
 901 #endif
 902
 903 class CachedFields : private Fields {
 904     Xapian::docid did_cached = 0;
 905
 906   public:
 907     CachedFields() {}
 908
 909     const string& get_field(Xapian::docid did, const string& name) {
 910         if (did != did_cached) {
 911             did_cached = did;
 912             auto it = option.find("fieldnames");
 913             Fields::parse_fields(db.get_document(did).get_data(),
 914                                  it == option.end() ? nullptr : &it->second);
 915         }
 916         return Fields::get_field(name);
 917     }
 918 };
 919
 920 static CachedFields fields;
 921 static Xapian::docid q0;
 922 static Xapian::doccount hit_no;
 923 static int percent;
 924 static double weight;
 925 static Xapian::doccount collapsed;
 926
 927 static string print_caption(const string& fmt, vector<string>& param);
 928
 929 enum tagval {
 930 CMD_,
 931 CMD_add,
 932 CMD_addfilter,
 933 CMD_allterms,
 934 CMD_and,
 935 CMD_base64,
 936 CMD_cgi,
 937 CMD_cgilist,
 938 CMD_cgiparams,
 939 CMD_chr,
 940 CMD_collapsed,
 941 CMD_cond,
 942 CMD_contains,
 943 CMD_csv,
 944 CMD_date,
 945 CMD_dbname,
 946 CMD_dbsize,
 947 CMD_def,
 948 CMD_defaultop,
 949 CMD_div,
 950 CMD_emptydocs,
 951 CMD_env,
 952 CMD_eq,
 953 CMD_error,
 954 CMD_field,
 955 CMD_filesize,
 956 CMD_filters,
 957 CMD_filterterms,
 958 CMD_find,
 959 CMD_fmt,
 960 CMD_foreach,
 961 CMD_freq,
 962 CMD_ge,
 963 CMD_gt,
 964 CMD_highlight,
 965 CMD_hit,
 966 CMD_hitlist,
 967 CMD_hitsperpage,
 968 CMD_hostname,
 969 CMD_html,
 970 CMD_htmlstrip,
 971 CMD_httpheader,
 972 CMD_id,
 973 CMD_if,
 974 CMD_include,
 975 CMD_json,
 976 CMD_jsonarray,
 977 CMD_jsonbool,
 978 CMD_jsonobject,
 979 CMD_keys,
 980 CMD_last,
 981 CMD_lastpage,
 982 CMD_le,
 983 CMD_length,
 984 CMD_list,
 985 CMD_log,
 986 CMD_lookup,
 987 CMD_lower,
 988 CMD_lt,
 989 CMD_map,
 990 CMD_match,
 991 CMD_max,
 992 CMD_min,
 993 CMD_mod,
 994 CMD_msize,
 995 CMD_msizeexact,
 996 CMD_msizelower,
 997 CMD_msizeupper,
 998 CMD_mul,
 999 CMD_muldiv,
1000 CMD_ne,
1001 CMD_nice,
1002 CMD_not,
1003 CMD_now,
1004 CMD_opt,
1005 CMD_or,
1006 CMD_ord,
1007 CMD_pack,
1008 CMD_percentage,
1009 CMD_prettyterm,
1010 CMD_prettyurl,
1011 CMD_query,
1012 CMD_querydescription,
1013 CMD_queryterms,
1014 CMD_range,
1015 CMD_record,
1016 CMD_relevant,
1017 CMD_relevants,
1018 CMD_score,
1019 CMD_set,
1020 CMD_seterror,
1021 CMD_setmap,
1022 CMD_setrelevant,
1023 CMD_slice,
1024 CMD_snippet,
1025 CMD_sort,
1026 CMD_split,
1027 CMD_stoplist,
1028 CMD_sub,
1029 CMD_subdb,
1030 CMD_subid,
1031 CMD_substr,
1032 CMD_suggestion,
1033 CMD_switch,
1034 CMD_termprefix,
1035 CMD_terms,
1036 CMD_thispage,
1037 CMD_time,
1038 CMD_topdoc,
1039 CMD_topterms,
1040 CMD_transform,
1041 CMD_truncate,
1042 CMD_uniq,
1043 CMD_unique,
1044 CMD_unpack,
1045 CMD_unprefix,
1046 CMD_unstem,
1047 CMD_upper,
1048 CMD_url,
1049 CMD_value,
1050 CMD_version,
1051 CMD_weight,
1052 CMD_MACRO // special tag for macro evaluation
1053 };
1054
1055 struct func_attrib {
1056     int tag;
1057     int minargs, maxargs, evalargs;
1058     char ensure;
1059 };
1060
1061 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1062 struct func_desc {
1063     const char *name;
1064     struct func_attrib a;
1065 };
1066
1067 #define N (-1)
1068 #define M 'M'
1069 #define Q 'Q'
1070 // NB when adding a new command which ensures M or Q, update the list in
1071 // docs/omegascript.rst
1072 static const struct func_desc func_tab[] = {
1073 //name minargs maxargs evalargs ensure
1074 {"",{CMD_,         N, N, 0, 0}},// commented out code
1075 T(add,             0, N, N, 0), // add a list of numbers
1076 T(addfilter,       1, 2, N, 0), // add filter term
1077 T(allterms,        0, 1, N, 0), // list of all terms matching document
1078 T(and,             1, N, 0, 0), // logical shortcutting and of a list of values
1079 T(base64,          1, 1, N, 0), // base64 encode
1080 T(cgi,             1, 1, N, 0), // return cgi parameter value
1081 T(cgilist,         1, 1, N, 0), // return list of values for cgi parameter
1082 T(cgiparams,       0, 0, N, 0), // return list of cgi parameter names
1083 T(chr,             1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1084 T(collapsed,       0, 0, N, 0), // return number of hits collapsed into this
1085 T(cond,            2, N, 0, 0), // cascaded conditionals
1086 T(contains,        2, 2, N, 0), // return position of substring, or empty string
1087 T(csv,             1, 2, N, 0), // CSV string escaping
1088 T(date,            1, 2, N, 0), // convert time_t to strftime format
1089                                 // (default: YYYY-MM-DD)
1090 T(dbname,          0, 0, N, 0), // database name
1091 T(dbsize,          0, 0, N, 0), // database size (# of documents)
1092 T(def,             2, 2, 1, 0), // define a macro
1093 T(defaultop,       0, 0, N, 0), // default operator: "and" or "or"
1094 T(div,             2, 2, N, 0), // integer divide
1095 T(emptydocs,       0, 1, N, 0), // list of empty documents
1096 T(env,             1, 1, N, 0), // environment variable
1097 T(eq,              2, 2, N, 0), // test equality
1098 T(error,           0, 0, N, 0), // error message
1099 T(field,           1, 2, N, 0), // lookup field in record
1100 T(filesize,        1, 1, N, 0), // pretty printed filesize
1101 T(filters,         0, 0, N, 0), // serialisation of current filters
1102 T(filterterms,     1, 1, N, 0), // list of terms with a given prefix
1103 T(find,            2, 2, N, 0), // find entry in list
1104 T(fmt,             0, 0, N, 0), // name of current format
1105 T(foreach,         2, 2, 1, 0), // evaluate something for every entry in a list
1106 T(freq,            1, 1, N, 0), // frequency of a term
1107 T(ge,              2, 2, N, 0), // test >=
1108 T(gt,              2, 2, N, 0), // test >
1109 T(highlight,       2, 4, N, 0), // html escape and highlight words from list
1110 T(hit,             0, 0, N, 0), // hit number of current mset entry (0-based)
1111 T(hitlist,         1, 1, 0, M), // display hitlist using format in argument
1112 T(hitsperpage,     0, 0, N, 0), // hits per page
1113 T(hostname,        1, 1, N, 0), // extract hostname from URL
1114 T(html,            1, 1, N, 0), // html escape string (<>&")
1115 T(htmlstrip,       1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1116 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
1117 T(id,              0, 0, N, 0), // docid of current doc
1118 T(if,              1, 3, 1, 0), // conditional
1119 T(include,         1, 2, 1, 0), // include another file
1120 T(json,            1, 1, N, 0), // JSON string escaping
1121 T(jsonarray,       1, 2, 1, 0), // Format list as a JSON array
1122 T(jsonbool,        1, 1, 1, 0), // Format list as a JSON bool
1123 T(jsonobject,      1, 3, 1, 0), // Format map as JSON object
1124 T(keys,            1, 1, N, 0), // list of keys from a map
1125 T(last,            0, 0, N, M), // hit number one beyond end of current page
1126 T(lastpage,        0, 0, N, M), // number of last hit page
1127 T(le,              2, 2, N, 0), // test <=
1128 T(length,          1, 1, N, 0), // length of list
1129 T(list,            2, 5, N, 0), // pretty print list
1130 T(log,             1, 2, 1, 0), // create a log entry
1131 T(lookup,          2, 2, N, 0), // lookup in named cdb file
1132 T(lower,           1, 1, N, 0), // convert string to lower case
1133 T(lt,              2, 2, N, 0), // test <
1134 T(map,             2, 2, 1, 0), // map a list into another list
1135 T(match,           2, 3, N, 0), // regex match
1136 T(max,             1, N, N, 0), // maximum of a list of values
1137 T(min,             1, N, N, 0), // minimum of a list of values
1138 T(mod,             2, 2, N, 0), // integer modulus
1139 T(msize,           0, 0, N, M), // number of matches (estimated)
1140 T(msizeexact,      0, 0, N, M), // is $msize exact?
1141 T(msizelower,      0, 0, N, M), // number of matches (lower bound)
1142 T(msizeupper,      0, 0, N, M), // number of matches (upper bound)
1143 T(mul,             2, N, N, 0), // multiply a list of numbers
1144 T(muldiv,          3, 3, N, 0), // calculate A*B/C
1145 T(ne,              2, 2, N, 0), // test not equal
1146 T(nice,            1, 1, N, 0), // pretty print integer (with thousands sep)
1147 T(not,             1, 1, N, 0), // logical not
1148 T(now,             0, 0, N, 0), // current date/time as a time_t
1149 T(opt,             1, 2, N, 0), // lookup an option value
1150 T(or,              1, N, 0, 0), // logical shortcutting or of a list of values
1151 T(ord,             1, 1, N, 0), // return codepoint for first character of UTF-8 string
1152 T(pack,            1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1153 T(percentage,      0, 0, N, 0), // percentage score of current hit
1154 T(prettyterm,      1, 1, N, Q), // pretty print term name
1155 T(prettyurl,       1, 1, N, 0), // pretty version of URL
1156 T(query,           0, 1, N, Q), // query
1157 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1158 T(queryterms,      0, 0, N, Q), // list of query terms
1159 T(range,           2, 2, N, 0), // return list of values between start and end
1160 T(record,          0, 1, N, 0), // record contents of document
1161 T(relevant,        0, 1, N, Q), // is document relevant?
1162 T(relevants,       0, 0, N, Q), // return list of relevant documents
1163 T(score,           0, 0, N, 0), // score (0-10) of current hit
1164 T(set,             2, 2, N, 0), // set option value
1165 T(seterror,        1, 1, N, 0), // set error_msg, setting it early stops query execution
1166 T(setmap,          1, N, N, 0), // set map of option values
1167 T(setrelevant,     1, 1, N, Q), // set rset
1168 T(slice,           2, 2, N, 0), // slice a list using a second list
1169 T(snippet,         1, 2, N, M), // generate snippet from text
1170 T(sort,            1, 2, N, 0), // alpha sort a list
1171 T(split,           1, 2, N, 0), // split a string to give a list
1172 T(stoplist,        0, 0, N, Q), // return list of stopped terms
1173 T(sub,             2, 2, N, 0), // subtract
1174 T(subdb,           0, 1, N, 0), // name of subdb docid is in
1175 T(subid,           0, 1, N, 0), // docid in the subdb#
1176 T(substr,          2, 3, N, 0), // substring
1177 T(suggestion,      0, 0, N, Q), // misspelled word correction suggestion
1178 T(switch,          3, N, 1, 0), // return position of substring, or empty string
1179 T(termprefix,      1, 1, N, 0), // get any prefix from a term
1180 T(terms,           0, 1, N, M), // list of matching terms
1181 T(thispage,        0, 0, N, M), // page number of current page
1182 T(time,            0, 0, N, M), // how long the match took (in seconds)
1183 T(topdoc,          0, 0, N, M), // first document on current page of hit list
1184                                 // (counting from 0)
1185 T(topterms,        0, 1, N, M), // list of up to N top relevance feedback terms
1186                                 // (default 16)
1187 T(transform,       3, 4, N, 0), // transform with a regexp
1188 T(truncate,        2, 4, N, 0), // truncate after a word
1189 T(uniq,            1, 1, N, 0), // removed duplicates from a sorted list
1190 T(unique,          1, 1, N, 0), // removed duplicates from any list
1191 T(unpack,          1, 1, N, 0), // convert 4 byte big endian binary string to a number
1192 T(unprefix,        1, 1, N, 0), // remove any prefix from a term
1193 T(unstem,          1, 1, N, Q), // return list of terms from the parsed query
1194                                 // which stemmed to this term
1195 T(upper,           1, 1, N, 0), // convert string to upper case
1196 T(url,             1, 1, N, 0), // url encode argument
1197 T(value,           1, 2, N, 0), // return document value
1198 T(version,         0, 0, N, 0), // omega version string
1199 T(weight,          0, 0, N, 0), // weight of the current hit
1200 { NULL,{0,         0, 0, 0, 0}}
1201 };
1202
1203 #undef T // Leaving T defined screws up Sun's C++ compiler!
1204
1205 static vector<string> macros;
1206
1207 // Call write() repeatedly until all data is written or we get a
1208 // non-recoverable error.
1209 static ssize_t
1210 write_all(int fd, const char * buf, size_t count)
1211 {
1212     while (count) {
1213         ssize_t r = write(fd, buf, count);
1214         if (rare(r < 0)) {
1215             if (errno == EINTR) continue;
1216             return r;
1217         }
1218         buf += r;
1219         count -= r;
1220     }
1221     return 0;
1222 }
1223
1224 static string eval(const string& fmt, vector<string>& param);
1225
1226 /** Implements $foreach{} and $map{}. */
1227 static string
1228 foreach(const string& list,
1229         const string& pat,
1230         vector<string>& param,
1231         char sep = '\0')
1232 {
1233     string result;
1234     string saved_arg0 = std::move(param[0]);
1235     string::size_type i = 0, j;
1236     while (true) {
1237         j = list.find('\t', i);
1238         param[0].assign(list, i, j - i);
1239         result += eval(pat, param);
1240         if (j == string::npos) break;
1241         if (sep) result += sep;
1242         i = j + 1;
1243     }
1244     param[0] = std::move(saved_arg0);
1245     return result;
1246 }
1247
1248 static string
1249 eval(const string& fmt, vector<string>& param)
1250 {
1251     static map<string, const struct func_attrib *> func_map;
1252     if (func_map.empty()) {
1253         for (auto p = func_tab; p->name != NULL; ++p) {
1254             func_map[string(p->name)] = &(p->a);
1255         }
1256     }
1257     string res;
1258     string::size_type p = 0, q;
1259     while ((q = fmt.find('$', p)) != string::npos) try {
1260         res.append(fmt, p, q - p);
1261         string::size_type code_start = q; // note down for error reporting
1262         q++;
1263         if (q >= fmt.size()) break;
1264         unsigned char ch = fmt[q];
1265         switch (ch) {
1266             // Magic sequences:
1267             // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1268             case '$':
1269                 res += '$';
1270                 p = q + 1;
1271                 continue;
1272             case '(':
1273                 res += '{';
1274                 p = q + 1;
1275                 continue;
1276             case ')':
1277                 res += '}';
1278                 p = q + 1;
1279                 continue;
1280             case '.':
1281                 res += ',';
1282                 p = q + 1;
1283                 continue;
1284             case '_':
1285                 ch = '0';
1286                 // FALL THRU
1287             case '1': case '2': case '3': case '4': case '5':
1288             case '6': case '7': case '8': case '9':
1289                 ch -= '0';
1290                 if (ch < param.size()) res += param[ch];
1291                 p = q + 1;
1292                 continue;
1293             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1294             case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1295             case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1296             case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1297             case 'y': case 'z':
1298             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1299             case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1300             case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1301             case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1302             case 'Y': case 'Z':
1303             case '{':
1304                 break;
1305             default:
1306                 string msg = "Unknown $ code in: $";
1307                 msg.append(fmt, q, string::npos);
1308                 throw msg;
1309         }
1310         p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1311         string var(fmt, q, p - q);
1312         map<string, const struct func_attrib *>::const_iterator func;
1313         func = func_map.find(var);
1314         if (func == func_map.end()) {
1315             throw "Unknown function '" + var + "'";
1316         }
1317         vector<string> args;
1318         if (fmt[p] == '{') {
1319             q = p + 1;
1320             int nest = 1;
1321             while (true) {
1322                 p = fmt.find_first_of(",{}", p + 1);
1323                 if (p == string::npos)
1324                     throw "missing } in " + fmt.substr(code_start);
1325                 if (fmt[p] == '{') {
1326                     ++nest;
1327                 } else {
1328                     if (nest == 1) {
1329                         // should we split the args
1330                         if (func->second->minargs != N) {
1331                             args.push_back(fmt.substr(q, p - q));
1332                             q = p + 1;
1333                         }
1334                     }
1335                     if (fmt[p] == '}' && --nest == 0) break;
1336                 }
1337             }
1338             if (func->second->minargs == N)
1339                 args.push_back(fmt.substr(q, p - q));
1340             ++p;
1341         }
1342
1343         if (func->second->minargs != N) {
1344             if (int(args.size()) < func->second->minargs)
1345                 throw "too few arguments to $" + var;
1346             if (func->second->maxargs != N &&
1347                 int(args.size()) > func->second->maxargs)
1348                 throw "too many arguments to $" + var;
1349
1350             vector<string>::size_type n;
1351             if (func->second->evalargs != N)
1352                 n = func->second->evalargs;
1353             else
1354                 n = args.size();
1355
1356             for (vector<string>::size_type j = 0; j < n; ++j)
1357                 args[j] = eval(args[j], param);
1358         }
1359         if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1360             ensure_query_parsed();
1361         if (func->second->ensure == 'M') ensure_match();
1362         string value;
1363         switch (func->second->tag) {
1364             case CMD_:
1365                 break;
1366             case CMD_add: {
1367                 int total = 0;
1368                 for (auto&& arg : args)
1369                     total += string_to_int(arg);
1370                 value = str(total);
1371                 break;
1372             }
1373             case CMD_addfilter:
1374                 if (args.size() == 1 || args[1].empty() || args[1] == "B") {
1375                     add_bterm(args[0]);
1376                 } else if (args[1] == "N") {
1377                     add_nterm(args[0]);
1378                 } else {
1379                     string msg = "Invalid $addfilter type '";
1380                     msg += args[1];
1381                     msg += "'";
1382                     throw msg;
1383                 }
1384                 break;
1385             case CMD_allterms: {
1386                 // list of all terms indexing document
1387                 Xapian::docid id = q0;
1388                 if (!args.empty()) id = string_to_int(args[0]);
1389                 for (Xapian::TermIterator term = db.termlist_begin(id);
1390                      term != db.termlist_end(id); ++term) {
1391                     value += *term;
1392                     value += '\t';
1393                 }
1394
1395                 if (!value.empty()) value.erase(value.size() - 1);
1396                 break;
1397             }
1398             case CMD_and: {
1399                 value = "true";
1400                 for (auto&& arg : args) {
1401                     if (eval(arg, param).empty()) {
1402                         value.resize(0);
1403                         break;
1404                     }
1405                 }
1406                 break;
1407             }
1408             case CMD_base64: {
1409                 const static char encode[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef"
1410                                              "ghijklmnopqrstuvwxyz0123456789+/";
1411                 const char pad = '=';
1412                 const string& input = args[0];
1413                 value.reserve((input.size() + 2) / 3 * 4);
1414                 auto it = input.begin();
1415                 auto n = input.size() / 3;
1416                 while (n--) {
1417                     uint32_t v = uint8_t(*it++);
1418                     v = (v << 8) | uint8_t(*it++);
1419                     v = (v << 8) | uint8_t(*it++);
1420                     value += encode[v >> 18];
1421                     value += encode[(v >> 12) & 63];
1422                     value += encode[(v >> 6) & 63];
1423                     value += encode[v & 63];
1424                 }
1425                 switch (input.size() % 3) {
1426                     case 2: {
1427                         uint32_t v = uint8_t(*it++);
1428                         v = (v << 8) | uint8_t(*it++);
1429                         value += encode[v >> 10];
1430                         value += encode[(v >> 4) & 63];
1431                         value += encode[(v << 2) & 63];
1432                         value += pad;
1433                         break;
1434                     }
1435                     case 1: {
1436                         uint32_t v = uint8_t(*it++);
1437                         value += encode[v >> 2];
1438                         value += encode[(v << 4) & 63];
1439                         value += pad;
1440                         value += pad;
1441                         break;
1442                     }
1443                 }
1444                 break;
1445             }
1446             case CMD_cgi: {
1447                 auto i = cgi_params.find(args[0]);
1448                 if (i != cgi_params.end()) value = i->second;
1449                 break;
1450             }
1451             case CMD_cgilist: {
1452                 auto g = cgi_params.equal_range(args[0]);
1453                 for (auto i = g.first; i != g.second; ++i) {
1454                     value += i->second;
1455                     value += '\t';
1456                 }
1457                 if (!value.empty()) value.erase(value.size() - 1);
1458                 break;
1459             }
1460             case CMD_cgiparams: {
1461                 const string* prev = NULL;
1462                 for (auto&& i : cgi_params) {
1463                     if (prev && i.first == *prev) continue;
1464                     value += i.first;
1465                     value += '\t';
1466                     prev = &i.first;
1467                 }
1468                 if (!value.empty()) value.erase(value.size() - 1);
1469                 break;
1470             }
1471             case CMD_chr:
1472                 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1473                 break;
1474             case CMD_collapsed: {
1475                 value = str(collapsed);
1476                 break;
1477             }
1478             case CMD_cond:
1479                 for (size_t i = 0; i < args.size(); i += 2) {
1480                     if (i == args.size() - 1) {
1481                         // Handle optional "else" value.
1482                         value = eval(args[i], param);
1483                         break;
1484                     }
1485                     if (!eval(args[i], param).empty()) {
1486                         value = eval(args[i + 1], param);
1487                         break;
1488                     }
1489                 }
1490                 break;
1491             case CMD_contains: {
1492                 size_t pos = args[1].find(args[0]);
1493                 if (pos != string::npos) {
1494                     value = str(pos);
1495                 }
1496                 break;
1497             }
1498             case CMD_csv:
1499                 value = args[0];
1500                 if (args.size() > 1 && !args[1].empty()) {
1501                     csv_escape_always(value);
1502                 } else {
1503                     csv_escape(value);
1504                 }
1505                 break;
1506             case CMD_date:
1507                 value = args[0];
1508                 if (!value.empty()) {
1509                     char buf[64] = "";
1510                     time_t date = string_to_int(value);
1511                     if (date != static_cast<time_t>(-1)) {
1512                         struct tm *then;
1513                         then = gmtime(&date);
1514                         string date_fmt = "%Y-%m-%d";
1515                         if (args.size() > 1) date_fmt = eval(args[1], param);
1516                         strftime(buf, sizeof buf, date_fmt.c_str(), then);
1517                     }
1518                     value = buf;
1519                 }
1520                 break;
1521             case CMD_dbname:
1522                 value = dbname;
1523                 break;
1524             case CMD_dbsize: {
1525                 static Xapian::doccount dbsize;
1526                 if (!dbsize) dbsize = db.get_doccount();
1527                 value = str(dbsize);
1528                 break;
1529             }
1530             case CMD_def: {
1531                 func_attrib *fa = new func_attrib;
1532                 fa->tag = CMD_MACRO + macros.size();
1533                 fa->minargs = 0;
1534                 fa->maxargs = 9;
1535                 fa->evalargs = N; // FIXME: or 0?
1536                 fa->ensure = 0;
1537
1538                 macros.push_back(args[1]);
1539                 func_map[args[0]] = fa;
1540                 break;
1541             }
1542             case CMD_defaultop:
1543                 if (default_op == Xapian::Query::OP_AND) {
1544                     value = "and";
1545                 } else {
1546                     value = "or";
1547                 }
1548                 break;
1549             case CMD_div: {
1550                 int denom = string_to_int(args[1]);
1551                 if (denom == 0) {
1552                     value = "divide by 0";
1553                 } else {
1554                     value = str(string_to_int(args[0]) / denom);
1555                 }
1556                 break;
1557             }
1558             case CMD_emptydocs: {
1559                 string t;
1560                 if (!args.empty())
1561                     t = args[0];
1562                 Xapian::PostingIterator i;
1563                 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1564                     if (i.get_doclength() != 0) continue;
1565                     if (!value.empty()) value += '\t';
1566                     value += str(*i);
1567                 }
1568                 break;
1569             }
1570             case CMD_env: {
1571                 char *env = getenv(args[0].c_str());
1572                 if (env != NULL) value = env;
1573                 break;
1574             }
1575             case CMD_eq:
1576                 if (args[0] == args[1]) value = "true";
1577                 break;
1578             case CMD_error:
1579                 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1580                     error_msg = "Database '" + dbname + "' couldn't be opened";
1581                 }
1582                 value = error_msg;
1583                 break;
1584             case CMD_field: {
1585                 Xapian::docid did = q0;
1586                 if (args.size() > 1) did = string_to_int(args[1]);
1587                 value = fields.get_field(did, args[0]);
1588                 break;
1589             }
1590             case CMD_filesize: {
1591                 // FIXME: rounding?  i18n?
1592                 int size = string_to_int(args[0]);
1593                 int intpart = size;
1594                 int fraction = -1;
1595                 const char * format = 0;
1596                 if (size < 0) {
1597                     // Negative size -> empty result.
1598                 } else if (size == 1) {
1599                     format = "%d byte";
1600                 } else if (size < 1024) {
1601                     format = "%d bytes";
1602                 } else {
1603                     if (size < 1024 * 1024) {
1604                         format = "%d.%cK";
1605                     } else {
1606                         size /= 1024;
1607                         if (size < 1024 * 1024) {
1608                             format = "%d.%cM";
1609                         } else {
1610                             size /= 1024;
1611                             format = "%d.%cG";
1612                         }
1613                     }
1614                     intpart = unsigned(size) / 1024;
1615                     fraction = unsigned(size) % 1024;
1616                 }
1617                 if (format) {
1618                     char buf[200];
1619                     int len;
1620                     if (fraction == -1) {
1621                         len = my_snprintf(buf, sizeof(buf), format, intpart);
1622                     } else {
1623                         fraction = (fraction * 10 / 1024) + '0';
1624                         len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1625                     }
1626                     if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1627                     value.assign(buf, len);
1628                 }
1629                 break;
1630             }
1631             case CMD_filters:
1632                 value = filters;
1633                 break;
1634             case CMD_filterterms: {
1635                 Xapian::TermIterator term = db.allterms_begin();
1636                 term.skip_to(args[0]);
1637                 while (term != db.allterms_end()) {
1638                     string t = *term;
1639                     if (!startswith(t, args[0])) break;
1640                     value += t;
1641                     value += '\t';
1642                     ++term;
1643                 }
1644
1645                 if (!value.empty()) value.erase(value.size() - 1);
1646                 break;
1647             }
1648             case CMD_find: {
1649                 string l = args[0], s = args[1];
1650                 string::size_type i = 0, j = 0;
1651                 size_t count = 0;
1652                 while (j != l.size()) {
1653                     j = l.find('\t', i);
1654                     if (j == string::npos) j = l.size();
1655                     if (j - i == s.length()) {
1656                         if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1657                             value = str(count);
1658                             break;
1659                         }
1660                     }
1661                     ++count;
1662                     i = j + 1;
1663                 }
1664                 break;
1665             }
1666             case CMD_fmt:
1667                 value = fmtname;
1668                 break;
1669             case CMD_foreach:
1670                 if (!args[0].empty()) {
1671                     value = foreach(args[0], args[1], param);
1672                 }
1673                 break;
1674             case CMD_freq: {
1675                 const string& term = args[0];
1676                 Xapian::doccount termfreq = 0;
1677                 if (done_query) {
1678                     try {
1679                         termfreq = mset.get_termfreq(term);
1680                     } catch (const Xapian::InvalidOperationError&) {
1681                         // In 1.4.x and earlier, InvalidOperationError is
1682                         // thrown if the MSet is empty and not associated with
1683                         // an Enquire object.  In 1.5.0 and later, a termfreq
1684                         // of 0 is returned for this case.
1685                     }
1686                 }
1687                 if (termfreq == 0) {
1688                     // We want $freq to work before the match is run, and we
1689                     // don't want using it to force the match to run.
1690                     termfreq = db.get_termfreq(term);
1691                 }
1692                 value = str(termfreq);
1693                 break;
1694             }
1695             case CMD_ge:
1696                 if (string_to_int(args[0]) >= string_to_int(args[1]))
1697                     value = "true";
1698                 break;
1699             case CMD_gt:
1700                 if (string_to_int(args[0]) > string_to_int(args[1]))
1701                     value = "true";
1702                 break;
1703             case CMD_highlight: {
1704                 string bra, ket;
1705                 if (args.size() > 2) {
1706                     bra = args[2];
1707                     if (args.size() > 3) {
1708                         ket = args[3];
1709                     } else {
1710                         string::const_iterator i;
1711                         i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1712                         ket = "</";
1713                         ket.append(bra, 1, i - bra.begin() - 1);
1714                         ket += '>';
1715                     }
1716                 }
1717
1718                 value = html_highlight(args[0], args[1], bra, ket);
1719                 break;
1720             }
1721             case CMD_hit:
1722                 // 0-based mset index
1723                 value = str(hit_no);
1724                 break;
1725             case CMD_hitlist: {
1726 #if 0
1727                 url_query_string = "?DB=";
1728                 url_query_string += dbname;
1729                 for (auto& j : query_strings) {
1730                     if (j.first.empty()) {
1731                         url_query_string += "&P=";
1732                     } else {
1733                         url_query_string += "&P."
1734                         url_query_string += j.first;
1735                         url_query_string += '=';
1736                     }
1737                     const char *q = j.second.c_str();
1738                     int ch;
1739                     while ((ch = *q++) != '\0') {
1740                         switch (ch) {
1741                           case '+':
1742                             url_query_string += "%2b";
1743                             break;
1744                           case '"':
1745                             url_query_string += "%22";
1746                             break;
1747                           case '%':
1748                             url_query_string += "%25";
1749                             break;
1750                           case '&':
1751                             url_query_string += "%26";
1752                             break;
1753                           case ' ':
1754                             ch = '+';
1755                             /* fall through */
1756                           default:
1757                             url_query_string += ch;
1758                         }
1759                     }
1760                 }
1761                 // add any boolean terms
1762                 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1763                     url_query_string += "&B=";
1764                     url_query_string += i->second;
1765                 }
1766 #endif
1767                 auto save_hit_no = hit_no;
1768                 for (hit_no = topdoc; hit_no < last; ++hit_no)
1769                     value += print_caption(args[0], param);
1770                 hit_no = save_hit_no;
1771                 break;
1772             }
1773             case CMD_hitsperpage:
1774                 value = str(hits_per_page);
1775                 break;
1776             case CMD_hostname: {
1777                 value = args[0];
1778                 // remove URL scheme and/or path
1779                 string::size_type i = value.find("://");
1780                 if (i == string::npos) i = 0; else i += 3;
1781                 value = value.substr(i, value.find('/', i) - i);
1782                 // remove user@ or user:password@
1783                 i = value.find('@');
1784                 if (i != string::npos) value.erase(0, i + 1);
1785                 // remove :port
1786                 i = value.find(':');
1787                 if (i != string::npos) value.resize(i);
1788                 break;
1789             }
1790             case CMD_html:
1791                 value = html_escape(args[0]);
1792                 break;
1793             case CMD_htmlstrip:
1794                 value = html_strip(args[0]);
1795                 break;
1796             case CMD_httpheader:
1797                 if (!suppress_http_headers) {
1798                     cout << args[0] << ": " << args[1] << endl;
1799                     if (!set_content_type && args[0].length() == 12 &&
1800                             strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1801                         set_content_type = true;
1802                     }
1803                 }
1804                 break;
1805             case CMD_id:
1806                 // document id
1807                 value = str(q0);
1808                 break;
1809             case CMD_if:
1810                 if (args.size() > 1 && !args[0].empty())
1811                     value = eval(args[1], param);
1812                 else if (args.size() > 2)
1813                     value = eval(args[2], param);
1814                 break;
1815             case CMD_include: {
1816                 if (args.size() == 1) {
1817                     value = eval_file(args[0]);
1818                 } else {
1819                     bool fallback = false;
1820                     value = eval_file(args[0], &fallback);
1821                     if (fallback) {
1822                         value = eval(args[1], param);
1823                     }
1824                 }
1825                 break;
1826             }
1827             case CMD_json:
1828                 value = args[0];
1829                 json_escape(value);
1830                 break;
1831             case CMD_jsonarray: {
1832                 const string & l = args[0];
1833                 string::size_type i = 0, j;
1834                 if (l.empty()) {
1835                     value = "[]";
1836                     break;
1837                 }
1838                 vector<string> new_args(1);
1839                 value = "[";
1840                 while (true) {
1841                     j = l.find('\t', i);
1842                     string elt(l, i, j - i);
1843                     if (args.size() == 1) {
1844                         value += '"';
1845                         json_escape(elt);
1846                         value += elt;
1847                         value += '"';
1848                     } else {
1849                         new_args[0] = std::move(elt);
1850                         value += eval(args[1], new_args);
1851                     }
1852                     if (j == string::npos) break;
1853                     value += ',';
1854                     i = j + 1;
1855                 }
1856                 value += ']';
1857                 break;
1858             }
1859             case CMD_jsonbool:
1860                 value = args[0].empty() ? "false" : "true";
1861                 break;
1862             case CMD_jsonobject: {
1863                 vector<string> new_args;
1864                 new_args.push_back(string());
1865
1866                 class map_range {
1867                     typedef map<string, string>::const_iterator iterator;
1868                     iterator b, e;
1869
1870                   public:
1871                     map_range(iterator b_, iterator e_) : b(b_), e(e_) {}
1872
1873                     iterator begin() const { return b; }
1874                     iterator end() const { return e; }
1875                 };
1876
1877                 string prefix = args[0] + ',';
1878                 auto b = option.lower_bound(prefix);
1879                 ++prefix.back();
1880                 auto e = option.lower_bound(prefix);
1881                 value = to_json(map_range(b, e),
1882                                 [&](const string& k) {
1883                                     string key(k, prefix.size());
1884                                     if (args.size() > 1 && !args[1].empty()) {
1885                                         new_args[0] = std::move(key);
1886                                         key = eval(args[1], new_args);
1887                                     }
1888                                     return key;
1889                                 },
1890                                 [&](const string& v) {
1891                                     if (args.size() > 2 && !args[2].empty()) {
1892                                         new_args[0] = v;
1893                                         return eval(args[2], new_args);
1894                                     }
1895                                     string r(1, '"');
1896                                     string elt = v;
1897                                     json_escape(elt);
1898                                     r += elt;
1899                                     r += '"';
1900                                     return r;
1901                                 });
1902                 break;
1903             }
1904             case CMD_keys: {
1905                 string prefix = args[0] + ',';
1906                 auto i = option.lower_bound(prefix);
1907                 for (; i != option.end() && startswith(i->first, prefix); ++i) {
1908                     const string& key = i->first;
1909                     if (!value.empty()) value += '\t';
1910                     value.append(key, prefix.size(), string::npos);
1911                 }
1912                 break;
1913             }
1914             case CMD_last:
1915                 value = str(last);
1916                 break;
1917             case CMD_lastpage: {
1918                 int l = mset.get_matches_estimated();
1919                 if (l > 0) l = (l - 1) / hits_per_page + 1;
1920                 value = str(l);
1921                 break;
1922             }
1923             case CMD_le:
1924                 if (string_to_int(args[0]) <= string_to_int(args[1]))
1925                     value = "true";
1926                 break;
1927             case CMD_length:
1928                 if (args[0].empty()) {
1929                     value = "0";
1930                 } else {
1931                     size_t length = count(args[0].begin(), args[0].end(), '\t');
1932                     value = str(length + 1);
1933                 }
1934                 break;
1935             case CMD_list: {
1936                 if (!args[0].empty()) {
1937                     string pre, inter, interlast, post;
1938                     switch (args.size()) {
1939                      case 2:
1940                         inter = interlast = args[1];
1941                         break;
1942                      case 3:
1943                         inter = args[1];
1944                         interlast = args[2];
1945                         break;
1946                      case 4:
1947                         pre = args[1];
1948                         inter = interlast = args[2];
1949                         post = args[3];
1950                         break;
1951                      case 5:
1952                         pre = args[1];
1953                         inter = args[2];
1954                         interlast = args[3];
1955                         post = args[4];
1956                         break;
1957                     }
1958                     value += pre;
1959                     string list = args[0];
1960                     string::size_type split = 0, split2;
1961                     while ((split2 = list.find('\t', split)) != string::npos) {
1962                         if (split) value += inter;
1963                         value.append(list, split, split2 - split);
1964                         split = split2 + 1;
1965                     }
1966                     if (split) value += interlast;
1967                     value.append(list, split, string::npos);
1968                     value += post;
1969                 }
1970                 break;
1971             }
1972             case CMD_log: {
1973                 if (!vet_filename(args[0])) break;
1974                 string logfile = log_dir + args[0];
1975                 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1976                 if (fd == -1) break;
1977                 vector<string> noargs;
1978                 noargs.resize(1);
1979                 string line;
1980                 if (args.size() > 1) {
1981                     line = args[1];
1982                 } else {
1983                     line = DEFAULT_LOG_ENTRY;
1984                 }
1985                 line = eval(line, noargs);
1986                 line += '\n';
1987                 (void)write_all(fd, line.data(), line.length());
1988                 close(fd);
1989                 break;
1990             }
1991             case CMD_lookup: {
1992                 if (!vet_filename(args[0])) break;
1993                 string cdbfile = cdb_dir + args[0];
1994                 int fd = open(cdbfile.c_str(), O_RDONLY);
1995                 if (fd == -1) break;
1996
1997                 struct cdb cdb;
1998                 if (cdb_init(&cdb, fd) < 0) {
1999                     close(fd);
2000                     break;
2001                 }
2002
2003                 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
2004                     size_t datalen = cdb_datalen(&cdb);
2005                     const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
2006                     if (dat) {
2007                         value.assign(static_cast<const char *>(dat), datalen);
2008                     }
2009                 }
2010
2011                 cdb_free(&cdb);
2012                 close(fd); // FIXME: cache fds?
2013                 break;
2014             }
2015             case CMD_lower:
2016                 value = Xapian::Unicode::tolower(args[0]);
2017                 break;
2018             case CMD_lt:
2019                 if (string_to_int(args[0]) < string_to_int(args[1]))
2020                     value = "true";
2021                 break;
2022             case CMD_map:
2023                 if (!args[0].empty()) {
2024                     value = foreach(args[0], args[1], param, '\t');
2025                 }
2026                 break;
2027             case CMD_match:
2028                 omegascript_match(value, args);
2029                 break;
2030             case CMD_max: {
2031                 vector<string>::const_iterator i = args.begin();
2032                 int val = string_to_int(*i++);
2033                 for (; i != args.end(); ++i) {
2034                     int x = string_to_int(*i);
2035                     if (x > val) val = x;
2036                 }
2037                 value = str(val);
2038                 break;
2039             }
2040             case CMD_min: {
2041                 vector<string>::const_iterator i = args.begin();
2042                 int val = string_to_int(*i++);
2043                 for (; i != args.end(); ++i) {
2044                     int x = string_to_int(*i);
2045                     if (x < val) val = x;
2046                 }
2047                 value = str(val);
2048                 break;
2049             }
2050             case CMD_mod: {
2051                 int denom = string_to_int(args[1]);
2052                 if (denom == 0) {
2053                     value = "divide by 0";
2054                 } else {
2055                     value = str(string_to_int(args[0]) % denom);
2056                 }
2057                 break;
2058             }
2059             case CMD_msize:
2060                 // Estimated number of matches.
2061                 value = str(mset.get_matches_estimated());
2062                 break;
2063             case CMD_msizeexact:
2064                 // Is msize exact?
2065                 if (mset.get_matches_lower_bound()
2066                     == mset.get_matches_upper_bound())
2067                     value = "true";
2068                 break;
2069             case CMD_msizelower:
2070                 // Lower bound on number of matches.
2071                 value = str(mset.get_matches_lower_bound());
2072                 break;
2073             case CMD_msizeupper:
2074                 // Upper bound on number of matches.
2075                 value = str(mset.get_matches_upper_bound());
2076                 break;
2077             case CMD_mul: {
2078                 vector<string>::const_iterator i = args.begin();
2079                 int total = string_to_int(*i++);
2080                 while (i != args.end())
2081                     total *= string_to_int(*i++);
2082                 value = str(total);
2083                 break;
2084             }
2085             case CMD_muldiv: {
2086                 int denom = string_to_int(args[2]);
2087                 if (denom == 0) {
2088                     value = "divide by 0";
2089                 } else {
2090                     int num = string_to_int(args[0]) * string_to_int(args[1]);
2091                     value = str(num / denom);
2092                 }
2093                 break;
2094             }
2095             case CMD_ne:
2096                 if (args[0] != args[1]) value = "true";
2097                 break;
2098             case CMD_nice: {
2099                 string::const_iterator i = args[0].begin();
2100                 int len = args[0].length();
2101                 while (len) {
2102                     value += *i++;
2103                     if (--len && len % 3 == 0) value += option["thousand"];
2104                 }
2105                 break;
2106             }
2107             case CMD_not:
2108                 if (args[0].empty()) value = "true";
2109                 break;
2110             case CMD_now:
2111                 value = str(static_cast<unsigned long>(time(NULL)));
2112                 break;
2113             case CMD_opt:
2114                 if (args.size() == 2) {
2115                     value = option[args[0] + "," + args[1]];
2116                 } else {
2117                     value = option[args[0]];
2118                 }
2119                 break;
2120             case CMD_or: {
2121                 for (auto&& arg : args) {
2122                     value = eval(arg, param);
2123                     if (!value.empty()) break;
2124                 }
2125                 break;
2126             }
2127             case CMD_ord: {
2128                 if (!args[0].empty()) {
2129                     Utf8Iterator it(args[0]);
2130                     value = str(*it);
2131                 }
2132                 break;
2133             }
2134             case CMD_pack:
2135                 value = int_to_binary_string(string_to_int(args[0]));
2136                 break;
2137             case CMD_percentage:
2138                 // percentage score
2139                 value = str(percent);
2140                 break;
2141             case CMD_prettyterm:
2142                 value = pretty_term(args[0]);
2143                 break;
2144             case CMD_prettyurl:
2145                 value = args[0];
2146                 url_prettify(value);
2147                 break;
2148             case CMD_query: {
2149                 auto r = query_strings.equal_range(args.empty() ?
2150                                                    string() : args[0]);
2151                 for (auto j = r.first; j != r.second; ++j) {
2152                     if (!value.empty()) value += '\t';
2153                     const string & s = j->second;
2154                     size_t start = 0, tab;
2155                     while ((tab = s.find('\t', start)) != string::npos) {
2156                         value.append(s, start, tab - start);
2157                         value += ' ';
2158                         start = tab + 1;
2159                     }
2160                     value.append(s, start, string::npos);
2161                 }
2162                 break;
2163             }
2164             case CMD_querydescription:
2165                 value = query.get_description();
2166                 break;
2167             case CMD_queryterms:
2168                 value = queryterms;
2169                 break;
2170             case CMD_range: {
2171                 int start = string_to_int(args[0]);
2172                 int end = string_to_int(args[1]);
2173                 while (start <= end) {
2174                     value += str(start);
2175                     if (start < end) value += '\t';
2176                     start++;
2177                 }
2178                 break;
2179             }
2180             case CMD_record: {
2181                 Xapian::docid id = q0;
2182                 if (!args.empty()) id = string_to_int(args[0]);
2183                 value = db.get_document(id).get_data();
2184                 break;
2185             }
2186             case CMD_relevant: {
2187                 // document id if relevant; empty otherwise
2188                 Xapian::docid id = q0;
2189                 if (!args.empty()) id = string_to_int(args[0]);
2190                 auto i = ticked.find(id);
2191                 if (i != ticked.end()) {
2192                     i->second = false; // icky side-effect
2193                     value = str(id);
2194                 }
2195                 break;
2196             }
2197             case CMD_relevants: {
2198                 for (auto i : ticked) {
2199                     if (i.second) {
2200                         value += str(i.first);
2201                         value += '\t';
2202                     }
2203                 }
2204                 if (!value.empty()) value.erase(value.size() - 1);
2205                 break;
2206             }
2207             case CMD_score:
2208                 // Score (0 to 10)
2209                 value = str(percent / 10);
2210                 break;
2211             case CMD_set:
2212                 option[args[0]] = args[1];
2213                 break;
2214             case CMD_seterror:
2215                 error_msg = args[0];
2216                 break;
2217             case CMD_setmap: {
2218                 string base = args[0] + ',';
2219                 if (args.size() % 2 != 1)
2220                     throw string("$setmap requires an odd number of arguments");
2221                 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2222                     option[base + args[i]] = args[i + 1];
2223                 }
2224                 break;
2225             }
2226             case CMD_setrelevant: {
2227                 string::size_type i = 0, j;
2228                 while (true) {
2229                     j = args[0].find_first_not_of("0123456789", i);
2230                     Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2231                     if (id) {
2232                         rset.add_document(id);
2233                         ticked[id] = true;
2234                     }
2235                     if (j == string::npos) break;
2236                     i = j + 1;
2237                 }
2238                 break;
2239             }
2240             case CMD_slice: {
2241                 string list = args[0], pos = args[1];
2242                 vector<string> items;
2243                 string::size_type i = 0, j;
2244                 while (true) {
2245                     j = list.find('\t', i);
2246                     items.push_back(list.substr(i, j - i));
2247                     if (j == string::npos) break;
2248                     i = j + 1;
2249                 }
2250                 i = 0;
2251                 bool have_added = false;
2252                 while (true) {
2253                     j = pos.find('\t', i);
2254                     int item = string_to_int(pos.substr(i, j - i));
2255                     if (item >= 0 && size_t(item) < items.size()) {
2256                         if (have_added) value += '\t';
2257                         value += items[item];
2258                         have_added = true;
2259                     }
2260                     if (j == string::npos) break;
2261                     i = j + 1;
2262                 }
2263                 break;
2264             }
2265             case CMD_snippet: {
2266                 size_t length = 200;
2267                 if (args.size() > 1) {
2268                     length = string_to_int(args[1]);
2269                 }
2270                 if (!stemmer)
2271                     stemmer = new Xapian::Stem(option["stemmer"]);
2272                 // FIXME: Allow start and end highlight and omit to be specified.
2273                 value = mset.snippet(args[0], length, *stemmer,
2274                                      mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2275                                      "<strong>", "</strong>", "...");
2276                 break;
2277             }
2278             case CMD_sort:
2279                 omegascript_sort(args, value);
2280                 break;
2281             case CMD_split: {
2282                 string split;
2283                 if (args.size() == 1) {
2284                     split = " ";
2285                     value = args[0];
2286                 } else {
2287                     split = args[0];
2288                     value = args[1];
2289                 }
2290                 string::size_type i = 0;
2291                 while (true) {
2292                     if (split.empty()) {
2293                         ++i;
2294                         if (i >= value.size()) break;
2295                     } else {
2296                         i = value.find(split, i);
2297                         if (i == string::npos) break;
2298                     }
2299                     value.replace(i, split.size(), 1, '\t');
2300                     ++i;
2301                 }
2302                 break;
2303             }
2304             case CMD_stoplist: {
2305                 Xapian::TermIterator i = qp.stoplist_begin();
2306                 Xapian::TermIterator end = qp.stoplist_end();
2307                 while (i != end) {
2308                     if (!value.empty()) value += '\t';
2309                     value += *i;
2310                     ++i;
2311                 }
2312                 break;
2313             }
2314             case CMD_sub:
2315                 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2316                 break;
2317             case CMD_subdb: {
2318                 Xapian::docid id = q0;
2319                 if (args.size() > 0) id = string_to_int(args[0]);
2320                 value = subdbs[(id - 1) % subdbs.size()].get_name();
2321                 break;
2322             }
2323             case CMD_subid: {
2324                 Xapian::docid id = q0;
2325                 if (args.size() > 0) id = string_to_int(args[0]);
2326                 // This is the docid in the single shard.
2327                 Xapian::docid shard_did = (id - 1) / subdbs.size() + 1;
2328                 // We now need to map this back to the docid in the collection
2329                 // of shards specified by the DB parameter value which $subdb
2330                 // returns.
2331                 const SubDB& subdb = subdbs[(id - 1) % subdbs.size()];
2332                 value = str(subdb.map_docid(shard_did));
2333                 break;
2334             }
2335             case CMD_substr: {
2336                 int start = string_to_int(args[1]);
2337                 if (start < 0) {
2338                     if (static_cast<size_t>(-start) >= args[0].size()) {
2339                         start = 0;
2340                     } else {
2341                         start = static_cast<int>(args[0].size()) + start;
2342                     }
2343                 } else {
2344                     if (static_cast<size_t>(start) >= args[0].size()) break;
2345                 }
2346                 size_t len = string::npos;
2347                 if (args.size() > 2) {
2348                     int int_len = string_to_int(args[2]);
2349                     if (int_len >= 0) {
2350                         len = size_t(int_len);
2351                     } else {
2352                         len = args[0].size() - start;
2353                         if (static_cast<size_t>(-int_len) >= len) {
2354                             len = 0;
2355                         } else {
2356                             len -= static_cast<size_t>(-int_len);
2357                         }
2358                     }
2359                 }
2360                 value.assign(args[0], start, len);
2361                 break;
2362             }
2363             case CMD_suggestion:
2364                 value = qp.get_corrected_query_string();
2365                 break;
2366             case CMD_switch: {
2367                 const string& val = args[0];
2368                 for (size_t i = 1; i < args.size(); i += 2) {
2369                     if (i == args.size() - 1) {
2370                         // Handle optional "else" value.
2371                         value = eval(args[i], param);
2372                         break;
2373                     }
2374                     if (val == eval(args[i], param)) {
2375                         value = eval(args[i + 1], param);
2376                         break;
2377                     }
2378                 }
2379                 break;
2380             }
2381             case CMD_termprefix:
2382                 (void)prefix_from_term(&value, args[0]);
2383                 break;
2384             case CMD_terms: {
2385                 // list of matching terms
2386                 if (!enquire) break;
2387                 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2388                 if (args.empty()) {
2389                     while (term != enquire->get_matching_terms_end(q0)) {
2390                         // check term was in the typed query so we ignore
2391                         // boolean filter terms
2392                         const string & t = *term;
2393                         if (termset.find(t) != termset.end()) {
2394                             value += t;
2395                             value += '\t';
2396                         }
2397                         ++term;
2398                     }
2399                 } else {
2400                     // Return matching terms with specified prefix.  We can't
2401                     // use skip_to() as the terms aren't ordered by termname.
2402                     const string & pfx = args[0];
2403                     while (term != enquire->get_matching_terms_end(q0)) {
2404                         const string & t = *term;
2405                         if (startswith(t, pfx)) {
2406                             value += t;
2407                             value += '\t';
2408                         }
2409                         ++term;
2410                     }
2411                 }
2412
2413                 if (!value.empty()) value.erase(value.size() - 1);
2414                 break;
2415             }
2416             case CMD_thispage:
2417                 value = str(topdoc / hits_per_page + 1);
2418                 break;
2419             case CMD_time:
2420                 if (secs >= 0) {
2421                     char buf[64];
2422                     my_snprintf(buf, sizeof(buf), "%.6f", secs);
2423                     // MSVC's snprintf omits the zero byte if the string if
2424                     // sizeof(buf) long.
2425                     buf[sizeof(buf) - 1] = '\0';
2426                     value = buf;
2427                 }
2428                 break;
2429             case CMD_topdoc:
2430                 // first document on current page of hit list (counting from 0)
2431                 value = str(topdoc);
2432                 break;
2433             case CMD_topterms:
2434                 if (enquire) {
2435                     int howmany = 16;
2436                     if (!args.empty()) howmany = string_to_int(args[0]);
2437                     if (howmany < 0) howmany = 0;
2438
2439                     // List of expand terms
2440                     Xapian::ESet eset;
2441                     OmegaExpandDecider decider(db, &termset);
2442
2443                     if (!rset.empty()) {
2444                         set_expansion_scheme(*enquire, option);
2445                         eset = enquire->get_eset(howmany * 2, rset, &decider);
2446                     } else if (mset.size()) {
2447                         // invent an rset
2448                         Xapian::RSet tmp;
2449
2450                         int c = 5;
2451                         // FIXME: what if mset does not start at first match?
2452                         for (Xapian::docid did : mset) {
2453                             tmp.add_document(did);
2454                             if (--c == 0) break;
2455                         }
2456
2457                         set_expansion_scheme(*enquire, option);
2458                         eset = enquire->get_eset(howmany * 2, tmp, &decider);
2459                     }
2460
2461                     // Don't show more than one word with the same stem.
2462                     set<string> stems;
2463                     Xapian::ESetIterator i;
2464                     for (i = eset.begin(); i != eset.end(); ++i) {
2465                         string term(*i);
2466                         string stem = (*stemmer)(term);
2467                         if (stems.find(stem) != stems.end()) continue;
2468                         stems.insert(stem);
2469                         value += term;
2470                         value += '\t';
2471                         if (--howmany == 0) break;
2472                     }
2473                     if (!value.empty()) value.erase(value.size() - 1);
2474                 }
2475                 break;
2476             case CMD_transform:
2477                 omegascript_transform(value, args);
2478                 break;
2479             case CMD_truncate:
2480                 value = generate_sample(args[0],
2481                                         string_to_int(args[1]),
2482                                         args.size() > 2 ? args[2] : string(),
2483                                         args.size() > 3 ? args[3] : string());
2484                 break;
2485             case CMD_uniq: {
2486                 const string &list = args[0];
2487                 if (list.empty()) break;
2488                 string::size_type split = 0, split2;
2489                 string prev;
2490                 do {
2491                     split2 = list.find('\t', split);
2492                     string item(list, split, split2 - split);
2493                     if (split == 0) {
2494                         value = item;
2495                     } else if (item != prev) {
2496                         value += '\t';
2497                         value += item;
2498                     }
2499                     prev = item;
2500                     split = split2 + 1;
2501                 } while (split2 != string::npos);
2502                 break;
2503             }
2504             case CMD_unique: {
2505                 unordered_set<string> seen;
2506                 const string &list = args[0];
2507                 if (list.empty()) break;
2508                 string::size_type split = 0, split2;
2509                 do {
2510                     split2 = list.find('\t', split);
2511                     string item(list, split, split2 - split);
2512                     if (seen.insert(item).second) {
2513                         if (split != 0)
2514                             value += '\t';
2515                         value += item;
2516                     }
2517                     split = split2 + 1;
2518                 } while (split2 != string::npos);
2519                 break;
2520             }
2521             case CMD_unpack:
2522                 value = str(binary_string_to_int(args[0]));
2523                 break;
2524             case CMD_unprefix: {
2525                 size_t prefix_len = prefix_from_term(NULL, args[0]);
2526                 value.assign(args[0], prefix_len, string::npos);
2527                 break;
2528             }
2529             case CMD_unstem: {
2530                 const string &term = args[0];
2531                 Xapian::TermIterator i = qp.unstem_begin(term);
2532                 Xapian::TermIterator end = qp.unstem_end(term);
2533                 while (i != end) {
2534                     if (!value.empty()) value += '\t';
2535                     value += *i;
2536                     ++i;
2537                 }
2538                 break;
2539             }
2540             case CMD_upper:
2541                 value = Xapian::Unicode::toupper(args[0]);
2542                 break;
2543             case CMD_url:
2544                 url_encode(value, args[0]);
2545                 break;
2546             case CMD_value: {
2547                 Xapian::docid id = q0;
2548                 Xapian::valueno value_no = string_to_int(args[0]);
2549                 if (args.size() > 1) id = string_to_int(args[1]);
2550                 value = db.get_document(id).get_value(value_no);
2551                 break;
2552             }
2553             case CMD_version:
2554                 value = PACKAGE_STRING;
2555                 break;
2556             case CMD_weight:
2557                 value = double_to_string(weight);
2558                 break;
2559             default: {
2560                 args.insert(args.begin(), param[0]);
2561                 int macro_no = func->second->tag - CMD_MACRO;
2562                 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2563                 // throw "Unknown function '" + var + "'";
2564                 value = eval(macros[macro_no], args);
2565                 break;
2566             }
2567         }
2568         res += value;
2569     } catch (const Xapian::Error & e) {
2570         // FIXME: this means we only see the most recent error in $error
2571         // - is that the best approach?
2572         error_msg = e.get_description();
2573     }
2574
2575     res.append(fmt, p, string::npos);
2576     return res;
2577 }
2578
2579 static string
2580 eval_file(const string& fmtfile, bool* p_not_found)
2581 {
2582     // Use -1 to indicate vet_filename() failed.
2583     int eno = -1;
2584     if (vet_filename(fmtfile)) {
2585         string file = template_dir + fmtfile;
2586         string fmt;
2587         errno = 0;
2588         if (load_file(file, fmt)) {
2589             vector<string> noargs;
2590             noargs.resize(1);
2591             return eval(fmt, noargs);
2592         }
2593         eno = errno;
2594     }
2595
2596     if (p_not_found) {
2597         *p_not_found = true;
2598         return string();
2599     }
2600
2601     // FIXME: report why!
2602     string msg = string("Couldn't read format template '") + fmtfile + '\'';
2603     if (eno) {
2604         msg += " (";
2605         msg += (eno < 0 ? "name contains '..'" : strerror(eno));
2606         msg += ')';
2607     }
2608     throw msg;
2609 }
2610
2611 extern string
2612 pretty_term(string term)
2613 {
2614     // Just leave empty strings and single characters alone.
2615     if (term.length() <= 1) return term;
2616
2617     // Assume unprefixed terms are unstemmed.
2618     if (!C_isupper(term[0])) return term;
2619
2620     // Handle stemmed terms.
2621     bool stemmed = (term[0] == 'Z');
2622     if (stemmed) {
2623         // First of all, check if a term in the query stemmed to this one.
2624         Xapian::TermIterator u = qp.unstem_begin(term);
2625         // There might be multiple words with the same stem, but we only want
2626         // one so just take the first.
2627         if (u != qp.unstem_end(term)) return *u;
2628
2629         // Remove the 'Z'.
2630         term.erase(0, 1);
2631     }
2632
2633     bool add_quotes = false;
2634
2635     // Check if the term has a prefix.
2636     if (C_isupper(term[0])) {
2637         // See if we have this prefix in the termprefix_to_userprefix map.  If
2638         // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2639         string prefix;
2640         size_t prefix_len = prefix_from_term(&prefix, term);
2641
2642         map<string, string>::const_iterator i;
2643         i = termprefix_to_userprefix.find(prefix);
2644         if (i != termprefix_to_userprefix.end()) {
2645             string user_prefix = i->second;
2646             user_prefix += ':';
2647             term.replace(0, prefix_len, user_prefix);
2648         } else {
2649             // We don't have a prefix mapping for this, so just set a flag to
2650             // add quotes around the term.
2651             add_quotes = true;
2652         }
2653     }
2654
2655     if (stemmed) term += '.';
2656
2657     if (add_quotes) {
2658         term.insert(0, "\"");
2659         term.append("\"");
2660     }
2661
2662     return term;
2663 }
2664
2665 static string
2666 print_caption(const string& fmt, vector<string>& param)
2667 {
2668     q0 = *(mset[hit_no]);
2669
2670     weight = mset[hit_no].get_weight();
2671     percent = mset.convert_to_percent(mset[hit_no]);
2672     collapsed = mset[hit_no].get_collapse_count();
2673
2674     return eval(fmt, param);
2675 }
2676
2677 void
2678 parse_omegascript()
2679 {
2680     try {
2681         string output = eval_file(fmtname);
2682         if (!set_content_type && !suppress_http_headers) {
2683             cout << "Content-Type: text/html" << endl;
2684             set_content_type = true;
2685         }
2686         if (!suppress_http_headers) cout << endl;
2687         cout << output;
2688     } catch (...) {
2689         // Ensure the headers have been output so that any exception gets
2690         // reported rather than giving a server error.
2691         if (!set_content_type && !suppress_http_headers) {
2692             cout << "Content-Type: text/html" << endl;
2693             set_content_type = true;
2694         }
2695         if (!suppress_http_headers) cout << endl;
2696         throw;
2697     }
2698 }
2699
2700 static void
2701 ensure_query_parsed()
2702 {
2703     if (query_parsed) return;
2704     query_parsed = true;
2705
2706     // Should we discard the existing R-set recorded in R CGI parameters?
2707     bool discard_rset = false;
2708
2709     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2710     // CGI parameters)?
2711     bool force_first_page = false;
2712
2713     string v;
2714     // get list of terms from previous iteration of query
2715     auto val = cgi_params.find("xP");
2716     if (val != cgi_params.end()) {
2717         v = val->second;
2718         // If xP given, default to discarding any RSet and forcing the first
2719         // page of results.  If the query is the same, or an extension of
2720         // the previous query, we adjust these again below.
2721         discard_rset = true;
2722         force_first_page = true;
2723     }
2724     querytype result = parse_queries(v);
2725     switch (result) {
2726         case BAD_QUERY:
2727             break;
2728         case NEW_QUERY:
2729             break;
2730         case SAME_QUERY:
2731         case EXTENDED_QUERY:
2732             // If we've changed database, force the first page of hits
2733             // and discard the R-set (since the docids will have changed)
2734             val = cgi_params.find("xDB");
2735             if (val != cgi_params.end() && val->second != dbname) break;
2736             if (result == SAME_QUERY && force_first_page) {
2737                 val = cgi_params.find("xFILTERS");
2738                 if (val != cgi_params.end() && val->second != filters &&
2739                     val->second != old_filters) {
2740                     // Filters have changed since last query.
2741                 } else {
2742                     force_first_page = false;
2743                 }
2744             }
2745             discard_rset = false;
2746             break;
2747     }
2748
2749     if (!force_first_page) {
2750         // Work out which mset element is the first hit we want
2751         // to display
2752         val = cgi_params.find("TOPDOC");
2753         if (val != cgi_params.end()) {
2754             topdoc = atol(val->second.c_str());
2755         }
2756
2757         // Handle next, previous, and page links
2758         if (cgi_params.find(">") != cgi_params.end()) {
2759             topdoc += hits_per_page;
2760         } else if (cgi_params.find("<") != cgi_params.end()) {
2761             if (topdoc >= hits_per_page)
2762                 topdoc -= hits_per_page;
2763             else
2764                 topdoc = 0;
2765         } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2766                    (val = cgi_params.find("#")) != cgi_params.end()) {
2767             long page = atol(val->second.c_str());
2768             // Do something sensible for page 0 (we count pages from 1).
2769             if (page == 0) page = 1;
2770             topdoc = (page - 1) * hits_per_page;
2771         }
2772
2773         // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2774         // Normally we snap TOPDOC like this so that things work nicely if
2775         // HITSPERPAGE is in a <select> or on radio buttons.  If we're
2776         // postprocessing the output of omega and want variable sized pages,
2777         // this is unhelpful.
2778         bool raw_search = false;
2779         val = cgi_params.find("RAWSEARCH");
2780         if (val != cgi_params.end()) {
2781             raw_search = bool(atol(val->second.c_str()));
2782         }
2783
2784         if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2785     }
2786
2787     if (!discard_rset) {
2788         // put documents marked as relevant into the rset
2789         auto g = cgi_params.equal_range("R");
2790         for (auto i = g.first; i != g.second; ++i) {
2791             const string & value = i->second;
2792             for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2793                 while (value[j] == '.') ++j;
2794                 Xapian::docid d = atoi(value.c_str() + j);
2795                 if (d) {
2796                     rset.add_document(d);
2797                     ticked[d] = true;
2798                 }
2799             }
2800         }
2801     }
2802 }
2803
2804 // run query if we haven't already
2805 static void
2806 ensure_match()
2807 {
2808     if (done_query) return;
2809
2810     secs = RealTime::now();
2811     run_query();
2812     if (secs != -1)
2813         secs = RealTime::now() - secs;
2814
2815     done_query = true;
2816     last = mset.get_matches_lower_bound();
2817     if (last == 0) {
2818         // Otherwise topdoc ends up being -6 if it's non-zero!
2819         topdoc = 0;
2820     } else {
2821         if (topdoc >= last)
2822             topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2823         // last is the count of documents up to the end of the current page
2824         // (as returned by $last)
2825         if (topdoc + hits_per_page < last)
2826             last = topdoc + hits_per_page;
2827     }
2828 }
2829
2830 // OmegaExpandDecider methods.
2831
2832 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2833                                        set<string> * querytermset)
2834     : db(db_)
2835 {
2836     // We'll want the stemmer for testing matches anyway.
2837     if (!stemmer)
2838         stemmer = new Xapian::Stem(option["stemmer"]);
2839     if (querytermset) {
2840         set<string>::const_iterator i;
2841         for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2842             string term(*i);
2843             if (term.empty()) continue;
2844
2845             unsigned char ch = term[0];
2846             bool stemmed = (ch == 'Z');
2847             if (stemmed) {
2848                 term.erase(0, 1);
2849                 if (term.empty()) continue;
2850                 ch = term[0];
2851             }
2852
2853             if (C_isupper(ch)) {
2854                 size_t prefix_len = prefix_from_term(NULL, term);
2855                 term.erase(0, prefix_len);
2856             }
2857
2858             if (!stemmed) term = (*stemmer)(term);
2859
2860             exclude_stems.insert(term);
2861         }
2862     }
2863 }
2864
2865 bool
2866 OmegaExpandDecider::operator()(const string & term) const
2867 {
2868     unsigned char ch = term[0];
2869
2870     // Reject terms with a prefix.
2871     if (C_isupper(ch)) return false;
2872
2873     {
2874         MyStopper stopper;
2875         // Don't suggest stopwords.
2876         if (stopper(term)) return false;
2877     }
2878
2879     // Reject small numbers.
2880     if (term.size() < 4 && C_isdigit(ch)) return false;
2881
2882     // Reject terms containing a space.
2883     if (term.find(' ') != string::npos) return false;
2884
2885     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2886     // terms which are already in the query in some form.
2887     string stem = (*stemmer)(term);
2888     if (exclude_stems.find(stem) != exclude_stems.end())
2889         return false;
2890
2891     // Ignore terms that only occur once (hapaxes) since they aren't
2892     // useful for finding related documents - they only occur in a
2893     // document that's already been marked as relevant.
2894     // FIXME: add an expand option to ignore terms where
2895     // termfreq == rtermfreq.
2896     if (db.get_termfreq(term) <= 1) return false;
2897
2898     return true;
2899 }