Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / query.cc
blobc02e21fed6dafcf6e2b7d66f8c49a2aa064f1afd
1 /** @file
2 * @brief query executor for omega
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002 Intercede 1749 Ltd
8 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021 Olly Betts
9 * Copyright 2008 Thomas Viehmann
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include <algorithm>
30 #include <iostream>
31 #include <map>
32 #include <random>
33 #include <set>
34 #include <unordered_map>
35 #include <unordered_set>
36 #include <vector>
38 #include <cassert>
39 #include <cctype>
40 #include <cerrno>
41 #include <stdio.h>
42 #include <cstdlib>
43 #include <cstring>
44 #include "strcasecmp.h"
45 #include <ctime>
47 #include "safeunistd.h"
48 #include <sys/types.h>
49 #include "safesysstat.h"
50 #include "safefcntl.h"
52 #include "realtime.h"
54 #include <cdb.h>
56 #include "csvescape.h"
57 #include "date.h"
58 #include "datevalue.h"
59 #include "fields.h"
60 #include "jsonescape.h"
61 #include "utils.h"
62 #include "omega.h"
63 #include "query.h"
64 #include "cgiparam.h"
65 #include "loadfile.h"
66 #include "sample.h"
67 #include "sort.h"
68 #include "str.h"
69 #include "stringutils.h"
70 #include "transform.h"
71 #include "urldecode.h"
72 #include "urlencode.h"
73 #include "unixperm.h"
74 #include "values.h"
75 #include "weight.h"
76 #include "expand.h"
77 #include "md5wrap.h"
78 #include "parseint.h"
79 #include <xapian.h>
81 using namespace std;
83 using Xapian::Utf8Iterator;
85 using Xapian::Unicode::is_wordchar;
87 /// Map shard to DB parameter value and stats to allow docid mapping.
88 vector<SubDB> subdbs;
90 static bool query_parsed = false;
91 static bool done_query = false;
92 static Xapian::docid last = 0;
93 static Xapian::docid topdoc = 0;
95 static Xapian::MSet mset;
96 static Xapian::RSet rset;
98 static map<Xapian::docid, bool> ticked;
100 static void ensure_query_parsed();
101 static void ensure_match();
103 static Xapian::Query query;
104 //static string url_query_string;
105 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
107 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
108 // be true if a date filter is specified which simplifies to
109 // Query::MatchNothing at construction time.
110 static bool date_filter_set = false;
111 static Xapian::Query date_filter;
113 static Xapian::QueryParser qp;
114 static Xapian::NumberRangeProcessor * size_rp = NULL;
115 static Xapian::Stem *stemmer = NULL;
117 static string eval_file(const string& fmtfile, bool* p_not_found = nullptr);
119 static set<string> termset;
121 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
122 static map<string, string> termprefix_to_userprefix;
124 static string queryterms;
126 static string error_msg;
128 static double secs = -1;
130 static const char DEFAULT_LOG_ENTRY[] =
131 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
132 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
133 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
134 "$dbname\t"
135 "$query\t"
136 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
138 class MyStopper : public Xapian::Stopper {
139 public:
140 bool operator()(const string& t) const override {
141 switch (t[0]) {
142 case 'a':
143 return (t == "a" || t == "about" || t == "an" || t == "and" ||
144 t == "are" || t == "as" || t == "at");
145 case 'b':
146 return (t == "be" || t == "by");
147 case 'e':
148 return (t == "en");
149 case 'f':
150 return (t == "for" || t == "from");
151 case 'h':
152 return (t == "how");
153 case 'i':
154 return (t == "i" || t == "in" || t == "is" || t == "it");
155 case 'o':
156 return (t == "of" || t == "on" || t == "or");
157 case 't':
158 return (t == "that" || t == "the" || t == "this" || t == "to");
159 case 'w':
160 return (t == "was" || t == "what" || t == "when" ||
161 t == "where" || t == "which" || t == "who" ||
162 t == "why" || t == "will" || t == "with");
163 case 'y':
164 return (t == "you" || t == "your");
165 default:
166 return false;
171 static size_t
172 prefix_from_term(string* prefix, const string& term)
174 if (!term.empty()) {
175 if (term[0] == 'X') {
176 const string::const_iterator begin = term.begin();
177 string::const_iterator i = begin + 1;
178 while (i != term.end() && C_isupper(*i))
179 ++i;
180 if (prefix)
181 prefix->assign(begin, i);
182 if (i != term.end() && *i == ':')
183 ++i;
184 return i - begin;
187 if (C_isupper(term[0])) {
188 if (prefix)
189 *prefix = term[0];
190 return 1;
194 if (prefix)
195 prefix->resize(0);
196 return 0;
199 // Don't allow ".." in format names, log file names, etc as this would allow
200 // people to open a format "../../etc/passwd" or similar.
201 // FIXME: make this check more exact ("foo..bar" is safe)
202 // FIXME: log when this check fails
203 static bool
204 vet_filename(const string &filename)
206 string::size_type i = filename.find("..");
207 return (i == string::npos);
210 // Heuristics:
211 // * If any terms have been removed, it's a "fresh query" so we discard any
212 // relevance judgements
213 // * If all previous terms are there but more have been added then we keep
214 // the relevance judgements, but return the first page of hits
216 // NEW_QUERY entirely new query
217 // SAME_QUERY unchanged query
218 // EXTENDED_QUERY new query, but based on the old one
219 // BAD_QUERY parse error (message in error_msg)
220 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
222 static multimap<string, string> query_strings;
224 void
225 add_query_string(const string& prefix, const string& s)
227 string query_string = s;
228 // Strip leading and trailing whitespace from query_string.
229 trim(query_string);
230 if (!query_string.empty())
231 query_strings.insert(make_pair(prefix, query_string));
234 static unsigned
235 read_qp_flags(const string & opt_pfx, unsigned f)
237 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
238 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
239 unsigned mask = 0;
240 const char * s = i->first.c_str() + opt_pfx.size();
241 switch (s[0]) {
242 case 'a':
243 // Note that the ``Xapian::QueryParser::FLAG_ACCUMULATE`` flag
244 // is or-ed in below because it's needed for ``$stoplist`` and
245 // ``$unstem`` to work correctly, and so is deliberately not
246 // available to specify here.
247 if (strcmp(s, "auto_multiword_synonyms") == 0) {
248 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
249 break;
251 if (strcmp(s, "auto_synonyms") == 0) {
252 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
253 break;
255 break;
256 case 'b':
257 if (strcmp(s, "boolean") == 0) {
258 mask = Xapian::QueryParser::FLAG_BOOLEAN;
259 break;
261 if (strcmp(s, "boolean_any_case") == 0) {
262 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
263 break;
265 break;
266 case 'c':
267 if (strcmp(s, "cjk_ngram") == 0) {
268 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
269 break;
271 break;
272 case 'd':
273 if (strcmp(s, "default") == 0) {
274 mask = Xapian::QueryParser::FLAG_DEFAULT;
275 break;
277 break;
278 case 'f':
279 if (strcmp(s, "fuzzy") == 0) {
280 mask = Xapian::QueryParser::FLAG_FUZZY;
281 break;
283 break;
284 case 'l':
285 if (strcmp(s, "lovehate") == 0) {
286 mask = Xapian::QueryParser::FLAG_LOVEHATE;
287 break;
289 break;
290 case 'n':
291 if (strcmp(s, "no_positions") == 0) {
292 mask = Xapian::QueryParser::FLAG_NO_POSITIONS;
293 break;
295 if (strcmp(s, "ngrams") == 0) {
296 mask = Xapian::QueryParser::FLAG_NGRAMS;
297 break;
299 break;
300 case 'p':
301 if (strcmp(s, "partial") == 0) {
302 mask = Xapian::QueryParser::FLAG_PARTIAL;
303 break;
305 if (strcmp(s, "phrase") == 0) {
306 mask = Xapian::QueryParser::FLAG_PHRASE;
307 break;
309 if (strcmp(s, "pure_not") == 0) {
310 mask = Xapian::QueryParser::FLAG_PURE_NOT;
311 break;
313 break;
314 case 's':
315 if (strcmp(s, "spelling_correction") == 0) {
316 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
317 break;
319 if (strcmp(s, "synonym") == 0) {
320 mask = Xapian::QueryParser::FLAG_SYNONYM;
321 break;
323 break;
324 case 'w':
325 if (strcmp(s, "wildcard") == 0) {
326 mask = Xapian::QueryParser::FLAG_WILDCARD;
327 break;
329 #if XAPIAN_AT_LEAST(1,5,0)
330 if (strcmp(s, "wildcard_glob") == 0) {
331 mask = Xapian::QueryParser::FLAG_WILDCARD_GLOB;
332 break;
334 if (strcmp(s, "wildcard_multi") == 0) {
335 mask = Xapian::QueryParser::FLAG_WILDCARD_MULTI;
336 break;
338 if (strcmp(s, "wildcard_single") == 0) {
339 mask = Xapian::QueryParser::FLAG_WILDCARD_SINGLE;
340 break;
342 if (strcmp(s, "word_breaks") == 0) {
343 mask = Xapian::QueryParser::FLAG_WORD_BREAKS;
344 break;
346 #endif
347 break;
350 if (i->second.empty()) {
351 f &= ~mask;
352 } else {
353 f |= mask;
356 // Always enable FLAG_ACCUMULATE so that $stoplist and $unstem report
357 // values accumulated over all query strings parsed as part of a query, not
358 // just the last one parsed.
359 return f | Xapian::QueryParser::FLAG_ACCUMULATE;
362 static querytype
363 parse_queries(const string& oldp)
365 // Parse the query string.
366 auto opt_it = option.find("stem_strategy");
367 if (opt_it != option.end()) {
368 if (opt_it->second == "all") {
369 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
370 } else if (opt_it->second == "all_z") {
371 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
372 } else if (opt_it->second == "none") {
373 qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
374 } else if (opt_it->second == "some") {
375 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
376 } else if (opt_it->second == "some_full_pos") {
377 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
379 } else {
380 opt_it = option.find("stem_all");
381 if (opt_it != option.end() && opt_it->second == "true") {
382 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
385 qp.set_stopper((new MyStopper())->release());
386 qp.set_default_op(default_op);
387 qp.set_database(db);
388 // FIXME: provide a custom RP which handles size:10..20K, etc.
389 if (!size_rp)
390 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
391 qp.add_rangeprocessor(size_rp);
392 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
393 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
394 string user_prefix(pfx->first, 7);
395 const string & term_pfx_list = pfx->second;
396 string::size_type i = 0;
397 do {
398 string::size_type i0 = i;
399 i = term_pfx_list.find('\t', i);
400 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
401 qp.add_prefix(user_prefix, term_pfx);
402 // std::map::insert() won't overwrite an existing entry, so we'll
403 // prefer the first user_prefix for which a particular term prefix
404 // is specified.
405 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
406 } while (UNSIGNED_OVERFLOW_OK(++i));
408 pfx = option.lower_bound("boolprefix,");
409 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
410 string user_prefix(pfx->first, 11, string::npos);
411 auto it = option.find("nonexclusiveprefix," + pfx->second);
412 bool exclusive = (it == option.end() || it->second.empty());
413 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
414 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
417 try {
418 unsigned default_flags = read_qp_flags("flag_", 0);
420 vector<Xapian::Query> queries;
421 queries.reserve(query_strings.size());
423 for (auto& j : query_strings) {
424 const string& prefix = j.first;
425 const string& query_string = j.second;
427 // Choose the stemmer to use for this input.
428 string stemlang = option[prefix + ":stemmer"];
429 if (stemlang.empty())
430 stemlang = option["stemmer"];
431 qp.set_stemmer(Xapian::Stem(stemlang));
433 // Work out the flags to use for this input.
434 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
436 Xapian::Query q = qp.parse_query(query_string, f, prefix);
437 if (!q.empty())
438 queries.push_back(q);
441 Xapian::Query::op intra_query_op = Xapian::Query::OP_AND;
442 if (queries.size() > 1) {
443 // Determine operator to use to combine multiple P and P.<prefix>
444 // parameters. Note that we only need to bother if there are two
445 // or more query strings, since for one or none the operator
446 // specified isn't actually used.
447 opt_it = option.find("intra_query_op");
448 if (opt_it != option.end()) {
449 const string& v = opt_it->second;
450 if (v == "OR" || v == "or") {
451 intra_query_op = Xapian::Query::OP_OR;
455 query = Xapian::Query(intra_query_op, queries.begin(), queries.end());
456 } catch (Xapian::QueryParserError &e) {
457 error_msg = e.get_msg();
458 return BAD_QUERY;
461 Xapian::termcount n_new_terms = 0;
462 for (Xapian::TermIterator i = query.get_terms_begin();
463 i != query.get_terms_end(); ++i) {
464 if (termset.find(*i) == termset.end()) {
465 termset.insert(*i);
466 if (!queryterms.empty()) queryterms += '\t';
467 queryterms += *i;
469 n_new_terms++;
472 // Check new query against the previous one
473 if (oldp.empty()) {
474 // If oldp was empty that means there were no parsed query terms
475 // before, so if there are now this is a new query.
476 return n_new_terms ? NEW_QUERY : SAME_QUERY;
479 // The terms in oldp are separated by tabs.
480 const char oldp_separator = '\t';
481 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
483 // short-cut: if the new query has fewer terms, it must be a new one
484 if (n_new_terms < n_old_terms) return NEW_QUERY;
486 const char *term = oldp.c_str();
487 const char *pend;
488 while ((pend = strchr(term, oldp_separator)) != NULL) {
489 if (termset.find(string(term, pend - term)) == termset.end())
490 return NEW_QUERY;
491 term = pend + 1;
493 if (*term) {
494 if (termset.find(string(term)) == termset.end())
495 return NEW_QUERY;
498 // Use termset.size() rather than n_new_terms so we correctly handle
499 // the case when the query has repeated terms.
500 // This works wrongly in the case when the user extends the query
501 // by adding a term already in it, but that's unlikely and the behaviour
502 // isn't too bad (we just don't reset page 1). We also mishandle a few
503 // other obscure cases e.g. adding quotes to turn a query into a phrase.
504 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
505 return SAME_QUERY;
508 static multimap<string, string> filter_map;
509 static set<string> neg_filters;
511 void add_bterm(const string &term) {
512 string prefix;
513 if (prefix_from_term(&prefix, term) > 0)
514 filter_map.insert(multimap<string, string>::value_type(prefix, term));
517 void add_nterm(const string &term) {
518 if (!term.empty())
519 neg_filters.insert(term);
522 void
523 add_date_filter(const string& date_start,
524 const string& date_end,
525 const string& date_span,
526 Xapian::valueno date_value_slot)
528 if (date_start.empty() && date_end.empty() && date_span.empty())
529 return;
531 Xapian::Query q;
532 if (date_value_slot != Xapian::BAD_VALUENO) {
533 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
534 // latter the sort order just works correctly between different
535 // precisions).
536 bool as_time_t =
537 db.get_value_lower_bound(date_value_slot).size() == 4 &&
538 db.get_value_upper_bound(date_value_slot).size() == 4;
539 q = date_value_range(as_time_t, date_value_slot,
540 date_start, date_end,
541 date_span);
542 } else {
543 q = date_range_filter(date_start, date_end, date_span);
544 q |= Xapian::Query("Dlatest");
547 if (date_filter_set) {
548 date_filter &= q;
549 } else {
550 date_filter_set = true;
551 date_filter = q;
555 static void
556 run_query()
558 string scheme;
559 bool force_boolean = false;
560 if (!filter_map.empty()) {
561 // OR together filters with the same prefix (or AND for non-exclusive
562 // prefixes), then AND together the resultant groups.
563 vector<Xapian::Query> filter_vec;
564 vector<string> same_vec;
565 string current;
566 for (auto i = filter_map.begin(); ; ++i) {
567 bool over = (i == filter_map.end());
568 if (over || i->first != current) {
569 switch (same_vec.size()) {
570 case 0:
571 break;
572 case 1:
573 filter_vec.push_back(Xapian::Query(same_vec[0]));
574 break;
575 default: {
576 Xapian::Query::op op = Xapian::Query::OP_OR;
577 auto it = option.find("nonexclusiveprefix," + current);
578 if (it != option.end() && !it->second.empty()) {
579 op = Xapian::Query::OP_AND;
581 filter_vec.push_back(Xapian::Query(op,
582 same_vec.begin(),
583 same_vec.end()));
584 break;
587 same_vec.clear();
588 if (over) break;
589 current = i->first;
591 same_vec.push_back(i->second);
594 Xapian::Query filter(Xapian::Query::OP_AND,
595 filter_vec.begin(), filter_vec.end());
597 if (query.empty()) {
598 // If no query strings were provided then promote the filters
599 // to be THE query - filtering an empty query will give no
600 // matches.
601 std::swap(query, filter);
602 auto&& it = option.find("weightingpurefilter");
603 if (it != option.end() && !it->second.empty()) {
604 scheme = it->second;
605 } else {
606 force_boolean = true;
608 } else {
609 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
613 if (date_filter_set) {
614 // If no query strings were provided then promote the daterange
615 // filter to be THE query instead of filtering an empty query.
616 if (query.empty()) {
617 query = date_filter;
618 force_boolean = true;
619 } else {
620 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
624 if (!neg_filters.empty()) {
625 // OR together all negated filters.
626 Xapian::Query filter(Xapian::Query::OP_OR,
627 neg_filters.begin(), neg_filters.end());
629 if (query.empty() && !date_filter_set) {
630 // If we only have a negative filter for the query, use MatchAll as
631 // the query to apply the filters to.
632 query = Xapian::Query::MatchAll;
633 force_boolean = true;
635 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
638 if (!enquire || !error_msg.empty()) return;
640 if (!force_boolean && scheme.empty()) {
641 auto&& it = option.find("weighting");
642 if (it != option.end()) scheme = it->second;
644 set_weighting_scheme(*enquire, scheme, force_boolean);
646 enquire->set_cutoff(threshold);
648 if (sort_keymaker) {
649 if (sort_after) {
650 enquire->set_sort_by_relevance_then_key(sort_keymaker,
651 reverse_sort);
652 } else {
653 enquire->set_sort_by_key_then_relevance(sort_keymaker,
654 reverse_sort);
656 } else if (sort_key != Xapian::BAD_VALUENO) {
657 if (sort_after) {
658 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
659 } else {
660 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
664 enquire->set_docid_order(docid_order);
666 if (collapse) {
667 enquire->set_collapse_key(collapse_key);
670 if (!query.empty()) {
671 #if 0
672 // FIXME: If we start doing permissions checks based on $REMOTE_USER
673 // we're going to break some existing setups if users upgrade. We
674 // probably want a way to set this from OmegaScript.
675 const char * remote_user = getenv("REMOTE_USER");
676 if (remote_user)
677 apply_unix_permissions(query, remote_user);
678 #endif
680 enquire->set_query(query);
681 // We could use the value of topdoc as first parameter, but we
682 // need to know the first few items in the mset to fake a
683 // relevance set for topterms.
685 // If min_hits isn't set, check at least one extra result so we
686 // know if we've reached the end of the matches or not - then we
687 // can avoid offering a "next" button which leads to an empty page.
688 mset = enquire->get_mset(0, topdoc + hits_per_page,
689 topdoc + max(hits_per_page + 1, min_hits),
690 &rset);
694 string
695 html_escape(const string &str)
697 string res;
698 string::size_type p = 0;
699 while (p < str.size()) {
700 char ch = str[p++];
701 switch (ch) {
702 case '<':
703 res += "&lt;";
704 continue;
705 case '>':
706 res += "&gt;";
707 continue;
708 case '&':
709 res += "&amp;";
710 continue;
711 case '"':
712 res += "&quot;";
713 continue;
714 default:
715 res += ch;
718 return res;
721 static string
722 html_strip(const string &str)
724 string res;
725 string::size_type p = 0;
726 bool skip = false;
727 while (p < str.size()) {
728 char ch = str[p++];
729 switch (ch) {
730 case '<':
731 skip = true;
732 continue;
733 case '>':
734 skip = false;
735 continue;
736 default:
737 if (!skip) res += ch;
740 return res;
743 class WordList {
744 static string prev_list;
745 static unordered_map<string, int> word_to_occurrence;
746 public:
747 void build_word_map(const string& list) {
748 // Don't build map again if passed list of terms is same as before.
749 if (prev_list == list) return;
750 word_to_occurrence.clear();
751 string::size_type split = 0, split2;
752 int word_index = 0;
753 string word;
754 while ((split2 = list.find('\t', split)) != string::npos) {
755 word = list.substr(split, split2 - split);
756 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
757 ++word_index;
758 split = split2 + 1;
760 word = list.substr(split, list.size() - split);
761 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
762 ++word_index;
763 prev_list = list;
766 int word_in_list(const string& word) {
767 auto it = word_to_occurrence.find(word);
768 if (it == word_to_occurrence.end()) return -1;
769 return it->second;
773 string WordList::prev_list;
774 unordered_map<string, int> WordList::word_to_occurrence;
776 // Not a character in an identifier
777 static inline bool
778 p_notid(unsigned int c)
780 return !C_isalnum(c) && c != '_';
783 // Not a character in an HTML tag name
784 static inline bool
785 p_nottag(unsigned int c)
787 return !C_isalnum(c) && c != '.' && c != '-';
790 // FIXME: shares algorithm with indextext.cc!
791 static string
792 html_highlight(const string &s, const string &list,
793 const string &bra, const string &ket)
795 if (!stemmer) {
796 stemmer = new Xapian::Stem(option["stemmer"]);
799 string res;
801 Utf8Iterator j(s);
802 const Utf8Iterator s_end;
803 while (true) {
804 Utf8Iterator first = j;
805 while (first != s_end && !is_wordchar(*first)) ++first;
806 if (first == s_end) break;
807 Utf8Iterator term_end;
808 string term;
809 string word;
810 const char *l = j.raw();
811 if (*first < 128 && C_isupper(*first)) {
812 j = first;
813 Xapian::Unicode::append_utf8(term, *j);
814 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
815 Xapian::Unicode::append_utf8(term, *j);
817 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
818 term.resize(0);
820 term_end = j;
822 if (term.empty()) {
823 j = first;
824 while (is_wordchar(*j)) {
825 Xapian::Unicode::append_utf8(term, *j);
826 ++j;
827 if (j == s_end) break;
828 if (*j == '&' || *j == '\'') {
829 Utf8Iterator next = j;
830 ++next;
831 if (next == s_end || !is_wordchar(*next)) break;
832 term += *j;
833 j = next;
836 term_end = j;
837 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
838 string::size_type len = term.length();
839 if (*j == '#') {
840 term += '#';
841 do { ++j; } while (j != s_end && *j == '#');
842 } else {
843 while (j != s_end && (*j == '+' || *j == '-')) {
844 Xapian::Unicode::append_utf8(term, *j);
845 ++j;
848 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
849 term.resize(len);
850 } else {
851 term_end = j;
855 j = term_end;
856 term = Xapian::Unicode::tolower(term);
857 WordList w;
858 w.build_word_map(list);
859 int match = w.word_in_list(term);
860 if (match == -1) {
861 string stem = "Z";
862 stem += (*stemmer)(term);
863 match = w.word_in_list(stem);
865 if (match >= 0) {
866 res += html_escape(string(l, first.raw() - l));
867 if (!bra.empty()) {
868 res += bra;
869 } else {
870 static const char * colours[] = {
871 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
872 "990000", "009900", "996600", "006699", "990099"
874 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
875 const char * bg = colours[idx];
876 if (strchr(bg, 'f')) {
877 res += "<b style=\"color:black;background-color:#";
878 } else {
879 res += "<b style=\"color:white;background-color:#";
881 res += bg;
882 res += "\">";
884 word.assign(first.raw(), j.raw() - first.raw());
885 res += html_escape(word);
886 if (!bra.empty()) {
887 res += ket;
888 } else {
889 res += "</b>";
891 } else {
892 res += html_escape(string(l, j.raw() - l));
895 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
896 return res;
899 #if 0
900 static void
901 print_query_string(const char *after)
903 if (after && strncmp(after, "&B=", 3) == 0) {
904 char prefix = after[3];
905 string::size_type start = 0, amp = 0;
906 while (true) {
907 amp = url_query_string.find('&', amp);
908 if (amp == string::npos) {
909 cout << url_query_string.substr(start);
910 return;
912 amp++;
913 while (url_query_string[amp] == 'B' &&
914 url_query_string[amp + 1] == '=' &&
915 url_query_string[amp + 2] == prefix) {
916 cout << url_query_string.substr(start, amp - start - 1);
917 start = url_query_string.find('&', amp + 3);
918 if (start == string::npos) return;
919 amp = start + 1;
923 cout << url_query_string;
925 #endif
927 class CachedFields : private Fields {
928 Xapian::docid did_cached = 0;
930 public:
931 CachedFields() {}
933 const string& get_field(Xapian::docid did, const string& name) {
934 if (did != did_cached) {
935 did_cached = did;
936 auto it = option.find("fieldnames");
937 Fields::parse_fields(db.get_document(did).get_data(),
938 it == option.end() ? nullptr : &it->second);
940 return Fields::get_field(name);
944 static CachedFields fields;
945 static Xapian::docid q0;
946 static Xapian::doccount hit_no;
947 static int percent;
948 static double weight;
949 static Xapian::doccount collapsed;
951 static string print_caption(const string& fmt, vector<string>& param);
953 enum tagval {
954 CMD_,
955 CMD_add,
956 CMD_addfilter,
957 CMD_allterms,
958 CMD_and,
959 CMD_base64,
960 CMD_cgi,
961 CMD_cgilist,
962 CMD_cgiparams,
963 CMD_chr,
964 CMD_collapsed,
965 CMD_cond,
966 CMD_contains,
967 CMD_csv,
968 CMD_date,
969 CMD_dbname,
970 CMD_dbsize,
971 CMD_def,
972 CMD_defaultop,
973 CMD_div,
974 CMD_emptydocs,
975 CMD_env,
976 CMD_eq,
977 CMD_error,
978 CMD_field,
979 CMD_filesize,
980 CMD_filters,
981 CMD_filterterms,
982 CMD_find,
983 CMD_fmt,
984 CMD_foreach,
985 CMD_freq,
986 CMD_ge,
987 CMD_gt,
988 CMD_hash,
989 CMD_highlight,
990 CMD_hit,
991 CMD_hitlist,
992 CMD_hitsperpage,
993 CMD_hostname,
994 CMD_html,
995 CMD_htmlstrip,
996 CMD_httpheader,
997 CMD_id,
998 CMD_if,
999 CMD_include,
1000 CMD_json,
1001 CMD_jsonarray,
1002 CMD_jsonbool,
1003 CMD_jsonobject,
1004 CMD_jsonobject2,
1005 CMD_keys,
1006 CMD_last,
1007 CMD_lastpage,
1008 CMD_le,
1009 CMD_length,
1010 CMD_list,
1011 CMD_log,
1012 CMD_lookup,
1013 CMD_lower,
1014 CMD_lt,
1015 CMD_map,
1016 CMD_match,
1017 CMD_max,
1018 CMD_min,
1019 CMD_mod,
1020 CMD_msize,
1021 CMD_msizeexact,
1022 CMD_msizelower,
1023 CMD_msizeupper,
1024 CMD_mul,
1025 CMD_muldiv,
1026 CMD_ne,
1027 CMD_nice,
1028 CMD_not,
1029 CMD_now,
1030 CMD_opt,
1031 CMD_or,
1032 CMD_ord,
1033 CMD_pack,
1034 CMD_percentage,
1035 CMD_prettyterm,
1036 CMD_prettyurl,
1037 CMD_query,
1038 CMD_querydescription,
1039 CMD_queryterms,
1040 CMD_random,
1041 CMD_range,
1042 CMD_record,
1043 CMD_relevant,
1044 CMD_relevants,
1045 CMD_score,
1046 CMD_set,
1047 CMD_seterror,
1048 CMD_setmap,
1049 CMD_setrelevant,
1050 CMD_slice,
1051 CMD_snippet,
1052 CMD_sort,
1053 CMD_sortableunserialise,
1054 CMD_split,
1055 CMD_srandom,
1056 CMD_stoplist,
1057 CMD_sub,
1058 CMD_subdb,
1059 CMD_subid,
1060 CMD_substr,
1061 CMD_suggestion,
1062 CMD_switch,
1063 CMD_termprefix,
1064 CMD_terms,
1065 CMD_thispage,
1066 CMD_time,
1067 CMD_topdoc,
1068 CMD_topterms,
1069 CMD_transform,
1070 CMD_truncate,
1071 CMD_uniq,
1072 CMD_unique,
1073 CMD_unpack,
1074 CMD_unprefix,
1075 CMD_unstem,
1076 CMD_upper,
1077 CMD_url,
1078 CMD_value,
1079 CMD_valuelowerbound,
1080 CMD_valueupperbound,
1081 CMD_version,
1082 CMD_weight,
1083 CMD_MACRO // special tag for macro evaluation
1086 struct func_attrib {
1087 int tag;
1088 int minargs, maxargs, evalargs;
1089 char ensure;
1092 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1093 struct func_desc {
1094 const char *name;
1095 struct func_attrib a;
1098 #define N (-1)
1099 #define M 'M'
1100 #define Q 'Q'
1101 // NB when adding a new command which ensures M or Q, update the list in
1102 // docs/omegascript.rst
1103 static const struct func_desc func_tab[] = {
1104 //name minargs maxargs evalargs ensure
1105 {"",{CMD_, N, N, 0, 0}},// commented out code
1106 T(add, 0, N, N, 0), // add a list of numbers
1107 T(addfilter, 1, 2, N, 0), // add filter term
1108 T(allterms, 0, 1, N, 0), // list of all terms matching document
1109 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1110 T(base64, 1, 1, N, 0), // base64 encode
1111 T(cgi, 1, 1, N, 0), // return cgi parameter value
1112 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1113 T(cgiparams, 0, 0, N, 0), // return list of cgi parameter names
1114 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1115 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1116 T(cond, 2, N, 0, 0), // cascaded conditionals
1117 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1118 T(csv, 1, 2, N, 0), // CSV string escaping
1119 T(date, 1, 2, N, 0), // convert time_t to strftime format
1120 // (default: YYYY-MM-DD)
1121 T(dbname, 0, 0, N, 0), // database name
1122 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1123 T(def, 2, 2, 1, 0), // define a macro
1124 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1125 T(div, 2, 2, N, 0), // integer divide
1126 T(emptydocs, 0, 1, N, 0), // list of empty documents
1127 T(env, 1, 1, N, 0), // environment variable
1128 T(eq, 2, 2, N, 0), // test equality
1129 T(error, 0, 0, N, 0), // error message
1130 T(field, 1, 2, N, 0), // lookup field in record
1131 T(filesize, 1, 1, N, 0), // pretty printed filesize
1132 T(filters, 0, 1, N, 0), // serialisation of current filters
1133 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1134 T(find, 2, 2, N, 0), // find entry in list
1135 T(fmt, 0, 0, N, 0), // name of current format
1136 T(foreach, 2, 2, 1, 0), // evaluate something for every entry in a list
1137 T(freq, 1, 1, N, 0), // frequency of a term
1138 T(ge, 2, 2, N, 0), // test >=
1139 T(gt, 2, 2, N, 0), // test >
1140 T(hash, 2, 2, N, 0), // hash a string using the specified hash function
1141 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1142 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1143 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1144 T(hitsperpage, 0, 0, N, 0), // hits per page
1145 T(hostname, 1, 1, N, 0), // extract hostname from URL
1146 T(html, 1, 1, N, 0), // html escape string (<>&")
1147 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1148 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1149 T(id, 0, 0, N, 0), // docid of current doc
1150 T(if, 1, 3, 1, 0), // conditional
1151 T(include, 1, 2, 1, 0), // include another file
1152 T(json, 1, 1, N, 0), // JSON string escaping
1153 T(jsonarray, 1, 2, 1, 0), // Format list as a JSON array
1154 T(jsonbool, 1, 1, 1, 0), // Format list as a JSON bool
1155 T(jsonobject, 1, 3, 1, 0), // Format map as JSON object
1156 T(jsonobject2, 2, 4, 2, 0), // Format 2 lists as JSON object
1157 T(keys, 1, 1, N, 0), // list of keys from a map
1158 T(last, 0, 0, N, M), // hit number one beyond end of current page
1159 T(lastpage, 0, 0, N, M), // number of last hit page
1160 T(le, 2, 2, N, 0), // test <=
1161 T(length, 1, 1, N, 0), // length of list
1162 T(list, 2, 5, N, 0), // pretty print list
1163 T(log, 1, 2, 1, 0), // create a log entry
1164 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1165 T(lower, 1, 1, N, 0), // convert string to lower case
1166 T(lt, 2, 2, N, 0), // test <
1167 T(map, 2, 2, 1, 0), // map a list into another list
1168 T(match, 2, 3, N, 0), // regex match
1169 T(max, 1, N, N, 0), // maximum of a list of values
1170 T(min, 1, N, N, 0), // minimum of a list of values
1171 T(mod, 2, 2, N, 0), // integer modulus
1172 T(msize, 0, 0, N, M), // number of matches (estimated)
1173 T(msizeexact, 0, 0, N, M), // is $msize exact?
1174 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1175 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1176 T(mul, 2, N, N, 0), // multiply a list of numbers
1177 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1178 T(ne, 2, 2, N, 0), // test not equal
1179 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1180 T(not, 1, 1, N, 0), // logical not
1181 T(now, 0, 0, N, 0), // current date/time as a time_t
1182 T(opt, 1, 2, N, 0), // lookup an option value
1183 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1184 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1185 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1186 T(percentage, 0, 0, N, 0), // percentage score of current hit
1187 T(prettyterm, 1, 1, N, Q), // pretty print term name
1188 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1189 T(query, 0, 1, N, Q), // query
1190 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1191 T(queryterms, 0, 0, N, Q), // list of query terms
1192 T(random, 1, 1, N, 0), // return a random number
1193 T(range, 2, 2, N, 0), // return list of values between start and end
1194 T(record, 0, 1, N, 0), // record contents of document
1195 T(relevant, 0, 1, N, Q), // is document relevant?
1196 T(relevants, 0, 0, N, Q), // return list of relevant documents
1197 T(score, 0, 0, N, 0), // score (0-10) of current hit
1198 T(set, 2, 2, N, 0), // set option value
1199 T(seterror, 1, 1, N, 0), // set error_msg, setting it early stops query execution
1200 T(setmap, 1, N, N, 0), // set map of option values
1201 T(setrelevant, 1, 1, N, Q), // set rset
1202 T(slice, 2, 2, N, 0), // slice a list using a second list
1203 T(snippet, 1, 6, N, M), // generate snippet from text
1204 T(sort, 1, 2, N, 0), // alpha sort a list
1205 T(sortableunserialise,
1206 1, 1, N, 0), // decode with Xapian::sortable_unserialise
1207 T(split, 1, 2, N, 0), // split a string to give a list
1208 T(srandom, 1, 1, N, 0), // seed for random number
1209 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1210 T(sub, 2, 2, N, 0), // subtract
1211 T(subdb, 0, 1, N, 0), // name of subdb docid is in
1212 T(subid, 0, 1, N, 0), // docid in the subdb#
1213 T(substr, 2, 3, N, 0), // substring
1214 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1215 T(switch, 3, N, 1, 0), // return position of substring, or empty string
1216 T(termprefix, 1, 1, N, 0), // get any prefix from a term
1217 T(terms, 0, 1, N, M), // list of matching terms
1218 T(thispage, 0, 0, N, M), // page number of current page
1219 T(time, 0, 0, N, M), // how long the match took (in seconds)
1220 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1221 // (counting from 0)
1222 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1223 // (default 16)
1224 T(transform, 3, 4, N, 0), // transform with a regexp
1225 T(truncate, 2, 4, N, 0), // truncate after a word
1226 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1227 T(unique, 1, 1, N, 0), // removed duplicates from any list
1228 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1229 T(unprefix, 1, 1, N, 0), // remove any prefix from a term
1230 T(unstem, 1, 1, N, Q), // return list of terms from the parsed query
1231 // which stemmed to this term
1232 T(upper, 1, 1, N, 0), // convert string to upper case
1233 T(url, 1, 1, N, 0), // url encode argument
1234 T(value, 1, 2, N, 0), // return document value
1235 T(valuelowerbound, 1, 1, N, 0), // return value slot lower bound
1236 T(valueupperbound, 1, 1, N, 0), // return value slot upper bound
1237 T(version, 0, 0, N, 0), // omega version string
1238 T(weight, 0, 0, N, 0), // weight of the current hit
1239 { NULL,{0, 0, 0, 0, 0}}
1242 #undef T // Leaving T defined screws up Sun's C++ compiler!
1244 static vector<string> macros;
1246 // Call write() repeatedly until all data is written or we get a
1247 // non-recoverable error.
1248 static ssize_t
1249 write_all(int fd, const char * buf, size_t count)
1251 while (count) {
1252 ssize_t r = write(fd, buf, count);
1253 if (rare(r < 0)) {
1254 if (errno == EINTR) continue;
1255 return r;
1257 buf += r;
1258 count -= r;
1260 return 0;
1263 // mersenne twister for RNG
1264 static mt19937 rng;
1265 static bool seed_set = false;
1267 static string eval(const string& fmt, vector<string>& param);
1269 /** Implements $foreach{} and $map{}. */
1270 static string
1271 foreach(const string& list,
1272 const string& pat,
1273 vector<string>& param,
1274 char sep = '\0')
1276 string result;
1277 string saved_arg0 = std::move(param[0]);
1278 string::size_type i = 0, j;
1279 while (true) {
1280 j = list.find('\t', i);
1281 param[0].assign(list, i, j - i);
1282 result += eval(pat, param);
1283 if (j == string::npos) break;
1284 if (sep) result += sep;
1285 i = j + 1;
1287 param[0] = std::move(saved_arg0);
1288 return result;
1291 static string
1292 eval(const string& fmt, vector<string>& param)
1294 static map<string, const struct func_attrib *> func_map;
1295 if (func_map.empty()) {
1296 for (auto p = func_tab; p->name != NULL; ++p) {
1297 func_map[string(p->name)] = &(p->a);
1300 string res;
1301 string::size_type p = 0, q;
1302 while ((q = fmt.find('$', p)) != string::npos) try {
1303 res.append(fmt, p, q - p);
1304 string::size_type code_start = q; // note down for error reporting
1305 q++;
1306 if (q >= fmt.size()) break;
1307 unsigned char ch = fmt[q];
1308 switch (ch) {
1309 // Magic sequences:
1310 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1311 case '$':
1312 res += '$';
1313 p = q + 1;
1314 continue;
1315 case '(':
1316 res += '{';
1317 p = q + 1;
1318 continue;
1319 case ')':
1320 res += '}';
1321 p = q + 1;
1322 continue;
1323 case '.':
1324 res += ',';
1325 p = q + 1;
1326 continue;
1327 case '_':
1328 ch = '0';
1329 // FALL THRU
1330 case '1': case '2': case '3': case '4': case '5':
1331 case '6': case '7': case '8': case '9':
1332 ch -= '0';
1333 if (ch < param.size()) res += param[ch];
1334 p = q + 1;
1335 continue;
1336 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1337 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1338 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1339 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1340 case 'y': case 'z':
1341 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1342 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1343 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1344 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1345 case 'Y': case 'Z':
1346 case '{':
1347 break;
1348 default:
1349 string msg = "Unknown $ code in: $";
1350 msg.append(fmt, q, string::npos);
1351 throw msg;
1353 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1354 string var(fmt, q, p - q);
1355 map<string, const struct func_attrib *>::const_iterator func;
1356 func = func_map.find(var);
1357 if (func == func_map.end()) {
1358 throw "Unknown function '" + var + "'";
1360 vector<string> args;
1361 if (fmt[p] == '{') {
1362 q = p + 1;
1363 int nest = 1;
1364 while (true) {
1365 p = fmt.find_first_of(",{}", p + 1);
1366 if (p == string::npos)
1367 throw "missing } in " + fmt.substr(code_start);
1368 if (fmt[p] == '{') {
1369 ++nest;
1370 } else {
1371 if (nest == 1) {
1372 // should we split the args
1373 if (func->second->minargs != N) {
1374 args.push_back(fmt.substr(q, p - q));
1375 q = p + 1;
1378 if (fmt[p] == '}' && --nest == 0) break;
1381 if (func->second->minargs == N)
1382 args.push_back(fmt.substr(q, p - q));
1383 ++p;
1386 if (func->second->minargs != N) {
1387 if (int(args.size()) < func->second->minargs)
1388 throw "too few arguments to $" + var;
1389 if (func->second->maxargs != N &&
1390 int(args.size()) > func->second->maxargs)
1391 throw "too many arguments to $" + var;
1393 vector<string>::size_type n;
1394 if (func->second->evalargs != N)
1395 n = func->second->evalargs;
1396 else
1397 n = args.size();
1399 for (vector<string>::size_type j = 0; j < n; ++j)
1400 args[j] = eval(args[j], param);
1402 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1403 ensure_query_parsed();
1404 if (func->second->ensure == 'M') ensure_match();
1405 string value;
1406 switch (func->second->tag) {
1407 case CMD_:
1408 break;
1409 case CMD_add: {
1410 int total = 0;
1411 for (auto&& arg : args)
1412 total += string_to_int(arg);
1413 value = str(total);
1414 break;
1416 case CMD_addfilter:
1417 if (args.size() == 1 || args[1].empty() || args[1] == "B") {
1418 add_bterm(args[0]);
1419 } else if (args[1] == "N") {
1420 add_nterm(args[0]);
1421 } else {
1422 string msg = "Invalid $addfilter type '";
1423 msg += args[1];
1424 msg += "'";
1425 throw msg;
1427 break;
1428 case CMD_allterms: {
1429 // list of all terms indexing document
1430 Xapian::docid id = q0;
1431 if (!args.empty() &&
1432 (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
1433 throw "Document id for command allterms should be > 0";
1435 for (Xapian::TermIterator term = db.termlist_begin(id);
1436 term != db.termlist_end(id); ++term) {
1437 value += *term;
1438 value += '\t';
1441 if (!value.empty()) value.erase(value.size() - 1);
1442 break;
1444 case CMD_and: {
1445 value = "true";
1446 for (auto&& arg : args) {
1447 if (eval(arg, param).empty()) {
1448 value.resize(0);
1449 break;
1452 break;
1454 case CMD_base64: {
1455 const static char encode[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef"
1456 "ghijklmnopqrstuvwxyz0123456789+/";
1457 const char pad = '=';
1458 const string& input = args[0];
1459 value.reserve((input.size() + 2) / 3 * 4);
1460 auto it = input.begin();
1461 auto n = input.size() / 3;
1462 for ( ; n; --n) {
1463 uint32_t v = uint8_t(*it++);
1464 v = (v << 8) | uint8_t(*it++);
1465 v = (v << 8) | uint8_t(*it++);
1466 value += encode[v >> 18];
1467 value += encode[(v >> 12) & 63];
1468 value += encode[(v >> 6) & 63];
1469 value += encode[v & 63];
1471 switch (input.size() % 3) {
1472 case 2: {
1473 uint32_t v = uint8_t(*it++);
1474 v = (v << 8) | uint8_t(*it++);
1475 value += encode[v >> 10];
1476 value += encode[(v >> 4) & 63];
1477 value += encode[(v << 2) & 63];
1478 value += pad;
1479 break;
1481 case 1: {
1482 uint32_t v = uint8_t(*it++);
1483 value += encode[v >> 2];
1484 value += encode[(v << 4) & 63];
1485 value += pad;
1486 value += pad;
1487 break;
1490 break;
1492 case CMD_cgi: {
1493 auto i = cgi_params.find(args[0]);
1494 if (i != cgi_params.end()) value = i->second;
1495 break;
1497 case CMD_cgilist: {
1498 auto g = cgi_params.equal_range(args[0]);
1499 for (auto i = g.first; i != g.second; ++i) {
1500 value += i->second;
1501 value += '\t';
1503 if (!value.empty()) value.erase(value.size() - 1);
1504 break;
1506 case CMD_cgiparams: {
1507 const string* prev = NULL;
1508 for (auto&& i : cgi_params) {
1509 if (prev && i.first == *prev) continue;
1510 value += i.first;
1511 value += '\t';
1512 prev = &i.first;
1514 if (!value.empty()) value.erase(value.size() - 1);
1515 break;
1517 case CMD_chr: {
1518 unsigned int codepoint;
1519 if (!parse_unsigned(args[0].c_str(), codepoint)) {
1520 throw "Unicode codepoint for command chr should be >= 0";
1522 Xapian::Unicode::append_utf8(value, codepoint);
1523 break;
1525 case CMD_collapsed: {
1526 value = str(collapsed);
1527 break;
1529 case CMD_cond:
1530 for (size_t i = 0; i < args.size(); i += 2) {
1531 if (i == args.size() - 1) {
1532 // Handle optional "else" value.
1533 value = eval(args[i], param);
1534 break;
1536 if (!eval(args[i], param).empty()) {
1537 value = eval(args[i + 1], param);
1538 break;
1541 break;
1542 case CMD_contains: {
1543 size_t pos = args[1].find(args[0]);
1544 if (pos != string::npos) {
1545 value = str(pos);
1547 break;
1549 case CMD_csv:
1550 value = args[0];
1551 if (args.size() > 1 && !args[1].empty()) {
1552 csv_escape_always(value);
1553 } else {
1554 csv_escape(value);
1556 break;
1557 case CMD_date:
1558 value = args[0];
1559 if (!value.empty()) {
1560 char buf[64] = "";
1561 time_t date;
1562 if (!parse_signed(value.c_str(), date)) {
1563 throw "Date (in secs) for command date should "
1564 "be an integer";
1566 if (date != static_cast<time_t>(-1)) {
1567 struct tm *then;
1568 then = gmtime(&date);
1569 string date_fmt = "%Y-%m-%d";
1570 if (args.size() > 1) date_fmt = eval(args[1], param);
1571 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1573 value = buf;
1575 break;
1576 case CMD_dbname:
1577 value = dbname;
1578 break;
1579 case CMD_dbsize: {
1580 static Xapian::doccount dbsize;
1581 if (!dbsize) dbsize = db.get_doccount();
1582 value = str(dbsize);
1583 break;
1585 case CMD_def: {
1586 func_attrib *fa = new func_attrib;
1587 fa->tag = CMD_MACRO + macros.size();
1588 fa->minargs = 0;
1589 fa->maxargs = 9;
1590 fa->evalargs = N; // FIXME: or 0?
1591 fa->ensure = 0;
1593 macros.push_back(args[1]);
1594 func_map[args[0]] = fa;
1595 break;
1597 case CMD_defaultop:
1598 if (default_op == Xapian::Query::OP_AND) {
1599 value = "and";
1600 } else {
1601 value = "or";
1603 break;
1604 case CMD_div: {
1605 int denom = string_to_int(args[1]);
1606 if (denom == 0) {
1607 value = "divide by 0";
1608 } else {
1609 value = str(string_to_int(args[0]) / denom);
1611 break;
1613 case CMD_emptydocs: {
1614 string t;
1615 if (!args.empty())
1616 t = args[0];
1617 Xapian::PostingIterator i;
1618 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1619 if (i.get_doclength() != 0) continue;
1620 if (!value.empty()) value += '\t';
1621 value += str(*i);
1623 break;
1625 case CMD_env: {
1626 char *env = getenv(args[0].c_str());
1627 if (env != NULL) value = env;
1628 break;
1630 case CMD_eq:
1631 if (args[0] == args[1]) value = "true";
1632 break;
1633 case CMD_error:
1634 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1635 error_msg = "Database '" + dbname + "' couldn't be opened";
1637 value = error_msg;
1638 break;
1639 case CMD_field: {
1640 Xapian::docid did = q0;
1641 if (args.size() > 1 &&
1642 (!parse_unsigned(args[1].c_str(), did) || did == 0)) {
1643 throw "Document id for command field should be > 0";
1645 value = fields.get_field(did, args[0]);
1646 break;
1648 case CMD_filesize: {
1649 if (args[0].empty()) break;
1650 // FIXME: rounding? i18n?
1651 int size;
1652 if (!parse_signed(args[0].c_str(), size)) {
1653 throw "Filesize must be an integer";
1655 int intpart = size;
1656 int fraction = -1;
1657 const char * format = 0;
1658 if (size < 0) {
1659 // Negative size -> empty result.
1660 } else if (size == 1) {
1661 format = "%d byte";
1662 } else if (size < 1024) {
1663 format = "%d bytes";
1664 } else {
1665 if (size < 1024 * 1024) {
1666 format = "%d.%cK";
1667 } else {
1668 size /= 1024;
1669 if (size < 1024 * 1024) {
1670 format = "%d.%cM";
1671 } else {
1672 size /= 1024;
1673 format = "%d.%cG";
1676 intpart = unsigned(size) / 1024;
1677 fraction = unsigned(size) % 1024;
1679 if (format) {
1680 char buf[200];
1681 int len;
1682 if (fraction == -1) {
1683 len = snprintf(buf, sizeof(buf), format, intpart);
1684 } else {
1685 fraction = (fraction * 10 / 1024) + '0';
1686 len = snprintf(buf, sizeof(buf), format, intpart, fraction);
1688 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1689 value.assign(buf, len);
1691 break;
1693 case CMD_filters:
1694 value = args.size() ? old_filters : filters;
1695 break;
1696 case CMD_filterterms: {
1697 Xapian::TermIterator term = db.allterms_begin();
1698 term.skip_to(args[0]);
1699 while (term != db.allterms_end()) {
1700 string t = *term;
1701 if (!startswith(t, args[0])) break;
1702 value += t;
1703 value += '\t';
1704 ++term;
1707 if (!value.empty()) value.erase(value.size() - 1);
1708 break;
1710 case CMD_find: {
1711 string l = args[0], s = args[1];
1712 string::size_type i = 0, j = 0;
1713 size_t count = 0;
1714 while (j != l.size()) {
1715 j = l.find('\t', i);
1716 if (j == string::npos) j = l.size();
1717 if (j - i == s.length()) {
1718 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1719 value = str(count);
1720 break;
1723 ++count;
1724 i = j + 1;
1726 break;
1728 case CMD_fmt:
1729 value = fmtname;
1730 break;
1731 case CMD_foreach:
1732 if (!args[0].empty()) {
1733 value = foreach(args[0], args[1], param);
1735 break;
1736 case CMD_freq: {
1737 const string& term = args[0];
1738 Xapian::doccount termfreq = 0;
1739 if (done_query) {
1740 termfreq = mset.get_termfreq(term);
1742 if (termfreq == 0) {
1743 // We want $freq to work before the match is run, and we
1744 // don't want using it to force the match to run.
1745 termfreq = db.get_termfreq(term);
1747 value = str(termfreq);
1748 break;
1750 case CMD_ge:
1751 if (string_to_int(args[0]) >= string_to_int(args[1]))
1752 value = "true";
1753 break;
1754 case CMD_gt:
1755 if (string_to_int(args[0]) > string_to_int(args[1]))
1756 value = "true";
1757 break;
1758 case CMD_hash: {
1759 const string& data = args[0];
1760 const string& hash = args[1];
1761 if (hash == "md5") {
1762 string md5;
1763 md5_string(data, md5);
1764 value.reserve(md5.size() * 2);
1765 for (unsigned char byte : md5) {
1766 value += "0123456789abcdef"[byte >> 4];
1767 value += "0123456789abcdef"[byte & 0x0f];
1769 } else {
1770 throw "Unknown hash function: " + hash;
1772 break;
1774 case CMD_highlight: {
1775 string bra, ket;
1776 if (args.size() > 2) {
1777 bra = args[2];
1778 if (args.size() > 3) {
1779 ket = args[3];
1780 } else {
1781 string::const_iterator i;
1782 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1783 ket = "</";
1784 ket.append(bra, 1, i - bra.begin() - 1);
1785 ket += '>';
1789 value = html_highlight(args[0], args[1], bra, ket);
1790 break;
1792 case CMD_hit:
1793 // 0-based mset index
1794 value = str(hit_no);
1795 break;
1796 case CMD_hitlist: {
1797 #if 0
1798 url_query_string = "?DB=";
1799 url_query_string += dbname;
1800 for (auto& j : query_strings) {
1801 if (j.first.empty()) {
1802 url_query_string += "&P=";
1803 } else {
1804 url_query_string += "&P."
1805 url_query_string += j.first;
1806 url_query_string += '=';
1808 const char *q = j.second.c_str();
1809 int ch;
1810 while ((ch = *q++) != '\0') {
1811 switch (ch) {
1812 case '+':
1813 url_query_string += "%2b";
1814 break;
1815 case '"':
1816 url_query_string += "%22";
1817 break;
1818 case '%':
1819 url_query_string += "%25";
1820 break;
1821 case '&':
1822 url_query_string += "%26";
1823 break;
1824 case ' ':
1825 ch = '+';
1826 /* fall through */
1827 default:
1828 url_query_string += ch;
1832 // add any boolean terms
1833 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1834 url_query_string += "&B=";
1835 url_query_string += i->second;
1837 #endif
1838 auto save_hit_no = hit_no;
1839 for (hit_no = topdoc; hit_no < last; ++hit_no)
1840 value += print_caption(args[0], param);
1841 hit_no = save_hit_no;
1842 break;
1844 case CMD_hitsperpage:
1845 value = str(hits_per_page);
1846 break;
1847 case CMD_hostname: {
1848 value = args[0];
1849 // remove URL scheme and/or path
1850 string::size_type i = value.find("://");
1851 if (i == string::npos) i = 0; else i += 3;
1852 value = value.substr(i, value.find('/', i) - i);
1853 // remove user@ or user:password@
1854 i = value.find('@');
1855 if (i != string::npos) value.erase(0, i + 1);
1856 // remove :port
1857 i = value.find(':');
1858 if (i != string::npos) value.resize(i);
1859 break;
1861 case CMD_html:
1862 value = html_escape(args[0]);
1863 break;
1864 case CMD_htmlstrip:
1865 value = html_strip(args[0]);
1866 break;
1867 case CMD_httpheader:
1868 if (!suppress_http_headers) {
1869 cout << args[0] << ": " << args[1] << endl;
1870 if (!set_content_type && args[0].length() == 12 &&
1871 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1872 set_content_type = true;
1875 break;
1876 case CMD_id:
1877 // document id
1878 value = str(q0);
1879 break;
1880 case CMD_if:
1881 if (args.size() > 1 && !args[0].empty())
1882 value = eval(args[1], param);
1883 else if (args.size() > 2)
1884 value = eval(args[2], param);
1885 break;
1886 case CMD_include: {
1887 if (args.size() == 1) {
1888 value = eval_file(args[0]);
1889 } else {
1890 bool fallback = false;
1891 value = eval_file(args[0], &fallback);
1892 if (fallback) {
1893 value = eval(args[1], param);
1896 break;
1898 case CMD_json:
1899 value = args[0];
1900 json_escape(value);
1901 break;
1902 case CMD_jsonarray: {
1903 const string & l = args[0];
1904 string::size_type i = 0, j;
1905 if (l.empty()) {
1906 value = "[]";
1907 break;
1909 vector<string> new_args(1);
1910 value = "[";
1911 while (true) {
1912 j = l.find('\t', i);
1913 string elt(l, i, j - i);
1914 if (args.size() == 1) {
1915 value += '"';
1916 json_escape(elt);
1917 value += elt;
1918 value += '"';
1919 } else {
1920 new_args[0] = std::move(elt);
1921 value += eval(args[1], new_args);
1923 if (j == string::npos) break;
1924 value += ',';
1925 i = j + 1;
1927 value += ']';
1928 break;
1930 case CMD_jsonbool:
1931 value = args[0].empty() ? "false" : "true";
1932 break;
1933 case CMD_jsonobject: {
1934 vector<string> new_args(1);
1936 class map_range {
1937 typedef map<string, string>::const_iterator iterator;
1938 iterator b, e;
1940 public:
1941 map_range(iterator b_, iterator e_) : b(b_), e(e_) {}
1943 iterator begin() const { return b; }
1944 iterator end() const { return e; }
1947 string prefix = args[0] + ',';
1948 auto b = option.lower_bound(prefix);
1949 ++prefix.back();
1950 auto e = option.lower_bound(prefix);
1951 value = to_json(map_range(b, e),
1952 [&](const string& k) {
1953 string key(k, prefix.size());
1954 if (args.size() > 1 && !args[1].empty()) {
1955 new_args[0] = std::move(key);
1956 key = eval(args[1], new_args);
1958 return key;
1960 [&](const string& v) {
1961 if (args.size() > 2 && !args[2].empty()) {
1962 new_args[0] = v;
1963 return eval(args[2], new_args);
1965 string r(1, '"');
1966 string elt = v;
1967 json_escape(elt);
1968 r += elt;
1969 r += '"';
1970 return r;
1972 break;
1974 case CMD_jsonobject2: {
1975 vector<string> new_args(1);
1977 static string dummy;
1979 class list_range {
1980 const string& keys;
1981 const string& values;
1983 public:
1984 class iterator {
1985 const string& keys;
1986 const string& values;
1987 string::size_type ki = 0;
1988 string::size_type kj;
1989 string::size_type vi = 0;
1990 string::size_type vj;
1992 public:
1993 iterator()
1994 : keys(dummy), values(dummy),
1995 ki(string::npos), vi(string::npos) {}
1997 iterator(const string& k, const string& v)
1998 : keys(k), values(v) {
1999 if (keys.empty() && values.empty()) {
2000 // Don't treat this as: { "": "" }
2001 ki = kj = vi = vj = string::npos;
2002 } else {
2003 kj = keys.find('\t');
2004 vj = values.find('\t');
2008 pair<string, string> operator*() const {
2009 return {keys.substr(ki, kj - ki),
2010 values.substr(vi, vj - vi)};
2013 iterator& operator++() {
2014 ki = kj;
2015 if (ki != string::npos) {
2016 ++ki;
2017 kj = keys.find('\t', ki);
2019 vi = vj;
2020 if (vi != string::npos) {
2021 ++vi;
2022 vj = values.find('\t', vi);
2024 if ((ki == string::npos) !=
2025 (vi == string::npos)) {
2026 throw "$jsonobject2: Different number of keys "
2027 "and values";
2029 return *this;
2032 iterator operator++(int) {
2033 iterator r = *this;
2034 operator++();
2035 return r;
2038 bool operator==(const iterator& o) const {
2039 return ki == o.ki && vi == o.vi;
2042 bool operator!=(const iterator& o) const {
2043 return !(*this == o);
2047 list_range(const string& k, const string& v)
2048 : keys(k), values(v) { }
2050 iterator begin() const { return iterator(keys, values); }
2051 iterator end() const { return iterator(); }
2054 value = to_json(list_range(args[0], args[1]),
2055 [&](const string& k) {
2056 string key = k;
2057 if (args.size() > 2 && !args[2].empty()) {
2058 new_args[0] = std::move(key);
2059 key = eval(args[2], new_args);
2061 return key;
2063 [&](const string& v) {
2064 if (args.size() > 3 && !args[3].empty()) {
2065 new_args[0] = v;
2066 return eval(args[3], new_args);
2068 string r(1, '"');
2069 string elt = v;
2070 json_escape(elt);
2071 r += elt;
2072 r += '"';
2073 return r;
2075 break;
2077 case CMD_keys: {
2078 string prefix = args[0] + ',';
2079 auto i = option.lower_bound(prefix);
2080 for (; i != option.end() && startswith(i->first, prefix); ++i) {
2081 const string& key = i->first;
2082 if (!value.empty()) value += '\t';
2083 value.append(key, prefix.size(), string::npos);
2085 break;
2087 case CMD_last:
2088 value = str(last);
2089 break;
2090 case CMD_lastpage: {
2091 int l = mset.get_matches_estimated();
2092 if (l > 0) l = (l - 1) / hits_per_page + 1;
2093 value = str(l);
2094 break;
2096 case CMD_le:
2097 if (string_to_int(args[0]) <= string_to_int(args[1]))
2098 value = "true";
2099 break;
2100 case CMD_length:
2101 if (args[0].empty()) {
2102 value = "0";
2103 } else {
2104 size_t length = count(args[0].begin(), args[0].end(), '\t');
2105 value = str(length + 1);
2107 break;
2108 case CMD_list: {
2109 if (!args[0].empty()) {
2110 string pre, inter, interlast, post;
2111 switch (args.size()) {
2112 case 2:
2113 inter = interlast = args[1];
2114 break;
2115 case 3:
2116 inter = args[1];
2117 interlast = args[2];
2118 break;
2119 case 4:
2120 pre = args[1];
2121 inter = interlast = args[2];
2122 post = args[3];
2123 break;
2124 case 5:
2125 pre = args[1];
2126 inter = args[2];
2127 interlast = args[3];
2128 post = args[4];
2129 break;
2131 value += pre;
2132 string list = args[0];
2133 string::size_type split = 0, split2;
2134 while ((split2 = list.find('\t', split)) != string::npos) {
2135 if (split) value += inter;
2136 value.append(list, split, split2 - split);
2137 split = split2 + 1;
2139 if (split) value += interlast;
2140 value.append(list, split, string::npos);
2141 value += post;
2143 break;
2145 case CMD_log: {
2146 if (!vet_filename(args[0])) {
2147 value = "filename can't contain \"..\"";
2148 break;
2150 string logfile = log_dir + args[0];
2151 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
2152 if (fd == -1) {
2153 value = "open failed: ";
2154 value += strerror(errno);
2155 break;
2157 vector<string> noargs;
2158 noargs.resize(1);
2159 string line;
2160 if (args.size() > 1) {
2161 line = args[1];
2162 } else {
2163 line = DEFAULT_LOG_ENTRY;
2165 line = eval(line, noargs);
2166 line += '\n';
2167 if (write_all(fd, line.data(), line.length()) < 0) {
2168 value = "write failed: ";
2169 value += strerror(errno);
2171 close(fd);
2172 break;
2174 case CMD_lookup: {
2175 if (!vet_filename(args[0])) break;
2176 string cdbfile = cdb_dir + args[0];
2177 int fd = open(cdbfile.c_str(), O_RDONLY);
2178 if (fd == -1) break;
2180 struct cdb cdb;
2181 if (cdb_init(&cdb, fd) < 0) {
2182 close(fd);
2183 break;
2186 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
2187 size_t datalen = cdb_datalen(&cdb);
2188 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
2189 if (dat) {
2190 value.assign(static_cast<const char *>(dat), datalen);
2194 cdb_free(&cdb);
2195 close(fd); // FIXME: cache fds?
2196 break;
2198 case CMD_lower:
2199 value = Xapian::Unicode::tolower(args[0]);
2200 break;
2201 case CMD_lt:
2202 if (string_to_int(args[0]) < string_to_int(args[1]))
2203 value = "true";
2204 break;
2205 case CMD_map:
2206 if (!args[0].empty()) {
2207 value = foreach(args[0], args[1], param, '\t');
2209 break;
2210 case CMD_match:
2211 omegascript_match(value, args);
2212 break;
2213 case CMD_max: {
2214 vector<string>::const_iterator i = args.begin();
2215 int val = string_to_int(*i++);
2216 for (; i != args.end(); ++i) {
2217 int x = string_to_int(*i);
2218 if (x > val) val = x;
2220 value = str(val);
2221 break;
2223 case CMD_min: {
2224 vector<string>::const_iterator i = args.begin();
2225 int val = string_to_int(*i++);
2226 for (; i != args.end(); ++i) {
2227 int x = string_to_int(*i);
2228 if (x < val) val = x;
2230 value = str(val);
2231 break;
2233 case CMD_mod: {
2234 int denom = string_to_int(args[1]);
2235 if (denom == 0) {
2236 value = "divide by 0";
2237 } else {
2238 value = str(string_to_int(args[0]) % denom);
2240 break;
2242 case CMD_msize:
2243 // Estimated number of matches.
2244 value = str(mset.get_matches_estimated());
2245 break;
2246 case CMD_msizeexact:
2247 // Is msize exact?
2248 if (mset.get_matches_lower_bound()
2249 == mset.get_matches_upper_bound())
2250 value = "true";
2251 break;
2252 case CMD_msizelower:
2253 // Lower bound on number of matches.
2254 value = str(mset.get_matches_lower_bound());
2255 break;
2256 case CMD_msizeupper:
2257 // Upper bound on number of matches.
2258 value = str(mset.get_matches_upper_bound());
2259 break;
2260 case CMD_mul: {
2261 vector<string>::const_iterator i = args.begin();
2262 int total = string_to_int(*i++);
2263 while (i != args.end())
2264 total *= string_to_int(*i++);
2265 value = str(total);
2266 break;
2268 case CMD_muldiv: {
2269 int denom = string_to_int(args[2]);
2270 if (denom == 0) {
2271 value = "divide by 0";
2272 } else {
2273 int num = string_to_int(args[0]) * string_to_int(args[1]);
2274 value = str(num / denom);
2276 break;
2278 case CMD_ne:
2279 if (args[0] != args[1]) value = "true";
2280 break;
2281 case CMD_nice: {
2282 string::const_iterator i = args[0].begin();
2283 int len = args[0].length();
2284 while (len) {
2285 value += *i++;
2286 if (--len && len % 3 == 0) value += option["thousand"];
2288 break;
2290 case CMD_not:
2291 if (args[0].empty()) value = "true";
2292 break;
2293 case CMD_now:
2294 value = str(static_cast<unsigned long>(time(NULL)));
2295 break;
2296 case CMD_opt:
2297 if (args.size() == 2) {
2298 value = option[args[0] + "," + args[1]];
2299 } else {
2300 value = option[args[0]];
2302 break;
2303 case CMD_or: {
2304 for (auto&& arg : args) {
2305 value = eval(arg, param);
2306 if (!value.empty()) break;
2308 break;
2310 case CMD_ord: {
2311 if (!args[0].empty()) {
2312 Utf8Iterator it(args[0]);
2313 value = str(*it);
2315 break;
2317 case CMD_pack: {
2318 int number;
2319 if (!parse_signed(args[0].c_str(), number)) {
2320 throw "NUMBER parameter for pack command "
2321 "must be an integer";
2323 value = int_to_binary_string(number);
2324 break;
2326 case CMD_percentage:
2327 // percentage score
2328 value = str(percent);
2329 break;
2330 case CMD_prettyterm:
2331 value = pretty_term(args[0]);
2332 break;
2333 case CMD_prettyurl:
2334 value = args[0];
2335 url_prettify(value);
2336 break;
2337 case CMD_query: {
2338 auto r = query_strings.equal_range(args.empty() ?
2339 string() : args[0]);
2340 for (auto j = r.first; j != r.second; ++j) {
2341 if (!value.empty()) value += '\t';
2342 const string & s = j->second;
2343 size_t start = 0, tab;
2344 while ((tab = s.find('\t', start)) != string::npos) {
2345 value.append(s, start, tab - start);
2346 value += ' ';
2347 start = tab + 1;
2349 value.append(s, start, string::npos);
2351 break;
2353 case CMD_querydescription:
2354 value = query.get_description();
2355 break;
2356 case CMD_queryterms:
2357 value = queryterms;
2358 break;
2359 case CMD_random: {
2360 if (!seed_set) {
2361 random_device rd;
2362 rng.seed(rd());
2363 seed_set = true;
2365 uniform_int_distribution<int>
2366 distr(0, string_to_int(args[0]));
2367 value = str(distr(rng));
2368 break;
2370 case CMD_range: {
2371 int start, end;
2372 if (!parse_signed(args[0].c_str(), start)) {
2373 throw "Start value for range command "
2374 "must be an integer";
2376 if (!parse_signed(args[1].c_str(), end)) {
2377 throw "End value for range command "
2378 "must be an integer";
2380 while (start <= end) {
2381 value += str(start);
2382 if (start < end) value += '\t';
2383 start++;
2385 break;
2387 case CMD_record: {
2388 Xapian::docid id = q0;
2389 if (!args.empty() &&
2390 (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2391 throw "Document id for command record should be > 0";
2393 value = db.get_document(id).get_data();
2394 break;
2396 case CMD_relevant: {
2397 // document id if relevant; empty otherwise
2398 Xapian::docid id = q0;
2399 if (!args.empty() &&
2400 (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2401 throw "Document id for command relevant should be > 0";
2403 auto i = ticked.find(id);
2404 if (i != ticked.end()) {
2405 i->second = false; // icky side-effect
2406 value = str(id);
2408 break;
2410 case CMD_relevants: {
2411 for (auto i : ticked) {
2412 if (i.second) {
2413 value += str(i.first);
2414 value += '\t';
2417 if (!value.empty()) value.erase(value.size() - 1);
2418 break;
2420 case CMD_score:
2421 // Score (0 to 10)
2422 value = str(percent / 10);
2423 break;
2424 case CMD_set:
2425 option[args[0]] = args[1];
2426 break;
2427 case CMD_seterror:
2428 error_msg = args[0];
2429 break;
2430 case CMD_setmap: {
2431 string base = args[0] + ',';
2432 if (args.size() % 2 != 1)
2433 throw string("$setmap requires an odd number of arguments");
2434 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2435 option[base + args[i]] = args[i + 1];
2437 break;
2439 case CMD_setrelevant: {
2440 string::size_type i = 0, j;
2441 while (true) {
2442 j = args[0].find_first_not_of("0123456789", i);
2443 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2444 if (id) {
2445 rset.add_document(id);
2446 ticked[id] = true;
2448 if (j == string::npos) break;
2449 i = j + 1;
2451 break;
2453 case CMD_slice: {
2454 string list = args[0], pos = args[1];
2455 vector<string> items;
2456 string::size_type i = 0, j;
2457 while (true) {
2458 j = list.find('\t', i);
2459 items.push_back(list.substr(i, j - i));
2460 if (j == string::npos) break;
2461 i = j + 1;
2463 i = 0;
2464 bool have_added = false;
2465 while (true) {
2466 j = pos.find('\t', i);
2467 int item = string_to_int(pos.substr(i, j - i));
2468 if (item >= 0 && size_t(item) < items.size()) {
2469 if (have_added) value += '\t';
2470 value += items[item];
2471 have_added = true;
2473 if (j == string::npos) break;
2474 i = j + 1;
2476 break;
2478 case CMD_snippet: {
2479 size_t length = 200;
2480 if (args.size() > 1 && !args[1].empty()) {
2481 if (!parse_unsigned(args[1].c_str(), length)) {
2482 throw "Snippet length must be >= 0";
2485 unsigned flags = mset.SNIPPET_BACKGROUND_MODEL |
2486 mset.SNIPPET_EXHAUSTIVE;
2487 if (args.size() > 2 && !args[2].empty()) {
2488 flags = 0;
2489 const string& s = args[2];
2490 size_t i = 0;
2491 while (true) {
2492 size_t j = s.find('|', i);
2493 string flag(s, i, j - i);
2494 for (char& c : flag) {
2495 c = C_tolower(c);
2497 if (startswith(flag, "snippet_")) {
2498 flag.erase(0, CONST_STRLEN("snippet_"));
2500 if (flag == "background_model") {
2501 flags |= mset.SNIPPET_BACKGROUND_MODEL;
2502 } else if (flag == "cjk_ngram") {
2503 flags |= mset.SNIPPET_CJK_NGRAM;
2504 } else if (flag == "empty_without_match") {
2505 flags |= mset.SNIPPET_EMPTY_WITHOUT_MATCH;
2506 } else if (flag == "exhaustive") {
2507 flags |= mset.SNIPPET_EXHAUSTIVE;
2508 } else if (flag == "ngrams") {
2509 flags |= mset.SNIPPET_NGRAMS;
2510 } else if (flag == "word_breaks") {
2511 flags |= mset.SNIPPET_WORD_BREAKS;
2512 } else {
2513 throw "Unknown $snippet flag '" + flag + "'";
2515 if (j == string::npos) break;
2516 i = j + 1;
2519 string bra, ket, gap;
2520 if (args.size() > 3) {
2521 bra = args[3];
2522 } else {
2523 bra = "<strong>";
2525 if (args.size() > 4) {
2526 ket = args[4];
2527 } else {
2528 ket = "</strong>";
2530 if (args.size() > 5) {
2531 gap = args[5];
2532 } else {
2533 gap = "...";
2535 if (!stemmer)
2536 stemmer = new Xapian::Stem(option["stemmer"]);
2537 value = mset.snippet(args[0], length, *stemmer, flags,
2538 bra, ket, gap);
2539 break;
2541 case CMD_sort:
2542 omegascript_sort(args, value);
2543 break;
2544 case CMD_sortableunserialise:
2545 // FIXME: This uses printf %f - maybe we want more than 6
2546 // decimal places in some cases though...
2547 value = double_to_string(Xapian::sortable_unserialise(args[0]));
2548 break;
2549 case CMD_split: {
2550 string split;
2551 if (args.size() == 1) {
2552 split = " ";
2553 value = args[0];
2554 } else {
2555 split = args[0];
2556 value = args[1];
2558 string::size_type i = 0;
2559 while (true) {
2560 if (split.empty()) {
2561 ++i;
2562 if (i >= value.size()) break;
2563 } else {
2564 i = value.find(split, i);
2565 if (i == string::npos) break;
2567 value.replace(i, split.size(), 1, '\t');
2568 ++i;
2570 break;
2572 case CMD_srandom: {
2573 int seed = string_to_int(args[0]);
2574 rng.seed(seed);
2575 seed_set = true;
2576 break;
2578 case CMD_stoplist: {
2579 Xapian::TermIterator i = qp.stoplist_begin();
2580 Xapian::TermIterator end = qp.stoplist_end();
2581 while (i != end) {
2582 if (!value.empty()) value += '\t';
2583 value += *i;
2584 ++i;
2586 break;
2588 case CMD_sub:
2589 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2590 break;
2591 case CMD_subdb: {
2592 Xapian::docid id = q0;
2593 if (args.size() > 0 &&
2594 (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2595 throw "Document id of the subdb command should be > 0";
2597 value = subdbs[(id - 1) % subdbs.size()].get_name();
2598 break;
2600 case CMD_subid: {
2601 Xapian::docid id = q0;
2602 if (args.size() > 0 &&
2603 (!parse_unsigned(args[0].c_str(), id) || id == 0)) {
2604 throw "Document id of the subid command should be > 0";
2606 // This is the docid in the single shard.
2607 Xapian::docid shard_did = (id - 1) / subdbs.size() + 1;
2608 // We now need to map this back to the docid in the collection
2609 // of shards specified by the DB parameter value which $subdb
2610 // returns.
2611 const SubDB& subdb = subdbs[(id - 1) % subdbs.size()];
2612 value = str(subdb.map_docid(shard_did));
2613 break;
2615 case CMD_substr: {
2616 int start;
2617 if (!parse_signed(args[1].c_str(), start)) {
2618 throw "Start value for substr command "
2619 "must be an integer";
2621 if (start < 0) {
2622 if (static_cast<size_t>(-start) >= args[0].size()) {
2623 start = 0;
2624 } else {
2625 start = static_cast<int>(args[0].size()) + start;
2627 } else {
2628 if (static_cast<size_t>(start) >= args[0].size()) break;
2630 size_t len = string::npos;
2631 if (args.size() > 2) {
2632 int int_len;
2633 if (!parse_signed(args[2].c_str(), int_len)) {
2634 throw "Length value for substr command "
2635 "must be an integer";
2637 if (int_len >= 0) {
2638 len = size_t(int_len);
2639 } else {
2640 len = args[0].size() - start;
2641 if (static_cast<size_t>(-int_len) >= len) {
2642 len = 0;
2643 } else {
2644 len -= static_cast<size_t>(-int_len);
2648 value.assign(args[0], start, len);
2649 break;
2651 case CMD_suggestion:
2652 value = qp.get_corrected_query_string();
2653 break;
2654 case CMD_switch: {
2655 const string& val = args[0];
2656 for (size_t i = 1; i < args.size(); i += 2) {
2657 if (i == args.size() - 1) {
2658 // Handle optional "else" value.
2659 value = eval(args[i], param);
2660 break;
2662 if (val == eval(args[i], param)) {
2663 value = eval(args[i + 1], param);
2664 break;
2667 break;
2669 case CMD_termprefix:
2670 (void)prefix_from_term(&value, args[0]);
2671 break;
2672 case CMD_terms: {
2673 // list of matching terms
2674 if (!enquire) break;
2675 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2676 if (args.empty()) {
2677 while (term != enquire->get_matching_terms_end(q0)) {
2678 // check term was in the typed query so we ignore
2679 // boolean filter terms
2680 const string & t = *term;
2681 if (termset.find(t) != termset.end()) {
2682 value += t;
2683 value += '\t';
2685 ++term;
2687 } else {
2688 // Return matching terms with specified prefix. We can't
2689 // use skip_to() as the terms aren't ordered by termname.
2690 const string & pfx = args[0];
2691 while (term != enquire->get_matching_terms_end(q0)) {
2692 const string & t = *term;
2693 if (startswith(t, pfx)) {
2694 value += t;
2695 value += '\t';
2697 ++term;
2701 if (!value.empty()) value.erase(value.size() - 1);
2702 break;
2704 case CMD_thispage:
2705 value = str(topdoc / hits_per_page + 1);
2706 break;
2707 case CMD_time:
2708 if (secs >= 0) {
2709 char buf[64];
2710 snprintf(buf, sizeof(buf), "%.6f", secs);
2711 // MSVC's snprintf omits the zero byte if the string is
2712 // sizeof(buf) long.
2713 buf[sizeof(buf) - 1] = '\0';
2714 value = buf;
2716 break;
2717 case CMD_topdoc:
2718 // first document on current page of hit list (counting from 0)
2719 value = str(topdoc);
2720 break;
2721 case CMD_topterms:
2722 if (enquire) {
2723 int howmany = 16;
2724 if (!args.empty()) {
2725 if (!parse_signed(args[0].c_str(), howmany)) {
2726 throw "Number of terms for command "
2727 "topterms must be an integer";
2730 if (howmany < 0) howmany = 0;
2731 // List of expand terms
2732 Xapian::ESet eset;
2733 OmegaExpandDecider decider(db, &termset);
2735 if (!rset.empty()) {
2736 set_expansion_scheme(*enquire, option);
2737 eset = enquire->get_eset(howmany * 2, rset, &decider);
2738 } else if (mset.size()) {
2739 // invent an rset
2740 Xapian::RSet tmp;
2742 int c = 5;
2743 // FIXME: what if mset does not start at first match?
2744 for (Xapian::docid did : mset) {
2745 tmp.add_document(did);
2746 if (--c == 0) break;
2749 set_expansion_scheme(*enquire, option);
2750 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2753 // Don't show more than one word with the same stem.
2754 set<string> stems;
2755 Xapian::ESetIterator i;
2756 for (i = eset.begin(); i != eset.end(); ++i) {
2757 string term(*i);
2758 string stem = (*stemmer)(term);
2759 if (stems.find(stem) != stems.end()) continue;
2760 stems.insert(stem);
2761 value += term;
2762 value += '\t';
2763 if (--howmany == 0) break;
2765 if (!value.empty()) value.erase(value.size() - 1);
2767 break;
2768 case CMD_transform:
2769 omegascript_transform(value, args);
2770 break;
2771 case CMD_truncate: {
2772 unsigned int length;
2773 if (!parse_unsigned(args[1].c_str(), length)) {
2774 throw "Length for truncate command must be >= 0";
2776 value = generate_sample(args[0],
2777 length,
2778 args.size() > 2 ? args[2] : string(),
2779 args.size() > 3 ? args[3] : string());
2780 break;
2782 case CMD_uniq: {
2783 const string &list = args[0];
2784 if (list.empty()) break;
2785 string::size_type split = 0, split2;
2786 string prev;
2787 do {
2788 split2 = list.find('\t', split);
2789 string item(list, split, split2 - split);
2790 if (split == 0) {
2791 value = item;
2792 } else if (item != prev) {
2793 value += '\t';
2794 value += item;
2796 prev = item;
2797 split = UNSIGNED_OVERFLOW_OK(split2 + 1);
2798 } while (split2 != string::npos);
2799 break;
2801 case CMD_unique: {
2802 unordered_set<string> seen;
2803 const string &list = args[0];
2804 if (list.empty()) break;
2805 string::size_type split = 0, split2;
2806 do {
2807 split2 = list.find('\t', split);
2808 string item(list, split, split2 - split);
2809 if (seen.insert(item).second) {
2810 if (split != 0)
2811 value += '\t';
2812 value += item;
2814 split = UNSIGNED_OVERFLOW_OK(split2 + 1);
2815 } while (split2 != string::npos);
2816 break;
2818 case CMD_unpack:
2819 value = str(binary_string_to_int(args[0]));
2820 break;
2821 case CMD_unprefix: {
2822 size_t prefix_len = prefix_from_term(NULL, args[0]);
2823 value.assign(args[0], prefix_len, string::npos);
2824 break;
2826 case CMD_unstem: {
2827 const string &term = args[0];
2828 Xapian::TermIterator i = qp.unstem_begin(term);
2829 Xapian::TermIterator end = qp.unstem_end(term);
2830 while (i != end) {
2831 if (!value.empty()) value += '\t';
2832 value += *i;
2833 ++i;
2835 break;
2837 case CMD_upper:
2838 value = Xapian::Unicode::toupper(args[0]);
2839 break;
2840 case CMD_url:
2841 url_encode(value, args[0]);
2842 break;
2843 case CMD_value: {
2844 Xapian::docid id = q0;
2845 Xapian::valueno slot;
2846 if (!parse_unsigned(args[0].c_str(), slot)) {
2847 throw "Value slot number should be >= 0";
2849 if (args.size() > 1 &&
2850 (!parse_unsigned(args[1].c_str(), id) || id == 0)) {
2851 throw "Document id for value command must be > 0";
2853 value = db.get_document(id).get_value(slot);
2854 break;
2856 case CMD_valuelowerbound: {
2857 Xapian::valueno slot;
2858 if (!parse_unsigned(args[0].c_str(), slot)) {
2859 throw "Value slot number should be >= 0";
2861 value = db.get_value_lower_bound(slot);
2862 break;
2864 case CMD_valueupperbound: {
2865 Xapian::valueno slot;
2866 if (!parse_unsigned(args[0].c_str(), slot)) {
2867 throw "Value slot number should be >= 0";
2869 value = db.get_value_upper_bound(slot);
2870 break;
2872 case CMD_version:
2873 value = PACKAGE_STRING;
2874 break;
2875 case CMD_weight:
2876 value = double_to_string(weight);
2877 break;
2878 default: {
2879 args.insert(args.begin(), param[0]);
2880 int macro_no = func->second->tag - CMD_MACRO;
2881 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2882 // throw "Unknown function '" + var + "'";
2883 value = eval(macros[macro_no], args);
2884 break;
2887 res += value;
2888 } catch (const Xapian::Error & e) {
2889 // FIXME: this means we only see the most recent error in $error
2890 // - is that the best approach?
2891 error_msg = e.get_description();
2894 res.append(fmt, p, string::npos);
2895 return res;
2898 static string
2899 eval_file(const string& fmtfile, bool* p_not_found)
2901 // Use -1 to indicate vet_filename() failed.
2902 int eno = -1;
2903 if (vet_filename(fmtfile)) {
2904 string file = template_dir + fmtfile;
2905 string fmt;
2906 errno = 0;
2907 if (load_file(file, fmt)) {
2908 vector<string> noargs;
2909 noargs.resize(1);
2910 return eval(fmt, noargs);
2912 eno = errno;
2915 if (p_not_found) {
2916 *p_not_found = true;
2917 return string();
2920 // FIXME: report why!
2921 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2922 if (eno) {
2923 msg += " (";
2924 msg += (eno < 0 ? "name contains '..'" : strerror(eno));
2925 msg += ')';
2927 throw msg;
2930 extern string
2931 pretty_term(string term)
2933 // Just leave empty strings and single characters alone.
2934 if (term.length() <= 1) return term;
2936 // Assume unprefixed terms are unstemmed.
2937 if (!C_isupper(term[0])) return term;
2939 // Handle stemmed terms.
2940 bool stemmed = (term[0] == 'Z');
2941 if (stemmed) {
2942 // First of all, check if a term in the query stemmed to this one.
2943 Xapian::TermIterator u = qp.unstem_begin(term);
2944 // There might be multiple words with the same stem, but we only want
2945 // one so just take the first.
2946 if (u != qp.unstem_end(term)) return *u;
2948 // Remove the 'Z'.
2949 term.erase(0, 1);
2952 bool add_quotes = false;
2954 // Check if the term has a prefix.
2955 if (C_isupper(term[0])) {
2956 // See if we have this prefix in the termprefix_to_userprefix map. If
2957 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2958 string prefix;
2959 size_t prefix_len = prefix_from_term(&prefix, term);
2961 map<string, string>::const_iterator i;
2962 i = termprefix_to_userprefix.find(prefix);
2963 if (i != termprefix_to_userprefix.end()) {
2964 string user_prefix = i->second;
2965 user_prefix += ':';
2966 term.replace(0, prefix_len, user_prefix);
2967 } else {
2968 // We don't have a prefix mapping for this, so just set a flag to
2969 // add quotes around the term.
2970 add_quotes = true;
2974 if (stemmed) term += '.';
2976 if (add_quotes) {
2977 term.insert(0, "\"");
2978 term.append("\"");
2981 return term;
2984 static string
2985 print_caption(const string& fmt, vector<string>& param)
2987 q0 = *(mset[hit_no]);
2989 weight = mset[hit_no].get_weight();
2990 percent = mset.convert_to_percent(mset[hit_no]);
2991 collapsed = mset[hit_no].get_collapse_count();
2993 return eval(fmt, param);
2996 void
2997 parse_omegascript()
2999 try {
3000 string output = eval_file(fmtname);
3001 if (!set_content_type && !suppress_http_headers) {
3002 cout << "Content-Type: text/html" << endl;
3003 set_content_type = true;
3005 if (!suppress_http_headers) cout << endl;
3006 cout << output;
3007 } catch (...) {
3008 // Ensure the headers have been output so that any exception gets
3009 // reported rather than giving a server error.
3010 if (!set_content_type && !suppress_http_headers) {
3011 cout << "Content-Type: text/html" << endl;
3012 set_content_type = true;
3014 if (!suppress_http_headers) cout << endl;
3015 throw;
3019 static void
3020 ensure_query_parsed()
3022 if (query_parsed) return;
3023 query_parsed = true;
3025 // Should we discard the existing R-set recorded in R CGI parameters?
3026 bool discard_rset = false;
3028 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
3029 // CGI parameters)?
3030 bool force_first_page = false;
3032 string v;
3033 // get list of terms from previous iteration of query
3034 auto val = cgi_params.find("xP");
3035 if (val != cgi_params.end()) {
3036 v = val->second;
3037 // If xP given, default to discarding any RSet and forcing the first
3038 // page of results. If the query is the same, or an extension of
3039 // the previous query, we adjust these again below.
3040 discard_rset = true;
3041 force_first_page = true;
3043 querytype result = parse_queries(v);
3044 switch (result) {
3045 case BAD_QUERY:
3046 break;
3047 case NEW_QUERY:
3048 break;
3049 case SAME_QUERY:
3050 case EXTENDED_QUERY:
3051 // If we've changed database, force the first page of hits
3052 // and discard the R-set (since the docids will have changed)
3053 val = cgi_params.find("xDB");
3054 if (val != cgi_params.end() && val->second != dbname) break;
3055 if (result == SAME_QUERY && force_first_page) {
3056 val = cgi_params.find("xFILTERS");
3057 if (val != cgi_params.end() && val->second != filters &&
3058 val->second != old_filters) {
3059 // Filters have changed since last query.
3060 } else {
3061 force_first_page = false;
3064 discard_rset = false;
3065 break;
3068 if (!force_first_page) {
3069 // Work out which mset element is the first hit we want
3070 // to display
3071 val = cgi_params.find("TOPDOC");
3072 if (val != cgi_params.end()) {
3073 if (!parse_unsigned(val->second.c_str(), topdoc)) {
3074 throw "TOPDOC parameter must be >= 0";
3078 // Handle next, previous, and page links
3079 if (cgi_params.find(">") != cgi_params.end()) {
3080 topdoc += hits_per_page;
3081 } else if (cgi_params.find("<") != cgi_params.end()) {
3082 if (topdoc >= hits_per_page)
3083 topdoc -= hits_per_page;
3084 else
3085 topdoc = 0;
3086 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
3087 (val = cgi_params.find("#")) != cgi_params.end()) {
3088 if (!C_isdigit(val->second[0])) {
3089 throw "Page parameter must be >= 0";
3091 long page = atol(val->second.c_str());
3092 // Do something sensible for page 0 (we count pages from 1).
3093 if (page == 0) page = 1;
3094 topdoc = (page - 1) * hits_per_page;
3097 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
3098 // Normally we snap TOPDOC like this so that things work nicely if
3099 // HITSPERPAGE is in a <select> or on radio buttons. If we're
3100 // postprocessing the output of omega and want variable sized pages,
3101 // this is unhelpful.
3102 bool raw_search = false;
3103 val = cgi_params.find("RAWSEARCH");
3104 if (val != cgi_params.end()) {
3105 unsigned int temp;
3106 if (!parse_unsigned(val->second.c_str(), temp)) {
3107 throw "RAWSEARCH parameter must be >= 0";
3109 raw_search = bool(temp);
3112 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
3115 if (!discard_rset) {
3116 // put documents marked as relevant into the rset
3117 auto g = cgi_params.equal_range("R");
3118 for (auto i = g.first; i != g.second; ++i) {
3119 const string & value = i->second;
3120 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
3121 while (value[j] == '.') ++j;
3122 Xapian::docid d;
3123 if (!parse_unsigned(value.c_str() + j, d) || d == 0) {
3124 throw "Document id for 'R' parameter must be > 0";
3126 if (d) {
3127 rset.add_document(d);
3128 ticked[d] = true;
3135 // run query if we haven't already
3136 static void
3137 ensure_match()
3139 if (done_query) return;
3141 secs = RealTime::now();
3142 run_query();
3143 if (secs != -1)
3144 secs = RealTime::now() - secs;
3146 done_query = true;
3147 last = mset.get_matches_lower_bound();
3148 if (last == 0) {
3149 // Otherwise topdoc ends up being -6 if it's non-zero!
3150 topdoc = 0;
3151 } else {
3152 if (topdoc >= last)
3153 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
3154 // last is the count of documents up to the end of the current page
3155 // (as returned by $last)
3156 if (topdoc + hits_per_page < last)
3157 last = topdoc + hits_per_page;
3161 // OmegaExpandDecider methods.
3163 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
3164 set<string> * querytermset)
3165 : db(db_)
3167 // We'll want the stemmer for testing matches anyway.
3168 if (!stemmer)
3169 stemmer = new Xapian::Stem(option["stemmer"]);
3170 if (querytermset) {
3171 set<string>::const_iterator i;
3172 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
3173 string term(*i);
3174 if (term.empty()) continue;
3176 unsigned char ch = term[0];
3177 bool stemmed = (ch == 'Z');
3178 if (stemmed) {
3179 term.erase(0, 1);
3180 if (term.empty()) continue;
3181 ch = term[0];
3184 if (C_isupper(ch)) {
3185 size_t prefix_len = prefix_from_term(NULL, term);
3186 term.erase(0, prefix_len);
3189 if (!stemmed) term = (*stemmer)(term);
3191 exclude_stems.insert(term);
3196 bool
3197 OmegaExpandDecider::operator()(const string & term) const
3199 unsigned char ch = term[0];
3201 // Reject terms with a prefix.
3202 if (C_isupper(ch)) return false;
3205 MyStopper stopper;
3206 // Don't suggest stopwords.
3207 if (stopper(term)) return false;
3210 // Reject small numbers.
3211 if (term.size() < 4 && C_isdigit(ch)) return false;
3213 // Reject terms containing a space.
3214 if (term.find(' ') != string::npos) return false;
3216 // Skip terms with stems in the exclude_stems set, to avoid suggesting
3217 // terms which are already in the query in some form.
3218 string stem = (*stemmer)(term);
3219 if (exclude_stems.find(stem) != exclude_stems.end())
3220 return false;
3222 // Ignore terms that only occur once (hapaxes) since they aren't
3223 // useful for finding related documents - they only occur in a
3224 // document that's already been marked as relevant.
3225 // FIXME: add an expand option to ignore terms where
3226 // termfreq == rtermfreq.
3227 if (db.get_termfreq(term) <= 1) return false;
3229 return true;